I am trying to implement the k-fold cross-validation algorithm in python. I know SKLearn provides an implementation but still... This is my code as of right now.
我正在尝试在python中实现k-fold交叉验证算法。我知道SKLearn提供了一个实现,但是…这是我现在的代码。
from sklearn import metrics
import numpy as np
class Cross_Validation:
@staticmethod
def partition(vector, fold, k):
size = vector.shape[0]
start = (size/k)*fold
end = (size/k)*(fold+1)
validation = vector[start:end]
if str(type(vector)) == "":
indices = range(start, end)
mask = np.ones(vector.shape[0], dtype=bool)
mask[indices] = False
training = vector[mask]
elif str(type(vector)) == "":
training = np.concatenate((vector[:start], vector[end:]))
return training, validation
@staticmethod
def Cross_Validation(learner, k, examples, labels):
train_folds_score = []
validation_folds_score = []
for fold in range(0, k):
training_set, validation_set = Cross_Validation.partition(examples, fold, k)
training_labels, validation_labels = Cross_Validation.partition(labels, fold, k)
learner.fit(training_set, training_labels)
training_predicted = learner.predict(training_set)
validation_predicted = learner.predict(validation_set)
train_folds_score.append(metrics.accuracy_score(training_labels, training_predicted))
validation_folds_score.append(metrics.accuracy_score(validation_labels, validation_predicted))
return train_folds_score, validation_folds_score
The learner parameter is a classifier from SKlearn library, k is the number of folds, examples is a sparse matrix produced by the CountVectorizer (again SKlearn) that is the representation of the bag of words. For example:
学习者参数是SKlearn库中的分类器,k是折叠数,例子是由CountVectorizer(再次SKlearn)制作的稀疏矩阵,它是单词包的表示形式。例如:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from Cross_Validation import Cross_Validation as cv
vectorizer = CountVectorizer(stop_words='english', lowercase=True, min_df=2, analyzer="word")
data = vectorizer.fit_transform("""textual data""")
clfMNB = MultinomialNB(alpha=.0001)
score = cv.Cross_Validation(clfMNB, 10, data, labels)
print "Train score" + str(score[0])
print "Test score" + str(score[1])
I'm assuming there is some logic error somewhere since the scores are 95% on the training set (as expected) but practically 0 on the test test, but I can't find it.
我假设有一些逻辑上的错误,因为在训练集上的分数是95%(如预期的),但是在测试测试中几乎是0,但是我找不到。
I hope I was clear. Thanks in advance.
我希望我是清白的。提前谢谢。
________________________________EDIT___________________________________
________________________________EDIT___________________________________
This is the code that loads the text into the vector that can be passed to the vectorizer. It also returns the label vector.
这是将文本加载到可以传递给vectorizer的向量的代码。它还返回标签向量。
from nltk.tokenize import word_tokenize
from Categories_Data import categories
import numpy as np
import codecs
import glob
import os
import re
class Data_Preprocessor:
def tokenize(self, text):
tokens = word_tokenize(text)
alpha = [t for t in tokens if unicode(t).isalpha()]
return alpha
def header_not_fully_removed(self, text):
if ":" in text.splitlines()[0]:
return len(text.splitlines()[0].split(":")[0].split()) == 1
else:
return False
def strip_newsgroup_header(self, text):
_before, _blankline, after = text.partition('\n\n')
if len(after) > 0 and self.header_not_fully_removed(after):
after = self.strip_newsgroup_header(after)
return after
def strip_newsgroup_quoting(self, text):
_QUOTE_RE = re.compile(r'(writes in|writes:|wrote:|says:|said:'r'|^In article|^Quoted from|^\||^>)')
good_lines = [line for line in text.split('\n')
if not _QUOTE_RE.search(line)]
return '\n'.join(good_lines)
def strip_newsgroup_footer(self, text):
lines = text.strip().split('\n')
for line_num in range(len(lines) - 1, -1, -1):
line = lines[line_num]
if line.strip().strip('-') == '':
break
if line_num > 0:
return '\n'.join(lines[:line_num])
else:
return text
def raw_to_vector(self, path, to_be_stripped=["header", "footer", "quoting"], noise_threshold=-1):
base_dir = os.getcwd()
train_data = []
label_data = []
for category in categories:
os.chdir(base_dir)
os.chdir(path+"/"+category[0])
for filename in glob.glob("*"):
with codecs.open(filename, 'r', encoding='utf-8', errors='replace') as target:
data = target.read()
if "quoting" in to_be_stripped:
data = self.strip_newsgroup_quoting(data)
if "header" in to_be_stripped:
data = self.strip_newsgroup_header(data)
if "footer" in to_be_stripped:
data = self.strip_newsgroup_footer(data)
if len(data) > noise_threshold:
train_data.append(data)
label_data.append(category[1])
os.chdir(base_dir)
return np.array(train_data), np.array(label_data)
This is what "from Categories_Data import categories" imports...
这是“从分类数据导入类别”导入的内容……
categories = [
('alt.atheism',0),
('comp.graphics',1),
('comp.os.ms-windows.misc',2),
('comp.sys.ibm.pc.hardware',3),
('comp.sys.mac.hardware',4),
('comp.windows.x',5),
('misc.forsale',6),
('rec.autos',7),
('rec.motorcycles',8),
('rec.sport.baseball',9),
('rec.sport.hockey',10),
('sci.crypt',11),
('sci.electronics',12),
('sci.med',13),
('sci.space',14),
('soc.religion.christian',15),
('talk.politics.guns',16),
('talk.politics.mideast',17),
('talk.politics.misc',18),
('talk.religion.misc',19)
]
2
The reason why your validation score is low is subtle.
你的验证分数低的原因很微妙。
The issue is how you have partitioned the dataset. Remember, when doing cross-validation you should randomly split the dataset. It is the randomness that you are missing.
问题是如何划分数据集。记住,在进行交叉验证时,应该随机地分割数据集。这就是你缺少的随机性。
Your data is loaded category by category, which means in your input dataset, class labels and examples follow one after the other. By not doing the random split, you have completely removed a class which your model never sees during the training phase and hence you get a bad result on your test/validation phase.
您的数据是按类别加载的,这意味着在您的输入数据集中,类标签和示例会跟随一个接着一个。通过不进行随机分割,您已经完全删除了在训练阶段中您的模型从未看到的类,因此您将在测试/验证阶段得到一个糟糕的结果。
You can solve this by doing a random shuffle. So, do this:
你可以通过随机洗牌来解决这个问题。所以,这样做:
from sklearn.utils import shuffle
processor = Data_Preprocessor()
td, tl = processor.raw_to_vector(path="C:/Users/Pankaj/Downloads/ng/")
vectorizer = CountVectorizer(stop_words='english', lowercase=True, min_df=2, analyzer="word")
data = vectorizer.fit_transform(td)
# Shuffle the data and labels
data, tl = shuffle(data, tl, random_state=0)
clfMNB = MultinomialNB(alpha=.0001)
score = Cross_Validation.Cross_Validation(clfMNB, 10, data, tl)
print("Train score" + str(score[0]))
print("Test score" + str(score[1]))