import csv
import numpy as np
from sklearn import cross_validation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif, chi2
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
LABELS_MAP = {}
LABELS_LIST = []
def load_data():
"""
Loads data from csv file.
Returns:
A list of descriptions (features) and a list of corresponding labels
"""
features = []
labels = []
with open('data.csv', 'r') as infile:
reader = csv.reader(infile)
next(reader)
for row in reader:
features.append(row[0])
labels.append(LABELS_MAP[row[1]])
return features, labels
def most_common(lst):
"""
Gets the classification that occurs most in list.
"""
return max(set(lst), key=lst.count)
def main():
# Load data from csv file
features, labels = load_data()
# Split 90% of data into training data and 10% into test data
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(
features, labels, test_size=0.1)
# Create logistic regression classifier, fit (train), and predict
clf_logistic_regressions = Pipeline([
('vect', TfidfVectorizer(sublinear_tf=True, max_df=.95, min_df=.001, stop_words='english')),
('selector', SelectPercentile(f_classif, percentile=100)),
('clf', LogisticRegression())
])
clf_logistic_regressions.fit(features_train, labels_train)
pred_logistic_regression = clf_logistic_regressions.predict(features_test)
# Create multinomial naive bayes classifier, fit (train), and predict
clf_nb = Pipeline([
('vect', CountVectorizer(stop_words='english', max_df=0.3, ngram_range=(1,4))),
('selector', SelectPercentile(f_classif, percentile=100)),
('clf', MultinomialNB())
])
clf_nb.fit(features_train, labels_train)
pred_nb = clf_nb.predict(features_test)
# Create SVC classifier, fit (train), and predict
clf_svc = Pipeline([
('vect', TfidfVectorizer(sublinear_tf=True, max_df=5.0, stop_words='english')),
('selector', SelectPercentile(f_classif, percentile=85)),
('clf', SVC(kernel='rbf', C=6000, gamma=.0002))
])
clf_svc.fit(features_train, labels_train)
pred_svc = clf_svc.predict(features_test)
# Vote for the best classification from the three algorithms
pred = []
for i in xrange(len(pred_logistic_regression)):
pred.append(most_common([pred_logistic_regression[i], pred_svc[i], pred_nb[i]]))
# For debugging, print out which examples the algorithm got wrong
print
print "Incorrect Classifications:"
print "--------------------------"
print
for i in xrange(len(pred)):
if pred[i] != labels_test[i]:
print LABELS_LIST[pred[i]], LABELS_LIST[labels_test[i]]
print features_test[i]
print
# Print metrics, accuracy, confusion for "ensemble" algorithm
print accuracy_score(labels_test, pred)
print classification_report(labels_test, pred, target_names=LABELS_LIST)
print confusion_matrix(labels_test, pred)
if __name__ == '__main__':
main()