richard-to
12/6/2014 - 12:28 AM

sklearn_example2.py

import csv
import numpy as np
from sklearn import cross_validation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif, chi2
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix


LABELS_MAP = {}
LABELS_LIST = []


def load_data():
    """
    Loads data from csv file.

    Returns:
        A list of descriptions (features) and a list of corresponding labels
    """
    features = []
    labels = []
    with open('data.csv', 'r') as infile:
        reader = csv.reader(infile)
        next(reader)
        for row in reader:
            features.append(row[0])
            labels.append(LABELS_MAP[row[1]])
    return features, labels


def most_common(lst):
    """
    Gets the classification that occurs most in list.
    """
    return max(set(lst), key=lst.count)


def main():

    # Load data from csv file
    features, labels = load_data()

    # Split 90% of data into training data and 10% into test data
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(
        features, labels, test_size=0.1)


    # Create logistic regression classifier, fit (train), and predict
    clf_logistic_regressions = Pipeline([
        ('vect', TfidfVectorizer(sublinear_tf=True, max_df=.95, min_df=.001, stop_words='english')),
        ('selector', SelectPercentile(f_classif, percentile=100)),
        ('clf', LogisticRegression())
    ])
    clf_logistic_regressions.fit(features_train, labels_train)
    pred_logistic_regression = clf_logistic_regressions.predict(features_test)


    # Create multinomial naive bayes classifier, fit (train), and predict
    clf_nb = Pipeline([
        ('vect', CountVectorizer(stop_words='english', max_df=0.3, ngram_range=(1,4))),
        ('selector', SelectPercentile(f_classif, percentile=100)),
        ('clf', MultinomialNB())
    ])
    clf_nb.fit(features_train, labels_train)
    pred_nb = clf_nb.predict(features_test)


    # Create SVC classifier, fit (train), and predict
    clf_svc = Pipeline([
        ('vect', TfidfVectorizer(sublinear_tf=True, max_df=5.0, stop_words='english')),
        ('selector', SelectPercentile(f_classif, percentile=85)),
        ('clf', SVC(kernel='rbf', C=6000, gamma=.0002))
    ])
    clf_svc.fit(features_train, labels_train)
    pred_svc = clf_svc.predict(features_test)


    # Vote for the best classification from the three algorithms
    pred = []
    for i in xrange(len(pred_logistic_regression)):
        pred.append(most_common([pred_logistic_regression[i], pred_svc[i], pred_nb[i]]))


    # For debugging, print out which examples the algorithm got wrong
    print
    print "Incorrect Classifications:"
    print "--------------------------"
    print
    for i in xrange(len(pred)):
        if pred[i] != labels_test[i]:
            print LABELS_LIST[pred[i]], LABELS_LIST[labels_test[i]]
            print features_test[i]
            print


    # Print metrics, accuracy, confusion for "ensemble" algorithm
    print accuracy_score(labels_test, pred)
    print classification_report(labels_test, pred, target_names=LABELS_LIST)
    print confusion_matrix(labels_test, pred)


if __name__ == '__main__':
    main()