liuyenting
5/19/2016 - 5:47 PM

Task 2 - SVM

Task 2 - SVM

import jieba
import os

def main():
    jieba.set_dictionary('dict.txt.big')
    jieba.load_userdict('userDict.txt')
    train = open("train.tsv")
    for line in train:
        log = line.split('\t')
        print(log[2])
        segm = jieba.cut(log[2], cut_all=False)
        segm_result = ""
        for word in segm:
            segm_result += str(word) + " / "
        print(segm_result)
        

if __name__ == '__main__':
    main()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Multiclass Naive Bayes SVM (NB-SVM)
https://github.com/lrei/nbsvm

Luis Rei <luis.rei@ijs.si> 
@lmrei
http://luisrei.com

Learns a multiclass (OneVsRest) classifier based on word ngrams.
Uses scikit learn. Reads input from TSV files.

Licensed under a Creative Commons Attribution-NonCommercial 4.0 
International License.

Based on a work at https://github.com/mesnilgr/nbsvm:
Naive Bayes SVM by Grégoire Mesnil
"""

import sys
import os
import pandas as pd
import argparse
from sklearn.pipeline import Pipeline
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

import six
from abc import ABCMeta
import numpy as np
from scipy import sparse
from scipy.sparse import issparse
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils import check_X_y, check_array
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.preprocessing import normalize, binarize, LabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.externals import joblib

class NBSVM(six.with_metaclass(ABCMeta, BaseEstimator, ClassifierMixin)):

    def __init__(self, alpha=1.0, C=1.0, max_iter=10000):
        self.alpha = alpha
        self.max_iter = max_iter
        self.C = C
        self.svm_ = [] # fuggly
        self.savedir = "/tmp2/b03902035/"

    def fit(self, X, y):
        X, y = check_X_y(X, y, 'csr')
        _, n_features = X.shape

        print("preparing labels...")
        labelbin = LabelBinarizer()
        Y = labelbin.fit_transform(y)
        self.classes_ = labelbin.classes_
        if Y.shape[1] == 1:
            Y = np.concatenate((1 - Y, Y), axis=1)

        # LabelBinarizer().fit_transform() returns arrays with dtype=np.int64.
        # so we don't have to cast X to floating point
        Y = Y.astype(np.float64)

        # Count raw events from data
        n_effective_classes = Y.shape[1]
        self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64)
        self.ratios_ = np.full((n_effective_classes, n_features), self.alpha,
                                 dtype=np.float64)
        
        print("running compute_ratios")
        self._compute_ratios(X, Y)

        # flugglyness
        for i in range(n_effective_classes):
            X_i = X.multiply(self.ratios_[i])
            svm = LinearSVC(C=self.C, max_iter=self.max_iter)
            Y_i = Y[:,i]
            print("fitting data to svm %d" % i)
            svm.fit(X_i, Y_i)
            self.svm_.append(svm) 

        return self

    def predict(self, X):
        n_effective_classes = self.class_count_.shape[0]
        n_examples = X.shape[0]

        D = np.zeros((n_effective_classes, n_examples))

        for i in range(n_effective_classes):
            X_i = X.multiply(self.ratios_[i])
            D[i] = self.svm_[i].decision_function(X_i)
        
        return self.classes_[np.argmax(D, axis=0)]
       
    def decision_function(self, X):
        n_effective_classes = self.class_count_.shape[0]
        n_examples = X.shape[0]

        D = np.zeros((n_effective_classes, n_examples))

        for i in range(n_effective_classes):
            X_i = X.multiply(self.ratios_[i])
            D[i] = self.svm_[i].decision_function(X_i)

        return self.classes_[np.argsort(D, axis=0)]

    def _compute_ratios(self, X, Y):
        """Count feature occurrences and compute ratios."""
        if np.any((X.data if issparse(X) else X) < 0):
            raise ValueError("Input X must be non-negative")

        self.ratios_ += safe_sparse_dot(Y.T, X)  # ratio + feature_occurrance_c
        normalize(self.ratios_, norm='l1', axis=1, copy=False)
        row_calc = lambda r: np.log(np.divide(r, (1 - r)))
        self.ratios_ = np.apply_along_axis(row_calc, axis=1, arr=self.ratios_)
        check_array(self.ratios_)
        self.ratios_ = sparse.csr_matrix(self.ratios_)

        #p_c /= np.linalg.norm(p_c, ord=1)
        #ratios[c] = np.log(p_c / (1 - p_c))

    def save(self):
        for i in range(len(self.svm_)):
            joblib.dump(self.svm_[i], self.savedir + 'nbsvm-svm-trigram-' + str(i))

def main(train_file, test_file, ngram=(1, 3)):
    print('loading...')
    train = pd.read_csv(train_file, delimiter='\t', encoding='utf-8', header=None,names=['id', 'label', 'text'])

    # to shuffle:
    #train.iloc[np.random.permutation(len(df))]

    test = pd.read_csv(test_file, delimiter='\t', encoding='utf-8', header=None, names=['id', 'label', 'text'])

    print('vectorizing...')
    vect = CountVectorizer()
    classifier = NBSVM()

    # create pipeline
    clf = Pipeline([('vect', vect), ('nbsvm', classifier)])
    params = {
        'vect__token_pattern': r"\S+",
        'vect__ngram_range': ngram, 
        'vect__binary': True
    }
    clf.set_params(**params)

    #X_train = vect.fit_transform(train['text'])
    #X_test = vect.transform(test['text'])

    print('fitting...')
    clf.fit(train['text'], train['label'])

    print('classifying...')
    #pred = clf.predict(test['text'])
    decision = clf.decision_function(test['text'])
    decision = decision[::-1]
    print('testing...')
    '''
    score = [1, 0.5, 0.333]
    n = len(test['label'])
    tc = 0
    for i in range(n):
        for rank in range(3):
            if decision[rank][i] == test['label'][i]:
                tc += score[rank]
    
    print('acc=%f/%f: %f' % (tc, n, tc/n))
    '''
    fw = open("nbsvm-submit-trigram", "w")
    fw.write("Id,Emoticon\n")
    for i in range(len(test['label'])):
        fw.write(str(i+261955) + ',')
        for rank in range(3):
            fw.write(str(decision[rank][i]) + ' ')
        fw.write('\n')

    fw.close()

    classifier.save()

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Run NBSVM.')
    parser.add_argument('--train', help='path of the train tsv')
    parser.add_argument('--test', help='path of the test tsv')
    parser.add_argument('--ngrams', help='N-grams considered e.g. 1,3 is uni+bi+tri-grams')
    args = parser.parse_args()

    if args.ngrams:
        ngrams = tuple([int(x) for x in args.ngrams.split(',')])
    else:
        ngrams = (1, 3)

    if not args.train or not args.test:
        pirnt('try --help')

    main(args.train, args.test, ngrams)
import json
import os

tagdict = {
            "不及": "vi",
            "不及物動詞": "vi",
            "介詞": "p",
            "副詞": "d",
            "及物動詞": "v",
            "名詞": "n",
            "外來語": "",
            "定詞": "",
            "形容詞": "a",
            "後置詞": "k",
            "時態詞": "t",
            "語助詞": "u",
            "連接詞": "c",
            "量詞": "q"
}

def main():
    merged = open("dict-merged.json")
    obj = json.load(merged)
    posToEng(obj)

    output = open("userDict.txt", "w")
    for idx in range(len(obj)):
        str = obj[idx]['word'] + " " + obj[idx]['frequency'] + " " + obj[idx]['pos'] + "\n"
        output.write(str)

def posToEng(obj):
    for idx in range(len(obj)):
        if obj[idx]['pos'] in list(tagdict.keys()):
            obj[idx]['pos'] = tagdict[obj[idx]['pos']]
        else:
            obj[idx]['pos'] = ""

if __name__ == "__main__":
    main()
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
import jieba
import os
import sys
import logging
from random import shuffle

stop_words = [',', '.', '(', ')', '-', '=', '[', ']', '{', '}', '\n', '/', ':', ';', '\'', '\"']

class generate_iter(object):
    def __init__(self, filenames):
        self.fns = filenames

    def __iter__(self):
        id = 0
        for f in self.fns:
            for line in open(f, 'r'):
                log = line.split('\t')
                log[2] = log[2].replace("EMOTICON", "")
                temp = list(w for w in jieba.cut(log[2], cut_all=False) if w != ' ')
                #temp = pynlpir.segment(log[2], pos_tagging=False)
                label = []
                if log[1] == '0':
                    label.append("TEST_" + str(id))
                    id += 1
                else:
                    label.append("TRAIN_" + log[0])
                    label.append("EMOTICON_" + log[1])
                yield TaggedDocument(words=temp, tags=label)

    def to_array(self):
        self.sentence = []
        id = 0
        for f in self.fns:
            for line in open(f, 'r'):
                log = line.split('\t')
                log[2] = log[2].replace("EMOTICON", "")
                temp = list(w for w in jieba.cut(log[2], cut_all=False) if w != ' ')
                #temp = pynlpir.segment(log[2], pos_tagging=False)
                label = []
                if log[1] == '0':
                    label.append("TEST_" + str(id))
                    id += 1
                else:
                    label.append("TRAIN_" + log[0])
                    label.append("EMOTICON_" + log[1])
                self.sentence.append(TaggedDocument(words=temp, tags=label))
            print(len(self.sentence))
        return self.sentence

    def perm(self):
        shuffle(self.sentence)
        return self.sentence
        
def main():
    jieba.set_dictionary('dict.txt.big')
    jieba.load_userdict('userDict.txt')

    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))
    
    '''
    for line in open("corpus.txt"):
        if line == "\n":
            continue
        temp = []
        temp += jieba.cut(line, cut_all=False)
        sentences.append(temp)
    '''
    filenames = ["train.tsv", "test.tsv"]
    sentences = generate_iter(filenames)

    logger.info("buliding model...")

    model = Doc2Vec(dm=0, size=100, workers=4, min_count=2, hs=1, negative=5, alpha=0.025, window=10, sample=1e-3)
    model.build_vocab(sentences.to_array())
    for epoch in range(20):
        logger.info("training number " + '%s' % str(epoch))
        model.train(sentences.perm())
        model.alpha -= 0.001

    model.save("/tmp2/b03902035/d2v-model-100d-20iter-2mincount-dbow")

    #print(model_dbow.similarity('男孩', '女孩'))
    #print(model_dbow.most_similar(positive=['男孩', '帥'], negative=['女孩']))

if __name__ == "__main__":
    main()
from gensim.models.doc2vec import Doc2Vec
import numpy as np
import logging
import os
import sys

n = 200
testcases = 68733
counts = [0 for i in range(41)]
totalcounts = 261954

def writeSubmit(logger, res):
	logger.info("writing submission file")
	with open("d2v-submit-most3_n200_multiple", "w") as f:
		f.write("Id,Emoticon\n")
		for i in range(testcases):
			f.write(str(i+261955) + ',')
			for j in range(3):
				f.write(str(res[i][j][1]) + ' ')	
			f.write('\n')
			if i % 5000 == 0:
				logger.info("processed %s sentences" % str(i))

def getHighest3EMOs(logger, model, train_label):
    res = []
    top = []
    logger.info("storing most_similar top %s sentences" % str(n))
    for i in range(testcases):
        top.append(model.docvecs.most_similar('TEST_' + str(i), topn=n))
        if i % 100 == 0:
            logger.info("processed %s sentences" % str(i))

    logger.info("storing complete")
    logger.info("start computing highest 3 EMOTICONs")	
    for i in range(testcases):
        reslist = [[0, k] for k in range(41)]
        for j in range(n):
            if "TRAIN_" in top[i][j][0]:
                idx = int(top[i][j][0].replace("TRAIN_", ""))
                print("j: " + str(j) + ", idx: " + str(idx))
                reslist[train_label[idx-1]][0] += (n - j) * counts[train_label[idx-1]]
        print(str(reslist))
        reslist_sort = sorted(reslist, key=lambda x : x[0], reverse=True)
        print(str(reslist_sort))
        res.append(reslist_sort[0:3])
        if i % 5000 == 0:
            logger.info("processed %s sentences" % str(i))
        
    return res

def getCounts():
    for line in open("train.tsv", "r"):
        log = line.split('\t')
        counts[int(log[1])] += 1

def getLabels():
	arr = []
	for line in open("train.tsv", "r"):
		log = line.split('\t')
		arr.append(int(log[1]))
	return arr

def main():

    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    logger.info("loading model")
    model = Doc2Vec.load('d2v-model-0508')
	
    logger.info("getting labels from training data")
    train_label = getLabels()

    logger.info("getting counts of emoticons from training data")
    getCounts()
	
    logger.info("getting highest probable EMOTICONs")
    res = getHighest3EMOs(logger, model, train_label)

    writeSubmit(logger, res)

if __name__ == "__main__":
	main()
from gensim.models.doc2vec import Doc2Vec
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
from sklearn.linear_model import LogisticRegression, SGDClassifier, LogisticRegressionCV
from sklearn import svm
from sklearn.externals import joblib
import jieba
import numpy as np
import logging
import os
import sys

testcases = range(261954)
n = 200
counts = [0 for i in range(41)]
totalcounts = 261954
train_doc = []
model_name = ['dbow', 'dmm']

def writeSubmit(logger, res):
	logger.info("writing submission file")
	with open("d2v-submit-most3", "w") as f:
		f.write("Id,Emoticon\n")
		for i in range(68733):
			f.write(str(i+261955) + ',')
			for j in range(3):
				f.write(str(res[i][j]) + ' ')
				f.write('\n')
			if i % 5000 == 0:
				logger.info("processed %s sentences" % str(i))

def evaluate(res, labels):
    for result in res:
        point = 0
        total = len(result)
        for i in range(total):
            if(result[i][0][1] == labels[i]):
                point += 1
            elif(result[i][1][1] == labels[i]):
                point += 0.5
            elif(result[i][2][1] == labels[i]):
                point += 0.333
        print("point/total = %s" % str(point) + "/%s" % str(total))
        print("average point: %s" % str(point/total))

def getHighest3EMOs(logger, models, train_label):
    res = []
    n_model = 0
    for model in models:
        restmp = []
        train_vec = list(model.docvecs['TRAIN_' + str(i+1)] for i in range(261954))
    
        logreg = LogisticRegression(n_jobs=4, solver='liblinear')
        logger.info("fitting data to classifier")
        logreg.fit(train_vec, train_label)
        #joblib.dump(logreg, 'logregs/' + model_name[n_model] + '-all-100d') 
    
        #logreg = joblib.load('logregs/dbow-all')
        logger.info("start computing highest 3 EMOTICONs")	
        for i in range(len(testcases)):
            proba = logreg.predict_proba(model.docvecs['TRAIN_' + str(i+1)].reshape(1, -1))
            #proba = logreg.predict_proba(model.infer_vector(train_doc[testcases[i]]).reshape(1, -1))
            reslist = [[proba[0][j], j+1] for j in range(40)]
            reslist_sort = sorted(reslist, key=lambda x : x[0], reverse=True)
            #print(reslist)
            #print(reslist_sort)
        
            restmp.append(reslist_sort[0:3])
            if i % 1000 == 0:
                logger.info("processed %s sentences" % str(i))
        res.append(restmp)
        n_model += 1
    
    return res

def getDoc():
    for line in open("train.tsv", "r"):
        log = line.split('\t')
        log[2] = log[2].replace("EMOTICON", "")
        train_doc.append([w for w in jieba.cut(log[2], cut_all=False) if w != ' '])

def getCounts():
    for line in open("train.tsv", "r"):
        log = line.split('\t')
        counts[int(log[1])] += 1

def getLabels():
	arr = []
	for line in open("train.tsv", "r"):
		log = line.split('\t')
		arr.append(int(log[1]))
	return arr

def main():
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    jieba.set_dictionary('dict.txt.big')
    jieba.load_userdict('userDict.txt')
    logger.info("loading training doc")
    getDoc()

    logger.info("loading model")
    model1 = Doc2Vec.load('/tmp2/b03902035/d2v-model-100d-20iter-2mincount-dbow')
    model2 = Doc2Vec.load('/tmp2/b03902035/d2v-model-100d-20iter-2mincount-dmm')
    modelconcat = ConcatenatedDoc2Vec([model1, model2])
    models = [modelconcat]

    logger.info("getting labels from training data")
    train_label = getLabels()
    
    logger.info("getting counts of emoticons from training data")
    getCounts()
    
    logger.info("getting highest probable EMOTICONs")
    res = getHighest3EMOs(logger, models, train_label)
    
    evaluate(res, train_label)
    #writeSubmit(logger, res)

if __name__ == "__main__":
	main()