Task 2 - SVM
import jieba
import os
def main():
jieba.set_dictionary('dict.txt.big')
jieba.load_userdict('userDict.txt')
train = open("train.tsv")
for line in train:
log = line.split('\t')
print(log[2])
segm = jieba.cut(log[2], cut_all=False)
segm_result = ""
for word in segm:
segm_result += str(word) + " / "
print(segm_result)
if __name__ == '__main__':
main()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Multiclass Naive Bayes SVM (NB-SVM)
https://github.com/lrei/nbsvm
Luis Rei <luis.rei@ijs.si>
@lmrei
http://luisrei.com
Learns a multiclass (OneVsRest) classifier based on word ngrams.
Uses scikit learn. Reads input from TSV files.
Licensed under a Creative Commons Attribution-NonCommercial 4.0
International License.
Based on a work at https://github.com/mesnilgr/nbsvm:
Naive Bayes SVM by Grégoire Mesnil
"""
import sys
import os
import pandas as pd
import argparse
from sklearn.pipeline import Pipeline
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
import six
from abc import ABCMeta
import numpy as np
from scipy import sparse
from scipy.sparse import issparse
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils import check_X_y, check_array
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.preprocessing import normalize, binarize, LabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.externals import joblib
class NBSVM(six.with_metaclass(ABCMeta, BaseEstimator, ClassifierMixin)):
def __init__(self, alpha=1.0, C=1.0, max_iter=10000):
self.alpha = alpha
self.max_iter = max_iter
self.C = C
self.svm_ = [] # fuggly
self.savedir = "/tmp2/b03902035/"
def fit(self, X, y):
X, y = check_X_y(X, y, 'csr')
_, n_features = X.shape
print("preparing labels...")
labelbin = LabelBinarizer()
Y = labelbin.fit_transform(y)
self.classes_ = labelbin.classes_
if Y.shape[1] == 1:
Y = np.concatenate((1 - Y, Y), axis=1)
# LabelBinarizer().fit_transform() returns arrays with dtype=np.int64.
# so we don't have to cast X to floating point
Y = Y.astype(np.float64)
# Count raw events from data
n_effective_classes = Y.shape[1]
self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64)
self.ratios_ = np.full((n_effective_classes, n_features), self.alpha,
dtype=np.float64)
print("running compute_ratios")
self._compute_ratios(X, Y)
# flugglyness
for i in range(n_effective_classes):
X_i = X.multiply(self.ratios_[i])
svm = LinearSVC(C=self.C, max_iter=self.max_iter)
Y_i = Y[:,i]
print("fitting data to svm %d" % i)
svm.fit(X_i, Y_i)
self.svm_.append(svm)
return self
def predict(self, X):
n_effective_classes = self.class_count_.shape[0]
n_examples = X.shape[0]
D = np.zeros((n_effective_classes, n_examples))
for i in range(n_effective_classes):
X_i = X.multiply(self.ratios_[i])
D[i] = self.svm_[i].decision_function(X_i)
return self.classes_[np.argmax(D, axis=0)]
def decision_function(self, X):
n_effective_classes = self.class_count_.shape[0]
n_examples = X.shape[0]
D = np.zeros((n_effective_classes, n_examples))
for i in range(n_effective_classes):
X_i = X.multiply(self.ratios_[i])
D[i] = self.svm_[i].decision_function(X_i)
return self.classes_[np.argsort(D, axis=0)]
def _compute_ratios(self, X, Y):
"""Count feature occurrences and compute ratios."""
if np.any((X.data if issparse(X) else X) < 0):
raise ValueError("Input X must be non-negative")
self.ratios_ += safe_sparse_dot(Y.T, X) # ratio + feature_occurrance_c
normalize(self.ratios_, norm='l1', axis=1, copy=False)
row_calc = lambda r: np.log(np.divide(r, (1 - r)))
self.ratios_ = np.apply_along_axis(row_calc, axis=1, arr=self.ratios_)
check_array(self.ratios_)
self.ratios_ = sparse.csr_matrix(self.ratios_)
#p_c /= np.linalg.norm(p_c, ord=1)
#ratios[c] = np.log(p_c / (1 - p_c))
def save(self):
for i in range(len(self.svm_)):
joblib.dump(self.svm_[i], self.savedir + 'nbsvm-svm-trigram-' + str(i))
def main(train_file, test_file, ngram=(1, 3)):
print('loading...')
train = pd.read_csv(train_file, delimiter='\t', encoding='utf-8', header=None,names=['id', 'label', 'text'])
# to shuffle:
#train.iloc[np.random.permutation(len(df))]
test = pd.read_csv(test_file, delimiter='\t', encoding='utf-8', header=None, names=['id', 'label', 'text'])
print('vectorizing...')
vect = CountVectorizer()
classifier = NBSVM()
# create pipeline
clf = Pipeline([('vect', vect), ('nbsvm', classifier)])
params = {
'vect__token_pattern': r"\S+",
'vect__ngram_range': ngram,
'vect__binary': True
}
clf.set_params(**params)
#X_train = vect.fit_transform(train['text'])
#X_test = vect.transform(test['text'])
print('fitting...')
clf.fit(train['text'], train['label'])
print('classifying...')
#pred = clf.predict(test['text'])
decision = clf.decision_function(test['text'])
decision = decision[::-1]
print('testing...')
'''
score = [1, 0.5, 0.333]
n = len(test['label'])
tc = 0
for i in range(n):
for rank in range(3):
if decision[rank][i] == test['label'][i]:
tc += score[rank]
print('acc=%f/%f: %f' % (tc, n, tc/n))
'''
fw = open("nbsvm-submit-trigram", "w")
fw.write("Id,Emoticon\n")
for i in range(len(test['label'])):
fw.write(str(i+261955) + ',')
for rank in range(3):
fw.write(str(decision[rank][i]) + ' ')
fw.write('\n')
fw.close()
classifier.save()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Run NBSVM.')
parser.add_argument('--train', help='path of the train tsv')
parser.add_argument('--test', help='path of the test tsv')
parser.add_argument('--ngrams', help='N-grams considered e.g. 1,3 is uni+bi+tri-grams')
args = parser.parse_args()
if args.ngrams:
ngrams = tuple([int(x) for x in args.ngrams.split(',')])
else:
ngrams = (1, 3)
if not args.train or not args.test:
pirnt('try --help')
main(args.train, args.test, ngrams)
import json
import os
tagdict = {
"不及": "vi",
"不及物動詞": "vi",
"介詞": "p",
"副詞": "d",
"及物動詞": "v",
"名詞": "n",
"外來語": "",
"定詞": "",
"形容詞": "a",
"後置詞": "k",
"時態詞": "t",
"語助詞": "u",
"連接詞": "c",
"量詞": "q"
}
def main():
merged = open("dict-merged.json")
obj = json.load(merged)
posToEng(obj)
output = open("userDict.txt", "w")
for idx in range(len(obj)):
str = obj[idx]['word'] + " " + obj[idx]['frequency'] + " " + obj[idx]['pos'] + "\n"
output.write(str)
def posToEng(obj):
for idx in range(len(obj)):
if obj[idx]['pos'] in list(tagdict.keys()):
obj[idx]['pos'] = tagdict[obj[idx]['pos']]
else:
obj[idx]['pos'] = ""
if __name__ == "__main__":
main()
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
import jieba
import os
import sys
import logging
from random import shuffle
stop_words = [',', '.', '(', ')', '-', '=', '[', ']', '{', '}', '\n', '/', ':', ';', '\'', '\"']
class generate_iter(object):
def __init__(self, filenames):
self.fns = filenames
def __iter__(self):
id = 0
for f in self.fns:
for line in open(f, 'r'):
log = line.split('\t')
log[2] = log[2].replace("EMOTICON", "")
temp = list(w for w in jieba.cut(log[2], cut_all=False) if w != ' ')
#temp = pynlpir.segment(log[2], pos_tagging=False)
label = []
if log[1] == '0':
label.append("TEST_" + str(id))
id += 1
else:
label.append("TRAIN_" + log[0])
label.append("EMOTICON_" + log[1])
yield TaggedDocument(words=temp, tags=label)
def to_array(self):
self.sentence = []
id = 0
for f in self.fns:
for line in open(f, 'r'):
log = line.split('\t')
log[2] = log[2].replace("EMOTICON", "")
temp = list(w for w in jieba.cut(log[2], cut_all=False) if w != ' ')
#temp = pynlpir.segment(log[2], pos_tagging=False)
label = []
if log[1] == '0':
label.append("TEST_" + str(id))
id += 1
else:
label.append("TRAIN_" + log[0])
label.append("EMOTICON_" + log[1])
self.sentence.append(TaggedDocument(words=temp, tags=label))
print(len(self.sentence))
return self.sentence
def perm(self):
shuffle(self.sentence)
return self.sentence
def main():
jieba.set_dictionary('dict.txt.big')
jieba.load_userdict('userDict.txt')
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
'''
for line in open("corpus.txt"):
if line == "\n":
continue
temp = []
temp += jieba.cut(line, cut_all=False)
sentences.append(temp)
'''
filenames = ["train.tsv", "test.tsv"]
sentences = generate_iter(filenames)
logger.info("buliding model...")
model = Doc2Vec(dm=0, size=100, workers=4, min_count=2, hs=1, negative=5, alpha=0.025, window=10, sample=1e-3)
model.build_vocab(sentences.to_array())
for epoch in range(20):
logger.info("training number " + '%s' % str(epoch))
model.train(sentences.perm())
model.alpha -= 0.001
model.save("/tmp2/b03902035/d2v-model-100d-20iter-2mincount-dbow")
#print(model_dbow.similarity('男孩', '女孩'))
#print(model_dbow.most_similar(positive=['男孩', '帥'], negative=['女孩']))
if __name__ == "__main__":
main()
from gensim.models.doc2vec import Doc2Vec
import numpy as np
import logging
import os
import sys
n = 200
testcases = 68733
counts = [0 for i in range(41)]
totalcounts = 261954
def writeSubmit(logger, res):
logger.info("writing submission file")
with open("d2v-submit-most3_n200_multiple", "w") as f:
f.write("Id,Emoticon\n")
for i in range(testcases):
f.write(str(i+261955) + ',')
for j in range(3):
f.write(str(res[i][j][1]) + ' ')
f.write('\n')
if i % 5000 == 0:
logger.info("processed %s sentences" % str(i))
def getHighest3EMOs(logger, model, train_label):
res = []
top = []
logger.info("storing most_similar top %s sentences" % str(n))
for i in range(testcases):
top.append(model.docvecs.most_similar('TEST_' + str(i), topn=n))
if i % 100 == 0:
logger.info("processed %s sentences" % str(i))
logger.info("storing complete")
logger.info("start computing highest 3 EMOTICONs")
for i in range(testcases):
reslist = [[0, k] for k in range(41)]
for j in range(n):
if "TRAIN_" in top[i][j][0]:
idx = int(top[i][j][0].replace("TRAIN_", ""))
print("j: " + str(j) + ", idx: " + str(idx))
reslist[train_label[idx-1]][0] += (n - j) * counts[train_label[idx-1]]
print(str(reslist))
reslist_sort = sorted(reslist, key=lambda x : x[0], reverse=True)
print(str(reslist_sort))
res.append(reslist_sort[0:3])
if i % 5000 == 0:
logger.info("processed %s sentences" % str(i))
return res
def getCounts():
for line in open("train.tsv", "r"):
log = line.split('\t')
counts[int(log[1])] += 1
def getLabels():
arr = []
for line in open("train.tsv", "r"):
log = line.split('\t')
arr.append(int(log[1]))
return arr
def main():
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
logger.info("loading model")
model = Doc2Vec.load('d2v-model-0508')
logger.info("getting labels from training data")
train_label = getLabels()
logger.info("getting counts of emoticons from training data")
getCounts()
logger.info("getting highest probable EMOTICONs")
res = getHighest3EMOs(logger, model, train_label)
writeSubmit(logger, res)
if __name__ == "__main__":
main()
from gensim.models.doc2vec import Doc2Vec
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
from sklearn.linear_model import LogisticRegression, SGDClassifier, LogisticRegressionCV
from sklearn import svm
from sklearn.externals import joblib
import jieba
import numpy as np
import logging
import os
import sys
testcases = range(261954)
n = 200
counts = [0 for i in range(41)]
totalcounts = 261954
train_doc = []
model_name = ['dbow', 'dmm']
def writeSubmit(logger, res):
logger.info("writing submission file")
with open("d2v-submit-most3", "w") as f:
f.write("Id,Emoticon\n")
for i in range(68733):
f.write(str(i+261955) + ',')
for j in range(3):
f.write(str(res[i][j]) + ' ')
f.write('\n')
if i % 5000 == 0:
logger.info("processed %s sentences" % str(i))
def evaluate(res, labels):
for result in res:
point = 0
total = len(result)
for i in range(total):
if(result[i][0][1] == labels[i]):
point += 1
elif(result[i][1][1] == labels[i]):
point += 0.5
elif(result[i][2][1] == labels[i]):
point += 0.333
print("point/total = %s" % str(point) + "/%s" % str(total))
print("average point: %s" % str(point/total))
def getHighest3EMOs(logger, models, train_label):
res = []
n_model = 0
for model in models:
restmp = []
train_vec = list(model.docvecs['TRAIN_' + str(i+1)] for i in range(261954))
logreg = LogisticRegression(n_jobs=4, solver='liblinear')
logger.info("fitting data to classifier")
logreg.fit(train_vec, train_label)
#joblib.dump(logreg, 'logregs/' + model_name[n_model] + '-all-100d')
#logreg = joblib.load('logregs/dbow-all')
logger.info("start computing highest 3 EMOTICONs")
for i in range(len(testcases)):
proba = logreg.predict_proba(model.docvecs['TRAIN_' + str(i+1)].reshape(1, -1))
#proba = logreg.predict_proba(model.infer_vector(train_doc[testcases[i]]).reshape(1, -1))
reslist = [[proba[0][j], j+1] for j in range(40)]
reslist_sort = sorted(reslist, key=lambda x : x[0], reverse=True)
#print(reslist)
#print(reslist_sort)
restmp.append(reslist_sort[0:3])
if i % 1000 == 0:
logger.info("processed %s sentences" % str(i))
res.append(restmp)
n_model += 1
return res
def getDoc():
for line in open("train.tsv", "r"):
log = line.split('\t')
log[2] = log[2].replace("EMOTICON", "")
train_doc.append([w for w in jieba.cut(log[2], cut_all=False) if w != ' '])
def getCounts():
for line in open("train.tsv", "r"):
log = line.split('\t')
counts[int(log[1])] += 1
def getLabels():
arr = []
for line in open("train.tsv", "r"):
log = line.split('\t')
arr.append(int(log[1]))
return arr
def main():
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
jieba.set_dictionary('dict.txt.big')
jieba.load_userdict('userDict.txt')
logger.info("loading training doc")
getDoc()
logger.info("loading model")
model1 = Doc2Vec.load('/tmp2/b03902035/d2v-model-100d-20iter-2mincount-dbow')
model2 = Doc2Vec.load('/tmp2/b03902035/d2v-model-100d-20iter-2mincount-dmm')
modelconcat = ConcatenatedDoc2Vec([model1, model2])
models = [modelconcat]
logger.info("getting labels from training data")
train_label = getLabels()
logger.info("getting counts of emoticons from training data")
getCounts()
logger.info("getting highest probable EMOTICONs")
res = getHighest3EMOs(logger, models, train_label)
evaluate(res, train_label)
#writeSubmit(logger, res)
if __name__ == "__main__":
main()