pedro-pinho
8/20/2018 - 3:24 PM

Python

Python and Machine Learning

def alert(msg):
    '''return javascript alert window with givin message'''
    return '''<script type="text/javascript" charset="utf-8">
    alert("%s");
</script>''' % msg


from os.path import dirname as dirname
from os.path import join as pathjoin

def getPath(sufix=""):
    '''get absolute path of the current dir'''
    path = dirname(__file__)
    try:
        index=path.index("..")
        if index!=-1:
            path=path[:index]
    except:
        pass
    return pathjoin(path, sufix).replace('\\','/')

#force utf8
import sys
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)


from hashlib import md5
def md5ize(s):
    '''return the md5 encoding of a string'''
    return md5(s).hexdigest()


#integer file size to stringG
def format_size(size):
    if size<1024:
        return "%s Byte"% size
    elif size>=1024 and size <1024*1024:
        size /=1024
        return "%s Kb"% size
    elif size>=1024*1024 and size <1024*1024*1024:
        size /=(1024*1024)
        return "%s Mb"% size


#get parameter from url:
def get_V(p, regex=None):
    """return cgi GET parameter; strip white spaces at both ends if any;
    if verify pattern provided, do match test; only return matched values.
    Note: it uses re.match to check , not re.search.
    """
    import cgi
    form = cgi.FieldStorage()
    value= form.getfirst(p)

    if not value:
        return None
    value=value.strip()


    if regex is not None:
        import re
        if re.match(regex+"$",value):
            return value
        else:
            return None
    else:
         return value

def stringio():
    buffer = StringIO.StringIO()
    buffer.truncate(0)
    c.setopt(c.WRITEFUNCTION, buffer.write)
    value=buffer.getvalue()

def Do(cmdstr):
    from commands import getoutput
    try:
        return getoutput(cmdstr)
    except Exception, e:
        return str(e)

#sending email via local bash.
def Notify(content, receiver):
    from commands import getoutput as Do
    if not content:
        return

    subject="fanfou @ v2ex update"
    print content
    command="""mail -a "From: no-reply@fanfou.com" -s "%s" "%s" <<< "%s" """ % (subject, receiver, content)

    try:
        Do(command)
    except Exception, e:
        print str(e),"error!"



import urllib

def getHeader(url):
    '''get header information of a url'''
    remotefile=urllib.urlopen(url)
    return remotefile.headers.dict

def getRemoteFileLength(url, unit='k'):
    '''get length of an remote file, without downloading that file.'''
    remotefile=urllib.urlopen(url)
    unit=unit.upper()
    units={
        'B':1,
        'K':1024,
        'M':1024*1024,
        'G':1024*1024*1024,
    }

    try:
        length=remotefile.headers.dict['content-length']
    except:
        print 'no length infor. loading complete file to caclulate length'
        length=len(remotefile.read())

    reallen=float( float(length) / units[unit])

    formatedLength = "%.2f%s" % (reallen,unit)
    return formatedLength

#escaping html
html_escape_table = {
    "&": "&amp;",
    '"': "&quot;",
    "'": "&apos;",
    ">": "&gt;",
    "<": "&lt;",
    }

def html_escape(text):
    """Produce entities within text."""
    return "".join(html_escape_table.get(c,c) for c in text)
    
    
# Enumerate allows you to have access to the indexes of the elements within a for loop.
>>> l = ['a','b','c','d','e','f']
>>> for (index,value) in enumerate(l):
...     print index, value
... 
0 a
1 b
2 c
3 d
4 e
5 f


# Using any and a generator:
if any(pred(x.item) for x in sequence):
    ...
# instead of code written like this:
found = False
for x in sequence:
    if pred(x.n):
        found = True
if found:
    ...
    
    
# Initializing a 2D list

While this can be done safely to initialize a list:

lst = [0] * 3
The same trick won’t work for a 2D list (list of lists):

>>> lst_2d = [[0] * 3] * 3
>>> lst_2d
[[0, 0, 0], [0, 0, 0], [0, 0, 0]]
>>> lst_2d[0][0] = 5
>>> lst_2d
[[5, 0, 0], [5, 0, 0], [5, 0, 0]]
The operator * duplicates its operands, and duplicated lists constructed with [] point to the same list. The correct way to do this is:

>>> lst_2d = [[0] * 3 for i in xrange(3)]
>>> lst_2d
[[0, 0, 0], [0, 0, 0], [0, 0, 0]]
>>> lst_2d[0][0] = 5
>>> lst_2d
[[5, 0, 0], [0, 0, 0], [0, 0, 0]]


# Fire up a simple web server for files in the current directory:
python -m SimpleHTTPServer 8008


# zip(*iterable) transposes an iterable.
>>> a=[[1,2,3],[4,5,6]]
>>> zip(*a)
    [(1, 4), (2, 5), (3, 6)]
#It's also useful with dicts.
>>> d={"a":1,"b":2,"c":3}
>>> zip(*d.iteritems())
[('a', 'c', 'b'), (1, 3, 2)]


# To flatten a list of lists, such as
[['a', 'b'], ['c'], ['d', 'e', 'f']]
#into
['a', 'b', 'c', 'd', 'e', 'f']
#use
[inner
    for outer in the_list
        for inner in outer]
        
        
# Suppose you have a list of items, and you want a dictionary with these items as the keys. Use fromkeys:
>>> items = ['a', 'b', 'c', 'd']
>>> idict = dict().fromkeys(items, 0)
>>> idict
{'a': 0, 'c': 0, 'b': 0, 'd': 0}


# To find out if line is empty (i.e. either size 0 or contains only whitespace),
# use the string method strip in a condition, as follows:

if not line.strip():    # if line is empty
    continue            # skip it
  
  
# Emulating a switch statement. For example switch(x) {..}:
def a():
  print "a"
def b():
  print "b"
def default():
   print "default"
apply({1:a, 2:b}.get(x, default))
@app.route('/')
def index():
    return render_template(
        'index.html'
    )


@app.route('/predict', methods=['POST'])
def predict():
    q = [request.form['q']] or ['']
    nb = {};
    sgd = {};
    svm = {};

    X_new_counts = count_vect.transform(q)
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)
    #1
    nb["predict"] = category[clf_nb.predict(X_new_tfidf)].replace("_", " ")
    nb["accuracy"] = accuracy_nb
    #2
    sgd["predict"] = category[clf_sgd.predict(X_new_tfidf)].replace("_", " ")
    sgd["accuracy"] = accuracy_sgd
    #3
    svm["predict"] = category[clf_svm.predict(X_new_tfidf)].replace("_", " ")
    svm["accuracy"] = accuracy_svm

    return render_template('results.html', nb=nb, sgd=sgd, svm=svm)

@app.route('/js/<path:path>')
def js(path):
    return send_from_directory('csv-to-html-table/js', path)

@app.route('/csv/<path:path>')
def csv(path):
    return send_from_directory('', path)

@app.route('/css/<path:path>')
def css(path):
    return send_from_directory('csv-to-html-table/css', path)

@app.route('/fonts/<path:path>')
def fonts(path):
    return send_from_directory('csv-to-html-table/fonts', path)

if __name__ == '__main__':
    app.run(port=5000,host='0.0.0.0',debug=True)
# A simple classifier of music genders, Naive bayes and SGD used.
def classifaction_report_csv(report, label):
    report_data = []
    lines = report.split('\n')
    for line in lines[2:-3]:
        m = re.search(' *(\w{4,10}) *(\d{1}\.\d{2}) *(\d{1}\.\d{2}) *(\d{1}\.\d{2}) *(\d{1,})', line)
        row = {}
        if m is None:
            return
        row['class'] = m.group(1).replace("_", " ")
        row['precision'] = float(m.group(2))
        row['recall'] = float(m.group(3))
        row['f1_score'] = float(m.group(4))
        row['support'] = float(m.group(5))
        report_data.append(row)
    dataframe = pd.DataFrame.from_dict(report_data)
    dataframe.to_csv(label+'.csv', index = False)

#### Pre-processamento de dados ####
#
category = ['bossa_nova','funk','gospel','sertanejo']

for c in category:
    i=0
    for filename in os.listdir(os.path.join(os.path.dirname(__file__),'data',c)):
        i+=1
    ## Se nunca passou pelo pre-processamento
    if(i<=4):
        for filename in os.listdir(os.path.join(os.path.dirname(__file__),'data',c)):
            #abre cada arquivo, enorme
            file=open(os.path.join(os.path.dirname(__file__),'data',c,filename),'r')
            fileContent=file.read()
            #separa usando uma regex
            myregex = re.compile('\"\n^(" \n)',re.M)
            lyricList = myregex.split(fileContent)
            #salva arquivos picados
            j=0
            for lyric in lyricList:
                lyric = lyric.replace("lyric", "", 3)
                lyric = lyric.replace('"', '', 3)
                if len(lyric)>2:
                    j+=1
                    f=open("data/"+c+"/"+str(j)+".txt","w+")
                    f.write(lyric)
                    f.close()
            os.remove(os.path.join(os.path.dirname(__file__),'data',c,filename))

#### Carregando arquivos ####
# mydata = np.genfromtxt(filename, delimiter=",")
dataset = load_files('data', encoding='ISO-8859-1', load_content=True, categories=category)

# 80% treino
test_size = 0.2
docs_train, docs_to_split, y_train, y_to_split = train_test_split(
    dataset.data, dataset.target, test_size = test_size, random_state=1)

#10% teste, 10% validacao
validation_size = 0.5
docs_test, docs_validation, y_test, y_validation = train_test_split(
    docs_to_split, y_to_split, test_size = validation_size, random_state=1)

# Tokenizer
count_vect = CountVectorizer()

# Abordagem tf-idf
tfidf_transformer = TfidfTransformer()

#### Captura de Features Conjunto de Treino ####
#Duvida: porque devo encaixar esses tokenizers no conjunto de treino?
X_train_counts = count_vect.fit_transform(docs_train)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

#### Captura de Features Conjunto de Validacao ####
X_validation_counts = count_vect.transform(docs_validation)
X_validation_tfidf = tfidf_transformer.transform(X_validation_counts)

#### Captura de Features Conjunto de Teste ####
X_test_counts = count_vect.transform(docs_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

#### Treinar Modelo ####
print "Calibrando Naive Bayers..."
# Alternativa 1: Naive bayes
#Encontrar melhor valor de alpha
alpha_nb = 0
best_accuracy_nb = 0
for x in np.arange(0.1, 1.0, 0.3):
    clf_nb = MultinomialNB(alpha=x).fit(X_validation_tfidf, y_validation)
    
    predict_validation_nb = clf_nb.predict(X_validation_tfidf)
    accuracy_nb = np.mean(predict_validation_nb == y_validation)

    #se foi a maior acuracia ate agora, salva como melhor alpha
    if accuracy_nb > best_accuracy_nb:  
        alpha_nb = x
        best_accuracy_nb = accuracy_nb

#Com o alpha encontrado
clf_nb = MultinomialNB(alpha=alpha_nb).fit(X_train_tfidf, y_train)

#### Avaliando algoritmo ####
predict_test_nb = clf_nb.predict(X_test_tfidf)
accuracy_nb = np.mean(predict_test_nb == y_test)

report_nb=metrics.classification_report(y_test, predict_test_nb, target_names=category) 
classifaction_report_csv(report_nb,"nb")
#print(metrics.confusion_matrix(y_test, predict_test_nb))

# Alternativa 2: SGD
# #
# Existem muitos parametros no sgd do sklearn
# loss=hinge,
# penalty=l2
# alpha=0.0001
# l1_ratio=0.15
# fit_intercept=True
# max_iter=None
# tol=None
# shuffle=True
# verbose=0
# epsilon=0.1,
# n_jobs=1,
# random_state=None,
# learning_rate=optimal,
# eta0=0.0,
# power_t=0.5,
# class_weight=None,
# warm_start=False,
# average=False,
# n_iter=None
# # 
print "Calibrando SGD..."

# Encontrando o melhor valor de loss
best_accuracy_sgd = 0
loss = ["hinge", "log", "modified_huber", "squared_hinge", "perceptron", "squared_loss", "huber", "epsilon_insensitive", "squared_epsilon_insensitive"]
for l in loss:
    sgd = SGDClassifier(loss=l)
    clf_sgd = sgd.fit(X_validation_tfidf, y_validation)

    # Se foi a maior acuracia ate agora, salva como melhor loss
    predict_validation_sgd = clf_sgd.predict(X_validation_tfidf)
    accuracy_sgd = np.mean(predict_validation_sgd == y_validation)
    
    if accuracy_sgd > best_accuracy_sgd:  
        loss_sgd = l
        best_accuracy_sgd = accuracy_sgd

#Encontrar melhor valor de alpha
alpha_sgd = 0
best_accuracy_sgd = 0
for x in np.arange(0.0001, 1.0, 0.3):
    sgd = SGDClassifier(alpha=x)
    clf_sgd = sgd.fit(X_validation_tfidf, y_validation)

    # Se foi a maior acuracia ate agora, salva como melhor alpha
    predict_validation_sgd = clf_sgd.predict(X_validation_tfidf)
    accuracy_sgd = np.mean(predict_validation_sgd == y_validation)
    
    if accuracy_sgd > best_accuracy_sgd:  
        alpha_sgd = x
        best_accuracy_sgd = accuracy_sgd


# Encontrando o melhor valor de penalty
best_accuracy_sgd = 0
penalty = ["none", "l2", "l1", "elasticnet"]
for p in penalty:
    sgd = SGDClassifier(penalty=p)
    clf_sgd = sgd.fit(X_validation_tfidf, y_validation)
    # Se foi a maior acuracia ate agora, salva como melhor penalty
    predict_validation_sgd = clf_sgd.predict(X_validation_tfidf)
    accuracy_sgd = np.mean(predict_validation_sgd == y_validation)
    
    if accuracy_sgd > best_accuracy_sgd:  
        penalty_sgd = p
        best_accuracy_sgd = accuracy_sgd

# Encontrando o melhor valor de learning_rate
best_accuracy_sgd = 0
learning_rate = ["constant","optimal","invscaling"]
for lr in learning_rate:
    eta0_sgd = 1 #chutando um valor de eta0
    sgd = SGDClassifier(learning_rate=lr, eta0=eta0_sgd)
    clf_sgd = sgd.fit(X_validation_tfidf, y_validation)
    # Se foi a maior acuracia ate agora, salva
    predict_validation_sgd = clf_sgd.predict(X_validation_tfidf)
    accuracy_sgd = np.mean(predict_validation_sgd == y_validation)

    if accuracy_sgd > best_accuracy_sgd:  
        learning_rate_sgd = lr
        best_accuracy_sgd = accuracy_sgd
if learning_rate_sgd is not "optimal":
    #Necessario aprender melhor valor de eta0
    best_accuracy_sgd = 0
    for x in np.arange(0.01, 1.0, 0.3):
        sgd = SGDClassifier(learning_rate=learning_rate_sgd,
                            eta0=x)
        clf_sgd = sgd.fit(X_validation_tfidf, y_validation)

        #se foi a maior acuracia ate agora, salva
        predict_validation_sgd = clf_sgd.predict(X_validation_tfidf)
        accuracy_sgd = np.mean(predict_validation_sgd == y_validation)
        
        if accuracy_sgd > best_accuracy_sgd:  
            eta0_sgd = x
            best_accuracy_sgd = accuracy_sgd

# Encontrando o melhor valor de tol (Criterio de parada)
best_accuracy_sgd = 0
for x in np.arange(0.001, 2.1, 0.01):
    sgd = SGDClassifier(tol=x)
    clf_sgd = sgd.fit(X_validation_tfidf, y_validation)
    #Se foi a maior acuracia ate agora, salva 
    predict_validation_sgd = clf_sgd.predict(X_validation_tfidf)
    accuracy_sgd = np.mean(predict_validation_sgd == y_validation)
    if accuracy_sgd > best_accuracy_sgd:
        tol_sgd = x
        best_accuracy_sgd = accuracy_sgd

# Encontrar o melhor valor de max interacoes nao faz muita diferenca
# best_accuracy_sgd = 0
# for x in range(5, 100, 1):
#     sgd = SGDClassifier(loss=loss_sgd, penalty=penalty_sgd,
#                         alpha=alpha_sgd, random_state=42,
#                         max_iter=x,learning_rate=learning_rate_sgd,
#                         eta0=eta0_sgd,tol=tol_sgd)
#     clf_sgd = sgd.fit(X_validation_tfidf, y_validation)
#     #Se foi a maior acuracia ate agora, salva 
#     predict_validation_sgd = clf_sgd.predict(X_validation_tfidf)
#     accuracy_sgd = np.mean(predict_validation_sgd == y_validation)
#     if accuracy_sgd >= best_accuracy_sgd:
#         max_iter_sgd = x
#         best_accuracy_sgd = accuracy_sgd

sgd = SGDClassifier(loss=loss_sgd,penalty=penalty_sgd,
                    alpha=alpha_sgd, random_state=42,
                    max_iter=5, tol=tol_sgd,
                    learning_rate=learning_rate_sgd, eta0=eta0_sgd)

clf_sgd = sgd.fit(X_train_tfidf, y_train)

#### Avaliando algoritmo ####
predict_test_sgd = clf_sgd.predict(X_test_tfidf)
accuracy_sgd = np.mean(predict_test_sgd == y_test)

report_sgd = metrics.classification_report(y_test, predict_test_sgd, target_names=category)
classifaction_report_csv(report_sgd,"sgd")
#print(metrics.confusion_matrix(y_test, predict_test_sgd))

# Alternativa 3: SVM
# LinearSVC e outra implementacao de Support Vector Classification 
# para o caso de kernel linear.
print "Calibrando SVM..."
# Encontrando o melhor valor de penalty
best_accuracy_svm = 0
penalty = ["l2", "l1"]
for p in penalty:
    svm_lin = svm.LinearSVC(penalty=p, dual=False)
    clf_svm = svm_lin.fit(X_validation_tfidf, y_validation)
    # Se foi a maior acuracia ate agora, salva como melhor penalty
    predict_validation_svm = clf_svm.predict(X_validation_tfidf)
    accuracy_svm = np.mean(predict_validation_svm == y_validation)
    
    if accuracy_svm > best_accuracy_svm:  
        penalty_svm = p
        best_accuracy_svm = accuracy_svm

best_accuracy_svm = 0
loss = ["hinge", "squared_hinge"]
for l in loss:
    svm_lin = svm.LinearSVC(loss=l)
    clf_svm = svm_lin.fit(X_validation_tfidf, y_validation)
    # Se foi a maior acuracia ate agora, salva como melhor loss
    predict_validation_svm = clf_svm.predict(X_validation_tfidf)
    accuracy_svm = np.mean(predict_validation_svm == y_validation)
    
    if accuracy_svm > best_accuracy_svm:
        loss_svm = l
        best_accuracy_svm = accuracy_svm

best_accuracy_svm = 0
multi_class = ["ovr", "crammer_singer"]
for mc in multi_class:
    svm_lin = svm.LinearSVC(multi_class=mc)
    clf_svm = svm_lin.fit(X_validation_tfidf, y_validation)
    # Se foi a maior acuracia ate agora, salva como melhor multi class
    predict_validation_svm = clf_svm.predict(X_validation_tfidf)
    accuracy_svm = np.mean(predict_validation_svm == y_validation)
    
    if accuracy_svm > best_accuracy_svm:
        mc_svm = mc
        best_accuracy_svm = accuracy_svm

best_accuracy_svm = 0
dual = [True, False]
for d in dual:
    svm_lin = svm.LinearSVC(dual=d)
    clf_svm = svm_lin.fit(X_validation_tfidf, y_validation)
    # Se foi a maior acuracia ate agora, salva
    predict_validation_svm = clf_svm.predict(X_validation_tfidf)
    accuracy_svm = np.mean(predict_validation_svm == y_validation)
    
    if accuracy_svm > best_accuracy_svm:
        dual_svm = d
        best_accuracy_svm = accuracy_svm

clf_svm = svm.LinearSVC(dual=dual_svm, loss=loss_svm,
     multi_class=mc_svm, penalty=penalty_svm, verbose=0, max_iter=3000)
clf_svm.fit(X_train_tfidf, y_train) 

predict_test_svm = clf_svm.predict(X_test_tfidf)
accuracy_svm = np.mean(predict_test_svm == y_test)

report_svm = metrics.classification_report(y_test, predict_test_svm, target_names=category)
classifaction_report_csv(report_svm,"svm")

#### Salvando modelo ####
joblib.dump(clf_nb, 'model_nb.pkl') 
joblib.dump(clf_sgd, 'model_sgd.pkl') 
joblib.dump(clf_svm, 'model_svm.pkl')