jmquintana79
4/25/2017 - 8:40 AM

Get tags collecting only one type of word

Get tags collecting only one type of word

def get_tags(text, limit_common, stype = 'NN'):
    import nltk
    import numpy as np
    import pandas as pd
    import copy
    
    # input
    sinput = copy.deepcopy(test.lower().replace('\n',' '))
    
    ## TOKENIZE, SINGULAR/PLURAL VALIDATION and CALCULATE common index
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    lemmatizer = nltk.stem.WordNetLemmatizer()
    tokens = tokenizer.tokenize(sinput)
    lemmas = [lemmatizer.lemmatize(t) for t in tokens]
    fdist = nltk.FreqDist(lemmas)
    common = fdist.most_common(100000000) 

    
    ## FILTER BY TYPE OF WORD for len(words)>1
    tagged = nltk.pos_tag([ic[0] for ic in common if len(ic)>1])
    # only nouns
    nouns = [inn for inn in tagged if stype in inn[1]]
    lnouns = [inn[0] for inn in nouns]
    # update common
    common = [icc for icc in common if icc[0] in lnouns] 

    ## store into df and return
    FINAL = pd.DataFrame(np.array(common))
    FINAL.columns = ['tag','rank']
    FINAL['rank'] = FINAL['rank'].astype(int) 
    FINAL.sort(['rank'], ascending=[0], inplace=True)
    FINAL.reset_index(drop=True,inplace=True)
    return FINAL[:limit_common]