jmquintana79
4/25/2017 - 8:40 AM

Type of words selector

Get tags in a text selecting only one type of word

import nltk
import numpy as np
import pandas as pd
import copy

def get_tags_text(text:str, limit_common:int, stype:str = 'NN'):
  
  # input
  sinput = copy.deepcopy(test.lower().replace('\n',' '))
    
  ## TOKENIZE, SINGULAR/PLURAL VALIDATION and CALCULATE common index
  tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
  lemmatizer = nltk.stem.WordNetLemmatizer()
  tokens = tokenizer.tokenize(sinput)
  lemmas = [lemmatizer.lemmatize(t) for t in tokens]
  fdist = nltk.FreqDist(lemmas)
  common = fdist.most_common(100000000) 

    
  ## FILTER BY TYPE OF WORD for len(words)>1
  tagged = nltk.pos_tag([ic[0] for ic in common if len(ic)>1])
  # only nouns
  nouns = [inn for inn in tagged if stype in inn[1]]
  lnouns = [inn[0] for inn in nouns]
  # update common
  common = [icc for icc in common if icc[0] in lnouns] 

  ## store into df and return
  FINAL = pd.DataFrame(np.array(common))
  FINAL.columns = ['tag','rank']
  FINAL['rank'] = FINAL['rank'].astype(int) 
  FINAL.sort(['rank'], ascending=[0], inplace=True)
  FINAL.reset_index(drop=True,inplace=True)
  # return
  return FINAL[:limit_common]
  
# test
if __name__ == "__main__":
  text = """It is very interesting the possibility to extrack words filtering by type. Give me a lot of possibilites. """
  print(get_tags_text(text, 5, stype = 'NN'))