Get tags in a text selecting only one type of word
import nltk
import numpy as np
import pandas as pd
import copy
def get_tags_text(text:str, limit_common:int, stype:str = 'NN'):
# input
sinput = copy.deepcopy(test.lower().replace('\n',' '))
## TOKENIZE, SINGULAR/PLURAL VALIDATION and CALCULATE common index
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
lemmatizer = nltk.stem.WordNetLemmatizer()
tokens = tokenizer.tokenize(sinput)
lemmas = [lemmatizer.lemmatize(t) for t in tokens]
fdist = nltk.FreqDist(lemmas)
common = fdist.most_common(100000000)
## FILTER BY TYPE OF WORD for len(words)>1
tagged = nltk.pos_tag([ic[0] for ic in common if len(ic)>1])
# only nouns
nouns = [inn for inn in tagged if stype in inn[1]]
lnouns = [inn[0] for inn in nouns]
# update common
common = [icc for icc in common if icc[0] in lnouns]
## store into df and return
FINAL = pd.DataFrame(np.array(common))
FINAL.columns = ['tag','rank']
FINAL['rank'] = FINAL['rank'].astype(int)
FINAL.sort(['rank'], ascending=[0], inplace=True)
FINAL.reset_index(drop=True,inplace=True)
# return
return FINAL[:limit_common]
# test
if __name__ == "__main__":
text = """It is very interesting the possibility to extrack words filtering by type. Give me a lot of possibilites. """
print(get_tags_text(text, 5, stype = 'NN'))