from string import punctuation
doc1Txt = 'some text here'
# remove numeric digits
txt =''.join(c for c in doc1Txt if not c.isdigit())
# remove punctuation and make lover case
txt = ''.join(c for c in txt if c not in punctuation).lover()
import nltk
import pandas as pd
from nltk.probability import FreqDist
nltk.download("punkt")
# tokenize the text into individual words
words = nltk.tokenize.word_tokenize(txt)
# get the frequency distribution of the world
fdist =FreqDist(words)
count_frame = pd.DataFrame(fdist, index = [0].T)
count_frame.columns = ['Count']
# remove stopwords
ntlk.download("stopwords")
from nltk.corpus import stopwords
txt = ' '.join([word for word in txt.split() if word not in (stopwords.words("english"))])
# frequency distribution of remaining words
words = nltk.tokenize.word_tokenize(txt)
fdist =FreqDist(words)
count_frame = pd.DataFrame(fdist, index = [0].T)
count_frame.columns = ['Count']
# term frequency
# functions to work with a text
!pip install -U textblob
import math
from textblob import TextBlob as tb
def tf(word, doc):
return doc.words.count(word) / len(doc.words)
def constrains(word, docs):
return sum(1 for doc in docs if word in doc.words)
def idf(word, docs):
return math.log(len(docs) / (1 + constrains(word, docs)))
def tfidf(word, doc, docs):
return tf(word,doc) * idf(word, docs)
doc1 = tb(txt)
doc2 = tb(txt2)
doc3 = tb(txt3)
docs = [doc1, doc2, doc3]
for i, doc in enumerate(docs):
print("Top words in document {}".format(i+1))
scores = {word: tfidf(word,doc,docs) for word in doc.words}
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
for word, score in sorted_words[:3]:
print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
# stemming (identify words with a common roots)
# txt - normalised text
# calculate frequency disribution (as above)
ps = PorterStemmer()
stems = [ps.stem(word) for word in words]