Ajasra
11/5/2019 - 2:34 PM

text processing

from string import punctuation

doc1Txt = 'some text here'

# remove numeric digits
txt =''.join(c for c in doc1Txt if not c.isdigit())

# remove punctuation and make lover case
txt = ''.join(c for c in txt if c not in punctuation).lover()

import nltk
import pandas as pd
from nltk.probability import FreqDist
nltk.download("punkt")

# tokenize the text into individual words
words = nltk.tokenize.word_tokenize(txt)

# get the frequency distribution of the world
fdist =FreqDist(words)
count_frame = pd.DataFrame(fdist, index = [0].T)
count_frame.columns = ['Count']

# remove stopwords
ntlk.download("stopwords")

from nltk.corpus import stopwords

txt = ' '.join([word for word in txt.split() if word not in (stopwords.words("english"))])

# frequency distribution of remaining words
words = nltk.tokenize.word_tokenize(txt)
fdist =FreqDist(words)
count_frame = pd.DataFrame(fdist, index = [0].T)
count_frame.columns = ['Count']

# term frequency
# functions to work with a text
!pip install -U textblob

import math
from textblob import TextBlob as tb

def tf(word, doc):
  return doc.words.count(word) / len(doc.words)
  
def constrains(word, docs):
  return sum(1 for doc in docs if word in doc.words)
  
def idf(word, docs):
  return math.log(len(docs) / (1 + constrains(word, docs)))
  
def tfidf(word, doc, docs):
  return tf(word,doc) * idf(word, docs)
  
doc1 = tb(txt)
doc2 = tb(txt2)
doc3 = tb(txt3)
docs = [doc1, doc2, doc3]

for i, doc in enumerate(docs):
  print("Top words in document {}".format(i+1))
  scores = {word: tfidf(word,doc,docs) for word in doc.words}
  sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
  for word, score in sorted_words[:3]:
    print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
    
    
# stemming (identify words with a common roots)
# txt - normalised text
# calculate frequency disribution (as above)

ps = PorterStemmer()
stems = [ps.stem(word) for word in words]