NLTK package
import nltk
md = nltk.corpus.gutenberg.words("melville-moby_dick.txt")
md.count("boat") #counts the number of times "boat" appears in md
md_sents = nltk.corpus.gutenberg.sents("melville‐moby_dick.txt")
#frequency distribution
fd=nltk.FreqDist(md)
names = [("Group A", "Paul"),("Group A", "Mike"),("Group A", "Katy),("Group B", "Amy"),("Group B", "Joe"),("Group B", "Amy")]
nltk.ConditionalFreqDist(names)
fd.most_common(100) #most common 100 words in md
tokens=nltk.word_tokenize(tex1.txt)
token_sent=nltk.sent_tokenize(phrase)
nltk.bigrams(tokens)
nltk.trigrams(tokens)
nltk.ngrams(tokens,2) #equivalent to bigrams
#stemming
nltk.PorterStemmer().stem(cats) will give cat
nltk.LancasterStemmer().stem(cats)
nltk.WordNetLemmatizer().lemmatize(cats)
sent_tag=nltk.pos_tag(tokens) #part of speech for each token (noun,verb etc)
#Through chunking, we can prevent two word entities from being split.
sequence = '''
CHUNK: {<NNP>+}
{<NN>+}
'''
nltk.RegexpParser(sequence).parse(sent_tag)
nltk.corpus.words.words() #wordlist
nltk.corpus.stopwords.words("english") #stopwords
list(set(common) - set(nltk.corpus.stopwords.words("english")))