nltk_book.py
import nltk
from nltk.corpus import brown as bw
from nltk.corpus import gutenberg as gt
import re
# 3- Tokenize and tag a sentece
sent = "They wind back the clock, while we chase after the wind"
tokens = nltk.word_tokenize(sent)
pos = nltk.pos_tag(tokens)
# 5- Creating and updating dict(). What happens if I look up a key that's not in dict?
tel = dict()
tel.update({'cat':'jengibre', 'dog': 'mateo'})
# -> print(tel['xyz']) KeyError.
# 6- Deleting items from dict()
tel = dict()
tel.update({'cat':'jengibre', 'dog': 'mateo'})
del tel['cat']
# -> {'dog': 'mateo'}
# 7- Update d1.update(d2).
tel = dict()
loo = dict()
tel.update({'cat':'jengibre', 'dog': 'mateo'})
loo.update({'mouse':'spleen', 'duck':'boxer'})
tel.update(loo)
print(loo, tel)
# ->{'mouse': 'spleen', 'cat': 'jengibre', 'dog': 'mateo', 'duck': 'boxer'}
# 8- Create a dict() e.
e = {'headword':'head', 'pos':'noun', 'example':'my head hurts'}
# 10- Train a UnigramTagger and use it on some new text
bw_tagged = bw.tagged_sents(categories='news') # For training
bw_sent = bw.sents(categories='news') # For testing the tagger on the training data
gt_text = gt.sents('carroll-alice.txt') # New text
uni_tagger = nltk.UnigramTagger(bw_tagged) # Training the tagger
bw_gettaged = uni_tagger.tag(bw_sent[98]) # Testing on a sentence of the trained data (shouldn't be done)
gt_gettaged = uni_tagger.tag(gt_text[101]) # Testing it on a new text
# -> [('Oh', None), ('dear', None), (',', ','), ('what', 'WDT'), ('nonsense', 'NN'),
# ('I', 'PPSS'), ("'", "'"), ('m', None), ('talking', 'VBG'), ("!'", None)]. They were tagged
# as None because these words didn't come up during training. So the Tagger doesn't know
# how to tag them
# 12- Train a bigram tagger without backoff tagger and run it on training data:
bw_new_text = bw.sents(categories='fiction') # new text to test tagger
bw_tagged = bw.tagged_sents(categories='news') # For training
bw_sent = bw.sents(categories='news') # For testing the tagger
bi_tagger = nltk.BigramTagger(bw_tagged) # Training the tagger
bw_gettaged = bi_tagger.tag(bw_sent[107]) # Testing on a sentence of the trained data (shouldn't be done)
bw_new_gettaged = bi_tagger.tag(bw_new_text[100]) # Testing it on a new text
print(bw_new_gettaged)
# -> 0.78 score, however this tagger without backoff doesn't work with new data:
# -> Many words in the new text are tagged with None. Notice that the bigram tagger
# manages to tag every word in a sentence it saw during training, but does badly
# on an unseen sentence. As soon as it encounters a new word (i.e., 13.5),
# it is unable to assign a tag. It cannot tag the following word (i.e., million),
# even if it was seen during training, simply because it never saw it during
# training with a None tag on the previous word. Consequently, the tagger fails
# to tag the rest of the sentence
# 16- Model of a look up tagger:
words = brown.words(categories="news")
tagged_words = brown.tagged_words(categories="news")
most_common = nltk.FreqDist(words).most_common(100)
cfd = nltk.ConditionalFreqDist(tagged_words)
most_likely_tag = dict((word, cfd[word].max()) for word,_ in most_common)
tagger = nltk.UnigramTagger(model=most_likely_tag, backoff=nltk.DefaultTagger('NN'))
# -> What if the backoff is omitted? The words that didn't came up during training are not going
# to be labeled or labeled as None since there's no a general model the tagger can rely to
# in case if a new word is seen.
# 19 -> (a) We train a UnigramTagger by specifying tagged sentence data as a
# parameter when we initialize the tagger. The training process involves inspecting
# the tag of each word and storing the most likely tag for any word in a dictionary
# that is stored inside the tagger. The evaluate() method must become a test set as parameter
# otherwise it will get a perfect score
# 22 -> Create a Regexp_Tagger:
patterns = [
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # simple past
(r'.*es$', 'VBZ'), # 3rd singular present
]
regexp_tagger = nltk.RegexpTagger(patterns)
tagging = regexp_tagger.tag(brown.sents[5])
regexp_tagger.evaluate(brown.tagged_sents)
# Create a tagger workflow including backoff
alice = gt.words('carroll-alice.txt') # Text for final test
bw_words = bw.words()
bw_tagged_wd = bw.tagged_words()
bw_tagged_st = bw.tagged_sents()
# size_words = int(len(bw_tagged_wd) * 0.9)
# training_wd = bw_tagged_wd[:size_words]
# test_wd = bw_tagged_wd[size_words:]
# Separating train from test data
size_sents = int(len(bw_tagged_st) * 0.9)
training_sents = bw_tagged_st[:size_sents]
test_sents = bw_tagged_st[size_sents:]
# Regexp Tagger
patterns = [
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # simple past
(r'.*es$', 'VBZ'), # 3rd singular present
]
t0 = nltk.RegexpTagger(patterns)
# Lookup Tagger
fd = nltk.FreqDist(bw_words)
cfd = nltk.ConditionalFreqDist(training_wd)
most_common = fd.most_common(100)
most_likely_tag = dict((word, cfd[word].max()) for word,_ in most_common)
t1 = nltk.UnigramTagger(model=most_likely_tag, backoff=t0)
# Unigram Tagger
t2 = nltk.UnigramTagger(training_sents, backoff=t1)
# 2-Gramm Tagger
t3 = nltk.BigramTagger(training_sents, cutoff=2, backoff=t2) # 'cutoff' will discard contexts that have only been seen once or twice.
# 3-Gramm Tagger
t4 = nltk.TrigramTagger(training_sents, cutoff=2, backoff=t3)
# Trying it on an unseen text
tagged_text = t4.tag(alice)
print(tagged_text)
import nltk
import requests
from nltk.corpus import gutenberg as gut
from bs4 import BeautifulSoup as bs
# 1- Creating a string 's' and changing it using only slice and concatenation:
s = 'colorless'
index = s.find('rless')
output_line = s[:index] + 'u' + s[index:]
print(output_line)
# 2- Removing morphological endings through slice notation
words = ['dishes', 'running', 'nationality', 'undo', 'preheat']
dish = words[0][:-2]
run = words[1][:-4]
nation = words[2][:-5]
do = words[3][-2:]
heat = words[4][3:]
print(dish,run,nation,do,heat)
# 3- Is it possible to create an Index going beyond the start of the string?
words = ['dishes']
dish = words[-10:]
# -> Yes, and it doesn't produce any error.
# 4- The step size of the slice can also be negative. What happens in this case?
monty = 'Monty Python'
print(monty[::-2])
# It prints backwards and each second character 'nhy to'
# 5- What happens if you ask the interpreter to evaluate monty[::-1]?
monty = 'Monty Python'
print(monty[::-1])
# -> nohtyP ytnoM
# 6- Describe the class of strings matched by the following regular expressions:
[a-zA-Z]+ # {whole}
[A-Z][a-z]* # -> {Ascii}
p[aeiou]{,2}t # -> {Paet}
\d+(\.\d+)? # -> {12} {17.2}
([^aeiou][aeiou][^aeiou])* # bat
\w+|[^\w\s]+ # it matches every word or what it doesn't start with a word but
# a whitespace
# 8 Write a function that takes a URL as its argument and returns the content of
# the URL without the HTML mark up.
def html_webpage(link):
response = requests.get(link)
soup = bs(response.content, "lxml")
text = soup.get_text()
text = re.sub('\s+', ' ', text) # normalizing whitespacing
text = re.sub(r'<.*?>', ' ', text) # deleting more HTML
return text
print(html_webpage("http://www.presidentialrhetoric.com/historicspeeches/index.html"))
# 9- Define a function load(f) to load a file from your computer
def load(f):
read_file = open(f, "rU").readlines() #rU read Universal
return read_file
# 10- Rewrite the following loop as a list comprehension
sent = ['The', 'dog', 'gave', 'John', 'the', 'newspaper']
result = []
for word in sent:
word_len = (word, len(word))
result.append(word_len)
print(result)
# As a list comprehension:
result = [(word, len(word)) for word in sent]
# 11- Define a string and split it on the character 's'
raw = 'Write regular expressions to match the following classes of strings'.split('s')
print(raw)
#12 Write a for loop to print out all the characters of a string. One per line
raw = 'Write regular expressions to match the following classes of strings'
for i in raw:
print(i)
#13- What's the difference between calling split() on a string with no argument and
# with ' '?
raw = 'Write regular expressions to match the following classes of strings'.split()
raw1 = 'Write regular expressions to match the following classes "\n" of strings'.split(' ')
#-> ['Write', 'regular', 'expressions', 'to', 'match', 'the', 'following', 'classes', 'of', 'strings']
#->['Write', 'regular', 'expressions', 'to', 'match', 'the', 'following', 'classes', '"\n"', 'of', 'strings']
#14- Difference between sort and sorted in Python:
# list.sort() sorts the list and save the sorted list
# sorted(list) returns a sorted list without changing the original list.
#30- Using Porter and Lancaster Stemmer to normalize text and checking differences
text = gut.words('carroll-alice.txt')
words = [w for w in text if w.isalpha()]
words = words[:30]
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
port = [porter.stem(t) for t in words]
lancas = [lancaster.stem(t) for t in words]
print(port)
print()
print(lancas)
# Porter: -> ['Alic', 's', 'Adventur', 'in', 'Wonderland', 'by', 'Lewi',
# 'Carrol', 'CHAPTER', 'I']
# Lancaster -> ['al', 's', 'adv', 'in', 'wonderland', 'by', 'lew', 'carrol',
# 'chapt', 'i']
import nltk
from nltk.corpus import gutenberg as gut
from nltk.corpus import brown as bw
from nltk.corpus import state_union as union
from nltk.corpus import wordnet as wn
from nltk.corpus import names as nm
from tabulate import tabulate
from nltk.corpus import stopwords
# 2- General information about Austen's book:
words = gut.words('austen-persuasion.txt')
print('tokens = ', len(words))
print('vocabulary = ', len(set(words)))
print("lexical diversity = ", len(set(words))/len(words))
print("lexical redundancy = ", len(words)/len(set(words)))
# 3- Accessing sample texts on two different categories
# Accessing 'webtext' through specific fileids:
print(nltk.corpus.webtext.words(fileids=["firefox.txt", 'singles.txt']))
# Accessing 'brown' through its categories:
print(bw.words(categories=["hobbies", "government"]))
# Getting to know which fileids are in brown
print(bw.fileids())
# Getting to know which categories are in brown
print(bw.categories())
# 4- Counting the words 'men', 'women', 'people' in different presidential speeches
cfd = nltk.ConditionalFreqDist(
(fileid, word)
for fileid in union.fileids()
for word in union.words(fileid))
print(cfd.tabulate(conditions=union.fileids(), samples=['men', 'women', 'people']))
# 5- Wordnet and the holonym-meronym relation
print(wn.synset('cloud.n.01').member_meronyms())
print(wn.synset('cloud.n.01').part_meronyms())
print(wn.synset('cloud.n.01').substance_meronyms())
print(wn.synset('cloud.n.01').member_holonyms())
print(wn.synset('cloud.n.01').part_holonyms())
print(wn.synset('cloud.n.01').substance_holonyms())
# 6- Getting the concordance of the word 'however' in different texts.
words1 = nltk.Text(gut.words('austen-sense.txt'))
words1.concordance('However')
print()
words2 = nltk.Text(bw.words('ca01'))
words2.concordance('However')
print()
words3 = nltk.Text(union.words('1980-Carter.txt'))
words3.concordance('However')
# 7- ConditionalFreqDist to discover which initial letters in names are more common
# on men and women.
cfd = nltk.ConditionalFreqDist(
(fileid, name[0])
for fileid in nm.fileids()
for name in nm.words(fileid))
cfd.plot()
# 13- Getting the percentage of all noun synsets that have no hyponyms
count = 0
alle = list(wn.all_synsets('n'))
for hyp in alle:
if len(hyp.hyponyms()) == 0:
count += 1
print(count/len(alle))
# 14- Definition of s and the definitions of its hyponyms and hypernyms
def supergloss(s):
res = s.definition() +"\n"
for w in s.hyponyms():
res += 'Hyponyms ->>'+ str(w) + ' '+ w.definition() + " \n"
for w in s.hypernyms():
res += 'Hypernym ->> '+ str(w) + ' '+ w.definition() + " \n"
return res
print(supergloss(wn.synset('cloud.n.01')))
# 15- Print the words that appear more than three times in a text
words = bw.words()
for x in words:
if words.count(x) >= 3:
print(x)
# 16- Generate a lexical diversity table for each category in the Brown corpus
def lexical_div_tabulation():
lex_diversity_list = list()
for cat in bw.categories():
words_in_cat = bw.words(categories=cat)
words_in_cat = [w.lower() for w in words_in_cat]
lex_diversity = len(words_in_cat)/len(set(words_in_cat))
lex_diversity_list.append([cat,lex_diversity])
print(tabulate(lex_diversity_list, headers=["category", "lexical diversity"]))
# Another way of doing the same but with the conditional FreqDist:
def lexical_diversity(text):
return len(text)/len(set(text))
cfd = nltk.ConditionalFreqDist(
(category, lexical_diversity(nltk.Text(bw.words(categories=category))))
for category in bw.categories())
cfd.tabulate()
# 17- Getting the 50 most frequent words in a text without stopwords and punctuation
text = gt.words('carroll-alice.txt')
stopwords = stopwords.words('english')
content = [w for w in text if w.lower() not in stopwords]
content = [w for w in content if w.isalpha()]
fd_words = nltk.FreqDist(content).most_common(50)
# 18- Getting the 5 most frequent bigrams in a text without stopwords and punctuation
text = gt.words('carroll-alice.txt')
stopwords = stopwords.words('english')
content = [w for w in text if w.lower() not in stopwords]
content = [w for w in content if w.isalpha()]
bigrams = list(nltk.bigrams(content))
fd_bigrams = nltk.FreqDist(bigrams).most_common(5)
# 19- Table of word frequencies by genre
cfd = nltk.ConditionalFreqDist(
(genre, word)
for genre in bw.categories()
for word in bw.words(categories=genre))
cfd.tabulate(conditions=bw.categories(),samples=['death', 'murder', 'kill', 'violence'])
# 20- Function taking a word and category of the brown corpus and computes the
# number of times that word appears in the given category
def word_freq(w, cat):
cat = bw.words(categories=cat)
w = cat.count(w)
return w
print(word_freq('would', 'adventure'))
# 21- Define a function that introduces the word 'like' each third word in the text.
def hedge(words):
words = list(words)
n = 3
while n <= len(words):
words.insert(n, 'LIKE')
n += 4
from nltk.corpus import brown
from nltk.book import *
from tabulate import tabulate
from Premier import count
# Getting a table with tokens, types and lexical diversity for each category in the brown corpus:
def lexical_div_tabulation():
list_lexical_div = []
for cat in brown.categories():
words_in_cat = brown.words(categories=cat)
len_words = len(words_in_cat) # Number of words per category
len_set_words = len(set(words_in_cat)) # Number of types per category
lexical_div = len_words / len_set_words
list_lexical_div.append([cat, len_words, len_set_words, lexical_div])
print(tabulate(list_lexical_div, headers=['Category', 'Tokens', 'Types', 'Lexical_Diversity']))
# Produce a dispersion plot for of the main protagonists of Sense and Sensibility. What can you
# observe about the different roles played by the males and females in this novel? can you identify
# the couples?
print(text2.dispersion_plot(["Elinor", "Marianne", "Edward", "Willowghby"]))
# Find the collocations in text5
print(text5.collocations())
# Take the first twosentences of a text and join their length
print(len(sent1), len(sent2), len(sent3))
print(len(sent1 + sent2))
print(len(sent1) + len(sent2))
# What does the second index do in this case? -- It gets the third character of the third word
print(sent1[2][6])
# Find all the four letter words in the Chat corpus. With FreqDist show these words in decreasing
# order of frequency
def reversed_order():
fours = [w.lower() for w in text5 if len(w) == 4 and w.isalpha()]
fd = FreqDist(fours)
reversed_pairs = [(v,k) for (k,v) in fd.items()]
print(sorted(reversed_pairs, reverse=True))
# Use a combination of for and if statements to loop over the words of the movie script for
# Monty Python and the Holy Grail (text6) and print all the uppercase words, one per line.
upper_words = set([w for w in text6 if w.isupper()])
for i in upper_words:
print(i)
# Write expressions for finding all words in text6 that meet the following conditions.
# The result should be in the form of a list of words: ['word1', 'word2', ...].
# a. Ending in ize
def multitask():
for word in text6:
if word.istitle():
# if len(word) > 4 and word[-3:] == "ize":
print(word)
sent = ['she', 'sells', 'sea', 'shells', 'by', 'the', 'sea', 'shore']
# Average word length of a text (it sums the length of each word and divides it through the
# length of the whole text)
print(sum(len(w) for w in sent)/len(sent))
# Define a function that returns the vocabulary size of the text:
def vocab_size(text):
return len(text) / len(set(text))
# Definde a function that finds out how often a word occurs in a text and what percentage
# of the text it takes
def percent(word, text):
times = text.count(word)
return 100 * times / len(text)