nltk_book.py

2/22/2017 - 5:25 PM

import nltk
from nltk.corpus import brown as bw
from nltk.corpus import gutenberg as gt
import re

# 3- Tokenize and tag a sentece
sent = "They wind back the clock, while we chase after the wind"
tokens = nltk.word_tokenize(sent)
pos = nltk.pos_tag(tokens)

# 5- Creating and updating dict(). What happens if I look up a key that's not in dict?

tel = dict()
tel.update({'cat':'jengibre', 'dog': 'mateo'})
# -> print(tel['xyz']) KeyError.

# 6- Deleting items from dict()

tel = dict()
tel.update({'cat':'jengibre', 'dog': 'mateo'})
del tel['cat']
# -> {'dog': 'mateo'}

# 7- Update d1.update(d2).

tel = dict()
loo = dict()
tel.update({'cat':'jengibre', 'dog': 'mateo'})
loo.update({'mouse':'spleen', 'duck':'boxer'})
tel.update(loo)
print(loo, tel)
# ->{'mouse': 'spleen', 'cat': 'jengibre', 'dog': 'mateo', 'duck': 'boxer'}

# 8- Create a dict() e.

e = {'headword':'head', 'pos':'noun', 'example':'my head hurts'}

# 10- Train a UnigramTagger and use it on some new text

bw_tagged = bw.tagged_sents(categories='news') # For training
bw_sent = bw.sents(categories='news') # For testing the tagger on the training data
gt_text = gt.sents('carroll-alice.txt') # New text
uni_tagger = nltk.UnigramTagger(bw_tagged) # Training the tagger
bw_gettaged = uni_tagger.tag(bw_sent[98]) # Testing on a sentence of the trained data (shouldn't be done)
gt_gettaged = uni_tagger.tag(gt_text[101]) # Testing it on a new text

# -> [('Oh', None), ('dear', None), (',', ','), ('what', 'WDT'), ('nonsense', 'NN'), 
# ('I', 'PPSS'), ("'", "'"), ('m', None), ('talking', 'VBG'), ("!'", None)]. They were tagged 
# as None because these words didn't come up during training. So the Tagger doesn't know
# how to tag them


# 12- Train a bigram tagger without backoff tagger and run it on training data:

bw_new_text = bw.sents(categories='fiction') # new text to test tagger
bw_tagged = bw.tagged_sents(categories='news') # For training
bw_sent = bw.sents(categories='news') # For testing the tagger
bi_tagger = nltk.BigramTagger(bw_tagged) # Training the tagger
bw_gettaged = bi_tagger.tag(bw_sent[107]) # Testing on a sentence of the trained data (shouldn't be done)
bw_new_gettaged = bi_tagger.tag(bw_new_text[100]) # Testing it on a new text
print(bw_new_gettaged)
# -> 0.78 score, however this tagger without backoff doesn't work with new data:
# -> Many words in the new text are tagged with None. Notice that the bigram tagger 
# manages to tag every word in a sentence it saw during training, but does badly
# on an unseen sentence. As soon as it encounters a new word (i.e., 13.5), 
# it is unable to assign a tag. It cannot tag the following word (i.e., million), 
# even if it was seen during training, simply because it never saw it during 
# training with a None tag on the previous word. Consequently, the tagger fails
# to tag the rest of the sentence

# 16- Model of a look up tagger:

words = brown.words(categories="news")
tagged_words = brown.tagged_words(categories="news")
most_common = nltk.FreqDist(words).most_common(100)
cfd = nltk.ConditionalFreqDist(tagged_words)
most_likely_tag = dict((word, cfd[word].max()) for word,_ in most_common)
tagger = nltk.UnigramTagger(model=most_likely_tag, backoff=nltk.DefaultTagger('NN'))

# -> What if the backoff is omitted? The words that didn't came up during training are not going
# to be labeled or labeled as None since there's no a general model the tagger can rely to 
# in case if a new word is seen. 

# 19 -> (a) We train a UnigramTagger by specifying tagged sentence data as a 
# parameter when we initialize the tagger. The training process involves inspecting 
# the tag of each word and storing the most likely tag for any word in a dictionary 
# that is stored inside the tagger. The evaluate() method must become a test set as parameter
# otherwise it will get a perfect score

# 22 -> Create a Regexp_Tagger:

patterns = [
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # simple past
(r'.*es$', 'VBZ'), # 3rd singular present
]

regexp_tagger = nltk.RegexpTagger(patterns)
tagging = regexp_tagger.tag(brown.sents[5])
regexp_tagger.evaluate(brown.tagged_sents)

# Create a tagger workflow including backoff

alice = gt.words('carroll-alice.txt') # Text for final test

bw_words = bw.words()
bw_tagged_wd = bw.tagged_words()
bw_tagged_st = bw.tagged_sents()

# size_words = int(len(bw_tagged_wd) * 0.9)
# training_wd = bw_tagged_wd[:size_words] 
# test_wd = bw_tagged_wd[size_words:]

# Separating train from test data
size_sents = int(len(bw_tagged_st) * 0.9)
training_sents = bw_tagged_st[:size_sents] 
test_sents = bw_tagged_st[size_sents:]

# Regexp Tagger
patterns = [
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # simple past
(r'.*es$', 'VBZ'), # 3rd singular present
]

t0 = nltk.RegexpTagger(patterns)

# Lookup Tagger

fd = nltk.FreqDist(bw_words)
cfd = nltk.ConditionalFreqDist(training_wd)
most_common = fd.most_common(100)
most_likely_tag = dict((word, cfd[word].max()) for word,_ in most_common)
t1 = nltk.UnigramTagger(model=most_likely_tag, backoff=t0)

# Unigram Tagger

t2 = nltk.UnigramTagger(training_sents, backoff=t1) 


# 2-Gramm Tagger

t3 = nltk.BigramTagger(training_sents, cutoff=2, backoff=t2) # 'cutoff' will discard contexts that have only been seen once or twice.

# 3-Gramm Tagger

t4 = nltk.TrigramTagger(training_sents, cutoff=2, backoff=t3)

# Trying it on an unseen text 
tagged_text = t4.tag(alice)

print(tagged_text)

nltk_chapter_3.py

import nltk
import requests
from nltk.corpus import gutenberg as gut
from bs4 import BeautifulSoup as bs

# 1- Creating a string 's' and changing it using only slice and concatenation:

s = 'colorless'
index = s.find('rless')
output_line = s[:index] + 'u' + s[index:]
print(output_line)

# 2- Removing morphological endings through slice notation

words = ['dishes', 'running', 'nationality', 'undo', 'preheat']
dish = words[0][:-2]
run = words[1][:-4]
nation = words[2][:-5]
do = words[3][-2:]
heat = words[4][3:]
print(dish,run,nation,do,heat)

# 3- Is it possible to create an Index going beyond the start of the string?

words = ['dishes']
dish = words[-10:]
# -> Yes, and it doesn't produce any error. 

# 4- The step size of the slice can also be negative. What happens in this case?

monty = 'Monty Python'
print(monty[::-2])
# It prints backwards and each second character 'nhy to'

# 5- What happens if you ask the interpreter to evaluate monty[::-1]?

monty = 'Monty Python'
print(monty[::-1])
# -> nohtyP ytnoM

# 6- Describe the class of strings matched by the following regular expressions:

[a-zA-Z]+ # {whole}
[A-Z][a-z]* # -> {Ascii}
p[aeiou]{,2}t # -> {Paet}
\d+(\.\d+)? # -> {12} {17.2}
([^aeiou][aeiou][^aeiou])* # bat
\w+|[^\w\s]+ # it matches every word or what it doesn't start with a word but 
# a whitespace

# 8 Write a function that takes a URL as its argument and returns the content of
# the URL without the HTML mark up.

def html_webpage(link):
    response = requests.get(link)
    soup = bs(response.content, "lxml")
    text = soup.get_text()
    text = re.sub('\s+', ' ', text) # normalizing whitespacing
    text = re.sub(r'<.*?>', ' ', text) # deleting more HTML
    return text

print(html_webpage("http://www.presidentialrhetoric.com/historicspeeches/index.html"))

# 9- Define a function load(f) to load a file from your computer

def load(f):
    read_file = open(f, "rU").readlines() #rU read Universal
    return read_file
    
# 10- Rewrite the following loop as a list comprehension

sent = ['The', 'dog', 'gave', 'John', 'the', 'newspaper']
result = []
for word in sent:
  word_len = (word, len(word))
  result.append(word_len)
print(result)

# As a list comprehension:
result = [(word, len(word)) for word in sent]

# 11- Define a string and split it on the character 's'

raw = 'Write regular expressions to match the following classes of strings'.split('s')
print(raw)

#12 Write a for loop to print out all the characters of a string. One per line

raw = 'Write regular expressions to match the following classes of strings'
for i in raw:
    print(i)
    
#13- What's the difference between calling split() on a string with no argument and 
# with ' '?

raw = 'Write regular expressions to match the following classes of strings'.split()
raw1 = 'Write regular expressions to match the following classes "\n" of strings'.split(' ')
#-> ['Write', 'regular', 'expressions', 'to', 'match', 'the', 'following', 'classes', 'of', 'strings'] 
#->['Write', 'regular', 'expressions', 'to', 'match', 'the', 'following', 'classes', '"\n"', 'of', 'strings']

#14- Difference between sort and sorted in Python:

# list.sort() sorts the list and save the sorted list
# sorted(list) returns a sorted list without changing the original list.

#30- Using Porter and Lancaster Stemmer to normalize text and checking differences

text = gut.words('carroll-alice.txt')
words = [w for w in text if w.isalpha()]
words = words[:30]
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
port = [porter.stem(t) for t in words]
lancas = [lancaster.stem(t) for t in words]
print(port)
print()
print(lancas)
# Porter: -> ['Alic', 's', 'Adventur', 'in', 'Wonderland', 'by', 'Lewi', 
# 'Carrol', 'CHAPTER', 'I']
# Lancaster -> ['al', 's', 'adv', 'in', 'wonderland', 'by', 'lew', 'carrol', 
# 'chapt', 'i']

nltk_chapter_2.py

import nltk
from nltk.corpus import gutenberg as gut
from nltk.corpus import brown as bw
from nltk.corpus import state_union as union
from nltk.corpus import wordnet as wn
from nltk.corpus import names as nm
from tabulate import tabulate
from nltk.corpus import stopwords

# 2- General information about Austen's book:
words = gut.words('austen-persuasion.txt')
print('tokens = ', len(words))
print('vocabulary = ', len(set(words)))
print("lexical diversity = ", len(set(words))/len(words))
print("lexical redundancy = ", len(words)/len(set(words)))

# 3- Accessing sample texts on two different categories

# Accessing 'webtext' through specific fileids:
print(nltk.corpus.webtext.words(fileids=["firefox.txt", 'singles.txt']))
# Accessing 'brown' through its categories:
print(bw.words(categories=["hobbies", "government"]))
# Getting to know which fileids are in brown
print(bw.fileids())
# Getting to know which categories are in brown
print(bw.categories())

# 4- Counting the words 'men', 'women', 'people' in different presidential speeches

cfd = nltk.ConditionalFreqDist(
    (fileid, word)
    for fileid in union.fileids()
    for word in union.words(fileid))

print(cfd.tabulate(conditions=union.fileids(), samples=['men', 'women', 'people']))

# 5- Wordnet and the holonym-meronym relation

print(wn.synset('cloud.n.01').member_meronyms())
print(wn.synset('cloud.n.01').part_meronyms())
print(wn.synset('cloud.n.01').substance_meronyms())
print(wn.synset('cloud.n.01').member_holonyms())
print(wn.synset('cloud.n.01').part_holonyms())
print(wn.synset('cloud.n.01').substance_holonyms())

# 6- Getting the concordance of the word 'however' in different texts.

words1 = nltk.Text(gut.words('austen-sense.txt'))
words1.concordance('However')
print()
words2 = nltk.Text(bw.words('ca01'))
words2.concordance('However')
print()
words3 = nltk.Text(union.words('1980-Carter.txt'))
words3.concordance('However')

# 7- ConditionalFreqDist to discover which initial letters in names are more common
# on men and women.

cfd = nltk.ConditionalFreqDist(
    (fileid, name[0])
    for fileid in nm.fileids()
    for name in nm.words(fileid))

cfd.plot()

# 13- Getting the percentage of all noun synsets that have no hyponyms

count = 0
alle = list(wn.all_synsets('n'))
for hyp in alle:
    if len(hyp.hyponyms()) == 0:
        count += 1
print(count/len(alle))

# 14- Definition of s and the definitions of its hyponyms and hypernyms

def supergloss(s):
    res = s.definition() +"\n"
    for w in s.hyponyms():
        res += 'Hyponyms ->>'+ str(w) + ' '+ w.definition() + " \n"
    for w in s.hypernyms():
        res += 'Hypernym ->> '+ str(w) + ' '+ w.definition() + " \n"
    return res


print(supergloss(wn.synset('cloud.n.01')))

# 15- Print the words that appear more than three times in a text

words = bw.words()
for x in words:
    if words.count(x) >= 3:
        print(x)
        
# 16- Generate a lexical diversity table for each category in the Brown corpus

def lexical_div_tabulation():
    lex_diversity_list = list()
    for cat in bw.categories():
        words_in_cat = bw.words(categories=cat)
        words_in_cat = [w.lower() for w in words_in_cat]
        lex_diversity = len(words_in_cat)/len(set(words_in_cat))
        lex_diversity_list.append([cat,lex_diversity])
    print(tabulate(lex_diversity_list, headers=["category", "lexical diversity"]))

# Another way of doing the same but with the conditional FreqDist:

def lexical_diversity(text):
  return len(text)/len(set(text))
  
cfd = nltk.ConditionalFreqDist(
    (category, lexical_diversity(nltk.Text(bw.words(categories=category))))
    for category in bw.categories())
cfd.tabulate()

# 17- Getting the 50 most frequent words in a text without stopwords and punctuation

text = gt.words('carroll-alice.txt')
stopwords = stopwords.words('english')
content = [w for w in text if w.lower() not in stopwords]
content = [w for w in content if w.isalpha()]
fd_words = nltk.FreqDist(content).most_common(50)

# 18- Getting the 5 most frequent bigrams in a text without stopwords and punctuation

text = gt.words('carroll-alice.txt')
stopwords = stopwords.words('english')
content = [w for w in text if w.lower() not in stopwords]
content = [w for w in content if w.isalpha()]
bigrams = list(nltk.bigrams(content))
fd_bigrams = nltk.FreqDist(bigrams).most_common(5)

# 19- Table of word frequencies by genre

cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in bw.categories()
    for word in bw.words(categories=genre))

cfd.tabulate(conditions=bw.categories(),samples=['death', 'murder', 'kill', 'violence'])

# 20- Function taking a word and category of the brown corpus and computes the 
# number of times that word appears in the given category

def word_freq(w, cat):
    cat = bw.words(categories=cat)
    w = cat.count(w)
    return w

print(word_freq('would', 'adventure'))

# 21- Define a function that introduces the word 'like' each third word in the text.

def hedge(words):
    words = list(words)
    n = 3
    while n <= len(words):
        words.insert(n, 'LIKE')
        n += 4

nltk_chapter_1.py

from  nltk.corpus import brown
from nltk.book import *
from tabulate import tabulate
from Premier import count

# Getting a table with tokens, types and lexical diversity for each category in the brown corpus:

def lexical_div_tabulation():
    list_lexical_div = []
    for cat in brown.categories():
        words_in_cat = brown.words(categories=cat)
        len_words = len(words_in_cat)  # Number of words per category
        len_set_words = len(set(words_in_cat)) # Number of types per category
        lexical_div = len_words / len_set_words 
        list_lexical_div.append([cat, len_words, len_set_words, lexical_div])
    print(tabulate(list_lexical_div, headers=['Category', 'Tokens', 'Types', 'Lexical_Diversity']))

# Produce a dispersion plot for of the main protagonists of Sense and Sensibility. What can you 
# observe about the different roles played by the males and females in this novel? can you identify
# the couples?

print(text2.dispersion_plot(["Elinor", "Marianne", "Edward", "Willowghby"]))

# Find the collocations in text5
print(text5.collocations())

# Take the first twosentences of a text and join their length

print(len(sent1), len(sent2), len(sent3))
print(len(sent1 + sent2))
print(len(sent1) + len(sent2))

# What does the second index do in this case? -- It gets the third character of the third word
print(sent1[2][6])

# Find all the four letter words in the Chat corpus. With FreqDist show these words in decreasing 
# order of frequency

def reversed_order():
    fours = [w.lower() for w in text5 if len(w) == 4 and w.isalpha()]
    fd = FreqDist(fours)
    reversed_pairs = [(v,k) for (k,v) in fd.items()]
    print(sorted(reversed_pairs, reverse=True))

# Use a combination of for and if statements to loop over the words of the movie script for 
# Monty Python and the Holy Grail (text6) and print all the uppercase words, one per line.

upper_words = set([w for w in text6 if w.isupper()])
for i in upper_words:
    print(i)

# Write expressions for finding all words in text6 that meet the following conditions. 
# The result should be in the form of a list of words: ['word1', 'word2', ...].

# a. Ending in ize

def multitask():
    for word in text6:
        if word.istitle():
        # if len(word) > 4 and word[-3:] == "ize":
            print(word)
sent = ['she', 'sells', 'sea', 'shells', 'by', 'the', 'sea', 'shore']

# Average word length of a text (it sums the length of each word and divides it through the 
# length of the whole text)
print(sum(len(w) for w in sent)/len(sent))

# Define a function that returns the vocabulary size of the text:
def vocab_size(text):
    return len(text) / len(set(text))

# Definde a function that finds out how often a word occurs in a text and what percentage 
# of the text it takes

def percent(word, text):
    times = text.count(word)
    return 100 * times / len(text)

Cacher is the code snippet organizer for pro developers

We empower you and your team to get more done, faster

nltk_book.py