import os.path
baseDir = os.path.join('data')
inputPath = os.path.join('cs100', 'lab1', 'shakespeare.txt')
fileName = os.path.join(baseDir, inputPath)
shakespeareRDD = (sc
.textFile(fileName, 8)
# word from line
shakespeareWordsRDD = shakespeareRDD.flatMap(lambda x: x.split(' '))
# no empty element
shakeWordsRDD = shakespeareWordsRDD.filter(lambda x:len(x) > 0)
# count words
top3WordsAndCounts = wordCount(shakeWordsRDD).takeOrdered(3,key=lambda(w,c):-c)
print '\n'.join(map(lambda (w, c): '{0}: {1}'.format(w, c), top3WordsAndCounts))
the: 27361
and: 26028
i: 20681
import re
def removePunctuation(text):
"""Removes punctuation, changes to lower case, and strips leading and trailing spaces.
Only spaces, letters, and numbers should be retained. Other characters should should be
eliminated (e.g. it's becomes its). Leading and trailing spaces should be removed after
punctuation is removed.
text (str): A string.
str: The cleaned up string.
rx = re.compile('[^a-zA-Z0-9 ]')
return rx.sub('',text).lower().strip()
def wordCount(wordListRDD):
"""Creates a pair RDD with word counts from an RDD of words.
wordListRDD (RDD of str): An RDD consisting of words.
RDD of (str, int): An RDD consisting of (word, count) tuples.
return x: (x,1)).reduceByKey(lambda x,y : x+y)