import os.path
baseDir = os.path.join('data')
inputPath = os.path.join('cs100', 'lab1', 'shakespeare.txt')
fileName = os.path.join(baseDir, inputPath)
shakespeareRDD = (sc
.textFile(fileName, 8)
.map(removePunctuation))
# word from line
shakespeareWordsRDD = shakespeareRDD.flatMap(lambda x: x.split(' '))
# no empty element
shakeWordsRDD = shakespeareWordsRDD.filter(lambda x:len(x) > 0)
# count words
top3WordsAndCounts = wordCount(shakeWordsRDD).takeOrdered(3,key=lambda(w,c):-c)
print '\n'.join(map(lambda (w, c): '{0}: {1}'.format(w, c), top3WordsAndCounts))
'''
the: 27361
and: 26028
i: 20681
'''
import re
def removePunctuation(text):
"""Removes punctuation, changes to lower case, and strips leading and trailing spaces.
Note:
Only spaces, letters, and numbers should be retained. Other characters should should be
eliminated (e.g. it's becomes its). Leading and trailing spaces should be removed after
punctuation is removed.
Args:
text (str): A string.
Returns:
str: The cleaned up string.
"""
rx = re.compile('[^a-zA-Z0-9 ]')
return rx.sub('',text).lower().strip()
def wordCount(wordListRDD):
"""Creates a pair RDD with word counts from an RDD of words.
Args:
wordListRDD (RDD of str): An RDD consisting of words.
Returns:
RDD of (str, int): An RDD consisting of (word, count) tuples.
"""
return wordListRDD.map(lambda x: (x,1)).reduceByKey(lambda x,y : x+y)