w22116972
7/31/2015 - 1:54 PM

main.py

import os.path
baseDir = os.path.join('data')
inputPath = os.path.join('cs100', 'lab1', 'shakespeare.txt')
fileName = os.path.join(baseDir, inputPath)

shakespeareRDD = (sc
                  .textFile(fileName, 8)
                  .map(removePunctuation))
# word from line
shakespeareWordsRDD = shakespeareRDD.flatMap(lambda x: x.split(' '))
# no empty element
shakeWordsRDD = shakespeareWordsRDD.filter(lambda x:len(x) > 0)
# count words
top3WordsAndCounts = wordCount(shakeWordsRDD).takeOrdered(3,key=lambda(w,c):-c)
print '\n'.join(map(lambda (w, c): '{0}: {1}'.format(w, c), top3WordsAndCounts))
'''
the: 27361
and: 26028
i: 20681
'''
import re
def removePunctuation(text):
    """Removes punctuation, changes to lower case, and strips leading and trailing spaces.
    Note:
        Only spaces, letters, and numbers should be retained.  Other characters should should be
        eliminated (e.g. it's becomes its).  Leading and trailing spaces should be removed after
        punctuation is removed.

    Args:
        text (str): A string.

    Returns:
        str: The cleaned up string.
    """
    rx = re.compile('[^a-zA-Z0-9 ]')
    return rx.sub('',text).lower().strip()
def wordCount(wordListRDD):
    """Creates a pair RDD with word counts from an RDD of words.

    Args:
        wordListRDD (RDD of str): An RDD consisting of words.

    Returns:
        RDD of (str, int): An RDD consisting of (word, count) tuples.
    """
    return wordListRDD.map(lambda x: (x,1)).reduceByKey(lambda x,y : x+y)