dgadiraju
7/30/2017 - 6:18 AM

pyspark-word-count.py

inputPath = "/Users/itversity/Research/data/wordcount.txt" or inputPath = "/public/randomtextwriter/part-m-00000"
outputPath = "/Users/itversity/Research/data/wordcount" or outputPath = "/user/dgadiraju/wordcount"
//Make sure outputPath does not exist for this example

for i in sc.textFile(inputPath).\
flatMap(lambda l: l.split(" ")).\
map(lambda w: (w, 1)).\
reduceByKey(lambda t, e: t + e).\
take(100):
    print(i)

//Saving to file
sc.textFile(inputPath).\
flatMap(lambda l: l.split(" ")).\
map(lambda w: (w, 1)).\
reduceByKey(lambda t, e: t + e).\
saveAsTextFile(outputPath)