dgadiraju
8/2/2017 - 6:44 AM

pyspark-wordcount-numtasks.py

inputPath = "/public/randomtextwriter/part-m-0000*"
outputPath = "/user/dgadiraju/wordcount"

# Ideal number of tasks could be 4 while processing 1 file
sc.textFile(inputPath). \
  flatMap(lambda rec: rec.split(" ")). \
  map(lambda rec: (rec, 1)). \
  reduceByKey(lambda total, agg: total + agg, 10). \
  saveAsTextFile(outputPath)