dgadiraju
8/2/2017 - 7:29 AM

pyspark-wordcount-coalesce.py

# Make sure you do not have directory used for output path
path = "/Users/itversity/Research/data/wordcount.txt" or path = "/public/randomtextwriter/part-m-00000"

lines = sc.textFile(path)
lines_coalesce =   lines.coalesce(5) # with out coalesce it will try to use 9 tasks in first stage
words = lines.flatMap(lambda rec: rec.split(" "))
tuples = words.map(lambda rec: (rec, 1))
wordByCount = tuples.reduceByKey(lambda total, agg: total + agg)
wbcCoalesce = wordByCount.coalesce(2) # second stage will use only 2 tasks

for i in wbcCoalesce.take(100):
  print(i)