dgadiraju
7/30/2017 - 12:51 PM

pyspark-transformations-mapping.py

orders = sc.textFile("/public/retail_db/orders") // On the lab accessing HDFS
orders = sc.textFile("/Users/itversity/Research/data/retail_db/orders") // Accessing locally on the PC
// Change to valid path as per your preference. Make sure the directory orders exist in the path (locally or on HDFS)
for i in orders.take(10): foreach(println)

completedOrders = orders.filter(lambda rec: rec.split(",")[3] == "COMPLETE")
pendingOrders = orders.\
filter(lambda o:
  ("PENDING" in o.split(",")[3] or o.split(",")[3] == "PROCESSING") and "2013-08" in o.split(",")[1]
)

orderDates = completedOrders.map(lambda rec: (int(rec.split(",")[0]), rec.split(",")[1]))

lines = ["Hello World", 
  "In this case we are trying to understand", 
  "the purpose of flatMap", 
  "flatMap is a function which will apply transformation", 
  "if the transformation results in array, it will flatten out array as individual records", 
  "let us also understand difference between map and flatMap", 
  "in case of map, it take one record and return one record after applying transformation", 
  "even if the transformation result in an array", 
  "where as in case of flatMap, it might return one or more records", 
  "if the transformation of 1 record result an array of 1000 records, ", 
  "then flatMap returns 1000 records"]
linesRDD = sc.parallelize(lines)
words = linesRDD.flatMap(lambda rec: rec.split(" "))
for i in words.collect(): print(i)