dgadiraju
7/30/2017 - 1:12 PM

pyspark-aggregations.py

path = "/Users/itversity/Research/data/retail_db" or path = "/public/retail_db"

orderItems = sc.textFile(path + "/order_items").\
map(lambda orderItem: (int(orderItem.split(",")[1]), float(orderItem.split(",")[4])))

// Compute revenue for each order
for i in orderItems.\
reduceByKey(lambda total, orderItemSubtotal: total + orderItemSubtotal).\
take(100):
  print(i)

// Compute revenue and number of items for each order using aggregateByKey
for i in orderItems.\
aggregateByKey((0.0, 0),
    lambda iTotal, oisubtotal:  (iTotal[0] + oisubtotal, iTotal[1] + 1),
    lambda fTotal, iTotal: (fTotal[0] + iTotal[0], fTotal[1] + iTotal[1])
  ).\
take(100):
  print(i)

// Compute revenue and number of items for each order using reduceByKey
for i in sc.textFile(path + "/order_items").\
map(lambda orderItem: (int(orderItem.split(",")[1]), (float(orderItem.split(",")[4]), 1))).\
reduceByKey(lambda total, element: (total[0] + element[0], total[1] + element[1])).\
take(100):
  print(i)