dgadiraju
11/4/2017 - 6:27 PM

core-spark-broadcast-variables.py

#Check out our lab for practice: https://labs.itversity.com

#Get Daily Revenue per product using Broadcast Variable
products = open("/data/retail_db/products/part-00000").read().splitlines()
productsMap = dict(map(lambda product: (int(product.split(",")[0]), product.split(",")[2]), products))

productsBV = sc.broadcast(productsMap)
ordersJoinMap = ordersJoin.\
map(lambda r: ((r[1][0], productsBV.value[r[1][1][0]]), r[1][1][1]))

for i in ordersJoinMap.take(10): print(i)

dailyRevenuePerProductName = ordersJoinMap.\
reduceByKey(lambda total, revenue: total + revenue)

#Raise any issues on https://discuss.itversity.com - make sure to categorize properly