dgadiraju
7/30/2017 - 1:26 PM

pyspark-minpricedproductbycategory.py

path = "/public/retail_db"
products = sc.textFile(path + "/products")

minPricedProductsByCategory = products.\
filter(lambda product: product.split(",")[4] != "").\
map(lambda p:
  (int(p.split(",")[1]), p)
).\
reduceByKey(lambda agg, product:
  agg if(float(agg.split(",")[4]) < float(product.split(",")[4])) else product
).\
map(lambda rec: rec[1])

for i in minPricedProductsByCategory.collect(): print(i)