dgadiraju
7/30/2017 - 3:22 PM

pyspark-bykey-sorting-and-ranking.py

path = "/Users/itversity/Research/data/retail_db" or path = "/public/retail_db"

orders = sc.textFile(path + "/orders")

// orders sorted by status
for i in orders.\
map(lambda o:
  (o.split(",")[3], o)
).\
sortByKey().\
map(lambda o: o[1]).\
take(100):
  print(i)

// orders sorted by status and date in descending order
for i in orders.\
map(lambda o:
  ((o.split(",")[3], o.split(",")[1]), o)
).\
sortByKey(False).\
map(lambda o: o[1]).\
take(100):
  print(i)

// let us get top 5 products in each category from products
products = sc.textFile(path + "/products")
productsGroupByCategory = products.\
filter(lambda product: product.split(",")[4] != "").\
map(lambda p:
    (int(p.split(",")[1]), p)
).\
groupByKey()

for i in productsGroupByCategory.\
sortByKey().\
flatMap(lambda rec:
    sorted(list(rec[1]), key=lambda k: -float(k.split(",")[4]))[0:5]
).\
take(100):
  print(i)