path = "/Users/itversity/Research/data/retail_db" or path = "/public/retail_db"
orders = sc.textFile(path + "/orders")
// orders sorted by status
for i in orders.\
map(lambda o:
(o.split(",")[3], o)
).\
sortByKey().\
map(lambda o: o[1]).\
take(100):
print(i)
// orders sorted by status and date in descending order
for i in orders.\
map(lambda o:
((o.split(",")[3], o.split(",")[1]), o)
).\
sortByKey(False).\
map(lambda o: o[1]).\
take(100):
print(i)
// let us get top 5 products in each category from products
products = sc.textFile(path + "/products")
productsGroupByCategory = products.\
filter(lambda product: product.split(",")[4] != "").\
map(lambda p:
(int(p.split(",")[1]), p)
).\
groupByKey()
for i in productsGroupByCategory.\
sortByKey().\
flatMap(lambda rec:
sorted(list(rec[1]), key=lambda k: -float(k.split(",")[4]))[0:5]
).\
take(100):
print(i)