dgadiraju
7/30/2017 - 2:59 PM

pyspark-set-operations.py


path = "/public/retail_db" or path = "/Users/itversity/Research/data/retail_db"

orders201312 = sc.textFile(path + "/orders").\
filter(lambda order: "2013-12" in order.split(",")[1]).\
map(lambda order: (int(order.split(",")[0]), order.split(",")[1]))

orderItems = sc.textFile(path + "/order_items").\
map(lambda rec: (int(rec.split(",")[1]), int(rec.split(",")[2])))

distinctProducts201312 = orders201312.\
join(orderItems).\
map(lambda order: order[1][1]).\
distinct()

orders201401 = sc.textFile(path + "/orders").\
filter(lambda order: "2014-01" in order.split(",")[1]).\
map(lambda order: (int(order.split(",")[0]), order.split(",")[1]))

products201312 = orders201312.\
join(orderItems).\
map(lambda order: order[1][1])

products201401 = orders201401.\
join(orderItems).\
map(lambda order: order[1][1])

products201312.union(products201401).count()
products201312.union(products201401).distinct().count()

products201312.intersection(products201401).count()