dgadiraju
11/8/2017 - 3:05 PM

python-get-count-by-date-mapreduce.py

import itertools as it
path = "/Users/itversity/Research/data/retail_db/orders/part-00000"
orders = open(path).read().splitlines()

ordersMap = it.imap(lambda o: (o.split(",")[1], 1), orders)
ordersGroupBy = it.groupby(sorted(ordersMap), lambda k: k[0])

def getCount(l):
    v = map(lambda k: k[1], list(l))
    return reduce(lambda tot, val: tot + val, v)

orderCountByDate = it.imap(lambda o: (o[0], getCount(o[1])), ordersGroupBy)

for i in sorted(orderCountByDate): print(i)