dgadiraju
7/27/2017 - 12:23 AM

python_dataframes_csv.py

import pandas as pd

orders = pd.read_csv('/Users/itversity/Research/data/retail_db/orders/part-00000', 
            names=['order_id', 'order_date', 'order_customer_id', 'order_status'],
            index_col='order_id')
orders.groupby(['order_status'])['order_status'].count()

order_items = pd.read_csv('/Users/itversity/Research/data/retail_db/order_items/part-00000', 
            names=['order_item_id', 'order_item_order_id', 'order_item_product_id',
                   'order_item_quantity', 'order_item_subtotal', 'order_item_product_price'],
            index_col='order_item_order_id')

ordersCompleted = orders.loc[(orders['order_status'] == 'COMPLETE') | (orders['order_status'] == 'CLOSED')]
ordersCompleted.join(order_items).groupby(['order_date'])['order_item_subtotal'].sum()