dgadiraju
7/27/2017 - 12:35 AM

python_dataframes_sql.py

import pandas as pd
import pandasql as pdsql

pysql = lambda q: pdsql.sqldf(q, globals())

orders = pd.read_csv('/Users/itversity/Research/data/retail_db/orders/part-00000', 
            names=['order_id', 'order_date', 'order_customer_id', 'order_status'],
            index_col='order_id')

order_items = pd.read_csv('/Users/itversity/Research/data/retail_db/order_items/part-00000', 
            names=['order_item_id', 'order_item_order_id', 'order_item_product_id',
                   'order_item_quantity', 'order_item_subtotal', 'order_item_product_price'],
index_col='order_item_order_id')

revenue_per_day_sql = """select o.order_date, sum(oi.order_item_subtotal) daily_revenue 
from orders o join order_items oi on o.order_id = oi.order_item_order_id
where o.order_status in ('COMPLETE', 'CLOSED')
group by o.order_date
order by o.order_date"""

revenue_per_day = pysql(revenue_per_day_sql)