JosephZYU
11/9/2019 - 8:11 AM

percent / normalize

import pandas as pd
 
df = pd.read_excel("../data/sales_transactions.xlsx")
df
 
df.groupby('order')["ext price"].transform('sum')
 
df["Order_Total"] = df.groupby('order')["ext price"].transform('sum')
df["Percent_of_Order"] = df["ext price"] / df["Order_Total"]
df
"""
B-004 - Using Sets for Data Analysis
https://pbpython.com/data-analysis-with-sets.html
"""


A More Complex Problem: for loop to generate automatic report
import pandas as pd
df = pd.read_csv("../data/sample-sales.csv")
df.head()
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
Account Number    1000 non-null int64
Account Name      1000 non-null object
sku               1000 non-null object
category          1000 non-null object
quantity          1000 non-null int64
unit price        1000 non-null float64
ext price         1000 non-null float64
date              1000 non-null object
dtypes: float64(2), int64(2), object(4)
memory usage: 62.6+ KB

shoes = set()
belts = set()
shirts = set()

Ref - How to iterate over rows in a DataFrame in Pandas? - https://stackoverflow.com/a/16476974

for index, row in df.iterrows():
        customer = (row['Account Number'],row['Account Name']) # by column name
        category = row['category']
        if category == "Shoes":
                shoes.add(customer)
        if category == "Belt":
                belts.add(customer)
        if category == "Shirt":
                shirts.add(customer)

for index, row in df.iterrows():
        customer = (row[0],row[1]) # by column number starting with 0
        category = row[3]
        if category == "Shoes":
                shoes.add(customer)
        if category == "Belt":
                belts.add(customer)
        if category == "Shirt":
                shirts.add(customer)

num_cus = df["Account Name"].nunique()
num_cus
718

Ref - Rounding a percentage in Python - https://stackoverflow.com/a/22250737
{0:.1f} means, the first parameter has to be formatted to be with 1 decimal digit

print("{0:.2f}% customers have purchased shoes".format(len(shoes)/num_cus * 100))
print("{0:.2f}% customers have purchased belts".format(len(belts)/num_cus * 100))
print("{0:.2f}% customers have purchased shoes but not belts".format(len(shoes - belts)/num_cus * 100))
print("{0:.2f}% customers have purchased shoes and belts".format(len(shoes & belts)/num_cus * 100))
print("{0:.2f}% customers have purchases shoes and shirts".format(len(shoes & shirts)/num_cus * 100))
print("{0:.2f}% customers have purchased shoes, belts and shirts".format(len(shoes & belts & shirts)/num_cus * 100))

37.74% customers have purchased shoes
23.68% customers have purchased belts
33.43% customers have purchased shoes but not belts
4.32% customers have purchased shoes and belts
11.00% customers have purchases shoes and shirts
1.25% customers have purchased shoes, belts and shirts
import pandas as pd

df = pd.read_excel("../data/sales_transactions.xlsx")
df

df.columns
df.info()

df.groupby('account')["ext price"].sum()
df.groupby('name')["ext price"].sum()
df.groupby('order')["ext price"].sum()

order_total = df.groupby('order')["ext price"].sum().rename("Order_Total").reset_index()
order_total

df_1 = df.merge(order_total)
df_1["Percent_of_Order"] = df_1["ext price"] / df_1["Order_Total"]
df_1
import pandas as pd
import seaborn as sns
 
# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]
 
# Read in the CSV file and convert "?" to NaN
df_raw = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data",
                     header=None, names=headers, na_values="?" )
 
# Define a list of models that we want to review
models = ["toyota","nissan","mazda", "honda", "mitsubishi", "subaru", "volkswagen", "volvo"]
 
# Create a copy of the data with only the top 8 manufacturers
df = df_raw[df_raw.make.isin(models)].copy()
 
pd.crosstab(df.make, df.body_style)
# df.groupby(['make', 'body_style'])['body_style'].count().unstack().fillna(0)
# df.pivot_table(index='make', columns='body_style', aggfunc={'body_style':len}, fill_value=0)
 
pd.crosstab(df.make, df.num_doors, margins=True, margins_name="Total")
 
pd.crosstab(df.make, df.body_style, values=df.curb_weight, aggfunc='mean').round(0)
 
pd.crosstab(df.make, df.body_style, normalize=True)
 
pd.crosstab(df.make, df.body_style, normalize='columns')
 
pd.crosstab(df.make, df.body_style, normalize='index')