pivot / crosstab / groupby

11/8/2019 - 3:10 PM

pivot / crosstab / groupby

import pandas as pd
import numpy as np

df = pd.read_excel("../data/sales-funnel.xlsx")
table = pd.pivot_table(df,index=["Manager","Rep","Product"],
               values=["Price","Quantity"],
               aggfunc=[np.sum,np.mean],fill_value=0)

GIST-pivot_crosstab.py

import pandas as pd
import seaborn as sns

# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

# Read in the CSV file and convert "?" to NaN
df_raw = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data",
                     header=None, names=headers, na_values="?" )

# Define a list of models that we want to review
models = ["toyota","nissan","mazda", "honda", "mitsubishi", "subaru", "volkswagen", "volvo"]

# Create a copy of the data with only the top 8 manufacturers
df = df_raw[df_raw.make.isin(models)].copy()

pd.crosstab(df.make, df.body_style)
# df.groupby(['make', 'body_style'])['body_style'].count().unstack().fillna(0)
# df.pivot_table(index='make', columns='body_style', aggfunc={'body_style':len}, fill_value=0)

pd.crosstab(df.make, df.num_doors, margins=True, margins_name="Total")

pd.crosstab(df.make, df.body_style, values=df.curb_weight, aggfunc='mean').round(0)

pd.crosstab(df.make, df.body_style, normalize=True)

pd.crosstab(df.make, df.body_style, normalize='columns')

pd.crosstab(df.make, df.body_style, normalize='index')

pivot_crosstab_group.py

import pandas as pd
import seaborn as sns

# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

# Read in the CSV file and convert "?" to NaN
df_raw = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data",
                     header=None, names=headers, na_values="?" )

# Define a list of models that we want to review
models = ["toyota","nissan","mazda", "honda", "mitsubishi", "subaru", "volkswagen", "volvo"]

# Create a copy of the data with only the top 8 manufacturers
df = df_raw[df_raw.make.isin(models)].copy()

pd.crosstab(df.make, df.body_style)
# df.groupby(['make', 'body_style'])['body_style'].count().unstack().fillna(0)
# df.pivot_table(index='make', columns='body_style', aggfunc={'body_style':len}, fill_value=0)

pd.crosstab(df.make, [df.body_style, df.drive_wheels])

pd.crosstab([df.make, df.num_doors], [df.body_style, df.drive_wheels],
            rownames=['Auto Manufacturer', "Doors"],
            colnames=['Body Style', "Drive Type"],
            dropna=False)

pivot_crosstab_group_visualizing.py

import pandas as pd
import seaborn as sns

# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

# Read in the CSV file and convert "?" to NaN
df_raw = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data",
                     header=None, names=headers, na_values="?" )

# Define a list of models that we want to review
models = ["toyota","nissan","mazda", "honda", "mitsubishi", "subaru", "volkswagen", "volvo"]

# Create a copy of the data with only the top 8 manufacturers
df = df_raw[df_raw.make.isin(models)].copy()

# Visualizing
sns.heatmap(pd.crosstab([df.make, df.num_doors], [df.body_style, df.drive_wheels]),
            cmap="YlGnBu", annot=True, cbar=False)

pivot_table.py

import pandas as pd
import numpy as np

SALES=pd.read_csv("../data/sample-sales.csv")

report = SALES.pivot_table(values=['quantity'],index=['Account Name'],columns=['category'], aggfunc=np.sum)
report = SALES.pivot_table(values=['quantity'],index=['Account Name'],columns=['category'], fill_value=0, aggfunc=np.sum)
report = SALES.pivot_table(values=['ext price','quantity'],index=['Account Name'],columns=['category'], fill_value=0,aggfunc=np.sum)

pivot_the_data_basic.py

import pandas as pd
import numpy as np
df = pd.read_excel("../data/sales-funnel.xlsx")
 
df["Status"] = df["Status"].astype("category")
df["Status"].cat.set_categories(["won","pending","presented","declined"],inplace=True) 
df.head()

pd.pivot_table(df,index=["Name"])
pd.pivot_table(df,index=["Name","Rep","Manager"])
pd.pivot_table(df,index=["Manager","Rep"])
pd.pivot_table(df,index=["Manager","Rep"],values=["Price"])
pd.pivot_table(df,index=["Manager","Rep"],values=["Price"],aggfunc=np.sum)
pd.pivot_table(df,index=["Manager","Rep"],values=["Price"],aggfunc=[np.mean,len])

pivot_the_data_intermediate.py

import pandas as pd
import numpy as np
df = pd.read_excel("../data/sales-funnel.xlsx")
 
df["Status"] = df["Status"].astype("category")
df["Status"].cat.set_categories(["won","pending","presented","declined"],inplace=True) 
df.head()

pd.pivot_table(df,index=["Manager","Rep"],values=["Price"],
               columns=["Product"],aggfunc=[np.sum])
pd.pivot_table(df,index=["Manager","Rep"],values=["Price","Quantity"],
               columns=["Product"],aggfunc=[np.sum],fill_value=0)
pd.pivot_table(df,index=["Manager","Rep","Product"],
               values=["Price","Quantity"],aggfunc=[np.sum],fill_value=0)
pd.pivot_table(df,index=["Manager","Rep","Product"],
               values=["Price","Quantity"],
               aggfunc=[np.sum,np.mean],fill_value=0,margins=True)
pd.pivot_table(df,index=["Manager","Status"],values=["Price"],
               aggfunc=[np.sum],fill_value=0,margins=True)

pivot_the_data_advanced.py

import pandas as pd
import numpy as np
df = pd.read_excel("../data/sales-funnel.xlsx")

table = pd.pivot_table(df,index=["Manager","Status"],columns=["Product"],values=["Quantity","Price"],
               aggfunc={"Quantity":len,"Price":[np.sum,np.mean]},fill_value=0)
table

table.query('Manager == ["Debra Henley"]')

table.query('Status == ["pending","won"]')

Cacher is the code snippet organizer for pro developers

We empower you and your team to get more done, faster

pivot / crosstab / groupby