JosephZYU
11/8/2019 - 3:56 PM

category

import pandas as pd
import numpy as np
df = pd.read_excel("../data/sales-funnel.xlsx")

df["Status"] = df["Status"].astype("category")
df["Status"].cat.set_categories(["won","pending","presented","declined"],inplace=True) 
df.head()
"""
B-039 - Guide to Encoding Categorical Values in Python
https://pbpython.com/categorical-encoding.html
"""
import pandas as pd
import numpy as np

# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )

obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()

obj_df[obj_df.isnull().any(axis=1)]
obj_df["num_doors"].value_counts()
obj_df = obj_df.fillna({"num_doors": "four"})

# Approach #1 - Find and Replace

obj_df["num_cylinders"].value_counts()

cleanup_nums = {"num_doors":     {"four": 4, "two": 2},
                "num_cylinders": {"four": 4, "six": 6, "five": 5, "eight": 8,
                                  "two": 2, "twelve": 12, "three":3 }}

obj_df.replace(cleanup_nums, inplace=True)
obj_df.head()

obj_df.dtypes
import pandas as pd
import numpy as np

# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )

obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()

obj_df[obj_df.isnull().any(axis=1)]
obj_df["num_doors"].value_counts()
obj_df = obj_df.fillna({"num_doors": "four"})

# Approach #2 - Label Encoding

obj_df["body_style"] = obj_df["body_style"].astype('category')
obj_df.dtypes

obj_df["body_style_cat"] = obj_df["body_style"].cat.codes
obj_df.head()
import pandas as pd
import numpy as np
 
# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]
 
# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )
 
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()
 
obj_df[obj_df.isnull().any(axis=1)]
obj_df["num_doors"].value_counts()
obj_df = obj_df.fillna({"num_doors": "four"})
 
# Approach #3 - One Hot Encoding
 
pd.get_dummies(obj_df, columns=["drive_wheels"]).head()

pd.get_dummies(obj_df, columns=["body_style", "drive_wheels"], prefix=["body", "drive"]).head()
import pandas as pd
import numpy as np
 
# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]
 
# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )
 
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()
 
obj_df[obj_df.isnull().any(axis=1)]
obj_df["num_doors"].value_counts()
obj_df = obj_df.fillna({"num_doors": "four"})
 
# Approach #4 - Custom Binary Encoding
 
obj_df["engine_type"].value_counts()

obj_df["OHC_Code"] = np.where(obj_df["engine_type"].str.contains("ohc"), 1, 0)

obj_df[["make", "engine_type", "OHC_Code"]].head()
"""
B-039 - Guide to Encoding Categorical Values in Python
https://pbpython.com/categorical-encoding.html
"""
import pandas as pd
import numpy as np
 
# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]
 
# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )
 
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()
 
obj_df[obj_df.isnull().any(axis=1)]
obj_df["num_doors"].value_counts()
obj_df = obj_df.fillna({"num_doors": "four"})

# Scikit-Learn

from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
obj_df["make_code"] = lb_make.fit_transform(obj_df["make"])
obj_df[["make", "make_code"]].head(11)

from sklearn.preprocessing import LabelBinarizer

lb_style = LabelBinarizer()
lb_results = lb_style.fit_transform(obj_df["body_style"])
pd.DataFrame(lb_results, columns=lb_style.classes_).head()

# Advanced Approaches

!pip install category-encoders

import category_encoders as ce

# Get a new clean dataframe
obj_df = df.select_dtypes(include=['object']).copy()

# Specify the columns to encode then fit and transform
encoder = ce.backward_difference.BackwardDifferenceEncoder(cols=["engine_type"])
encoder.fit(obj_df, verbose=1)

# Only display the first 8 columns for brevity
encoder.transform(obj_df).iloc[:,0:7].head()

encoder = ce.polynomial.PolynomialEncoder(cols=["engine_type"])
encoder.fit(obj_df, verbose=1)
encoder.transform(obj_df).iloc[:,0:7].head()