filter / select / contain

11/8/2019 - 3:31 PM

# B-073 Tips for Selecting Columns in a DataFrame

import pandas as pd
import numpy as np
df = pd.read_csv(
    'https://data.cityofnewyork.us/api/views/vfnx-vebw/rows.csv?accessType=DOWNLOAD&bom=true&format=true')
    
np.r_[0:3,15:19,24,25]
array([ 0,  1,  2, 15, 16, 17, 18, 24, 25])

# We can pass the output of np.r_ to .iloc to use multiple selection approaches

df.iloc[:, np.r_[0:3,15:19,24,25]]

# https://nbviewer.jupyter.org/github/chris1610/pbpython/blob/master/notebooks/Selecting_Columns_in_DataFrame.ipynb

filter_by_column.py

import pandas as pd
import numpy as np
 
df = pd.read_excel("../data/sample-salesv3.xlsx")
 
df['date'] = pd.to_datetime(df['date'])
df.head()

df[df["account number"]==307599].head()
df[df["quantity"] > 22].head()
df[df["sku"].map(lambda x: x.startswith('B1'))].head()
df[df["sku"].map(lambda x: x.startswith('B1')) & (df["quantity"] > 22)].head()
df[df["account number"].isin([714466,218895])].head()

filter_by_column_select_dtypes.py

import pandas as pd
import numpy as np

# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )

int_df = df.select_dtypes(include=['int']).copy()
int_df.head()

float_df = df.select_dtypes(include=['float']).copy()
float_df.head()

obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()

filter_str.contains.py

import pandas as pd
import numpy as np
df = pd.read_excel("../data/sample-salesv3.xlsx")

df[df['sku'].str.contains('B1')].head()
df[(df['sku'].str.contains('B1-531')) & (df['quantity']>40)].sort_values(by=['quantity','name'],ascending=[0,1])

filter_str.contains_boolean_array.py

# B-073 Tips for Selecting Columns in a DataFrame
 
import pandas as pd
import numpy as np
df = pd.read_csv(
    'https://data.cityofnewyork.us/api/views/vfnx-vebw/rows.csv?accessType=DOWNLOAD&bom=true&format=true')

We can also select columns using a boolean array

run_cols = df.columns.str.contains('run', case=False)
run_cols
array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False])

df.iloc[:, run_cols].head()
Running	Runs from
0	False	False
1	True	True
2	False	False
3	False	True
4	False	False

filter_query.py

import pandas as pd
import numpy as np
df = pd.read_excel("../data/sales-funnel.xlsx")
df.query('name == ["Kulas Inc","Barton LLC"]').head()

table = pd.pivot_table(df,index=["Manager","Status"],columns=["Product"],values=["Quantity","Price"],
               aggfunc={"Quantity":len,"Price":[np.sum,np.mean]},fill_value=0)
table

table.query('Manager == ["Debra Henley"]')

table.query('Status == ["pending","won"]')

filter_loc.py

import pandas as pd
import numpy as np
data = {'account': ['Jones LLC', 'Alpha Co', 'Blue Inc', 'Mega Corp'], 
        'Total Sales': [150, 200, 75, 300], 
        'Country': ['US', 'UK', 'US', 'US']}
df = pd.DataFrame.from_dict(data)

df.loc[[True, True, False, True], "account"]
df.loc[[True, True, False, True], ["account", "Country"]]
df.loc[df["Total Sales"] > 200, ["account", "Country"]]

# Edit Columns
df["rate"] = 0.02
df
df.loc[df["Total Sales"] > 100, ["rate"]] = .05
df

filter_iloc_lambda.py

# B-073 Tips for Selecting Columns in a DataFrame

import pandas as pd
import numpy as np
df = pd.read_csv(
    'https://data.cityofnewyork.us/api/views/vfnx-vebw/rows.csv?accessType=DOWNLOAD&bom=true&format=true')
    
A lambda function can be useful for combining into 1 line.

df.iloc[:, lambda df:df.columns.str.contains('run', case=False)].head()
Running	Runs from
0	False	False
1	True	True
2	False	False
3	False	True
4	False	False
A more complex example

df.iloc[:, lambda df: df.columns.str.contains('district|precinct|boundaries',
                                              case=False)].head()
Community Districts	Borough Boundaries	City Council Districts	Police Precincts
0	19	4	19	13
1	19	4	19	13
2	19	4	19	13
3	19	4	19	13
4	19	4	19	13

# https://nbviewer.jupyter.org/github/chris1610/pbpython/blob/master/notebooks/Selecting_Columns_in_DataFrame.ipynb

filter_iloc_lambda_str.contains.py

# B-073 Tips for Selecting Columns in a DataFrame
 
import pandas as pd
import numpy as np
df = pd.read_csv(
    'https://data.cityofnewyork.us/api/views/vfnx-vebw/rows.csv?accessType=DOWNLOAD&bom=true&format=true')

A lambda function can be useful for combining into 1 line.

df.iloc[:, lambda df:df.columns.str.contains('run', case=False)].head()
Running	Runs from
0	False	False
1	True	True
2	False	False
3	False	True
4	False	False

# A more complex example

df.iloc[:, lambda df: df.columns.str.contains('district|precinct|boundaries',
                                              case=False)].head()
Community Districts	Borough Boundaries	City Council Districts	Police Precincts
0	19	4	19	13
1	19	4	19	13
2	19	4	19	13
3	19	4	19	13
4	19	4	19	13

# Combining index and boolean arrays

location_cols = df.columns.str.contains('district|precinct|boundaries',
                                        case=False)
location_cols
array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True,  True,  True,  True])

location_indices = [i for i, col in enumerate(location_cols) if col]
location_indices
[32, 33, 34, 35]

df.iloc[:, np.r_[0:3,location_indices]].head()

X	Y	Unique Squirrel ID	Community Districts	Borough Boundaries	City Council Districts	Police Precincts
0	-73.956134	40.794082	37F-PM-1014-03	19	4	19	13
1	-73.957044	40.794851	37E-PM-1006-03	19	4	19	13
2	-73.976831	40.766718	2E-AM-1010-03	19	4	19	13
3	-73.975725	40.769703	5D-PM-1018-05	19	4	19	13
4	-73.959313	40.797533	39B-AM-1018-01	19	4	19	13

Cacher is the code snippet organizer for pro developers

We empower you and your team to get more done, faster

filter / select / contain