# B-073 Tips for Selecting Columns in a DataFrame
import pandas as pd
import numpy as np
df = pd.read_csv(
'https://data.cityofnewyork.us/api/views/vfnx-vebw/rows.csv?accessType=DOWNLOAD&bom=true&format=true')
np.r_[0:3,15:19,24,25]
array([ 0, 1, 2, 15, 16, 17, 18, 24, 25])
# We can pass the output of np.r_ to .iloc to use multiple selection approaches
df.iloc[:, np.r_[0:3,15:19,24,25]]
# https://nbviewer.jupyter.org/github/chris1610/pbpython/blob/master/notebooks/Selecting_Columns_in_DataFrame.ipynb
import pandas as pd
import numpy as np
df = pd.read_excel("../data/sample-salesv3.xlsx")
df['date'] = pd.to_datetime(df['date'])
df.head()
df[df["account number"]==307599].head()
df[df["quantity"] > 22].head()
df[df["sku"].map(lambda x: x.startswith('B1'))].head()
df[df["sku"].map(lambda x: x.startswith('B1')) & (df["quantity"] > 22)].head()
df[df["account number"].isin([714466,218895])].head()
import pandas as pd
import numpy as np
# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
"num_doors", "body_style", "drive_wheels", "engine_location",
"wheel_base", "length", "width", "height", "curb_weight",
"engine_type", "num_cylinders", "engine_size", "fuel_system",
"bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
"city_mpg", "highway_mpg", "price"]
# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data",
header=None, names=headers, na_values="?" )
int_df = df.select_dtypes(include=['int']).copy()
int_df.head()
float_df = df.select_dtypes(include=['float']).copy()
float_df.head()
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()
import pandas as pd
import numpy as np
df = pd.read_excel("../data/sample-salesv3.xlsx")
df[df['sku'].str.contains('B1')].head()
df[(df['sku'].str.contains('B1-531')) & (df['quantity']>40)].sort_values(by=['quantity','name'],ascending=[0,1])
# B-073 Tips for Selecting Columns in a DataFrame
import pandas as pd
import numpy as np
df = pd.read_csv(
'https://data.cityofnewyork.us/api/views/vfnx-vebw/rows.csv?accessType=DOWNLOAD&bom=true&format=true')
We can also select columns using a boolean array
run_cols = df.columns.str.contains('run', case=False)
run_cols
array([False, False, False, False, False, False, False, False, False,
False, False, False, False, False, False, True, False, False,
False, False, False, False, False, False, False, False, False,
False, True, False, False, False, False, False, False, False])
df.iloc[:, run_cols].head()
Running Runs from
0 False False
1 True True
2 False False
3 False True
4 False False
import pandas as pd
import numpy as np
df = pd.read_excel("../data/sales-funnel.xlsx")
df.query('name == ["Kulas Inc","Barton LLC"]').head()
table = pd.pivot_table(df,index=["Manager","Status"],columns=["Product"],values=["Quantity","Price"],
aggfunc={"Quantity":len,"Price":[np.sum,np.mean]},fill_value=0)
table
table.query('Manager == ["Debra Henley"]')
table.query('Status == ["pending","won"]')
import pandas as pd
import numpy as np
data = {'account': ['Jones LLC', 'Alpha Co', 'Blue Inc', 'Mega Corp'],
'Total Sales': [150, 200, 75, 300],
'Country': ['US', 'UK', 'US', 'US']}
df = pd.DataFrame.from_dict(data)
df.loc[[True, True, False, True], "account"]
df.loc[[True, True, False, True], ["account", "Country"]]
df.loc[df["Total Sales"] > 200, ["account", "Country"]]
# Edit Columns
df["rate"] = 0.02
df
df.loc[df["Total Sales"] > 100, ["rate"]] = .05
df
# B-073 Tips for Selecting Columns in a DataFrame
import pandas as pd
import numpy as np
df = pd.read_csv(
'https://data.cityofnewyork.us/api/views/vfnx-vebw/rows.csv?accessType=DOWNLOAD&bom=true&format=true')
A lambda function can be useful for combining into 1 line.
df.iloc[:, lambda df:df.columns.str.contains('run', case=False)].head()
Running Runs from
0 False False
1 True True
2 False False
3 False True
4 False False
A more complex example
df.iloc[:, lambda df: df.columns.str.contains('district|precinct|boundaries',
case=False)].head()
Community Districts Borough Boundaries City Council Districts Police Precincts
0 19 4 19 13
1 19 4 19 13
2 19 4 19 13
3 19 4 19 13
4 19 4 19 13
# https://nbviewer.jupyter.org/github/chris1610/pbpython/blob/master/notebooks/Selecting_Columns_in_DataFrame.ipynb
# B-073 Tips for Selecting Columns in a DataFrame
import pandas as pd
import numpy as np
df = pd.read_csv(
'https://data.cityofnewyork.us/api/views/vfnx-vebw/rows.csv?accessType=DOWNLOAD&bom=true&format=true')
A lambda function can be useful for combining into 1 line.
df.iloc[:, lambda df:df.columns.str.contains('run', case=False)].head()
Running Runs from
0 False False
1 True True
2 False False
3 False True
4 False False
# A more complex example
df.iloc[:, lambda df: df.columns.str.contains('district|precinct|boundaries',
case=False)].head()
Community Districts Borough Boundaries City Council Districts Police Precincts
0 19 4 19 13
1 19 4 19 13
2 19 4 19 13
3 19 4 19 13
4 19 4 19 13
# Combining index and boolean arrays
location_cols = df.columns.str.contains('district|precinct|boundaries',
case=False)
location_cols
array([False, False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False, False,
False, False, False, False, False, True, True, True, True])
location_indices = [i for i, col in enumerate(location_cols) if col]
location_indices
[32, 33, 34, 35]
df.iloc[:, np.r_[0:3,location_indices]].head()
X Y Unique Squirrel ID Community Districts Borough Boundaries City Council Districts Police Precincts
0 -73.956134 40.794082 37F-PM-1014-03 19 4 19 13
1 -73.957044 40.794851 37E-PM-1006-03 19 4 19 13
2 -73.976831 40.766718 2E-AM-1010-03 19 4 19 13
3 -73.975725 40.769703 5D-PM-1018-05 19 4 19 13
4 -73.959313 40.797533 39B-AM-1018-01 19 4 19 13