//panda selecting rows vs selecting columns
// in pandas you can select rows by using the loc or iloc function
import pandas as pd
purchase_1 = pd.Series({'Name': 'Chris',
'Item Purchased': 'Dog Food',
'Cost': 22.50})
purchase_2 = pd.Series({'Name': 'Kevyn',
'Item Purchased': 'Kitty Litter',
'Cost': 2.50})
purchase_3 = pd.Series({'Name': 'Vinod',
'Item Purchased': 'Bird Seed',
'Cost': 5.00})
df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2'])
df.head()
df.loc['Item Purchased']
//for selecting columns you can use the label directly
df['Item Purchased']
//to select a particular row and column, chain them together
df.loc['Store 1', 'Cost']
//.loc supports slicing, in this case it selects all the rows and returns the name and cost column
df.loc[:,['Name', 'Cost']]
//chaining in pandas can lead to confusing results as it returns a dataframe instead of a view
//for instance below. avoid it if you can
df.loc['Store 1']['Cost']
//df.drop lets you drop data (but also be aware that it only returns a copy, even though it can be used with a parameter in place)
//deleting data using the del function deletes in frame
del copy_df['Name']
copy_df
//finally adding a new column is as easy as broadcasting a new column name
df['Location'] = None
//AGAIN its very important that you pay attention to which functions will modify the dataframe in place for instance the below commands will modify the dataframe in place
costs = df['Cost']
costs+=2
df
// when reading csv you can indicate which row or column you want to use as index or skip. for instance the below column chooses the first column as index and
//skips the first row to use the second row as the column name
df = pd.read_csv('olympics.csv', index_col = 0, skiprows=1)
//renaming columns
for col in df.columns:
if col[:2]=='01':
df.rename(columns={col:'Gold' + col[4:]}, inplace=True)
if col[:2]=='02':
df.rename(columns={col:'Silver' + col[4:]}, inplace=True)
if col[:2]=='03':
df.rename(columns={col:'Bronze' + col[4:]}, inplace=True)
if col[:1]=='№':
df.rename(columns={col:'#' + col[1:]}, inplace=True)
df.head()
//querying data frames using boolean masking and the "where" method
only_gold = df.where(df['Gold'] > 0)
only_gold.head()
//note this returns a dataframe with the original index where all the values for those rows that do not qualify are called have NaN
//you might wanna trim those columns in which case you call dropna
only_gold = only_gold.dropna()
only_gold.head()
// a shortcut to this without using dropnan
only_gold = df[df['Gold'] > 0]
only_gold.head()
// boolean masks can be combined to have many conditions
df[(df['Gold.1'] > 0) & (df['Gold'] == 0)]
// using .values returns only the column items without the relevant index
df[df['Cost']>3.00]['Name'].values
//alternatively you can use this
df['Name'][df['Cost']>3]
//setting a new index is destructive so you have to copy the current index to anothr column if you want to preserve it
df['country'] = df.index
df = df.set_index('Gold')
df.head()
//you can reset the index and create a new default index using reset index
df = df.reset_index()
df.head()
//pandas can create multiple indexes. useful for geographic datasets (where one wants to divide by country then state)
df = df.set_index(['STNAME', 'CTYNAME'])
df.head()
//also you can filter by unique values
df['SUMLEV'].unique()
//to look at the values and to select only a subset of values (in this case only the county level data)
df=df[df['SUMLEV'] == 50]
df.head()
//you can set a list of columns to keep
columns_to_keep = ['STNAME',
'CTYNAME',
'BIRTHS2010',
'BIRTHS2011',
'BIRTHS2012',
'BIRTHS2013',
'BIRTHS2014',
'BIRTHS2015',
'POPESTIMATE2010',
'POPESTIMATE2011',
'POPESTIMATE2012',
'POPESTIMATE2013',
'POPESTIMATE2014',
'POPESTIMATE2015']
df = df[columns_to_keep]
df.head()
//filling values using fillna. if the data is sorted you can fill the values in forward or backward with the prev value. however make sure the data is sorted.
//this is useful for filling in missing values from time series data (for instance with timestamps)
df = df.set_index('time')
df = df.sort_index()
df = df.fillna(method='ffill')
df.fillna
//groupby - is another way to set multilevel indexes. be warned though it returns a groupby object not a dataframe but its useful for computing quick stats.
//the code below returns the state with the nmost number of counties
census_df2.groupby('STNAME')['CTYNAME'].nunique().idxmax()
//split-apply-combine. if you want to select only a subset of examples within a group, first you apply a groupby.object. then you apply a function to select the data
//this function groups by states, that sorts counties by county population then selects only the first 3 values
def top(df,n=3,column='CENSUS2010POP'):
return df.sort_values(by=column, ascending=False)[:3]
def answer_six():
populus_df_top3 = census_df.groupby('STNAME').apply(top)
populus_df_sorted=populus_df_top3.groupby('STNAME').sum().sort_values(by='CENSUS2010POP',ascending=False)
return list(populus_df_sorted.index[:3])
answer_six()
//this function computes the max diff among a column and returns the index of that diff
def answer_seven():
census_df_pop_slice = census_df[['CTYNAME','POPESTIMATE2010','POPESTIMATE2011','POPESTIMATE2012','POPESTIMATE2013','POPESTIMATE2014','POPESTIMATE2015']]
census_df_pop_slice['MAX'] = census_df_pop_slice.max(axis=1)
census_df_pop_slice['MIN'] = census_df_pop_slice.min(axis=1)
census_df_pop_slice['DIFF'] = census_df_pop_slice['MAX']-census_df_pop_slice['MIN']
census_df_pop_slice.set_index('CTYNAME',inplace=True)
return census_df_pop_slice['DIFF'].idxmax()
answer_seven()
//boolean filters to slice dataframes (this could be theoretically shortened)
//by using data[data['three'] > 5]
def answer_eight():
census_df_region_mask = census_df["REGION"]<3
census_df_region_filtered = census_df[census_df_region_mask]
census_df_pop_mask = census_df_region_filtered["POPESTIMATE2015"]>census_df_region_filtered["POPESTIMATE2014"]
census_df_pop_filtered = census_df_region_filtered[census_df_pop_mask]
census_washington_mask = census_df_pop_filtered["CTYNAME"].str.contains("Washington")
census_washington_filtered = census_df_pop_filtered[census_washington_mask]
return census_washington_filtered[["STNAME",'CTYNAME']]
answer_eight()
//method_chaining this improves the pandorable of the code by reducing the need to create a dataframe for the inbetween steps (as seen above
print(df.drop(df[df['Quantity'] == 0].index)
.rename(columns={'Weight': 'Weight (oz.)'}))
//more examples of using apply (to apply a function to every row, in this case to calculate the min and max of every value)
def min_max(row):
data = row[['POPESTIMATE2010',
'POPESTIMATE2011',
'POPESTIMATE2012',
'POPESTIMATE2013',
'POPESTIMATE2014',
'POPESTIMATE2015']]
return pd.Series({'min': np.min(data), 'max': np.max(data)})
df.apply(min_max, axis=1)
//.agg aggregates the data in each group and allows you to do summary statistics
df.groupby('STNAME').agg({'CENSUS2010POP': np.average})
//another apply function
def totalweight(df, w, q):
return sum(df[w] * df[q])
print(df.groupby('Category').apply(totalweight, 'Weight (oz.)', 'Quantity'))
//order by category
s = pd.Series(['Low', 'Low', 'High', 'Medium', 'Low', 'High', 'Low'])
s.astype('category', categories=['Low', 'Medium', 'High'], ordered=True)
s = pd.Series([168, 180, 174, 190, 170, 185, 179, 181, 175, 169, 182, 177, 180, 171])
pd.cut(s, 3)
# You can also add labels for the sizes [Small < Medium < Large].
pd.cut(s, 3, labels=['Small', 'Medium', 'Large'])
#Pivottables
pd.pivot_table(Bikes, index=['Manufacturer','Bike Type']))
#simple plotting
Top15.plot(x='Citable docs per Capita', y='Energy Supply per Capita', kind='scatter', xlim=[0, 0.0006])
#use dir(to check attributes of object)
#use apply or map to use a function on each value of a series
PopEst = PopEst.apply(lambda x: "{:,}".format(x))