banchan86
5/20/2019 - 12:26 AM

Panda operations

//panda selecting rows vs selecting columns
// in pandas you can select rows by using the loc or iloc function

import pandas as pd
purchase_1 = pd.Series({'Name': 'Chris',
                        'Item Purchased': 'Dog Food',
                        'Cost': 22.50})
purchase_2 = pd.Series({'Name': 'Kevyn',
                        'Item Purchased': 'Kitty Litter',
                        'Cost': 2.50})
purchase_3 = pd.Series({'Name': 'Vinod',
                        'Item Purchased': 'Bird Seed',
                        'Cost': 5.00})
df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2'])
df.head()

df.loc['Item Purchased']

//for selecting columns you can use the label directly

df['Item Purchased']

//to select a particular row and column, chain them together

df.loc['Store 1', 'Cost']

//.loc supports slicing, in this case it selects all the rows and returns the name and cost column

df.loc[:,['Name', 'Cost']]

//chaining in pandas can lead to confusing results as it returns a dataframe instead of a view
//for instance below. avoid it if you can
df.loc['Store 1']['Cost']

//df.drop lets you drop data (but also be aware that it only returns a copy, even though it can be used with a parameter in place)
//deleting data using the del function deletes in frame
del copy_df['Name']
copy_df

//finally adding a new column is as easy as broadcasting a new column name
df['Location'] = None

//AGAIN its very important that you pay attention to which functions will modify the dataframe in place for instance the below commands will modify the dataframe in place
costs = df['Cost']
costs+=2
df

// when reading csv you can indicate which row or column you want to use as index or skip. for instance the below column chooses the first column as index and 
//skips the first row to use the second row as the column name
df = pd.read_csv('olympics.csv', index_col = 0, skiprows=1)

//renaming columns
for col in df.columns:
    if col[:2]=='01':
        df.rename(columns={col:'Gold' + col[4:]}, inplace=True)
    if col[:2]=='02':
        df.rename(columns={col:'Silver' + col[4:]}, inplace=True)
    if col[:2]=='03':
        df.rename(columns={col:'Bronze' + col[4:]}, inplace=True)
    if col[:1]=='№':
        df.rename(columns={col:'#' + col[1:]}, inplace=True) 

df.head()
//querying data frames using boolean masking and the "where" method

only_gold = df.where(df['Gold'] > 0)
only_gold.head()

//note this returns a dataframe with the original index where all the values for those rows that do not qualify are called have NaN
//you might wanna trim those columns in which case you call dropna
only_gold = only_gold.dropna()
only_gold.head()

// a shortcut to this without using dropnan 
only_gold = df[df['Gold'] > 0]
only_gold.head()

// boolean masks can be combined to have many conditions
df[(df['Gold.1'] > 0) & (df['Gold'] == 0)]

// using .values returns only the column items without the relevant index
df[df['Cost']>3.00]['Name'].values

//alternatively you can use this
df['Name'][df['Cost']>3]

//setting a new index is destructive so you have to copy the current index to anothr column if you want to preserve it
df['country'] = df.index
df = df.set_index('Gold')
df.head()

//you can reset the index and create a new default index using reset index
df = df.reset_index()
df.head()

//pandas can create multiple indexes. useful for geographic datasets (where one wants to divide by country then state)
df = df.set_index(['STNAME', 'CTYNAME'])
df.head()

//also you can filter by unique values
df['SUMLEV'].unique()
//to look at the values and to select only a subset of values (in this case only the county level data)
df=df[df['SUMLEV'] == 50]
df.head()

//you can set a list of columns to keep
columns_to_keep = ['STNAME',
                   'CTYNAME',
                   'BIRTHS2010',
                   'BIRTHS2011',
                   'BIRTHS2012',
                   'BIRTHS2013',
                   'BIRTHS2014',
                   'BIRTHS2015',
                   'POPESTIMATE2010',
                   'POPESTIMATE2011',
                   'POPESTIMATE2012',
                   'POPESTIMATE2013',
                   'POPESTIMATE2014',
                   'POPESTIMATE2015']
df = df[columns_to_keep]
df.head()

//filling values using fillna. if the data is sorted you can fill the values in forward or backward with the prev value. however make sure the data is sorted. 
//this is useful for filling in missing values from time series data (for instance with timestamps)
df = df.set_index('time')
df = df.sort_index()
df = df.fillna(method='ffill')
df.fillna

//groupby - is another way to set multilevel indexes. be warned though it returns a groupby object not a dataframe but its useful for computing quick stats.
//the code below returns the state with the nmost number of counties
census_df2.groupby('STNAME')['CTYNAME'].nunique().idxmax()

//split-apply-combine. if you want to select only a subset of examples within a group, first you apply a groupby.object. then you apply a function to select the data
//this function groups by states, that sorts counties by county population then selects only the first 3 values
def top(df,n=3,column='CENSUS2010POP'):
    return df.sort_values(by=column, ascending=False)[:3]
def answer_six():
    populus_df_top3 = census_df.groupby('STNAME').apply(top)
    populus_df_sorted=populus_df_top3.groupby('STNAME').sum().sort_values(by='CENSUS2010POP',ascending=False)
    return list(populus_df_sorted.index[:3])
answer_six()

//this function computes the max diff among a column and returns the index of that diff
def answer_seven():
    census_df_pop_slice = census_df[['CTYNAME','POPESTIMATE2010','POPESTIMATE2011','POPESTIMATE2012','POPESTIMATE2013','POPESTIMATE2014','POPESTIMATE2015']]
    census_df_pop_slice['MAX'] = census_df_pop_slice.max(axis=1)
    census_df_pop_slice['MIN'] = census_df_pop_slice.min(axis=1)
    census_df_pop_slice['DIFF'] = census_df_pop_slice['MAX']-census_df_pop_slice['MIN']
    census_df_pop_slice.set_index('CTYNAME',inplace=True)
    return census_df_pop_slice['DIFF'].idxmax()
answer_seven()

//boolean filters to slice dataframes (this could be theoretically shortened)
//by using data[data['three'] > 5]
def answer_eight():
    census_df_region_mask = census_df["REGION"]<3
    census_df_region_filtered = census_df[census_df_region_mask]
    census_df_pop_mask = census_df_region_filtered["POPESTIMATE2015"]>census_df_region_filtered["POPESTIMATE2014"]
    census_df_pop_filtered = census_df_region_filtered[census_df_pop_mask]
    census_washington_mask = census_df_pop_filtered["CTYNAME"].str.contains("Washington")
    census_washington_filtered = census_df_pop_filtered[census_washington_mask]
    return census_washington_filtered[["STNAME",'CTYNAME']]
answer_eight()

//method_chaining this improves the pandorable of the code by reducing the need to create a dataframe for the inbetween steps (as seen above
print(df.drop(df[df['Quantity'] == 0].index)
  .rename(columns={'Weight': 'Weight (oz.)'}))

//more examples of using apply (to apply a function to every row, in this case to calculate the min and max of every value)
def min_max(row):
    data = row[['POPESTIMATE2010',
                'POPESTIMATE2011',
                'POPESTIMATE2012',
                'POPESTIMATE2013',
                'POPESTIMATE2014',
                'POPESTIMATE2015']]
    return pd.Series({'min': np.min(data), 'max': np.max(data)})
df.apply(min_max, axis=1)

//.agg aggregates the data in each group and allows you to do summary statistics
df.groupby('STNAME').agg({'CENSUS2010POP': np.average})

//another apply function
def totalweight(df, w, q):
        return sum(df[w] * df[q])
        
print(df.groupby('Category').apply(totalweight, 'Weight (oz.)', 'Quantity'))

//order by category
s = pd.Series(['Low', 'Low', 'High', 'Medium', 'Low', 'High', 'Low'])

s.astype('category', categories=['Low', 'Medium', 'High'], ordered=True)

s = pd.Series([168, 180, 174, 190, 170, 185, 179, 181, 175, 169, 182, 177, 180, 171])


pd.cut(s, 3)

# You can also add labels for the sizes [Small < Medium < Large].
pd.cut(s, 3, labels=['Small', 'Medium', 'Large'])

#Pivottables
pd.pivot_table(Bikes, index=['Manufacturer','Bike Type']))

#simple plotting
Top15.plot(x='Citable docs per Capita', y='Energy Supply per Capita', kind='scatter', xlim=[0, 0.0006])

#use dir(to check attributes of object)

#use apply or map to use a function on each value of a series 
PopEst = PopEst.apply(lambda x: "{:,}".format(x))