[pandas_groupby] #mean #std #first #mad #sum #groupby #aggregate #filter #mapping #apply #transform #reset_index
import numpy as np
import pandas as pd
import seaborn as sns
## df to be used on a number of tests
planets = sns.load_dataset('planets')
planets.shape
planets.head()
#############################
# simple aggregation #
#############################
## aggregation on pandas series
rng = np.random.RandomState(42)
ser = pd.Series(rng.rand(5))
ser
#lets sort the series before
ser.sort_values(inplace=True)
ser.sum() # sum
ser.mean()
ser.std()
## opn dataframes
rng = np.random.RandomState(42)
df = pd.DataFrame({'A': rng.rand(5),
'B': rng.rand(5)})
df.mean() #makes mean across columns
df.mean(axis='columsn') #makes mean across rows
## agregation functions
# count(), first(), last(), mean()l, median(), std(), var(), mad():mean abs deviation
#prod(), sum()
####################
# groupby #
####################
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
'data': range(6)}, columns=['key', 'data'])
#it is useful to sort values by 'key' and look on input data
df.sort_values(by=['key']) #sorting can be done on a list of columns
dfg=df.groupby('key') # DataFrameGroupBy object
dfg.sum() #sum is applied
#shows median on all columns for groups with destinct 'method'
planets.groupby('method').median()
#for a particular column
planets.groupby('method')['orbital_period'].median()
## see how many elements are in each groupped bucket
for (method, group) in planets.groupby('method'):
print("{0:30s} shape={1}".format(method, group.shape)) #{0:30s} add reserve 30 chars for field
gdf=planets.groupby('method')
#get groups
gdf.groups
gdf.size() #get counts per group
#describe() applied for each bucket
#beter understanding of the data
planets.groupby('method')['year'].describe()
#group by equal bins
gdf=planets.groupby(
pd.qcut(x=planets['year'],
q=3,
labels=['early', 'mid','recent']
)
)
gdf.size()
#group by specified bins
gdf=planets.groupby(
pd.cut(planets['year'],
[1970,2000,2005,2011,2018],
labels=['to2k', '0-5','5-11','to18']
))
gdf.size()
--------------------------------
#groupping by multiple coolumns
--------------------------------
#apply mean() to all numeric colums outside the groupby criterias
planets.groupby(['method','year']).count() #count elements in each bucket
planets.groupby(['method','year']).mean()
--------------------------------
#groupby reset_index()
--------------------------------
##put the bucket identifiers from multi-index back to columns
planets.groupby(['method','year']).count().reset_index()
###########################
# groupby NOTA BENE #
###########################
#when doing groupby by multiple keys on multiple columns
# ALWAYS perform
planets.groupby(list_of_keys).count()
## to make sure your grouping buckets are what you think
########################
# aggregate, filter, #
# transform, apply #
########################
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
'data1': range(6),
'data2': rng.randint(0, 10, 6)},
columns = ['key', 'data1', 'data2'])
### aggregate allows to apply multiple aggregation funcs foreach bucket
df.groupby('key').aggregate([min, np.mean, np.median,np.std, max,sum,np.prod])
#one can pass a dictionary mapping column names to operations to be applied to that col
df.groupby('key').aggregate({'data1': 'min', 'data2': 'max'})
#filtering using a custom logic in lambda func
df.groupby('key').aggregate([np.std])
#this shows original rows but filtered in grouped buckets. i.e. not key A
df.groupby('key').filter(lambda x: x['data2'].std()>4)
### transformation, #interesting for skew
df.sort_values(by='key')
df.groupby('key').mean()
# common example : center data by substracting a group wise mean
df.groupby('key').transform(lambda x: x - x.mean())
#the appy method
#arbitrary function to the group results
pp(df)
pp(df.groupby('key').sum())
def norm_by_data2(x):
# x is a DataFrame of group values
x['data1'] /= x['data2'].sum()
return x
df.groupby('key').apply(norm_by_data2)
## 'data2' unchanged
# 'data1 0.00 = 0/8 , 0.1428 = 1/7, 0.166=2/12 , 0.375 = 3/8 0.57 = 4/7
#speciying a split key
L = [0, 1, 0, 1, 2, 0]
df
df.groupby(L).sum()
## dictionary or series mapping index to group
df2 = df.set_index('key')
mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant'}
df2.groupby(mapping).sum()
#any python func applied to the index
df2.groupby(str.lower).mean()
# this creates a grouped df wth multi-column index
df2.groupby([str.lower, mapping]).mean()
##############################
#grouping example on planets #
##############################
decade = 10 * (planets['year'] // 10)
decade = decade.astype(str) + 's'
decade.name = 'decade' #decade is pandas Series
planets.groupby(['method',decade])['number'].sum()
#transposes
planets.groupby(['method',decade])['number'].sum().unstack()
#tranposes and fills nas with 0
planets.groupby(['method',decade])['number'].sum().unstack().fillna(0)