slzdevsnp
8/14/2019 - 3:25 PM

[pandas_groupby] #mean #std #first #mad #sum #groupby #aggregate #filter #mapping #apply #transform #reset_index

[pandas_groupby] #mean #std #first #mad #sum #groupby #aggregate #filter #mapping #apply #transform #reset_index

import numpy as np
import pandas as pd
import seaborn as sns

## df to be used on a number of tests
planets = sns.load_dataset('planets')
planets.shape
planets.head()



#############################
#    simple aggregation     #
#############################

## aggregation on pandas series 
rng = np.random.RandomState(42)
ser = pd.Series(rng.rand(5))
ser

#lets sort the series before
ser.sort_values(inplace=True)

ser.sum()  # sum
ser.mean()
ser.std()

## opn dataframes
rng = np.random.RandomState(42)
df = pd.DataFrame({'A': rng.rand(5),
                   'B': rng.rand(5)})


df.mean() #makes mean across columns
df.mean(axis='columsn')  #makes mean across rows

## agregation functions
# count(), first(), last(), mean()l, median(), std(), var(), mad():mean abs deviation
#prod(), sum()


####################
#    groupby       #
####################

df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data': range(6)}, columns=['key', 'data'])

#it is useful to sort values by 'key' and look on input data
df.sort_values(by=['key']) #sorting can be done on a list of columns

dfg=df.groupby('key') # DataFrameGroupBy object
dfg.sum()  #sum is applied

#shows median on all columns for groups with destinct 'method'
planets.groupby('method').median()
#for a particular column
planets.groupby('method')['orbital_period'].median()

## see how many elements are in each groupped bucket
for (method, group) in planets.groupby('method'):
    print("{0:30s} shape={1}".format(method, group.shape)) #{0:30s} add reserve 30 chars for field 


gdf=planets.groupby('method')
#get groups
gdf.groups
gdf.size() #get counts per group

#describe() applied for each bucket
#beter understanding of the data
planets.groupby('method')['year'].describe()

#group by equal bins
gdf=planets.groupby( 
	pd.qcut(x=planets['year'], 
	q=3, 
	labels=['early', 'mid','recent'] 
	) 
)  
gdf.size() 

#group by specified bins
gdf=planets.groupby(
  pd.cut(planets['year'],
  [1970,2000,2005,2011,2018],
  labels=['to2k', '0-5','5-11','to18']
  ))
gdf.size() 

--------------------------------
#groupping by multiple coolumns
--------------------------------
#apply mean() to all numeric colums outside the groupby criterias


planets.groupby(['method','year']).count() #count elements in each bucket
planets.groupby(['method','year']).mean()

--------------------------------
#groupby reset_index()
--------------------------------
##put the bucket identifiers  from multi-index back to columns
planets.groupby(['method','year']).count().reset_index()

###########################
#   groupby NOTA BENE     #
###########################
#when doing groupby by multiple keys on multiple columns
# ALWAYS perform 
planets.groupby(list_of_keys).count()
## to make sure your grouping buckets are what you think

########################
#   aggregate, filter, #
#   transform, apply   #
########################

rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                   columns = ['key', 'data1', 'data2'])


### aggregate allows to apply multiple aggregation funcs foreach bucket
df.groupby('key').aggregate([min, np.mean, np.median,np.std, max,sum,np.prod])

#one can pass a dictionary mapping column names to operations to be applied to that col
df.groupby('key').aggregate({'data1': 'min', 'data2': 'max'})

#filtering using a custom logic in lambda func
df.groupby('key').aggregate([np.std])
#this shows original rows but filtered in grouped buckets. i.e. not key A
df.groupby('key').filter(lambda x: x['data2'].std()>4)

### transformation,  #interesting for skew
df.sort_values(by='key')
df.groupby('key').mean()
# common example  : center data by substracting a group wise mean
df.groupby('key').transform(lambda x: x - x.mean())

#the appy method
#arbitrary function to the group results

pp(df)
pp(df.groupby('key').sum())

def norm_by_data2(x):
    # x is a DataFrame of group values
    x['data1'] /= x['data2'].sum()
    return x
df.groupby('key').apply(norm_by_data2)  
## 'data2' unchanged
# 'data1  0.00 = 0/8 , 0.1428 = 1/7, 0.166=2/12  , 0.375 = 3/8 0.57 = 4/7 

#speciying a split key
L = [0, 1, 0, 1, 2, 0]
df
df.groupby(L).sum()

## dictionary or series mapping index to group
df2 = df.set_index('key')
mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant'}
df2.groupby(mapping).sum()

#any python func applied to the index
df2.groupby(str.lower).mean()

# this creates a grouped df wth multi-column index
df2.groupby([str.lower, mapping]).mean()

##############################
#grouping example on planets #
##############################
decade = 10 * (planets['year'] // 10)
decade = decade.astype(str) + 's'
decade.name = 'decade' #decade is pandas Series

planets.groupby(['method',decade])['number'].sum()
#transposes
planets.groupby(['method',decade])['number'].sum().unstack()
#tranposes and fills nas with 0
planets.groupby(['method',decade])['number'].sum().unstack().fillna(0)