jmquintana79
3/25/2016 - 5:28 AM

data pandas

Data managment.

"""
In [1]: df = DataFrame(randn(5,2),index=range(0,10,2),columns=list('AB'))
In [2]: df
Out[2]: 
          A         B
0  1.068932 -0.794307
2 -0.470056  1.192211
4 -0.284561  0.756029
6  1.037563 -0.267820
8 -0.538478 -0.800654
"""
# Return item(column) and drop from frame
df.pop("column_name")

# select row of data frame by integer index
In [3]: df.iloc[[2]]
Out[3]: 
          A         B
4 -0.284561  0.756029

# select multiple rows of data frame by integer index
df.iloc[1:1000, 5].values

# df data selection by rows and columns
df = df.ix[0:nrows,0:ncols]


# GET DATA by index value
df.get_value(index_value_cell,'column_cell_name')

# REPLACE VALUE IN CELLS
df.iat[position_index_cell, position_index_column] = new_value

# REPLACE VALUES IN ENTIRE DF
df.replace(to_replace = 0.0, value = -1, inplace = True)

# get index
In [4]: df.index.tolist()
Out[4]:
[0, 2, 4, 6, 8]

# get list from pandas DataFrame column headers
In [5]: pd.columns.values
Out[5]:
['A', 'B']


# Collect unique values of column (without repeated values)
df[x].unique()


# APPLY FUNCTION ON COLUMN OF DATAFRAME
def f(x): 
    return x[0]
df.apply(f, axis=1)
df['e'].map(f)


# SORT BY COLUMNS (ex.'A','B')
df.sort(['A', 'B'], ascending=[1, 0], inplace=True) ## deprecated
df.sort_values(['A', 'B'], ascending=[1, 0], inplace=True)
# SORT BY INDEX
df.sort_index(inplace=True)


# GROUP BY
data.groupby(func, axis=0).mean()
data.groupby(['col1', 'col2'], as_index=False)['col3'].mean()

# GROUP BY 10 Minutes
DF10min = DF.groupby(pd.TimeGrouper(freq='10Min')).aggregate(numpy.sum)
    

# Apply FUNCTION / ACTION OVER DATA COLUMN
df['b'] = df['a'].apply(lambda col: do stuff with col here) # over 1 column data
df['c'] = df[['a','b']].apply(lambda x: do stuff with x[0] and x[1] here, axis=1)   # over 2 column data


# Finding out basic STATISTICAL INFORMATION on your dataset.
pd.options.display.float_format = '{:,.3f}'.format # Limit output to 3 decimal places.
df.describe()          # main statistics of metric fields
df['x'].value_counts() # count categorical variables
df.corr()     # univariate correlation

# Information about dataframe
df.info(max_cols=0) # max_cols => diplay whole information or not

# Looking for Nan values
pd.isnull(df) # return df isnan (True/False)

# SUMMARY OF NAN values
DF.isnull().sum()


# Displace df any steps (rows)
nsteps = 1 
# nsteps > 0: +1 row
# nsteps < 0: -1 row
DF.shift(periods=nsteps)

# calculate differences between each row/column and his own n-shifted value 
n = 1
DF.diff(periods=n, axis=0)

## split df into same size dfs
def splitdf(DF,nsplit):
  import numpy as np
  aux = np.array([int(i) for i in np.linspace(0,len(DF),nsplit+1)] )
  lranges = list(zip(list(aux[:-1]),aux[1:]))
  # split
  for i1,i2 in lranges: yield DF[i1:i2]
  
  
## COLLECT COLUMNS PER TYPES
# other types: "category","number","bool_","object_"
# doc types: https://docs.scipy.org/doc/numpy/reference/arrays.scalars.html
df.select_dtypes(exclude=["number"],include=[])

# Check what columns are NUMERICAL or CATEGORICAL
cols_num = df.select_dtypes(include=['float64']).columns.tolist()                                       
cols_cat = df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()         
cols_int = df.select_dtypes(include=['int64']).columns.tolist()           
cols_dt = df.select_dtypes(include=['datetime64[ns]']).columns.tolist()

# Calculate frequencies of categorical / numerical values
df['cat_column'].value_counts(normalize=False,sort=True,ascending=False,dropna=True) # categorical
df['num_column'].value_counts(normalize=False,sort=True,ascending=False,dropna=True,bins=10) # numerical

## GET DUMMIES from categorical variable
"""
prefix_sep: symbol to use to include value prefix just after the original name. Ex. 'colname_value1','colname_value1',..
dummy_na: if there are nan values, this one are codified as a new categorical value 
columns: list of categorical columns to be processed
drop_first: if is True, drop the first categorical value and keep the another one. it is useful when only there are two possible values.

"""
pd.get_dummies(df,prefix_sep='_',dummy_na=False, columns=lcol,drop_first=False)


## Calculate MATRIX FREQUENCY according to df columns
dfmatrixf = pd.crosstab(index=data["column1"],columns=data["column2"],normalize=True)*100 # freq into %
dfmatrixf = pd.crosstab(index=data["column1"],columns=data["column2"],normalize=True) # freq into [0,1]
dfmatrixf = pd.crosstab(index=data["column1"],columns=data["column2"],normalize=False) # count


## RESHAPE DF according to columns for one value
dfpivot = df.pivot(index='column1', columns='column2', values='column3')

## PIVOT reshaping but with any operation (because column1 and column2 are not unique values)
pd.pivot_table(df, values='column3', index='column1', columns='column2', aggfunc=np.mean) # or another function/operator
# example of operations: np.mean,'mean',lambda x: np.mean(x)

## RESAMPLING DF TIME SERIES (ej. weekly average)
df.resample('W').mean()

## FILL TEMPORAL HOLES
dfhourly.resample("30min").asfreq() # hourly data is filled with NaN values at xx:30
# reindex by datetime a df (fill with NaN if a time step not exists)
df.reindex(pd.date_range(start=df.index.min(),end=df.index.max(),freq='1H')) 

## BINNING A COLUMN (numeric columns)
nbins = 9
bins = np.linspace(np.min(df['var'].values), np.max(df['var'].values), nbins+1, endpoint=True)
labels = np.arange(1,nbins+1,1)
df['var_bin'] = pd.cut(df['var'], bins = bins, labels = labels)

## GROUPBY AGGREGATION per one var
def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_
# applying single aggregation
df.groupby('A').agg({'B': 'sum', 'C': 'min'})
# applying multiple aggregation a column
df = pd.DataFrame({'A': [1, 1, 1, 2, 2], 'B': range(5), 'C': range(5)})
df.groupby('A').agg({'B': ['sum','min'], 'C': ['sum','min']})
 
 ## stacked df: tenemos una variable url que son listas de urls. por tanto stack el df, es decir, se pone cada url de cada lista como record del nuevo df manteniendo el resto de campos invariables
df_stacked = pd.DataFrame(df.url.tolist(), index=[df.index, df.gender, df.age, df.location]).stack().reset_index([0, 'id', 'gender', 'age', 'location']).rename(columns = {0:'url'})
# usando la funcion explode (https://www.kdnuggets.com/2021/05/applying-pythons-explode-function-pandas-dataframes.html)
# en este ejemplo la columna "year" tiene como registros listas de años. con este comando, el contenido de las listas es splitted y guardados en nuevos registros duplicando el resto de columnas
df.explode('year')

## COLLECT datetime components
# ref: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.dt.date.html
df['date'] = df.col_datetime.dt.date
df['hour'] = df.col_datetime.dt.hour
df['dow'] = df.col_datetime.dt.dayofweek