[panda_basic] #loc #iloc #drop #filter #sort_values #csv
panda_basic.py
import pandas as pd
import numpy as np
from pprint import pprint as pp
# Create an example dataframe
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy', 'Jason'],
'year': [2012, 2012, 2013, 2014, 2014, 2015],
'reports': [4, 24, 31, 2, 3, 0]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma', 'Moscow'])
##############################
## droprows, columns #
##############################
df
#drop a row based on index value
df.drop('Cochice', axis=0, inplace=True)
#drop a column 'reports'
df.drop('reports',axis=1,inplace=True)
#drop first two rows basedon num index 0:2
df.drop(df.index[0:2],axis=0,inplace=True)
#drop last row of the df
df.drop(df.index[-1],axis=0,inplace=True)
#drop last two rows
df.drop(df.index[-2:],axis=0,inplace=True)
#-------------------
# drop rows based on conditifon(s)
#--------------------
## !!NB this expects a continious monotonic index !!
df.drop(df.loc[df['year']==2012].index, axis=0,inplace=True)
#######################
## sort values values #
#######################
df.sort_values(by='name',ascending=False)
#by multiple criterias
df.sort_values(by=['reports','name'],ascending=False)
#######################
## filtering values #
#######################
#filter on a single value
msk=df['year']==2012
df.loc[msk]
#filter return sublist of columns
df.loc[msk,['name','year']]
#filter on more than 1 value
msk=df['year'].isin([2012,2014])
df.loc[msk]
#filter on more than 1 criteria
name_msk=df['name'].isin(['Jason','Molly'])
year_msk=df['year'].isin([2012,2013,2015])
gmask=(name_msk)&(year_msk)
df.loc[gmask,:]
#filter on true and false criteria
df.loc[(df['name']=='Jason')& ~(df['reports'].isnull()),:]
#######################
## create new columns #
#######################
data = {'name': ['Jason', 'Molly', 'Tina', np.nan, 'Amy', 'Jason'],
'year': [2012, 2012, 2013, 2014, 2014, 2015],
'born': [pd.NaT, pd.NaT, pd.Timestamp('1940-04-25'), pd.NaT, pd.Timestamp('1973-03-25'), pd.NaT],
'reports': [4, 24, 31, 2, 3, np.nan]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma', 'Moscow'])
#conditional value in new column
df['reports_in_2012'] = np.where(df['year']==2012, df['reports'], 0)
#conditional creatipon on null
df['is_born'] = np.where(df['born'].isnull(), 'N', 'Y')
#######################
## read write csv #
#######################
## reading
df=pd.read_csv(csvfile,header='infer')
#writing
df.to_csv(csvfile, header=True, index=False)