sumit
10/7/2019 - 5:56 PM

pandas

####### Series#############
df.astype({'col1': 'int32'}).dtypes
s = pd.Series(['a','b','c','d'], index=[100,101,102,103])
# seeting value
s.loc[index] = val

# index of max value
s.idxmax()

# Return Series with duplicate values removed
pandas.Series.drop_duplicates

keep : {‘first’, ‘last’, False}, default ‘first’
first : Drop duplicates except for the first occurrence.
last : Drop duplicates except for the last occurrence.
False : Drop all duplicates.
take_last : deprecated


######## Dataframe #########
# Initializing
df = pandas.DataFrame()

for mat in arr:
    df = df.append({"name": "rahul", "age": 41}, ignore_index=True)

# initializing empty column
df["D"] = np.nan

# drop nan values
df = df.dropna()

# How to drop rows from pandas data frame that contains a particular string in a particular column? [duplicate]
new_df = df[df.C != 'XYZ']

# Summary
df.describe()

# reset index of dataframe https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.reset_index.html
df = df.reset_index(drop=True)

# Apply function to each element of column
# ... and then this column must be transformed to float explicitly, convert_to_float is a function
def convert_to_float(x):

    try:
        return np.float(x)
    except:
        return np.nan
        
df['z-axis'] = df['z-axis'].apply(convert_to_float)

        
# index of max value in column
df.gps_accn.idxmax()

# Remove a column
df.drop('floor', axis=1)

# Round off with index
df.index = df.index.round("100ms")
 # Round off column timestamp
 df_mount.timestamp.dt.round("100ms")
 
# Round off columns
df_train = df_train.round({'x-axis': 4, 'y-axis': 4, 'z-axis': 4})

 # REGEX, Last column has a ";" character which must be removed ...
df['z-axis'].replace(regex=True,
                      inplace=True,
                      to_replace=r';',
                      value=r'')
 
# drop duplicate rows with same value in 'A' column
df.drop_duplicates(subset='A', keep="last")

# drop duplicate rows with same value in index
df = df[~df.index.duplicated(keep='first')]

# make df.index monotonic increasing 
temp = temp.sort_index()

# get indices which are not monotonic increasing
for i in range(1,temp.shape[0]):
    if temp.index[i] <= temp.index[i-1]:
        print i,temp.index[i]
        
# get the row with nearest timestamp index 
t0 = df.index.get_loc(datetime.datetime(2019,3,22,17,54,24),method='nearest')
df.iloc[t0:t0+5]

# Count the null columns
train = pd.read_csv("train.csv")
null_columns=train.columns[train.isnull().any()]
train[null_columns].isnull().sum()

# Converting string column in integer
gender = {'male': 1,'female': 2} 
data.Gender = [gender[item] for item in data.Gender] 

# Encoding the label strings into int
# Transform the labels from String to Integer via LabelEncoder
le = preprocessing.LabelEncoder()  # print(le.classes_)
# Add a new column to the existing DataFrame with the encoded values
df[LABEL] = le.fit_transform(df['activity'].values.ravel())

# One hot encoding
from keras.utils import np_utils
y_train_hot = np_utils.to_categorical(y_train, num_classes)

# Splitting the data
# sometimes we split based on user-id so that our model is uniquely finding pattern instead of learning data
eg.
# Differentiate between test set and training set
df_test = df[df['user-id'] > 28]
df_train = df[df['user-id'] <= 28]

# Normalize training data
# Normalize features for training data set (values between 0 and 1)
# Surpress warning for next 3 operation
pd.options.mode.chained_assignment = None  # default='warn'
df_train['x-axis'] = df_train['x-axis'] / df_train['x-axis'].max()
df_train['y-axis'] = df_train['y-axis'] / df_train['y-axis'].max()
df_train['z-axis'] = df_train['z-axis'] / df_train['z-axis'].max()

# Find label of a time window 
# Retrieve the most often used label in this segment # 
label = stats.mode(df[label_name][i: i + time_steps])[0][0]

# Graphs
# Show how many training examples exist for each of the six activities
df['activity'].value_counts().plot(kind='bar', title='Training Examples by Activity Type')

#iteration in rows
for row_no in range(df0.shape[0]):
  row = df0.iloc[row_no]
  
for index, row in df0.iterrows():
  print(row)
# iloc accesses by row no
# loc accesses by index 
Example:
df.index =  [1, 2, 3, 4, 5, 6, 7, 8]
df['2'] = [35,2,3,4,3435,3,5,34]
df.iloc[1] # output : 2
df.loc[1,'2'] # output: 35