####### Series#############
df.astype({'col1': 'int32'}).dtypes
s = pd.Series(['a','b','c','d'], index=[100,101,102,103])
# seeting value
s.loc[index] = val
# index of max value
s.idxmax()
# Return Series with duplicate values removed
pandas.Series.drop_duplicates
keep : {‘first’, ‘last’, False}, default ‘first’
first : Drop duplicates except for the first occurrence.
last : Drop duplicates except for the last occurrence.
False : Drop all duplicates.
take_last : deprecated
######## Dataframe #########
# Initializing
df = pandas.DataFrame()
for mat in arr:
df = df.append({"name": "rahul", "age": 41}, ignore_index=True)
# initializing empty column
df["D"] = np.nan
# drop nan values
df = df.dropna()
# How to drop rows from pandas data frame that contains a particular string in a particular column? [duplicate]
new_df = df[df.C != 'XYZ']
# Summary
df.describe()
# reset index of dataframe https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.reset_index.html
df = df.reset_index(drop=True)
# Apply function to each element of column
# ... and then this column must be transformed to float explicitly, convert_to_float is a function
def convert_to_float(x):
try:
return np.float(x)
except:
return np.nan
df['z-axis'] = df['z-axis'].apply(convert_to_float)
# index of max value in column
df.gps_accn.idxmax()
# Remove a column
df.drop('floor', axis=1)
# Round off with index
df.index = df.index.round("100ms")
# Round off column timestamp
df_mount.timestamp.dt.round("100ms")
# Round off columns
df_train = df_train.round({'x-axis': 4, 'y-axis': 4, 'z-axis': 4})
# REGEX, Last column has a ";" character which must be removed ...
df['z-axis'].replace(regex=True,
inplace=True,
to_replace=r';',
value=r'')
# drop duplicate rows with same value in 'A' column
df.drop_duplicates(subset='A', keep="last")
# drop duplicate rows with same value in index
df = df[~df.index.duplicated(keep='first')]
# make df.index monotonic increasing
temp = temp.sort_index()
# get indices which are not monotonic increasing
for i in range(1,temp.shape[0]):
if temp.index[i] <= temp.index[i-1]:
print i,temp.index[i]
# get the row with nearest timestamp index
t0 = df.index.get_loc(datetime.datetime(2019,3,22,17,54,24),method='nearest')
df.iloc[t0:t0+5]
# Count the null columns
train = pd.read_csv("train.csv")
null_columns=train.columns[train.isnull().any()]
train[null_columns].isnull().sum()
# Converting string column in integer
gender = {'male': 1,'female': 2}
data.Gender = [gender[item] for item in data.Gender]
# Encoding the label strings into int
# Transform the labels from String to Integer via LabelEncoder
le = preprocessing.LabelEncoder() # print(le.classes_)
# Add a new column to the existing DataFrame with the encoded values
df[LABEL] = le.fit_transform(df['activity'].values.ravel())
# One hot encoding
from keras.utils import np_utils
y_train_hot = np_utils.to_categorical(y_train, num_classes)
# Splitting the data
# sometimes we split based on user-id so that our model is uniquely finding pattern instead of learning data
eg.
# Differentiate between test set and training set
df_test = df[df['user-id'] > 28]
df_train = df[df['user-id'] <= 28]
# Normalize training data
# Normalize features for training data set (values between 0 and 1)
# Surpress warning for next 3 operation
pd.options.mode.chained_assignment = None # default='warn'
df_train['x-axis'] = df_train['x-axis'] / df_train['x-axis'].max()
df_train['y-axis'] = df_train['y-axis'] / df_train['y-axis'].max()
df_train['z-axis'] = df_train['z-axis'] / df_train['z-axis'].max()
# Find label of a time window
# Retrieve the most often used label in this segment #
label = stats.mode(df[label_name][i: i + time_steps])[0][0]
# Graphs
# Show how many training examples exist for each of the six activities
df['activity'].value_counts().plot(kind='bar', title='Training Examples by Activity Type')
#iteration in rows
for row_no in range(df0.shape[0]):
row = df0.iloc[row_no]
for index, row in df0.iterrows():
print(row)
# iloc accesses by row no
# loc accesses by index
Example:
df.index = [1, 2, 3, 4, 5, 6, 7, 8]
df['2'] = [35,2,3,4,3435,3,5,34]
df.iloc[1] # output : 2
df.loc[1,'2'] # output: 35