[pandas_cols_mem_optimize] #category #downcast
import pandas as pd
#downcasting object columns to floats , ints
# downcasting a float column
df['col1'] = df['col1'].apply(pd.to_numeric,downcast=’float’)
# downcasting an object to integer column
#(normally pd.read_csv detects this and converts. issue can be if column has lots)
df['col2'] = df['col2'].apply(pd.to_numeric,downcast=’unsigned’)
#stats to see unique counts of each column before deciding to turn it to categorial
import seaborn as sns
df = sns.load_dataset('planets')
df.info()
#convert int number to category. (it has 7 non-unique values)
df['number'].nunique()
df['number']=df['number'].astype('category')
##############################################
# settings types when reading csv #
##############################################
df = pd.read_csv('./dataset_head.csv', nlines=20)
df.info()
df['col1'] = df['col1'].apply(pd.to_numeric,downcast=’float’)
df['col2'] = df['col2'].apply(pd.to_numeric,downcast=’unsigned’)
df['col3'] = df['col3'].astype(‘category’)
# create the dict of index names and optimized datatypes
dtypes = df.dtypes
colnames = dtypes.index
types = [i.name for i in dtypes.values]
column_types = dict(zip(colnames, types))
df_optimized = pd.read_csv('./dataset.csv',dtype=column_types)
#### converting strings to categories can save a lot of memory