slzdevsnp
8/14/2019 - 3:23 PM

[pandas_cols_mem_optimize] #category #downcast

[pandas_cols_mem_optimize] #category #downcast

import pandas as pd 

#downcasting object columns to floats , ints 
# downcasting a float column
df['col1'] = df['col1'].apply(pd.to_numeric,downcast=’float’)

# downcasting an object to integer column
#(normally pd.read_csv detects this and converts. issue can be if column has lots)
df['col2'] = df['col2'].apply(pd.to_numeric,downcast=’unsigned’)

#stats to see unique counts of each column before deciding to turn it to categorial

import seaborn as sns
df = sns.load_dataset('planets')
df.info()

#convert int number to category. (it has 7 non-unique values)
df['number'].nunique()

df['number']=df['number'].astype('category')


##############################################
#    settings types when reading  csv        #
##############################################

df = pd.read_csv('./dataset_head.csv', nlines=20)
df.info()

df['col1'] = df['col1'].apply(pd.to_numeric,downcast=’float’)
df['col2'] = df['col2'].apply(pd.to_numeric,downcast=’unsigned’)
df['col3'] = df['col3'].astype(‘category’)

# create the dict of index names and optimized datatypes
dtypes = df.dtypes
colnames = dtypes.index
types = [i.name for i in dtypes.values]
column_types = dict(zip(colnames, types))

df_optimized = pd.read_csv('./dataset.csv',dtype=column_types)
#### converting strings to categories can save a lot of memory