jmquintana79
11/5/2015 - 3:04 PM

I/O pandas

I/O between csv and pandas dataframe

import pandas
from datetime import datetime
 
## CSV to PANDAS

# basic
path = 'path_file_input.csv'
data_df = pandas.read_csv(path,sep=";",index_col=0,usecols=['col1','col2'])

# reading japanese characters
data_df = read_csv(path, encoding="SHIFT-JIS")

# formating type of columns
path = 'path_file_input.csv'
data_df = pandas.read_csv(path,sep=";",dtype={"column1":str,"column2":float})

# parse a single datetime column
def dateparse(x): return pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
data_df = pd.read_csv(path_input, parse_dates=['datetime'], date_parser=dateparse)

# parse date time in different columns before upload
path = 'path_file_input.csv'
parse = lambda x1,x2: datetime.strptime('%s %s'%(x1,x2), '%Y%m%d %H%M')
data_df = pd.read_csv(path,sep=";", parse_dates = [['column_DATE', 'column_TIME']], 
            index_col = 0, 
            date_parser=parse)
            

# parse dates after upload data
DF['date'] = pd.to_datetime(DF['date'], format='%d%b%Y')


## PANDAS to CSV

# basic
path = 'path_file_output.csv'
DF.to_csv(path,sep=";", index=False) # where DF is a Pandas dataframe
## EXCEL (.xlsx) to PANDAS
import pandas as pd
xl = pd.ExcelFile(path_input)
lsheet_names = xl.sheet_names
xl.parse(lsheet_names[0]).head()

Feather format:

Overview:

  • The primary reason for the existence of Feather is to have a data format using which data frames can be exchanged between Python and R.
  • Feather is a binary data format.
  • Using feather enables faster I/O speeds and less memory. Usa poca memoria RAM aunque si un poco mas que CSV, y ocupa menos espacio en disco que un CSV.
  • However, since it is an evolving format it is recommended to use it for quick loading and transformation related data processing rather than using it as a long term storage.
  • Por tanto, esta bien usar este formato para trabajar con Jupyter Notebook, etc, para cargas y descargas rapidas y constantes, no para guardar datos durante mucho tiempo.
import pandas as pd
import feather
# file name
pingInfoFilePath = "./serverpings.ftr";
# data
pingInfo    = {"servername": ["svr_et_1","svr_et_2","svr_wt_1","svr_wt_2","svr_nr_1","svr_nr_2","svr_st_1","svr_st_2"],

                "lastping":["12.20.15.122","12.20.11.395", "12.20.12.836","12.20.16.769","12.20.17.193","12.20.18.416","11.59.55.913","12.20.14.811"],

                "roundtriptime":[300, 400, 0, 200, 100, 500, 350, 0],

                "status":["PASS","PASS","FAIL","PASS","PASS","PASS","PASS","FAIL"]};
# pandas df createtion
dataFrame   = pd.DataFrame(data=pingInfo);
# save into feather file
dataFrame.to_feather(pingInfoFilePath);
# load / read feather file
readFrame = pd.read_feather(pingInfoFilePath, columns=None, use_threads=True);
# pandas 0.21 introduces new functions for Parquet:
df = pd.read_parquet('example_file.parquet', engine='pyarrow')
# or
df = pd.read_parquet('example_ifle.parquet', engine='fastparquet')