9/8/2017 - 9:40 AM

Import a CSV files to db

./ -d tabs --nocreate --truncate -e utf8 -db -u root -p -i -t

#!/usr/bin/env python

import os
import argparse
import shutil
import subprocess
import re
import string
import codecs


class Helper():

    def which(program):
        import os
        def is_exe(fpath):
            return os.path.isfile(fpath) and os.access(fpath, os.X_OK)

        fpath, fname = os.path.split(program)
        if fpath:
            if is_exe(program):
                return program
            for path in os.environ["PATH"].split(os.pathsep):
                path = path.strip('"')
                exe_file = os.path.join(path, program)
                if is_exe(exe_file):
                    return exe_file

        return None

    ## separate as array keys to have data piped to next command
    def readSubOutput(call):
            ps = False;
            for idx, val in enumerate(call):
                if(idx == len(call)-1) :
                    if ps == False:
                        return subprocess.check_output(val)
                        return subprocess.check_output(val, stdin=ps.stdout)
                    ps = subprocess.Popen(val, stdout=subprocess.PIPE)
            return None
        except subprocess.CalledProcessError, e:
            print "Error - Last Call"
            print call[0]
            print  " Stdout output:\n"
            print e.output

# add arguments to script
parser = argparse.ArgumentParser(description='Create a table for csv and import data requires csvkit')
parser.add_argument("-i", "--host",type=str, help="The host for database.")
parser.add_argument("-u", "--username",type=str, help="The username for database.")
parser.add_argument("-p", "--password",type=str, help="The pasword for database.")
parser.add_argument("-db", "--db",type=str, help="The database name.")
parser.add_argument("-e", "--encoding",type=str, help="File char enconding.")
parser.add_argument("-d", "--delimiter",type=str, help="Delimiter.")
parser.add_argument("--nocreate", help="Create target table", action="store_true")
parser.add_argument("--truncate", help="Truncate table before load", action="store_true")
parser.add_argument("-t", "--tablename",type=str, help="Table name.")
parser.add_argument("filename", help="Import csv filename.")

args = parser.parse_args()

#requried program
programs = ["csvsql","csvstack","mysql"]
helper = Helper()

for x in programs:
    inpath = helper.which(x)
    if inpath == "None":
         print(sys.stderr, "Could not find " + x + " program required")

# default to the following if not provided
tablename = ("", args.tablename)[bool(args.tablename)]
encoding = ("utf8", args.encoding)[bool(args.encoding)]
delimiter = (",", args.delimiter)[bool(args.delimiter)]

#check if filenma of directory it is try to stack
filename = os.path.abspath(args.filename).strip()
files = []

if os.path.isdir(filename):
    for file in os.listdir(filename):
        if file.endswith(".csv") and file.find("stacked") == -1 :
            files.append(filename + "/" +  file)

    print "stacking files in " + args.filename
    filename += "/stacked.csv"
    f = , "w",encoding)
    exitcode =["csvstack",'-e',encoding] + files, stdout=f)

    #change new lines to match what we expect
    print "changing line endings"
    with , "r", encoding) as f:
        filename = filename + "-replaced.csv"
        print filename
        out = , "w",encoding)
        for line in f:
            out.write(re.sub("\n", "\r\n", line))

    #TODO: not a great way to do this butexitcode of csvstact
    # seems to be no use

    statinfo = os.stat(filename)
    if statinfo.st_size == 0:
        print "stack failed zero length stack file"

# if create table is specified use csvsql to get a create table statement
csvsqlCommand = ["csvsql" ,"-i","mysql","--table",tablename,"-e",encoding,"-d",delimiter]
if delimiter == "tabs":
	csvsqlCommand = ["csvsql" ,"-i","mysql","--table",tablename,"-e",encoding,"--tabs"]
	delimiter = "\\t"

sql =  helper.readSubOutput([

# csvsql is too consertive with columns widths so try to fix this
#TODO: can do something better than this
sql = re.sub("VARCHAR\(.*\)","VARCHAR(255)",sql)
sql = re.sub("DECIMAL NOT NULL","DECIMAL (12,4) NOT NULL",sql)
#construct a load data statement
loadDataSql =   ("LOAD DATA LOCAL INFILE '" + filename + "' INTO TABLE `" + tablename + "` "
                 "CHARACTER SET " + encoding + " "
                 "FIELDS TERMINATED BY '" + delimiter  +"' "
                 "ENCLOSED BY '\"' "
                 "ESCAPED BY '\' "
                 "LINES TERMINATED BY '\\r\\n' "
                 "IGNORE 1 LINES "
#TODO: not 100% safe but shoulde work for our puroses
f =, "r", "utf-8")
cols = f.readline()

#get rid of UTF-8 BOM
cols = cols.replace(u'\ufeff', '')
cols = re.sub("[\r\n]","",cols)
if delimiter == "\\t":
	cols = re.sub("[\t]",",",cols)
cols = string.split(cols,",")
cols = ["`" + string.strip(x.replace(" ","_")) + "`" for x in cols]
cols = " , ".join(cols)

loadDataSql = loadDataSql.replace("|cols|",cols)
#if no db provided just show sql needed to create table and load data otherwise run te commands on the db

if args.db == None :
    print sql
    print loadDataSql
    #check for fields required if inserting to db
    if == None :
        "Host required if inserting to DB"
    if args.tablename == None :
        "Tablename required if inserting to DB"

    #create table and or load data
    if args.nocreate != True:
        print "Creating table.."
        print helper.readSubOutput([

    if args.truncate == True:
        print "Truncating table "  + args.tablename
        truncateSql = "TRUNCATE TABLE " + args.tablename
        print helper.readSubOutput([

    print "Load data into file."
    print loadDataSql
    print helper.readSubOutput([
    print "Done";