ruanbekker
5/24/2016 - 5:53 AM

Generates CSV Data using the Faker library

Generates CSV Data using the Faker library

#!/usr/bin/python
# example usage: ./generate-csv-data.py --filename data --number-runs 10000 --number-reiterations 5

from faker import Factory
import sys
import time

errInvalidArgs = "Usage: " + sys.argv[0] + " --filename" + " [STRING] " + " --number-runs" + " [INT] " + "--number-reiterations" + " [INT] "
errEg = " -> eg: " + sys.argv[0] + " --filename" + " dataset" + " --number-runs" + " 1000000 " + "--number-reiterations " + "5"
errOutput = "Outputs: dataset-timestamp.txt"

def create_names(numberRuns, file_object, fake):
    for x in range(numberRuns):
        genUname = fake.uuid4()
        genName =  fake.first_name()
        genSurname = fake.last_name()
        genCountry = fake.country()

        file_object.write(genUname + "," + genName + "," + genSurname + "," + genCountry + "\n")
#        [file_object.write("%s, %s, %s, %s\n"% (fake.uuid4(), fake.first_name(), fake.last_name(), fake.country()
#)) for x in range(numberRuns)]

if __name__ == "__main__":

    if len(sys.argv) != 7:
        print(errInvalidArgs)
        print(errEg)
        print(errOutput)
        exit(-1)
    if sys.argv[1] != "--filename" and sys.argv[3] != "--number-runs":
        print(errInvalidArgs)
        print(errEg)
        print(errOutput)
        exit(-1)
    numCall = int(sys.argv[-1])

    for i in range(numCall):
        timestart = time.strftime("%Y%m%d%H%M%S")
        destFile = sys.argv[2] + "-" + timestart + ".txt"
        print "Creating File: " + destFile
        #print ("Started at: " + timestart)
        numberRuns = int(sys.argv[4])
        fake = Factory.create()
        file_object = open(destFile,"a")
        file_object.write("uuid" + "," + "username" + "," + "name" + "," + "country" + "\n")

        create_names(numberRuns, file_object, fake)
        file_object.close()
        timefinish = time.strftime("%Y%m%d%H%M%S")
        #print ("Finished at: " + timefinish)
        print ("Generated " + str(numberRuns)  + " Records")

        timeDuration = int(timefinish)-int(timestart)
        print "Job took:", float(timeDuration), "seconds"

        average = int(numberRuns)/int(timeDuration)
        print "That is", average, "Records per second!"
        print "\n"