gerbal
5/1/2014 - 2:30 AM

generate_inserts.py

import random
import time


def randomMAC():
    mac = [0x00, 0x16, 0x3e,
           random.randint(0x00, 0x7f),
           random.randint(0x00, 0xff),
           random.randint(0x00, 0xff)]
    return ':'.join(map(lambda x: "%02x" % x, mac))


def randomIP():
    not_valid = [10, 127, 169, 172, 192]

    first = random.randrange(1, 256)
    while first in not_valid:
        first = random.randrange(1, 256)
    ip = ".".join([str(first), str(random.randrange(1, 256)),
                   str(random.randrange(1, 256)), str(random.randrange(1, 256))])
    return ip


def strTimeProp(start, end, format, prop):
    """Get a time at a proportion of a range of two formatted times.

    start and end should be strings specifying times formated in the
    given format (strftime-style), giving an interval [start, end].
    prop specifies how a proportion of the interval to be taken after
    start.  The returned time will be in the specified format.
    """

    stime = time.mktime(time.strptime(start, format))
    etime = time.mktime(time.strptime(end, format))

    ptime = stime + prop * (etime - stime)

    return time.strftime(format, time.localtime(ptime))


def randomDate(start, end, prop):
    return strTimeProp(start, end, '%m/%d/%Y %I:%M %p', prop)
    
eightyIPs = []
for i in range(80):
    eightyIPs.append(randomIP())

hosts = ["RunPapa.com", "lucidhost.com",
         "hoststhemost.net", "freedomhostworks.su"]
site_owner = ["megacorp inc", "non-profic llc", "duetches web gbh"]
tld = [".com", ".gov", ".edu", ".org", ".net", ".io", ".hobbies",
       ".icann-messed-up-with-the-new-gTLDs", ".bogus", ".su", "co.uk"]
sites = ["nile", "brootoople", "alohationary", "pinkinternets", "unc", "plums",
         "apples", "hotpants", "felinesrus", "email", "plantsforsale", "newtonianagronomy"]


file_extensions = [".exe", ".php", ".html",
                   ".asp", ".xml", ".pdf", ".?=1234", ""]
pages = [
    "search", "about", "index", "billing", "admin", "how-do-i-stop-the-burning", "sesamestreet",
    "courses", "watch", "404", "careers", "item", "pies", "shipping", "forks", "magicians-for-hire"]

device = ["mobile", "tablet", "desktop", "desktop", "mobile"]
browser = ["firefox", "chrome", "safari", "ie6", "mosaic"]
OS = ["windows", "iOS", "OSX", "Linux", "Android"]

referral = ["organic", "referral", "direct"]
city = ["springfield", "jonesboro", "paris", "miloud", "send-help",
        "southbend", "unicode-error", "townville", "citytown"]
country = ["USA", "France", "Belize", "The Moon",
           "United Kingdom", "Germany", "Canada"]


def gen_sites():
    return_sites = []
    domain_names = []
    while len(return_sites) <= 10:
        random_domain = random.choice(sites) + random.choice(tld)
        if random_domain not in domain_names:
            return_sites.append(
                [random_domain, random.choice(site_owner), random.choice(hosts)])
            domain_names.append(random_domain)
    return return_sites


def gen_pages(sites_list):
    return_pages = []
    for a_site in sites_list:
        ret_list = []
        filenames_list = []
        while len(ret_list) < 10:
            page_name = random.choice(pages)
            page_ext = random.choice(file_extensions)
            if page_name not in filenames_list:
                filename = a_site[0] + "/" + page_name + page_ext
                handback = [filename, page_name, a_site[0]]
                filenames_list.append(page_name)
                ret_list = ret_list + [handback]
        return_pages = return_pages + ret_list
    return return_pages


def gen_devices():
    return_devices = []
    for i in range(50):
        device_desc = []
        device_desc = [randomMAC(), random.choice(
            device), random.choice(browser), random.choice(OS)]
        return_devices.append(device_desc)
    return return_devices


def gen_visit(devices_list, pages_list, sites_list):
    return_visits = []
    # visit = date_time, duration, ip_address, traffic_source, city, country, device_id, browser, OS, filename
    for genned_page in pages_list:
        for i in range(random.randrange(20)):
            random_device = random.choice(devices_list)
            visit_record = [randomDate("4/25/2014 12:00 am", "5/2/2014 5:00 pm",  random.random()), str(int(random.expovariate(1.0 / (random.randint(1, 20))))), random.choice(eightyIPs), random.choice(
                referral), random.choice(city), random.choice(country), random_device[0], random_device[2], random_device[3], genned_page[0]]
            return_visits.append(visit_record)
    return return_visits


def output_sql(table_name, meta_array):
    giant_list = []
    for i in meta_array:
        giant_list.append("','".join(i))

    table_str = "');\nINSERT INTO " + table_name + " VALUES('"
    outstr = table_str.join(giant_list)
    return "INSERT INTO " + table_name + " VALUES('" + outstr + "');"

db_sites_table = gen_sites()
db_pages_table = gen_pages(db_sites_table)
db_device_table = gen_devices()
db_visit_table = gen_visit(db_device_table, db_pages_table, db_sites_table)


populate_file = open("output.txt", "wb+")
websites_sql = output_sql("Website", db_sites_table)
webpage_sql = output_sql("web_Page", db_pages_table)
device_sql = output_sql("Device", db_device_table)
visit_sql = output_sql("visit", db_visit_table)

populate_file.write(websites_sql + "\n" +
                    webpage_sql + "\n" + device_sql + "\n" + visit_sql)

populate_file.close()