# sdt_jsonBuilder_byId_export
# Sample Command : python sdt_jsonBuilder_byId_export.py /Users/me/myfiles/ bkuuid,puuid,win_time puuid 123423,2343243 campaign_id multiple limit=1000
import sys
from itertools import islice
import inspect
import datetime
import time
import os
import json
# Define print_words(file_name) and print_top(file_name) functions.
def sdt_jsonBuilder_byId_export(folder_name,field_list,field_list_export,group_key,campaignid_list,campaignfield,json_type,optional_limit):
filenames = []
print "\nCHECKING FILES IN FOLDER TO PARSE..."
# Checking file names to parse
for file_name in os.listdir(folder_name):
full_file_name = folder_name + "/" + file_name
if full_file_name.find(".") > -1:
if full_file_name.find('.log') == -1:
print "CHECKING FILE NAME : NOT OKAY (SKIPPING) : Does not contain .log extension : " + file_name
continue
print "CHECKING FILE NAME : OKAY : " + file_name
filenames.append(full_file_name)
# Write File
print "\nJOINING FILES INTO JSON FORMAT...please wait...\n"
# JSON formatter
json_data = {};
group_key_column = field_list.index(group_key)
campaign_id_column = field_list.index(campaignfield)
# Check for limits
if optional_limit:
#Check type of limit
limit_type = optional_limit.split("=")[0]
limit_value = optional_limit.split("=")[1]
# If Limiting by Number of IDs
if limit_type == "limit":
print "Note : Will only return the first ",limit_value," '",group_key,"'s\n"
limit_value = int(limit_value) + 1
# If Limiting by specific IDs
elif limit_type == "id":
limit_value = limit_value.split(',')
print "Note : Will only parse data for the following '",group_key.strip(),"'s : ",limit_value,"\n"
# LOOP THROUGH FILES
# Create error file
row_errors = []
row_errors.append(["file_name","row","field_list_length","line_fields","line\n"])
# Set up file counter
file_counter = 0
for fname in filenames:
with open(fname) as infile:
# Console Log
file_counter = file_counter + 1
print "CHECKING FILE ", file_counter,"/",len(filenames)," : ",fname.split("/")[-1]
# Create line number
line_number = 0
# Loop through each line in the file
for line in infile:
# Record Line Number
line_number = line_number + 1
# Split Line to process columns (force to lower case too)
line_split = line.split('\t')
# Ensure correct number of columns
if len(line_split) != len(field_list):
print "SKIPPING LINE : different number of fields than declared delimited field list (see error file for error)"
error_row = [fname.split("/")[-1],str(line_number),str(len(field_list)),str(len(line_split)),str(line_split)+"\n"]
row_errors.append(error_row)
else:
# Default : Line should be processed
process_line = True
# CHECK IF WE SHOULD PROCESS LINE
# Check if line qualifies for the campaign id we want (if we're checking for campaign IDs)
if type(campaignid_list).__name__ == "list":
line_campaign_id = line_split[campaign_id_column]
if line_campaign_id not in campaignid_list:
process_line = False
# If limiting by group_key, check if Group Key present
if optional_limit and limit_type == "id":
#Force PUUIDs to lowercase to make life easier
if "partner_uuid" in group_key:
line_split[group_key_column] = line_split[group_key_column].lower()
if line_split[group_key_column] not in limit_value:
process_line = False
else:
print "Group Key found! '",line_split[group_key_column],"'"
# Check if we're going to process the line
if process_line == True:
# Ensure JSON export data has an entry for this group key
group_value = line_split[group_key_column]
if group_value not in json_data:
json_data[group_value] = []
# Loop through line and populate JSON object for this line (and add it to entry in 'json_data')
json_line = {}
col_number = 0
# Loop through the columns in this line and process them
for column in line_split:
clean_column = column.replace("\n","")
# Check for CATS
if field_list[col_number] == "cats" and "cats" in field_list_export:
json_line[field_list[col_number]] = clean_column.split(",")
# Check for CATS_W_TIMESTAMPS
elif field_list[col_number] == "cats_w_timestamps" and "cats_w_timestamps" in field_list_export:
# Create an object with an attribute per timestamp
ts_split = clean_column.split(';')
cats_w_timestamps_obj = {}
for ts in ts_split:
this_ts = ts.split(":")[0]
this_cats = ts.split(":")[1].split(',')
cats_w_timestamps_obj[this_ts]=this_cats
json_line[field_list[col_number]] = cats_w_timestamps_obj
# Check for WIN_TIME
elif field_list[col_number] == "win_time" and "win_time" in field_list_export:
json_line["win_time_friendly"] = datetime.datetime.fromtimestamp(int(clean_column)).strftime('%Y-%m-%d_%H:%M:%S')
json_line[field_list[col_number]] = clean_column
elif field_list[col_number] in field_list_export:
json_line[field_list[col_number]] = clean_column
col_number = col_number + 1
# Add file name and row to object (the increment line number)
json_line["file_line_number"] = line_number
json_line["file_name"] = fname.split('/')[-1]
# Add the new object for this line into the JSON export
json_data[group_value].append(json_line)
# Calculating time for files
currenttime = datetime.datetime.fromtimestamp(int(time.time())).strftime('%Y-%m-%d_%H_%M_%S')
# Create Error File (if needed)
if len(row_errors) > 1:
print "\nCreating Error File...\n"
error_file_name = folder_name + "/" + "joined_file_campaignids=" + "-".join(campaignid_list) + "_" + json_type + "_" + currenttime + "_errors.tsv"
with open(error_file_name, 'w') as outfile:
for row in row_errors:
outfile.write("\t".join(row))
print "\nERROR FILE CREATED IN YOUR FOLDER"
print "\nERROR FILE NAME = " + error_file_name + "\n"
# Create JSON file
print "Writing JSON data to file...\n"
# Set file name
new_file_name = folder_name + "/" + "joined_file_campaignids=" + "-".join(campaignid_list) + "_" + json_type + "_" + currenttime + ".json"
with open(new_file_name, 'w') as outfile:
# Single Line
if json_type == "single":
if optional_limit:
print "\n NOTE : Max Export Customer Count cannot be done with 'single' json export - must use 'multi' argument"
json.dump(json_data,outfile)
#json.dump(json_data,outfile,indent=4)
# Multi-Line
if json_type == "multi":
# If there's a limit and it's ID type : export just that row
if optional_limit and limit_type == "id":
# Export data for each PUUID
for puuid in limit_value:
if puuid in json_data:
print "Exporting data for '",puuid,"'..."
export = {puuid:json_data[puuid]}
json.dump(export,outfile)
outfile.write("\n")
else:
print "'",puuid, "' not found in data so not exported..."
# Otherwise export the whole file/limit of IDs
else:
if optional_limit and limit_type == "limit":
current_customer_count = 0
for customer in json_data:
# Check for limit and break if hit
if optional_limit and limit_type == "limit":
current_customer_count = current_customer_count + 1
if current_customer_count == limit_value:
print "ID limit=",limit_value-1," reached : stopping export"
break
print "Exporting data for '",puuid,"'..."
export = {customer:json_data[customer]}
json.dump(export,outfile)
outfile.write("\n")
print "\nNEW FILE CREATED IN YOUR FOLDER"
print "\nNEW FILE NAME = " + new_file_name + "\n"
sys.exit(1)
###
# Start Script
def main():
if len(sys.argv) < 8:
print '\nTo open, write: "python ./sdt_jsonBuilder_byId_export.py {(1)FOLDER_OF_WHERE_FILES_ARE} {(2)SDT field list} {(3) output fields } {(4) group-key-field-name} {(5)comma-delimited campaign ids} {(6)campaign_id fieldname} {(7)"multi" or "single" JSON objects} {(8)optional "limit=XXXX" or "id=YYYYY} \n'
print "(1) Folder location where your files are"
print "(2) Comma-delimited SDT fields : support fields are 'obfs_bkuuid','partner_uuid', bkuuid','cats','win_time','cats_w_timestamps','tag_uri','pixel_url','referrer','primary_id','primary_id_wo_prefix','pixel_id','ip_address','price'"
print "(3) Comma-delimited SDT fields for output"
print "(4) Which field should we group records by (TIP! Use 'partner_uuid' for PUUIDs"
print "(5) Either ',' delimit your chosen campaign ids or set to 'all'"
print "(6) Confirm the name of the 'campaign_id' field in your SDT field names (above)"
print "(7)'multi' to have one object per set of grouped SDT rows or 'single' to put them in a giant JSON object (set 'multi' highly recommended)"
print "(8) #Optional Set 'limit=NUMBER' to only output first X objects (e.g. PUUIDs) or 'id=VALUE,' to only output an object for a specific PUUID"
print '\nAll files in folder must have either ".log" or no extension, e.g. "file_name.log" or "file_name"\n'
print 'For example, "python sdt_jsonBuilder_byId_export.py /Users/me/myfiles/ bkuuid,puuid,win_time puuid 123423,2343243 campaign_id multiple limit=1000\n'
sys.exit(1)
print "\nSCRIPT STARTING:\n"
my_folder_name = sys.argv[1]
field_list = sys.argv[2].split(',')
field_list_export = sys.argv[3].split(',')
group_key = sys.argv[4]
campaignid_list = sys.argv[5]
if campaignid_list != "all":
campaignid_list = campaignid_list.split(",")
campaignfield = sys.argv[6]
json_type = sys.argv[7]
print "SEE ARGUMENTS BELOW:\n"
print "Folder Name : " + my_folder_name
print "Field List : ",field_list
print "Field List Export : ",field_list_export
print "Group Key : " + group_key
print "Campaign ID List : ",campaignid_list
print "Campaign ID Field : ",campaignfield
# Handle Optional Limit
if len(sys.argv) == 9:
# Allow forcing puuid to lower case to make life easier
if "partner_uuid" in group_key:
optional_limit = sys.argv[8].lower()
group_key_force_lower = True
else:
optional_limit = sys.argv[8]
group_key_force_lower = False
if ("limit=" not in optional_limit) and ("id=" not in optional_limit):
print "\nError : Optional limit syntax must be 'id=XXXXX' or 'limit=YYYYY'\n"
sys.exit(1)
print "Optional Limit = ",optional_limit
if group_key_force_lower:
print "\nINFO : 'partner_uuid' detected - forcing 'partner_uuid' values to lower case...\n"
else:
optional_limit = False
print ""
if my_folder_name:
sdt_jsonBuilder_byId_export(my_folder_name,field_list,field_list_export,group_key,campaignid_list,campaignfield,json_type,optional_limit)
if __name__ == '__main__':
main()