roshanoracle
1/30/2020 - 2:33 PM

SDT JSON Parser (by puuid) (Python 2.7)

# sdt_jsonBuilder_byId_export

# Sample Command : python sdt_jsonBuilder_byId_export.py /Users/me/myfiles/ bkuuid,puuid,win_time puuid 123423,2343243 campaign_id multiple limit=1000

import sys
from itertools import islice
import inspect
import datetime
import time 
import os
import json

# Define print_words(file_name) and print_top(file_name) functions.
def sdt_jsonBuilder_byId_export(folder_name,field_list,field_list_export,group_key,campaignid_list,campaignfield,json_type,optional_limit):  

  filenames = []

  print "\nCHECKING FILES IN FOLDER TO PARSE..."

  # Checking file names to parse
  for file_name in os.listdir(folder_name):

    full_file_name = folder_name + "/" + file_name
    
    if full_file_name.find(".") > -1:
      
      if full_file_name.find('.log') == -1:

        print "CHECKING FILE NAME : NOT OKAY (SKIPPING) : Does not contain .log extension : " + file_name

        continue    

    print "CHECKING FILE NAME : OKAY : " + file_name    
    filenames.append(full_file_name)
    
  # Write File  
  print "\nJOINING FILES INTO JSON FORMAT...please wait...\n"
  
  # JSON formatter
  json_data = {};
  group_key_column = field_list.index(group_key)
  campaign_id_column = field_list.index(campaignfield)

  # Check for limits 
  if optional_limit:

    #Check type of limit
    limit_type = optional_limit.split("=")[0]
    limit_value = optional_limit.split("=")[1]

    # If Limiting by Number of IDs
    if limit_type == "limit":

      print "Note : Will only return the first ",limit_value," '",group_key,"'s\n"
      limit_value = int(limit_value) + 1

    # If Limiting by specific IDs
    elif limit_type == "id":      
            
      limit_value = limit_value.split(',')
      print "Note : Will only parse data for the following '",group_key.strip(),"'s : ",limit_value,"\n"

  # LOOP THROUGH FILES

  # Create error file
  row_errors = []
  row_errors.append(["file_name","row","field_list_length","line_fields","line\n"])

  # Set up file counter
  file_counter = 0

  for fname in filenames:      

    with open(fname) as infile:

      # Console Log
      file_counter = file_counter + 1
      print "CHECKING FILE ", file_counter,"/",len(filenames)," : ",fname.split("/")[-1]

      # Create line number
      line_number = 0

      # Loop through each line in the file
      for line in infile:
    
        # Record Line Number
        line_number = line_number + 1        

        # Split Line to process columns (force to lower case too)        
        line_split = line.split('\t')

        # Ensure correct number of columns
        if len(line_split) != len(field_list):
          print "SKIPPING LINE : different number of fields than declared delimited field list (see error file for error)"          
          error_row = [fname.split("/")[-1],str(line_number),str(len(field_list)),str(len(line_split)),str(line_split)+"\n"]          
          row_errors.append(error_row)

        else:

          # Default : Line should be processed
          process_line = True

          # CHECK IF WE SHOULD PROCESS LINE
          # Check if line qualifies for the campaign id we want (if we're checking for campaign IDs)
          if type(campaignid_list).__name__ == "list":
            
            line_campaign_id = line_split[campaign_id_column]

            if line_campaign_id not in campaignid_list:
                            
              process_line = False                    
          
          # If limiting by group_key, check if Group Key present
          if optional_limit and limit_type == "id":
            
            #Force PUUIDs to lowercase to make life easier
            if "partner_uuid" in group_key:
              
              line_split[group_key_column] = line_split[group_key_column].lower()

            if line_split[group_key_column] not in limit_value:

              process_line = False

            else:

              print "Group Key found! '",line_split[group_key_column],"'"

          # Check if we're going to process the line            
          if process_line == True:

            # Ensure JSON export data has an entry for this group key
            group_value = line_split[group_key_column]

            if group_value not in json_data:
              
              json_data[group_value] = []

            # Loop through line and populate JSON object for this line (and add it to entry in 'json_data')
            json_line = {}
            col_number = 0        

            # Loop through the columns in this line and process them
            for column in line_split:

              clean_column = column.replace("\n","")
                                            
              # Check for CATS
              if field_list[col_number] == "cats" and "cats" in field_list_export:

                json_line[field_list[col_number]] = clean_column.split(",")

              # Check for CATS_W_TIMESTAMPS
              elif field_list[col_number] == "cats_w_timestamps" and "cats_w_timestamps" in field_list_export:

                # Create an object with an attribute per timestamp                
                ts_split = clean_column.split(';')
                cats_w_timestamps_obj = {}
                for ts in ts_split:
                  this_ts = ts.split(":")[0]
                  this_cats = ts.split(":")[1].split(',')
                  cats_w_timestamps_obj[this_ts]=this_cats

                json_line[field_list[col_number]] = cats_w_timestamps_obj

              # Check for WIN_TIME
              elif field_list[col_number] == "win_time" and "win_time" in field_list_export:

                json_line["win_time_friendly"] = datetime.datetime.fromtimestamp(int(clean_column)).strftime('%Y-%m-%d_%H:%M:%S')
                json_line[field_list[col_number]] = clean_column


              elif field_list[col_number] in field_list_export:

                json_line[field_list[col_number]] = clean_column
                                    
              col_number = col_number + 1
          
              # Add file name and row to object (the increment line number)
              json_line["file_line_number"] = line_number
              json_line["file_name"] = fname.split('/')[-1]                

            # Add the new object for this line into the JSON export              
            json_data[group_value].append(json_line)

  # Calculating time for files  
  currenttime = datetime.datetime.fromtimestamp(int(time.time())).strftime('%Y-%m-%d_%H_%M_%S')

  # Create Error File (if needed)
  if len(row_errors) > 1:
    print "\nCreating Error File...\n"
    
    error_file_name = folder_name + "/" + "joined_file_campaignids=" + "-".join(campaignid_list) + "_" + json_type + "_" + currenttime + "_errors.tsv"

    with open(error_file_name, 'w') as outfile:

      for row in row_errors:

        outfile.write("\t".join(row))  
    
    print "\nERROR FILE CREATED IN YOUR FOLDER"
    print "\nERROR FILE NAME = " + error_file_name + "\n"

  # Create JSON file
  print "Writing JSON data to file...\n"
  
  # Set file name
  new_file_name = folder_name + "/" + "joined_file_campaignids=" + "-".join(campaignid_list) + "_" + json_type + "_" + currenttime + ".json"  

  with open(new_file_name, 'w') as outfile:

    # Single Line
    if json_type == "single":

      if optional_limit:
        print "\n NOTE : Max Export Customer Count cannot be done with 'single' json export - must use 'multi' argument"
      
      json.dump(json_data,outfile)
      #json.dump(json_data,outfile,indent=4)

    # Multi-Line
    if json_type == "multi":
                      
      # If there's a limit and it's ID type : export just that row
      if optional_limit and limit_type == "id":

        # Export data for each PUUID
        for puuid in limit_value:

          if puuid in json_data:

            print "Exporting data for '",puuid,"'..."
            export = {puuid:json_data[puuid]}              
            json.dump(export,outfile)  
            outfile.write("\n")

          else:

            print "'",puuid, "' not found in data so not exported..."
            

      # Otherwise export the whole file/limit of IDs
      else:

        if optional_limit and limit_type == "limit":
          current_customer_count = 0  

        for customer in json_data:                    

          # Check for limit and break if hit
          if optional_limit and limit_type == "limit":

            current_customer_count = current_customer_count + 1

            if current_customer_count == limit_value:
              print "ID limit=",limit_value-1," reached : stopping export"
              break

          print "Exporting data for '",puuid,"'..."
          export = {customer:json_data[customer]}              
          json.dump(export,outfile)  
          outfile.write("\n")
  
  print "\nNEW FILE CREATED IN YOUR FOLDER"
  print "\nNEW FILE NAME = " + new_file_name + "\n"
  sys.exit(1)


###
# Start Script
def main():


  if len(sys.argv) < 8:
    print '\nTo open, write: "python ./sdt_jsonBuilder_byId_export.py {(1)FOLDER_OF_WHERE_FILES_ARE} {(2)SDT field list} {(3) output fields } {(4) group-key-field-name} {(5)comma-delimited campaign ids} {(6)campaign_id fieldname} {(7)"multi" or "single" JSON objects} {(8)optional "limit=XXXX" or "id=YYYYY} \n'
    
    print "(1) Folder location where your files are"
    print "(2) Comma-delimited SDT fields : support fields are 'obfs_bkuuid','partner_uuid', bkuuid','cats','win_time','cats_w_timestamps','tag_uri','pixel_url','referrer','primary_id','primary_id_wo_prefix','pixel_id','ip_address','price'"
    print "(3) Comma-delimited SDT fields for output"
    print "(4) Which field should we group records by (TIP! Use 'partner_uuid' for PUUIDs"
    print "(5) Either ',' delimit your chosen campaign ids or set to 'all'"
    print "(6) Confirm the name of the 'campaign_id' field in your SDT field names (above)"
    print "(7)'multi' to have one object per set of grouped SDT rows or 'single' to put them in a giant JSON object (set 'multi' highly recommended)"    
    print "(8) #Optional Set 'limit=NUMBER' to only output first X objects (e.g. PUUIDs) or 'id=VALUE,' to only output an object for a specific PUUID"    
    
    print '\nAll files in folder must have either ".log" or no extension, e.g. "file_name.log" or "file_name"\n'
    print 'For example, "python sdt_jsonBuilder_byId_export.py /Users/me/myfiles/ bkuuid,puuid,win_time puuid 123423,2343243 campaign_id multiple limit=1000\n'
    sys.exit(1)

  print "\nSCRIPT STARTING:\n"

  my_folder_name = sys.argv[1]
  field_list = sys.argv[2].split(',')
  field_list_export = sys.argv[3].split(',')
  group_key = sys.argv[4]  
  campaignid_list = sys.argv[5]
  if campaignid_list != "all":
    campaignid_list = campaignid_list.split(",")
  campaignfield = sys.argv[6]
  json_type = sys.argv[7]

  print "SEE ARGUMENTS BELOW:\n"
  print "Folder Name : " + my_folder_name
  print "Field List : ",field_list
  print "Field List Export : ",field_list_export
  print "Group Key : " + group_key  
  print "Campaign ID List : ",campaignid_list
  print "Campaign ID Field : ",campaignfield
  
  # Handle Optional Limit
  if len(sys.argv) == 9:    
    
    # Allow forcing puuid to lower case to make life easier
    if "partner_uuid" in group_key:
      
      optional_limit = sys.argv[8].lower()
      group_key_force_lower = True

    else:

      optional_limit = sys.argv[8]
      group_key_force_lower = False

    if ("limit=" not in optional_limit) and ("id=" not in optional_limit):
      print "\nError : Optional limit syntax must be 'id=XXXXX' or 'limit=YYYYY'\n"
      sys.exit(1)
    print "Optional Limit = ",optional_limit
    if group_key_force_lower:
      print "\nINFO : 'partner_uuid' detected - forcing 'partner_uuid' values to lower case...\n"
  else:
    optional_limit = False

  print ""    
  
  if my_folder_name:
    sdt_jsonBuilder_byId_export(my_folder_name,field_list,field_list_export,group_key,campaignid_list,campaignfield,json_type,optional_limit)

if __name__ == '__main__':
  main()