ibanez270dx
5/19/2015 - 1:15 AM

SKHIP work in progress

SKHIP work in progress

require 'io/console'
require 'optparse'
require 'ostruct'
require 'fileutils'

NAME = "Safari Keyword History Index Parser"
VERSION = "v0.0.1"
TIME = Time.now

def box_me_up(str)
  width = (STDOUT.winsize[-1].to_i-2).times.collect{'═'}.join
  # width = (str.length+2).times.collect{'═'}.join
  str = str.center(STDOUT.winsize[-1]-4,' ')
  boxed = "╔#{width}╗\n║ #{str} ║\n╚#{width}╝\n"
end

def show_error_and_exit
  puts @option_parser.banner
  puts "  #{$!}\n    use --help for more information\n\n"
  exit 1
end

################################################################################
# Command Line Options
################################################################################

# We set default values here.
options = OpenStruct.new
  options.name = "HistoryIndex"
  options.path = "/Users/#{ENV['USER']}/Library/Safari/"
  options.time = "#{TIME.strftime("%Y%m%d%H%M%S")}"
  options.line = STDOUT.winsize[-1].times.collect{'─'}.join
  options.stdo = true

 options.input = "#{options.name}.sk"
options.output = "#{options.name}-#{options.time}.txt"
options.backup = "#{options.name}-#{options.time}.backup"
  options.dump = "#{options.name}-#{options.time}.dump"

# Start parsing those options
@option_parser = OptionParser.new do |opts|
  opts.program_name = NAME
  opts.version = VERSION
  opts.banner = "#{box_me_up("#{NAME} #{VERSION}")}
    Usage: ruby skhip.rb [options]\n\n"

  opts.separator "  Specific options:"
  opts.on "-i", "--input FILE",
    "Path to HistoryIndex.sk" \
  do |input|
    options.input  = input
    options.name   = input[/(?<=\/)[\w]+(?=\.)/]
    options.output = "#{options.name}-#{options.time}.txt"
    File.open(input)
  end

  opts.separator ""
  opts.on "-o", "--output FILE",
    "Relative output location" \
  do |output|
    options.output = output
  end

  opts.separator ""
  opts.on "-d", "--use-dump [FILE]",
    "Skip dumping process by specifying an existing dump file.",
    "Leave blank to use default path." \
  do |dump|
    options.dump = dump
    File.open(dump) if dump
  end

  opts.separator ""
  opts.on_tail("-h", "--help", "What you're looking at :P") do
    puts opts
    exit
  end

  opts.on_tail("--version", "Show version") do
    puts opts.program_name
    puts opts.version
    exit
  end
end

begin
  @option_parser.parse!
  raise OptionParser::ParseError.
  new("arguments provided without switches!") \
  unless ARGV.empty?
rescue show_error_and_exit
end

################################################################################
# Setup
################################################################################

# Make a copy of the HistoryIndex.sk file
FileUtils.cp options.input, "#{options.backup}" \
rescue show_error_and_exit

# Dump it to ASCII chars
`xxd -b -c 10 #{options.input}.backup >> #{options.dump}` \
rescue show_error_and_exit

# Read it in all the fragments
IO.foreach(options.dump) do |input|
  (@acc || @acc=[]) << input.split(' ').last \
  rescue show_error_and_exit
end

# Make it one big ass string
dumped = @acc.join

################################################################################
# Parsing
################################################################################

parser = []  # collect regexp's and their corresponding replacements

# Make a big o' line breaky thing if there's a lot of dots
parser << { regexp: /(?<=[^\.])\.{256,}(?=[^\.|IA])/, replacement: '...[SKHIP-PARSER-SEGMENT]...' }

# replace sets of 3 dots with a single dot
# parser << { regexp: /(?<=[^\.])\.{3}(?=[^\.])/, replacement: ',' }

# remove single non-word characters (between two dots)
# parser << { regexp: /(?<=\.)[](?=\.)/, replacement: '..' }

# looks like there's some code in there. It's the only other place other than
# the the URLs that have single dots. Here, we'll add an extra dot and parse it
# along with the rest 'o that crap.
parser << { regexp: /~(.)+big/, replacement: '*' }

# remove dots between single letters by using regex lookaheads
parser << { regexp: /(?<=[^\.])\.{1}(?=[^\.])/, replacement: '' }

# remove single "stand-alone" characters
# parser << { regexp: /(?:\.{2,}|\n)[^-\.]{1,2}(?=\.{2,}|\n)/, replacement: '' }

# gonna assume that URL's HTTP part needs some slashes
# parser << { regexp: /http\.\.(?=[\w])/, replacement: 'http://' }

# put a newline before each URL
# parser << { regexp: /(\/?\.{1,})(?=https?)/, replacement: " " }

# Make a big o' line breaky thing if there's a lot of dots
# parser << { regexp: /(?<=[^\.])\.{256,}(?=[^\.|IA])/, replacement: options.line }

# clean up the ends of the URLs
# parser << { regexp: /\.{2,}(\w|{|}|\\|\d|;)+\n/, replacement: "\n" }


# replace all dot sequences longer than one with a comma
# parser << { regexp: /\.{2,}/, replacement: "," }

# add a newline after "html"... just makes things easier :P
# parser << { regexp: /(?<=html)()[^\n]/, replacement: "\n" }

# surround the headers with a border
# parser << { regexp: /()(?=IA\w+)/, replacement: options.line }
# parser << { regexp: /(?:IADefault)(?:I\w+|T\w+)(\n)/, replacement: options.line }

# Put line breaks in the remaining word blocks
# parser << { regexp: //, replacement: "\n" }

# Execute!
parser.each do |r|
  # puts r.inspect
  dumped.gsub!(r[:regexp], r[:replacement])
end

puts dumped

################################################################################
# Tokenize that shit
################################################################################
current_index = 0
last_char = ""
@token  = ""
@tokens = []
@rejected = []

dumped.split('').each do |char|
  char.strip!

  if (char == "." && last_char != ".")
    # end of a word, add to array
    if @token.length > 1
      @tokens << case @token
      when "http" then "http://"
      else @token
      end
    else
      @rejected << @token
    end
    @token = ""
  end

  if char =~ /[\w|\-|\+|&|\=|\?]/
    @token << char
  elsif char != "."
    @rejected << char
  end
  last_char = char
end


puts @tokens.inspect
# puts @rejected.inspect

################################################################################
# Iterate through tokens to create URL's and newlines
################################################################################

@words = []
last_token = ''
is_url = false
tmp = []

@tokens.each do |token|

  if token=~/^http/ && is_url
    @words << tmp
    tmp = []
  elsif token=="SKHIP-PARSER-SEGMENT" && is_url
    @words << tmp
    tmp = []
    is_url = false
  elsif token == "http://"
    is_url = true
  end

  if is_url
    tmp << token
  elsif token=="SKHIP-PARSER-SEGMENT"
    @words << options.line
  elsif token=~/IADefault/
    @words << box_me_up(token)
  else
    @words << token
  end

  last_token = token
end

puts @words.inspect

@words.each do |word|
  w = word.is_a?(Array) ? word.join('.').sub('.','') : word
  puts w
end

################################################################################
# Filter Wierd Artifacts
################################################################################
# artifacts = []


# collect regexp's and their corresponding replacements
# artifacts << { regexp: /\.\=\=/, replacement: '' }
# artifacts << { regexp: /http0/, replacement: 'http:' }

# artifacts << /http\n.+\n/
# artifacts << /z\.{+\n/
# artifacts << /E\.F\.\w/
# artifacts << /Bud2/
# artifacts << /.?\.["|-]/

# Execute!
# artifacts.each do |artifact|
  # regexp = Regexp.new artifact
  # words.match(regexp).to_a.each do |match|
    # (@removals || @removals=[]) << match.to_s
  # end
  # words.gsub! regexp, ''
# end

# Remove double spaces
# words.gsub!(/\n{2,}/,"\n")
#
# artifacts.each do |a|
#   dumped.gsub!(r[:regexp], r[:replacement])
# end

# puts words

# puts box_me_up('Artifact Removals:')
# @removals.each { |x| puts x }

# output = File.open("history_index_output.txt", 'w+')

# words.each do |word|
#   output.puts word
# end
#
# output.close

puts "\n\nFinished! Parsing took #{(Time.now - TIME).round(2)} seconds\n\n"
require 'io/console'
require 'optparse'
require 'ostruct'
require 'fileutils'

NAME = "Safari Keyword History Index Parser"
VERSION = "v0.0.1"
TIME = Time.now

def box_me_up(str)
  width = (STDOUT.winsize[-1].to_i-2).times.collect{'═'}.join
  # width = (str.length+2).times.collect{'═'}.join
  str = str.center(STDOUT.winsize[-1]-4,' ')
  boxed = "╔#{width}╗\n║ #{str} ║\n╚#{width}╝\n"
end

def show_error_and_exit
  puts @option_parser.banner
  puts "  #{$!}\n    use --help for more information\n\n"
  exit 1
end

################################################################################
# Command Line Options
################################################################################

# We set default values here.
options = OpenStruct.new
  options.name = "HistoryIndex"
  options.path = "/Users/#{ENV['USER']}/Library/Safari/"
  options.time = "#{TIME.strftime("%Y%m%d%H%M%S")}"
  options.line = STDOUT.winsize[-1].times.collect{'─'}.join
  options.stdo = true

 options.input = "#{options.name}.sk"
options.output = "#{options.name}-#{options.time}.txt"
options.backup = "#{options.name}-#{options.time}.backup"
  options.dump = "#{options.name}-#{options.time}.dump"

# Start parsing those options
@option_parser = OptionParser.new do |opts|
  opts.program_name = NAME
  opts.version = VERSION
  opts.banner = "#{box_me_up("#{NAME} #{VERSION}")}
    Usage: ruby skhip.rb [options]\n\n"

  opts.separator "  Specific options:"
  opts.on "-i", "--input FILE",
    "Path to HistoryIndex.sk" \
  do |input|
    options.input  = input
    options.name   = input[/(?<=\/)[\w]+(?=\.)/]
    options.output = "#{options.name}-#{options.time}.txt"
    File.open(input)
  end

  opts.separator ""
  opts.on "-o", "--output FILE",
    "Relative output location" \
  do |output|
    options.output = output
  end

  opts.separator ""
  opts.on "-d", "--use-dump [FILE]",
    "Skip dumping process by specifying an existing dump file.",
    "Leave blank to use default path." \
  do |dump|
    options.dump = dump
    File.open(dump) if dump
  end

  opts.separator ""
  opts.on_tail("-h", "--help", "What you're looking at :P") do
    puts opts
    exit
  end

  opts.on_tail("--version", "Show version") do
    puts opts.program_name
    puts opts.version
    exit
  end
end

begin
  @option_parser.parse!
  raise OptionParser::ParseError.
  new("arguments provided without switches!") \
  unless ARGV.empty?
rescue show_error_and_exit
end

################################################################################
# Setup
################################################################################

# Make a copy of the HistoryIndex.sk file
FileUtils.cp options.input, "#{options.backup}" \
rescue show_error_and_exit

# Dump it to ASCII chars
`xxd -b -c 10 #{options.input}.backup >> #{options.dump}` \
rescue show_error_and_exit

# Read it in all the fragments
IO.foreach(options.dump) do |input|
  (@acc || @acc=[]) << input.split(' ').last \
  rescue show_error_and_exit
end

# Make it one big ass string
dumped = @acc.join

################################################################################
# Parsing
################################################################################

parser = []  # collect regexp's and their corresponding replacements

# Mark as a line thingy
parser << { regexp: /(?<=[^\.])\.{256,}(?=[^\.|IA])/, replacement: '...[SKHIP-PARSER-SEGMENT]...' }

# looks like there's some code in there. It's the only other place other than
# the the URLs that have single dots. Here, we'll add an extra dot and parse it
# along with the rest 'o that crap.
parser << { regexp: /~(.)+big/, replacement: '*' }

# remove dots between single letters by using regex lookaheads
parser << { regexp: /(?<=[^\.])\.{1}(?=[^\.])/, replacement: '' }

# Execute!
parser.each do |r|
  # puts r.inspect
  dumped.gsub!(r[:regexp], r[:replacement])
end

################################################################################
# Tokenize that shit
################################################################################
current_index = 0
last_char = ""
@token  = ""
@tokens = []
@rejected = []

dumped.split('').each do |char|
  char.strip!

  if (char == "." && last_char != ".")
    # end of a word, add to array
    if @token.length > 1
      @tokens << case @token
      when "http" then "http://"
      else @token
      end
    else
      @rejected << @token
    end
    @token = ""
  end

  if char =~ /[\w|\-|\+|&|\=|\?]/
    @token << char
  elsif char != "."
    @rejected << char
  end
  last_char = char
end

################################################################################
# Iterate through tokens to create URL's and newlines
################################################################################

@words = []
last_token = ''
is_url = false
tmp = []

@tokens.each do |token|

  if token=~/^http/ && is_url
    @words << tmp
    tmp = []
  elsif token=="SKHIP-PARSER-SEGMENT" && is_url
    @words << tmp
    tmp = []
    is_url = false
  elsif token == "http://"
    is_url = true
  end

  if is_url
    tmp << token
  elsif token=="SKHIP-PARSER-SEGMENT"
    @words << options.line
  elsif token=~/IADefault/
    @words << box_me_up(token)
  else
    @words << token
  end

  last_token = token
end

################################################################################
# Print it out
################################################################################

@words.each do |word|
  puts word.is_a?(Array) ? word.join('.').sub('.','') : word
end

puts "\n\nFinished! Parsing took #{(Time.now - TIME).round(2)} seconds\n\n"