straight-shoota
7/31/2017 - 1:34 PM

Faster Command Line Tools in Crystal

Faster Command Line Tools in Crystal

data: ngrams.tsv

ngrams.tsv:
	curl https://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-0.gz | gunzip > ngrams.tsv

benchmark: data
	crystal cli_bench.cr --release --no-debug -- ngrams.tsv 1 2

build: bin/crystal_nuts bin/crystal_int bin/crystal_array_with_backup bin/crystal_bench

.PHONY: bin/crystal_bench
bin/crystal_bench: bin
	crystal build cli_bench.cr --release --no-debug -o $@

.PHONY: bin/crystal_array_with_backup
bin/crystal_array_with_backup: bin
	crystal build cli.cr --release --no-debug -o $@ -Darray_with_backup

.PHONY: bin/crystal_int
bin/crystal_int: bin
	crystal build cli.cr --release --no-debug -o $@
	
.PHONY: bin/crystal_nuts
bin/crystal_nuts: bin
	crystal build cli_nuts.cr --release --no-debug -o $@
	
bin:
	mkdir bin

clean:
	rm ngrams.tsv
	rm -r bin

This tries to solve the challenge of a fast parser and aggregator for a delimter-separated data file in Crystal.

Here are some benchmarks on my fairly old laptop with Ubuntu on Linux (so total speed is quite low) and not comparable with the results from the linked benchmarks:

                             Naive   0.14  (  7.37s ) (± 6.05%)  1.58× slower
                            Simple   0.12  (  8.05s ) (± 5.71%)  1.73× slower
                         IntParser    0.2  (  5.06s ) (± 4.63%)  1.09× slower
                             Array   0.21  (  4.83s ) (± 6.63%)  1.04× slower
                   ArrayWithBackup   0.21  (  4.66s ) (± 1.50%)       fastest
          Naive with cached stream   0.13  (  7.62s ) (± 4.11%)  1.64× slower
         Simple with cached stream   0.14  (  7.08s ) (± 1.51%)  1.52× slower
      IntParser with cached stream   0.19  (  5.35s ) (±10.71%)  1.15× slower
          Array with cached stream    0.2  (  4.98s ) (± 7.27%)  1.07× slower
ArrayWithBackup with cached stream    0.2  (  4.94s ) (± 7.10%)  1.06× slower

Running Crystal IntParser against V4: Stop using strings from the Go implementation yields the following results on my machine:

$ time ./crystal_int ngrams.tsv 1 2
max_key: 2006 sum: 22569013

real    0m6.689s
user    0m5.016s
sys     0m1.031s

$ time ./go_v4 ngrams.tsv 1 2
max_key: 2006 sum: 22569013

real    0m9.679s
user    0m8.016s
sys     0m1.438s
require "./faster"

file_path = ARGV[0]? || "./ngrams.tsv"
key_index = ARGV[1]?.try(&.to_i) || 1
value_index = ARGV[2]?.try(&.to_i) || 2

File.open(file_path) do |file|
  result = {% if flag?("array_with_backup") %}Faster::ArrayWithBackup{% else %}Faster::IntParser{% end %}.process_data(file, key_index, value_index)
  puts "max_key: #{result[0]} sum: #{result[1]}"
end
require "./faster"
require "benchmark"

file_path = ARGV[0]? || "./ngrams.tsv"
key_index = ARGV[1]?.try(&.to_i) || 1
value_index = ARGV[2]?.try(&.to_i) || 2

IMPLEMENTATIONS = %w(Naive Simple IntParser Array ArrayWithBackup)

file = File.open(file_path)
Benchmark.ips(25, 0) do |x|
  {% for implementation in IMPLEMENTATIONS %}
    x.report({{ implementation }}) do
      File.open(file_path) do |file|
        Faster::{{implementation.id}}.process_data(file, key_index, value_index)
      end
    end
  {% end %}
  {% for implementation in IMPLEMENTATIONS %}
    x.report("{{ implementation.id }} with cached stream") do
      file.rewind
      Faster::{{implementation.id}}.process_data(file, key_index, value_index)
    end
  {% end %}
end

at_exit do
  file.close
end
require "./faster"
require "benchmark"

file_path = ARGV[0]? || "./ngrams.tsv"
key_index = ARGV[1]?.try(&.to_i) || 1
value_index = ARGV[2]?.try(&.to_i) || 2

data = IO::Memory.new
time_loading = Benchmark.measure do
  File.open(file_path) do |file|
    IO.copy(file, data)
    data.rewind
  end
end

puts "Time to load data into memory:"
puts time_loading

time_executing = Benchmark.measure do
  result = {% if flag?("array_with_backup") %}Faster::ArrayWithBackup{% else %}Faster::IntParser{% end %}.process_data(data, key_index, value_index)
  puts "max_key: #{result[0]} sum: #{result[1]}"
end

puts
puts "Time to process data:"
puts time_executing
module Faster
  DELIMITER = '\t'
  NL = '\n'

  module Naive
    def self.process_lines(io, key_index, value_index)
      io.each_line do |line|
        fields = line.split(DELIMITER)
        yield fields[key_index], fields[value_index]
      end
    end

    def self.process_data(io, key_index, value_index)
      sums = {} of Int32 => Int32
      process_lines(io, key_index, value_index) do |key, value|
        key = key.to_i
        sums[key] = sums.fetch(key, 0) + value.to_i
      end
      sums.max_by &.[](1)
    end
  end

  module Simple
    def self.process_lines(io, key_index, value_index)
      field_index = 0
      key = IO::Memory.new
      value = IO::Memory.new

      io.each_char do |c|
        case c
        when NL
          yield key.to_s, value.to_s
          key.clear
          value.clear
          field_index = 0
        when DELIMITER
          field_index += 1
        else
          case field_index
          when key_index
            key << c
          when value_index
            value << c
          end
        end
      end
    end

    def self.process_data(io, key_index, value_index)
      sums = {} of Int32 => Int32
      process_lines(io, key_index, value_index) do |key, value|
        key = key.to_i
        sums[key] = sums.fetch(key, 0) + value.to_i
      end
      sums.max_by &.[](1)
    end
  end

  module IntParser
    def self.process_lines(io, key_index, value_index)
      field_index = 0
      key = 0
      value = 0

      io.each_char do |c|
        case c
        when '\n'
          yield key, value
          key = 0
          value = 0
          field_index = 0
        when DELIMITER
          field_index += 1
        else
          case field_index
          when key_index
            key = key * 10 + c.to_i
          when value_index
            value = value * 10 + c.to_i
          end
        end
      end
    end

    def self.process_data(io, key_index, value_index)
      sums = {} of Int32 => Int32
      process_lines(io, key_index, value_index) do |key, value|
        sums[key] = sums.fetch(key, 0) + value
      end
      sums.max_by &.[](1)
    end
  end

  module Array
    def self.process_lines(io, key_index, value_index)
      field_index = 0
      key = 0
      value = 0

      io.each_char do |c|
        case c
        when '\n'
          yield key, value
          key = 0
          value = 0
          field_index = 0
        when DELIMITER
          field_index += 1
        else
          case field_index
          when key_index
            key = key * 10 + c.to_i
          when value_index
            value = value * 10 + c.to_i
          end
        end
      end
    end

    def self.process_data(io, key_index, value_index)
      sums = ::Array(Int32).new(4096, 0)
      process_lines(io, key_index, value_index) do |key, value|
        sums[key] += value
      end

      max = {-1, 0}
      sums.each_with_index do |value, key|
        max = {key, value} if value > max[1]
      end

      max
    end
  end

  module ArrayWithBackup
    ARRAY_SIZE = 4096

    def self.process_lines(io, key_index, value_index)
      field_index = 0
      key = 0
      value = 0

      io.each_char do |c|
        case c
        when '\n'
          yield key, value
          key = 0
          value = 0
          field_index = 0
        when DELIMITER
          field_index += 1
        else
          case field_index
          when key_index
            key = key * 10 + c.to_i
          when value_index
            value = value * 10 + c.to_i
          end
        end
      end
    end

    def self.process_data(io, key_index, value_index)
      sums = ::Array(Int32).new(ARRAY_SIZE, 0)
      sums_hash = {} of Int32 => Int32
      process_lines(io, key_index, value_index) do |key, value|
        if(key >= ARRAY_SIZE)
          sums_hash[key] = sums_hash.fetch(key, 0) + value
        else
          sums[key] += value
        end
      end

      max = {-1, 0}
      max = sums_hash.max_by &.[](1) unless sums_hash.empty?
      sums.each_with_index do |value, key|
        max = {key, value} if value > max[1]
      end

      max
    end
  end
end