forresty
2/28/2015 - 1:32 PM

to_utf8.rb

#!/usr/bin/env ruby

def to_utf8(codepoint)
  case codepoint
  when (0..0x7F)
    [codepoint]
  when (0x80..0x7FF)
    [
      0b11000000 + (codepoint >> 6),
      0b10000000 + (codepoint & 0b111111)
    ]
  when (0x800..0xFFFF)
    [
      0b11100000 + (codepoint >> 12),
      0b10000000 + ((codepoint >> 6) & 0b111111),
      0b10000000 + (codepoint & 0b111111)
    ]
  when (0x10000..0xFFFFF)
    [
      0b11110000 + (codepoint >> 18),
      0b10000000 + ((codepoint >> 12) & 0b111111),
      0b10000000 + ((codepoint >> 6) & 0b111111),
      0b10000000 + (codepoint & 0b111111)
    ]
  else
    raise 'codepoint too large'
  end
end

(0..0xFFFFF).each do |codepoint|
  unless [codepoint].pack('U*').bytes == to_utf8(codepoint)
    raise "WA
        codepoint: #{codepoint}
          to_utf8: #{to_utf8(codepoint)}
      ruby native: #{[codepoint].pack('U*').bytes}
    "
  end
end

puts "success!"