retrography
3/30/2013 - 6:13 PM

Extract annotations from PDFs with pdf-reader gem

Extract annotations from PDFs with pdf-reader gem

require 'pdf/reader/page_layout'

# Builds a UTF-8 string of all the text on a single page within the given markups
# by processing all the operaters in a content stream.
class MarkupReceiver
  extend Forwardable

  def initialize(markups)
    @markups = markups
  end

  SPACE = " "

  attr_reader :state, :content, :options

  ########## BEGIN FORWARDERS ##########
  # Graphics State Operators
  def_delegators :@state, :save_graphics_state, :restore_graphics_state

  # Matrix Operators
  def_delegators :@state, :concatenate_matrix

  # Text Object Operators
  def_delegators :@state, :begin_text_object, :end_text_object

  # Text State Operators
  def_delegators :@state, :set_character_spacing, :set_horizontal_text_scaling
  def_delegators :@state, :set_text_font_and_size, :font_size
  def_delegators :@state, :set_text_leading, :set_text_rendering_mode
  def_delegators :@state, :set_text_rise, :set_word_spacing

  # Text Positioning Operators
  def_delegators :@state, :move_text_position, :move_text_position_and_set_leading
  def_delegators :@state, :set_text_matrix_and_text_line_matrix, :move_to_start_of_next_line
  ##########  END FORWARDERS  ##########

  # starting a new page
  def page=(page)
    @state = PDF::Reader::PageState.new(page)
    @content = []
    @characters = Hash.new {|h,k| h[k] = [] }
    @mediabox = page.attributes[:MediaBox]
  end

  def set_markup_texts
    @characters.each do |markup, text_runs|
      text = PDF::Reader::PageLayout.new(text_runs, @mediabox).to_s
      markup.text = text
    end
  end

  #####################################################
  # Text Showing Operators
  #####################################################
  # record text that is drawn on the page
  def show_text(string) # Tj (AWAY)
    internal_show_text(string)
  end

  def show_text_with_positioning(params) # TJ [(A) 120 (WA) 20 (Y)]
    params.each do |arg|
      if arg.is_a?(String)
        internal_show_text(arg)
      else
        @state.process_glyph_displacement(0, arg, false)
      end
    end
  end

  def move_to_next_line_and_show_text(str) # '
    @state.move_to_start_of_next_line
    show_text(str)
  end

  def set_spacing_next_line_show_text(aw, ac, string) # "
    @state.set_word_spacing(aw)
    @state.set_character_spacing(ac)
    move_to_next_line_and_show_text(string)
  end

  #####################################################
  # XObjects
  #####################################################
  def invoke_xobject(label)
    @state.invoke_xobject(label) do |xobj|
      case xobj
      when PDF::Reader::FormXObject then
        xobj.walk(self)
      end
    end
  end

  private

  def internal_show_text(string)
    if @state.current_font.nil?
      raise PDF::Reader::MalformedPDFError, "current font is invalid"
    end
    glyphs = @state.current_font.unpack(string)
    glyphs.each_with_index do |glyph_code, index|
      # paint the current glyph
      newx, newy = @state.trm_transform(0,0)
      utf8_chars = @state.current_font.to_utf8(glyph_code)

      # apply to glyph displacment for the current glyph so the next
      # glyph will appear in the correct position
      glyph_width = @state.current_font.glyph_width(glyph_code) / 1000.0
      th = 1
      scaled_glyph_width = glyph_width * @state.font_size * th
      unless utf8_chars == SPACE
        @markups.each do |markup|
          if markup.contains?(newx, newy)
            text_run = PDF::Reader::TextRun.new(newx, newy, scaled_glyph_width, @state.font_size, utf8_chars)
            @characters[markup] << text_run
          end
        end
      end
      @state.process_glyph_displacement(glyph_width, 0, utf8_chars == SPACE)
    end
  end

end
require 'pdf-reader'
require './markup_receiver'

doc = PDF::Reader.new(ARGV[0])

$objects = doc.objects

def is_note?(object)
  object[:Type] == :Annot && [:Text, :FreeText].include?(object[:Subtype])
end

def is_markup?(object)
  object[:Type] == :Annot && [:Highlight, :Underline].include?(object[:Subtype])
end

def annots_on_page(page)
  references = (page.attributes[:Annots] || [])
  lookup_all(references).flatten
end

def lookup_all(refs)
  refs = *refs
  refs.map { |ref| lookup(ref) }
end

def lookup(ref)
  object = $objects[ref]
  return object unless object.is_a?(Array)
  lookup_all(object)
end

def notes_on_page(page)
  all_annots = annots_on_page(page)
  all_annots.select { |a| is_note?(a) }
end

def markups_on_page(page)
  all_annots = annots_on_page(page)
  markups = all_annots.select { |a| is_markup?(a) }.map {|a| Markup.new(a) }

  if markups.any?
    receiver = MarkupReceiver.new(markups)
    page.walk(receiver)
    coords = nil
    receiver.set_markup_texts
  end
  markups

end

class Markup
  attr_reader :attributes
  attr_accessor :text

  def initialize(attributes)
    @attributes = attributes
  end

  class Rectangle
    attr_reader :quad_points

    def initialize(points)
      @quad_points = points.sort
    end

    def bottom_left
      quad_points[0]
    end

    def top_left
      quad_points[1]
    end

    def bottom_right
      quad_points[2]
    end

    def top_right
      quad_points[3]
    end

    def contains?(coords)
      x, y = *coords
      x >= bottom_left.first && x <= top_right.first &&
        y >= bottom_left.last && y <= top_right.last
    end

    def within?(bottom, top)
      bottom_left[1] >= bottom && bottom_left[1] <= top
    end
    
  end

  def rectangles
    attributes[:QuadPoints].each_slice(8).to_a.map do |ps| 
      Rectangle.new(ps.each_slice(2).to_a)
    end
  end

  def color
    rgb_to_hex(attributes[:C])
  end
  
  def contains?(x, y)
    rectangles.any? {|r| r.contains?([x, y]) }
  end

  def within?(bottom, top)
    rectangles.any? {|r| r.within?(bottom, top) }
  end

  def rgb_to_hex(rgb)
    "#" + rgb.map {|i| (i*255).to_i.to_s(16).rjust(2, "0").upcase }.join
  end
end

doc.pages.each do |page|
  notes = notes_on_page(page)
  markups = markups_on_page(page)
  next unless notes.any? or markups.any?

  puts "# Page #{page.number}"
  notes.each do |note|
    puts "  * " + note[:Contents]
  end
  markups.each do |markup|
    puts "  - " + (markup.text || "")
  end
  puts
  puts
end