class PDF::Reader::PageLayout

Takes a collection of TextRun objects and renders them into a single string that best approximates the way they'd appear on a render PDF page.

media box should be a 4 number array that describes the dimensions of the page to be rendered as described by the page's MediaBox attribute

Constants

DEFAULT_FONT_SIZE

Public Class Methods

new(runs, mediabox) click to toggle source
# File lib/pdf/reader/page_layout.rb, line 14
def initialize(runs, mediabox)
  raise ArgumentError, "a mediabox must be provided" if mediabox.nil?

  @runs    = merge_runs(runs)
  @mean_font_size   = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
  @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
  @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
  @page_width  = mediabox[2] - mediabox[0]
  @page_height = mediabox[3] - mediabox[1]
  @x_offset = @runs.map(&:x).sort.first
end

Public Instance Methods

to_s() click to toggle source
# File lib/pdf/reader/page_layout.rb, line 26
def to_s
  return "" if @runs.empty?

  page = row_count.times.map { |i| " " * col_count }
  @runs.each do |run|
    x_pos = ((run.x - @x_offset) / col_multiplier).round
    y_pos = row_count - (run.y / row_multiplier).round
    if y_pos <= row_count && y_pos >= 0 && x_pos <= col_count && x_pos >= 0
      local_string_insert(page[y_pos-1], run.text, x_pos)
    end
  end
  interesting_rows(page).map(&:rstrip).join("\n")
end

Private Instance Methods

col_count() click to toggle source
# File lib/pdf/reader/page_layout.rb, line 63
def col_count
  @col_count ||= ((@page_width  / @mean_glyph_width) * 1.05).floor
end
col_multiplier() click to toggle source
# File lib/pdf/reader/page_layout.rb, line 71
def col_multiplier
  @col_multiplier ||= @page_width.to_f / col_count.to_f
end
each_line() { |y, collection| ... } click to toggle source
# File lib/pdf/reader/page_layout.rb, line 83
def each_line(&block)
  @runs.sort.group_by { |run|
    run.y.to_i
  }.map { |y, collection|
    yield y, collection
  }
end
group_chars_into_runs(chars) click to toggle source
# File lib/pdf/reader/page_layout.rb, line 101
def group_chars_into_runs(chars)
  runs = []
  while head = chars.shift
    if runs.empty?
      runs << head
    elsif runs.last.mergable?(head)
      runs[-1] = runs.last + head
    else
      runs << head
    end
  end
  runs
end
interesting_rows(rows) click to toggle source

given an array of strings, return a new array with empty rows from the beginning and end removed.

interesting_rows([ "", "one", "two", "" ])
=> [ "one", "two" ]
# File lib/pdf/reader/page_layout.rb, line 48
def interesting_rows(rows)
  line_lengths = rows.map { |l| l.strip.length }

  return [] if line_lengths.all?(&:zero?)

  first_line_with_text = line_lengths.index { |l| l > 0 }
  last_line_with_text  = line_lengths.size - line_lengths.reverse.index { |l| l > 0 }
  interesting_line_count = last_line_with_text - first_line_with_text
  rows[first_line_with_text, interesting_line_count].map
end
local_string_insert(haystack, needle, index) click to toggle source
# File lib/pdf/reader/page_layout.rb, line 115
def local_string_insert(haystack, needle, index)
  haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
end
mean(collection) click to toggle source
# File lib/pdf/reader/page_layout.rb, line 75
def mean(collection)
  if collection.size == 0
    0
  else
    collection.inject(0) { |accum, v| accum + v} / collection.size.to_f
  end
end
merge_runs(runs) click to toggle source

take a collection of TextRun objects and merge any that are in close proximity

# File lib/pdf/reader/page_layout.rb, line 93
def merge_runs(runs)
  runs.group_by { |char|
    char.y.to_i
  }.map { |y, chars|
    group_chars_into_runs(chars.sort)
  }.flatten.sort
end
row_count() click to toggle source
# File lib/pdf/reader/page_layout.rb, line 59
def row_count
  @row_count ||= (@page_height / @mean_font_size).floor
end
row_multiplier() click to toggle source
# File lib/pdf/reader/page_layout.rb, line 67
def row_multiplier
  @row_multiplier ||= @page_height.to_f / row_count.to_f
end