diff --git a/lib/pdf/reader/page_text_receiver.rb b/lib/pdf/reader/page_text_receiver.rb index 5c1c1575..827de6fe 100644 --- a/lib/pdf/reader/page_text_receiver.rb +++ b/lib/pdf/reader/page_text_receiver.rb @@ -49,6 +49,8 @@ def page=(page) @page = page @content = [] @characters = [] + @actual_text = nil + @actual_text_consumed = false end def runs(opts = {}) @@ -121,6 +123,21 @@ def set_spacing_next_line_show_text(aw, ac, string) # " move_to_next_line_and_show_text(string) end + ##################################################### + # Marked Content + ##################################################### + def begin_marked_content_with_pl(tag, properties) + if properties.is_a?(Hash) && properties[:ActualText] + @actual_text = properties[:ActualText] + @actual_text_consumed = false + end + end + + def end_marked_content + @actual_text = nil + @actual_text_consumed = false + end + ##################################################### # XObjects ##################################################### @@ -149,6 +166,18 @@ def internal_show_text(string) utf8_chars = @state.current_font.to_utf8(glyph_code) + # Use ActualText from marked content if available (PDF 32000-1 ยง14.9.4). + # ActualText replaces all text within a BDC/EMC span. + if @actual_text + if !@actual_text_consumed + text = @actual_text + utf8_chars = PDF::Reader::EncodingUtils.string_to_utf8(text) + @actual_text_consumed = true + else + utf8_chars = "" + end + end + # apply to glyph displacment for the current glyph so the next # glyph will appear in the correct position glyph_width = @state.current_font.glyph_width_in_text_space(glyph_code) diff --git a/spec/data/actual_text_marked_content.pdf b/spec/data/actual_text_marked_content.pdf new file mode 100644 index 00000000..1ecd7dc5 Binary files /dev/null and b/spec/data/actual_text_marked_content.pdf differ diff --git a/spec/integration_spec.rb b/spec/integration_spec.rb index 3febb5e3..99f89621 100644 --- a/spec/integration_spec.rb +++ b/spec/integration_spec.rb @@ -1024,6 +1024,17 @@ end end + context "PDF with ActualText in marked content" do + let(:filename) { pdf_spec_file("actual_text_marked_content") } + + it "extracts text correctly using ActualText" do + PDF::Reader.open(filename) do |reader| + page = reader.page(1) + expect(page.text).to include("21.09.2023") + end + end + end + context "PDF that uses a standatd font and a ligature" do let(:filename) { pdf_spec_file("standard_font_with_a_difference") } diff --git a/spec/integrity.yml b/spec/integrity.yml index 32095211..a3ac4dcd 100644 --- a/spec/integrity.yml +++ b/spec/integrity.yml @@ -125,6 +125,9 @@ data/encrypted_version5_revision6_256bit_aes_user_pass_apples_unenc_metadata.pdf data/extended_eof.pdf: :bytes: 61721 :md5: 02bd4cfbc79b4a295754fda2705b6181 +data/actual_text_marked_content.pdf: + :bytes: 6651 + :md5: 3fa1e5422adb93486eaad8a75bd3323c data/font_sizes.pdf: :bytes: 1062 :md5: c8d4bd87fa2d9b16c9c41501f37905c3