From c1964560b667a54ba4eb3bd9fbc7ed26c4cc86a2 Mon Sep 17 00:00:00 2001 From: Amit Dovev Date: Tue, 8 Nov 2022 08:02:09 +0200 Subject: [PATCH] pdfrenderer.cpp: Ignore non-text blocks Fix #3957. --- src/api/pdfrenderer.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/api/pdfrenderer.cpp b/src/api/pdfrenderer.cpp index 774558aed7..81cf2e24d8 100644 --- a/src/api/pdfrenderer.cpp +++ b/src/api/pdfrenderer.cpp @@ -25,6 +25,7 @@ #include #include +#include // for PTIsTextType() #include #include #include @@ -354,6 +355,12 @@ char *TessPDFRenderer::GetPDFTextObjects(TessBaseAPI *api, double width, double const std::unique_ptr res_it(api->GetIterator()); while (!res_it->Empty(RIL_BLOCK)) { if (res_it->IsAtBeginningOf(RIL_BLOCK)) { + auto block_type = res_it->BlockType(); + if (!PTIsTextType(block_type)) { + // ignore non-text blocks + res_it->Next(RIL_BLOCK); + continue; + } pdf_str << "BT\n3 Tr"; // Begin text object, use invisible ink old_fontsize = 0; // Every block will declare its fontsize new_block = true; // Every block will declare its affine matrix