From 5e5fa92528c6ab79030b9db3ed12f5510d808dc4 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Tue, 31 Oct 2023 09:54:52 -0400 Subject: [PATCH 1/2] protect against pop with few lines and poppler to temp dir --- elm/pdf.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/elm/pdf.py b/elm/pdf.py index 4ee06cf..1be1606 100644 --- a/elm/pdf.py +++ b/elm/pdf.py @@ -6,6 +6,7 @@ import subprocess import numpy as np import requests +import tempfile import copy from PyPDF2 import PdfReader import logging @@ -224,7 +225,7 @@ def is_double_col(self, separator=' '): n_cols[i] = len(columns) return np.median(n_cols) >= 2 - def clean_poppler(self, fp_out, layout=True): + def clean_poppler(self, layout=True): """Clean the pdf using the poppler pdftotxt utility Requires the `pdftotext` command line utility from this software: @@ -232,8 +233,6 @@ def clean_poppler(self, fp_out, layout=True): Parameters ---------- - fp_out : str - Filepath to output .txt file layout : bool Layout flag for poppler pdftotxt utility: "maintain original physical layout". Layout=True works well for single column text, @@ -246,21 +245,24 @@ def clean_poppler(self, fp_out, layout=True): Joined cleaned pages """ - args = ['pdftotext', f"{self.fp}", f"{fp_out}"] - if layout: - args.insert(1, '-layout') + with tempfile.TemporaryDirectory() as td: + fp_out = os.path.join(td, 'poppler_out.txt') + args = ['pdftotext', f"{self.fp}", f"{fp_out}"] + if layout: + args.insert(1, '-layout') - if not os.path.exists(os.path.dirname(fp_out)): - os.makedirs(os.path.dirname(fp_out), exist_ok=True) + if not os.path.exists(os.path.dirname(fp_out)): + os.makedirs(os.path.dirname(fp_out), exist_ok=True) - stdout = subprocess.run(args, check=True, stdout=subprocess.PIPE) - if stdout.returncode == 0: - logger.info(f'Saved to disk: {fp_out}') - else: - raise RuntimeError(stdout) + stdout = subprocess.run(args, check=True, stdout=subprocess.PIPE) + if stdout.returncode != 0: + msg = ('Poppler raised return code {}: {}' + .format(stdout.returncode, stdout)) + logger.exception(msg) + raise RuntimeError(msg) - with open(fp_out, 'r') as f: - clean_txt = f.read() + with open(fp_out, 'r') as f: + clean_txt = f.read() # break on poppler page break self.pages = clean_txt.split('\x0c') @@ -399,7 +401,7 @@ def clean_headers(self, char_thresh=0.6, page_thresh=0.8, split_on='\n', for ip, page in enumerate(self.pages): page = page.split(split_on) for i, iheader in enumerate(iheaders): - if tests[i]: + if tests[i] and len(page) > np.abs(iheader): _ = page.pop(iheader) page = split_on.join(page) From 32b6b13e52e33ed96ff4451d8f6b5ab649583419 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Tue, 31 Oct 2023 10:40:56 -0400 Subject: [PATCH 2/2] refactor header cleaning to get nominal headers from a typical page --- elm/pdf.py | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/elm/pdf.py b/elm/pdf.py index 1be1606..55fd03f 100644 --- a/elm/pdf.py +++ b/elm/pdf.py @@ -335,6 +335,35 @@ def combine_pages(pages): full = full.replace('•', '-') return full + def _get_nominal_headers(self, split_on, iheaders): + """Get nominal headers from a standard page. Aim for a "typical" page + that is likely to have a normal header, not the first or last. + + Parameters + ---------- + split_on : str + Chars to split lines of a page on + iheaders : list | tuple + Integer indices to look for headers after splitting a page into + lines based on split_on. This needs to go from the start of the + page to the end. + + Returns + ------- + headers : list + List of headers where each entry is a string header + """ + + headers = [None] * len(iheaders) + page_lens = np.array([len(p) for p in self.pages]) + median_len = np.median(page_lens) + ipage = np.argmin(np.abs(page_lens - median_len)) + page = self.pages[ipage] + for i, ih in enumerate(iheaders): + headers[i] = page.split(split_on)[ih] + + return headers + def clean_headers(self, char_thresh=0.6, page_thresh=0.8, split_on='\n', iheaders=(0, 1, -2, -1)): """Clean headers/footers that are duplicated across pages @@ -360,13 +389,9 @@ def clean_headers(self, char_thresh=0.6, page_thresh=0.8, split_on='\n', Clean text with all pages joined """ logger.info('Cleaning headers') - headers = [None] * len(iheaders) + headers = self._get_nominal_headers(split_on, iheaders) tests = np.zeros((len(self.pages), len(headers))) - page = self.pages[-1] - for i, ih in enumerate(iheaders): - headers[i] = page.split(split_on)[ih] - for ip, page in enumerate(self.pages): for ih, header in zip(iheaders, headers): pheader = ''