From 8e8e41d2b033926be8c7e0d4948004e404ec1775 Mon Sep 17 00:00:00 2001 From: Slater Podgorny Date: Mon, 24 Jun 2024 17:09:51 -0700 Subject: [PATCH 1/2] Additional column detection, pdf download for papers and fact sheets --- elm/pdf.py | 6 +++--- elm/utilities/parse.py | 16 +++++++++++++--- elm/web/rhub.py | 15 ++++++++++----- examples/research_hub/retrieve_docs.py | 6 ++++-- tests/utilities/test_utilities_parse.py | 14 ++++++++++++++ 5 files changed, 44 insertions(+), 13 deletions(-) diff --git a/elm/pdf.py b/elm/pdf.py index 4f04fab..b45bce3 100644 --- a/elm/pdf.py +++ b/elm/pdf.py @@ -206,7 +206,7 @@ async def clean_txt_async(self, ignore_error=None, rate_limit=40e3): return clean_pages - def is_double_col(self, separator=' '): + def is_double_col(self, text, separator=' '): """Does the text look like it has multiple vertical text columns? Parameters @@ -219,7 +219,7 @@ def is_double_col(self, separator=' '): out : bool True if more than one vertical text column """ - return is_multi_col(self.full, separator=separator) + return is_multi_col(text, separator=separator) def clean_poppler(self, layout=True): """Clean the pdf using the poppler pdftotxt utility @@ -365,7 +365,7 @@ def convert_to_txt(self, txt_fp): Text string containing contents from pdf """ text = self.clean_poppler(layout=True) - if self.is_double_col(): + if self.is_double_col(text): text = self.clean_poppler(layout=False) text = self.clean_headers(char_thresh=0.6, page_thresh=0.8, split_on='\n', diff --git a/elm/utilities/parse.py b/elm/utilities/parse.py index a01db0c..e3a28ed 100644 --- a/elm/utilities/parse.py +++ b/elm/utilities/parse.py @@ -13,7 +13,7 @@ logger = logging.getLogger(__name__) -def is_multi_col(text, separator=" "): +def is_multi_col(text, separator=" ", threshold_ratio=0.35): """Does the text look like it has multiple vertical text columns? Parameters @@ -23,14 +23,24 @@ def is_multi_col(text, separator=" "): columns. separator : str Heuristic split string to look for spaces between columns + threshold_ratio : float + Portion of lines containing the separator at which point + the text should be classified as multi-column. Returns ------- out : bool True if more than one vertical text column """ - n_cols = [len(line.strip().split(separator)) for line in text.split("\n")] - return np.median(n_cols) >= 2 + lines = text.split("\n") + total_lines = len(lines) + + gap_lines = [line for line in lines if separator in line.strip()] + cols = len(gap_lines) + + ratio = cols / total_lines + + return ratio >= threshold_ratio def remove_blank_pages(pages): diff --git a/elm/web/rhub.py b/elm/web/rhub.py index 12104fc..df1d3ca 100644 --- a/elm/web/rhub.py +++ b/elm/web/rhub.py @@ -564,10 +564,13 @@ def id(self): id : str Publication Number. """ - group = self.get('keywordGroups')[0] - cont = group.get('keywordContainers')[0] - id = cont.get('freeKeywords')[0].get('freeKeywords')[0] - id = id.replace('/', '-') + try: + group = self.get('keywordGroups')[0] + cont = group.get('keywordContainers')[0] + id = cont.get('freeKeywords')[0].get('freeKeywords')[0] + id = id.replace('/', '-') + except TypeError: + id = self.get('externalId') return id @@ -690,7 +693,9 @@ def download(self, pdf_dir, txt_dir): pdf_url = self.links[1] abstract = self.abstract - if category != 'Technical Report': + pdf_categories = ['Technical Report', 'Paper', 'Fact Sheet'] + + if category not in pdf_categories: fn = self.id.replace('/', '-') + '.txt' fp = os.path.join(txt_dir, fn) if not os.path.exists(fp): diff --git a/examples/research_hub/retrieve_docs.py b/examples/research_hub/retrieve_docs.py index 72c7373..6deadb2 100644 --- a/examples/research_hub/retrieve_docs.py +++ b/examples/research_hub/retrieve_docs.py @@ -67,15 +67,17 @@ publications.download(PDF_DIR, TXT_DIR) pubs_meta = publications.meta() + pdf_categories = ['Technical Report', 'Paper', 'Fact Sheet'] + pubs_meta['fn'] = pubs_meta.apply(lambda row: row['id'] + '.pdf' - if row['category'] == 'Technical Report' + if row['category'] in pdf_categories and row['pdf_url'] is not None and row['pdf_url'].endswith('.pdf') else row['id'] + '.txt', axis=1) pubs_meta['fp'] = pubs_meta.apply(lambda row: PDF_DIR + row['id'] + '.pdf' - if row['category'] == 'Technical Report' + if row['category'] in pdf_categories and row['pdf_url'] is not None and row['pdf_url'].endswith('.pdf') else TXT_DIR + row['fn'], axis=1) diff --git a/tests/utilities/test_utilities_parse.py b/tests/utilities/test_utilities_parse.py index 3826c59..1ca0baf 100644 --- a/tests/utilities/test_utilities_parse.py +++ b/tests/utilities/test_utilities_parse.py @@ -79,6 +79,20 @@ def test_is_multi_col(): double column! """ ) + assert is_multi_col( + """ + Text that has multiple + columns and also has + lines without columns. + """ + ) + assert not is_multi_col( + """ + Text that is mostly single + column but might have some + weird spacing like this. + """ + ) def test_remove_blank_pages(): From 252d02873b853f7403cf5b2ad3307465c581a78c Mon Sep 17 00:00:00 2001 From: Slater Podgorny Date: Tue, 2 Jul 2024 15:05:32 -0700 Subject: [PATCH 2/2] Remove is_double_col() from PDFtoTXT --- elm/pdf.py | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/elm/pdf.py b/elm/pdf.py index b45bce3..0f05630 100644 --- a/elm/pdf.py +++ b/elm/pdf.py @@ -206,21 +206,6 @@ async def clean_txt_async(self, ignore_error=None, rate_limit=40e3): return clean_pages - def is_double_col(self, text, separator=' '): - """Does the text look like it has multiple vertical text columns? - - Parameters - ---------- - separator : str - Heuristic split string to look for spaces between columns - - Returns - ------- - out : bool - True if more than one vertical text column - """ - return is_multi_col(text, separator=separator) - def clean_poppler(self, layout=True): """Clean the pdf using the poppler pdftotxt utility @@ -351,13 +336,15 @@ def clean_headers(self, char_thresh=0.6, page_thresh=0.8, split_on='\n', self.full = combine_pages(self.pages) return self.full - def convert_to_txt(self, txt_fp): + def convert_to_txt(self, txt_fp, separator=' '): """Function to convert contents of pdf document to txt file. Parameters ---------- txt_fp: str Directory for output txt file. + separator : str + Heuristic split string to look for spaces between columns Returns ------- @@ -365,7 +352,7 @@ def convert_to_txt(self, txt_fp): Text string containing contents from pdf """ text = self.clean_poppler(layout=True) - if self.is_double_col(text): + if is_multi_col(text, separator=separator): text = self.clean_poppler(layout=False) text = self.clean_headers(char_thresh=0.6, page_thresh=0.8, split_on='\n',