From 8e8e41d2b033926be8c7e0d4948004e404ec1775 Mon Sep 17 00:00:00 2001
From: Slater Podgorny <podgornyslater@gmail.com>
Date: Mon, 24 Jun 2024 17:09:51 -0700
Subject: [PATCH 1/2] Additional column detection, pdf download for papers and
 fact sheets

---
 elm/pdf.py                              |  6 +++---
 elm/utilities/parse.py                  | 16 +++++++++++++---
 elm/web/rhub.py                         | 15 ++++++++++-----
 examples/research_hub/retrieve_docs.py  |  6 ++++--
 tests/utilities/test_utilities_parse.py | 14 ++++++++++++++
 5 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/elm/pdf.py b/elm/pdf.py
index 4f04fab..b45bce3 100644
--- a/elm/pdf.py
+++ b/elm/pdf.py
@@ -206,7 +206,7 @@ async def clean_txt_async(self, ignore_error=None, rate_limit=40e3):
 
         return clean_pages
 
-    def is_double_col(self, separator='    '):
+    def is_double_col(self, text, separator='    '):
         """Does the text look like it has multiple vertical text columns?
 
         Parameters
@@ -219,7 +219,7 @@ def is_double_col(self, separator='    '):
         out : bool
             True if more than one vertical text column
         """
-        return is_multi_col(self.full, separator=separator)
+        return is_multi_col(text, separator=separator)
 
     def clean_poppler(self, layout=True):
         """Clean the pdf using the poppler pdftotxt utility
@@ -365,7 +365,7 @@ def convert_to_txt(self, txt_fp):
             Text string containing contents from pdf
         """
         text = self.clean_poppler(layout=True)
-        if self.is_double_col():
+        if self.is_double_col(text):
             text = self.clean_poppler(layout=False)
         text = self.clean_headers(char_thresh=0.6, page_thresh=0.8,
                                   split_on='\n',
diff --git a/elm/utilities/parse.py b/elm/utilities/parse.py
index a01db0c..e3a28ed 100644
--- a/elm/utilities/parse.py
+++ b/elm/utilities/parse.py
@@ -13,7 +13,7 @@
 logger = logging.getLogger(__name__)
 
 
-def is_multi_col(text, separator="    "):
+def is_multi_col(text, separator="    ", threshold_ratio=0.35):
     """Does the text look like it has multiple vertical text columns?
 
     Parameters
@@ -23,14 +23,24 @@ def is_multi_col(text, separator="    "):
         columns.
     separator : str
         Heuristic split string to look for spaces between columns
+    threshold_ratio : float
+        Portion of lines containing the separator at which point
+        the text should be classified as multi-column.
 
     Returns
     -------
     out : bool
         True if more than one vertical text column
     """
-    n_cols = [len(line.strip().split(separator)) for line in text.split("\n")]
-    return np.median(n_cols) >= 2
+    lines = text.split("\n")
+    total_lines = len(lines)
+
+    gap_lines = [line for line in lines if separator in line.strip()]
+    cols = len(gap_lines)
+
+    ratio = cols / total_lines
+
+    return ratio >= threshold_ratio
 
 
 def remove_blank_pages(pages):
diff --git a/elm/web/rhub.py b/elm/web/rhub.py
index 12104fc..df1d3ca 100644
--- a/elm/web/rhub.py
+++ b/elm/web/rhub.py
@@ -564,10 +564,13 @@ def id(self):
         id : str
             Publication Number.
         """
-        group = self.get('keywordGroups')[0]
-        cont = group.get('keywordContainers')[0]
-        id = cont.get('freeKeywords')[0].get('freeKeywords')[0]
-        id = id.replace('/', '-')
+        try:
+            group = self.get('keywordGroups')[0]
+            cont = group.get('keywordContainers')[0]
+            id = cont.get('freeKeywords')[0].get('freeKeywords')[0]
+            id = id.replace('/', '-')
+        except TypeError:
+            id = self.get('externalId')
 
         return id
 
@@ -690,7 +693,9 @@ def download(self, pdf_dir, txt_dir):
         pdf_url = self.links[1]
         abstract = self.abstract
 
-        if category != 'Technical Report':
+        pdf_categories = ['Technical Report', 'Paper', 'Fact Sheet']
+
+        if category not in pdf_categories:
             fn = self.id.replace('/', '-') + '.txt'
             fp = os.path.join(txt_dir, fn)
             if not os.path.exists(fp):
diff --git a/examples/research_hub/retrieve_docs.py b/examples/research_hub/retrieve_docs.py
index 72c7373..6deadb2 100644
--- a/examples/research_hub/retrieve_docs.py
+++ b/examples/research_hub/retrieve_docs.py
@@ -67,15 +67,17 @@
     publications.download(PDF_DIR, TXT_DIR)
     pubs_meta = publications.meta()
 
+    pdf_categories = ['Technical Report', 'Paper', 'Fact Sheet']
+
     pubs_meta['fn'] = pubs_meta.apply(lambda row:
                                       row['id'] + '.pdf'
-                                      if row['category'] == 'Technical Report'
+                                      if row['category'] in pdf_categories
                                       and row['pdf_url'] is not None
                                       and row['pdf_url'].endswith('.pdf')
                                       else row['id'] + '.txt', axis=1)
     pubs_meta['fp'] = pubs_meta.apply(lambda row:
                                       PDF_DIR + row['id'] + '.pdf'
-                                      if row['category'] == 'Technical Report'
+                                      if row['category'] in pdf_categories
                                       and row['pdf_url'] is not None
                                       and row['pdf_url'].endswith('.pdf')
                                       else TXT_DIR + row['fn'], axis=1)
diff --git a/tests/utilities/test_utilities_parse.py b/tests/utilities/test_utilities_parse.py
index 3826c59..1ca0baf 100644
--- a/tests/utilities/test_utilities_parse.py
+++ b/tests/utilities/test_utilities_parse.py
@@ -79,6 +79,20 @@ def test_is_multi_col():
         double column!
         """
     )
+    assert is_multi_col(
+        """
+        Text that    has multiple
+        columns and    also has
+        lines without columns.
+        """
+    )
+    assert not is_multi_col(
+        """
+        Text that is mostly single
+        column but might have some
+        weird spacing     like this.
+        """
+    )
 
 
 def test_remove_blank_pages():

From 252d02873b853f7403cf5b2ad3307465c581a78c Mon Sep 17 00:00:00 2001
From: Slater Podgorny <podgornyslater@gmail.com>
Date: Tue, 2 Jul 2024 15:05:32 -0700
Subject: [PATCH 2/2] Remove is_double_col() from PDFtoTXT

---
 elm/pdf.py | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

diff --git a/elm/pdf.py b/elm/pdf.py
index b45bce3..0f05630 100644
--- a/elm/pdf.py
+++ b/elm/pdf.py
@@ -206,21 +206,6 @@ async def clean_txt_async(self, ignore_error=None, rate_limit=40e3):
 
         return clean_pages
 
-    def is_double_col(self, text, separator='    '):
-        """Does the text look like it has multiple vertical text columns?
-
-        Parameters
-        ----------
-        separator : str
-            Heuristic split string to look for spaces between columns
-
-        Returns
-        -------
-        out : bool
-            True if more than one vertical text column
-        """
-        return is_multi_col(text, separator=separator)
-
     def clean_poppler(self, layout=True):
         """Clean the pdf using the poppler pdftotxt utility
 
@@ -351,13 +336,15 @@ def clean_headers(self, char_thresh=0.6, page_thresh=0.8, split_on='\n',
         self.full = combine_pages(self.pages)
         return self.full
 
-    def convert_to_txt(self, txt_fp):
+    def convert_to_txt(self, txt_fp, separator='    '):
         """Function to convert contents of pdf document to txt file.
 
         Parameters
         ----------
         txt_fp: str
             Directory for output txt file.
+        separator : str
+            Heuristic split string to look for spaces between columns
 
         Returns
         -------
@@ -365,7 +352,7 @@ def convert_to_txt(self, txt_fp):
             Text string containing contents from pdf
         """
         text = self.clean_poppler(layout=True)
-        if self.is_double_col(text):
+        if is_multi_col(text, separator=separator):
             text = self.clean_poppler(layout=False)
         text = self.clean_headers(char_thresh=0.6, page_thresh=0.8,
                                   split_on='\n',