Merge pull request #22 from NREL/sp/additional_pdfs

Additional column detection, pdf download for papers and fact sheets
NREL · Jul 3, 2024 · 906eeba · 906eeba
2 parents 21bd276 + 252d028
commit 906eeba
Show file tree

Hide file tree

Showing 5 changed files with 45 additions and 27 deletions.
diff --git a/elm/pdf.py b/elm/pdf.py
@@ -206,21 +206,6 @@ async def clean_txt_async(self, ignore_error=None, rate_limit=40e3):
 
         return clean_pages
 
-    def is_double_col(self, separator='    '):
-        """Does the text look like it has multiple vertical text columns?
-
-        Parameters
-        ----------
-        separator : str
-            Heuristic split string to look for spaces between columns
-
-        Returns
-        -------
-        out : bool
-            True if more than one vertical text column
-        """
-        return is_multi_col(self.full, separator=separator)
-
     def clean_poppler(self, layout=True):
         """Clean the pdf using the poppler pdftotxt utility
 
@@ -351,21 +336,23 @@ def clean_headers(self, char_thresh=0.6, page_thresh=0.8, split_on='\n',
         self.full = combine_pages(self.pages)
         return self.full
 
-    def convert_to_txt(self, txt_fp):
+    def convert_to_txt(self, txt_fp, separator='    '):
         """Function to convert contents of pdf document to txt file.
 
         Parameters
         ----------
         txt_fp: str
             Directory for output txt file.
+        separator : str
+            Heuristic split string to look for spaces between columns
 
         Returns
         -------
         text : str
             Text string containing contents from pdf
         """
         text = self.clean_poppler(layout=True)
-        if self.is_double_col():
+        if is_multi_col(text, separator=separator):
             text = self.clean_poppler(layout=False)
         text = self.clean_headers(char_thresh=0.6, page_thresh=0.8,
                                   split_on='\n',

diff --git a/elm/utilities/parse.py b/elm/utilities/parse.py
@@ -13,7 +13,7 @@
 logger = logging.getLogger(__name__)
 
 
-def is_multi_col(text, separator="    "):
+def is_multi_col(text, separator="    ", threshold_ratio=0.35):
     """Does the text look like it has multiple vertical text columns?
 
     Parameters
@@ -23,14 +23,24 @@ def is_multi_col(text, separator="    "):
         columns.
     separator : str
         Heuristic split string to look for spaces between columns
+    threshold_ratio : float
+        Portion of lines containing the separator at which point
+        the text should be classified as multi-column.
 
     Returns
     -------
     out : bool
         True if more than one vertical text column
     """
-    n_cols = [len(line.strip().split(separator)) for line in text.split("\n")]
-    return np.median(n_cols) >= 2
+    lines = text.split("\n")
+    total_lines = len(lines)
+
+    gap_lines = [line for line in lines if separator in line.strip()]
+    cols = len(gap_lines)
+
+    ratio = cols / total_lines
+
+    return ratio >= threshold_ratio
 
 
 def remove_blank_pages(pages):

diff --git a/elm/web/rhub.py b/elm/web/rhub.py
@@ -564,10 +564,13 @@ def id(self):
         id : str
             Publication Number.
         """
-        group = self.get('keywordGroups')[0]
-        cont = group.get('keywordContainers')[0]
-        id = cont.get('freeKeywords')[0].get('freeKeywords')[0]
-        id = id.replace('/', '-')
+        try:
+            group = self.get('keywordGroups')[0]
+            cont = group.get('keywordContainers')[0]
+            id = cont.get('freeKeywords')[0].get('freeKeywords')[0]
+            id = id.replace('/', '-')
+        except TypeError:
+            id = self.get('externalId')
 
         return id
 
@@ -690,7 +693,9 @@ def download(self, pdf_dir, txt_dir):
         pdf_url = self.links[1]
         abstract = self.abstract
 
-        if category != 'Technical Report':
+        pdf_categories = ['Technical Report', 'Paper', 'Fact Sheet']
+
+        if category not in pdf_categories:
             fn = self.id.replace('/', '-') + '.txt'
             fp = os.path.join(txt_dir, fn)
             if not os.path.exists(fp):

diff --git a/examples/research_hub/retrieve_docs.py b/examples/research_hub/retrieve_docs.py
@@ -67,15 +67,17 @@
     publications.download(PDF_DIR, TXT_DIR)
     pubs_meta = publications.meta()
 
+    pdf_categories = ['Technical Report', 'Paper', 'Fact Sheet']
+
     pubs_meta['fn'] = pubs_meta.apply(lambda row:
                                       row['id'] + '.pdf'
-                                      if row['category'] == 'Technical Report'
+                                      if row['category'] in pdf_categories
                                       and row['pdf_url'] is not None
                                       and row['pdf_url'].endswith('.pdf')
                                       else row['id'] + '.txt', axis=1)
     pubs_meta['fp'] = pubs_meta.apply(lambda row:
                                       PDF_DIR + row['id'] + '.pdf'
-                                      if row['category'] == 'Technical Report'
+                                      if row['category'] in pdf_categories
                                       and row['pdf_url'] is not None
                                       and row['pdf_url'].endswith('.pdf')
                                       else TXT_DIR + row['fn'], axis=1)

diff --git a/tests/utilities/test_utilities_parse.py b/tests/utilities/test_utilities_parse.py
@@ -79,6 +79,20 @@ def test_is_multi_col():
         double column!
         """
     )
+    assert is_multi_col(
+        """
+        Text that    has multiple
+        columns and    also has
+        lines without columns.
+        """
+    )
+    assert not is_multi_col(
+        """
+        Text that is mostly single
+        column but might have some
+        weird spacing     like this.
+        """
+    )
 
 
 def test_remove_blank_pages():