Add convert_to_txt() to PDFtoTXT

NREL · Jun 3, 2024 · dc1ca4c · dc1ca4c
1 parent cd6df33
commit dc1ca4c
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 37 deletions.
diff --git a/elm/pdf.py b/elm/pdf.py
@@ -350,3 +350,28 @@ def clean_headers(self, char_thresh=0.6, page_thresh=0.8, split_on='\n',
                                    iheaders=iheaders)
         self.full = combine_pages(self.pages)
         return self.full
+
+    def convert_to_txt(self, txt_fp):
+        """Function to convert contents of pdf document to txt file.
+
+        Parameters
+        ----------
+        txt_fp: str
+            Directory for output txt file.
+
+        Returns
+        -------
+        text : str
+            Text string containing contents from pdf
+        """
+        text = self.clean_poppler(layout=True)
+        if self.is_double_col():
+            text = self.clean_poppler(layout=False)
+        text = self.clean_headers(char_thresh=0.6, page_thresh=0.8,
+                                    split_on='\n',
+                                    iheaders=[0, 1, 3, -3, -2, -1])
+        with open(txt_fp, 'w') as f:
+            f.write(text)
+        logger.info(f'Saved: {txt_fp}')
+
+        return text
diff --git a/elm/web/rhub.py b/elm/web/rhub.py
@@ -17,7 +17,8 @@ class ProfilesRecord(dict):
     """Class to handle a single profiles as dictionary data.
     This class requires setting an 'RHUB_API_KEY' environment
     variable to access the Pure Web Service. The API key can be
-    obtained...
+    obtained by contacting an NREL library representative:
+    [email protected].
     """
     def __init__(self, record):
         """
@@ -304,7 +305,8 @@ class ProfilesList(list):
     """Class to retrieve and handle multiple profiles from an API URL.
     This class requires setting an 'RHUB_API_KEY' environment
     variable to access the Pure Web Service. The API key can be
-    obtained...
+    obtained by contacting an NREL library representative:
+    [email protected].
     """
     def __init__(self, url, n_pages=1):
         """
@@ -447,7 +449,8 @@ class PublicationsRecord(dict):
     """Class to handle a single publication as dictionary data.
     This class requires setting an 'RHUB_API_KEY' environment
     variable to access the Pure Web Service. The API key can be
-    obtained...
+    obtained by contacting an NREL library representative:
+    [email protected].
     """
     def __init__(self, record):
         """
@@ -664,7 +667,8 @@ class PublicationsList(list):
     """Class to retrieve and handle multiple publications from an API URL.
     This class requires setting an 'RHUB_API_KEY' environment
     variable to access the Pure Web Service. The API key can be
-    obtained...
+    obtained by contacting an NREL library representative:
+    [email protected].
     """
     def __init__(self, url, n_pages=1):
         """

diff --git a/examples/research_hub/retrieve_docs.py b/examples/research_hub/retrieve_docs.py
@@ -52,59 +52,31 @@
                     f'apiKey={rhub_api_key}')
 
 
-def convert_pdf(pdf_fp, txt_fp):
-    """Function to convert pdf document to txt file.
-
-    Parameters
-    ----------
-    pdf_fp : str
-        pdf to convert to text
-    txt_fp: str
-        Directory for output txt file.
-
-    Returns
-    -------
-    text : str
-        Text string containing contents from pdf
-    """
-
-    pdf_obj = PDFtoTXT(pdf_fp)
-    text = pdf_obj.clean_poppler(layout=True)
-    if pdf_obj.is_double_col():
-        text = pdf_obj.clean_poppler(layout=False)
-    text = pdf_obj.clean_headers(char_thresh=0.6, page_thresh=0.8,
-                                 split_on='\n',
-                                 iheaders=[0, 1, 3, -3, -2, -1])
-    with open(txt_fp, 'w') as f:
-        f.write(text)
-    logger.info(f'Saved: {txt_fp}')
-
-    return text
-
-
 if __name__ == '__main__':
     os.makedirs(PDF_DIR, exist_ok=True)
     os.makedirs(TXT_DIR, exist_ok=True)
     os.makedirs(EMBED_DIR, exist_ok=True)
 
-    profiles = ProfilesList(PROFILES_URL, n_pages=5)
+    profiles = ProfilesList(PROFILES_URL, n_pages=10)
     logger.info("Starting download for researcher profiles.")
     profiles.download(TXT_DIR)
     profiles_meta = profiles.meta()
 
-    publications = PublicationsList(PUBLICATIONS_URL, n_pages=5)
+    publications = PublicationsList(PUBLICATIONS_URL, n_pages=20)
     logger.info("Starting download for publications.")
     publications.download(PDF_DIR, TXT_DIR)
     pubs_meta = publications.meta()
 
     pubs_meta['fn'] = pubs_meta.apply(lambda row:
                                       row['id'] + '.pdf'
                                       if row['category'] == 'Technical Report'
+                                      and row['pdf_url'] is not None
                                       and row['pdf_url'].endswith('.pdf')
                                       else row['id'] + '.txt', axis=1)
     pubs_meta['fp'] = pubs_meta.apply(lambda row:
                                       PDF_DIR + row['id'] + '.pdf'
                                       if row['category'] == 'Technical Report'
+                                      and row['pdf_url'] is not None
                                       and row['pdf_url'].endswith('.pdf')
                                       else TXT_DIR + row['fn'], axis=1)
 
@@ -137,7 +109,8 @@ def convert_pdf(pdf_fp, txt_fp):
 
         else:
             try:
-                text = convert_pdf(fp, txt_fp)
+                pdf_obj = PDFtoTXT(fp)
+                text = pdf_obj.convert_to_txt(txt_fp)
             except Exception as e:
                 failed_count += 1
                 logger.info(f'Could not convert {fp} to pdf.')