Skip to content

Commit

Permalink
Add convert_to_txt() to PDFtoTXT
Browse files Browse the repository at this point in the history
  • Loading branch information
spodgorny9 committed Jun 3, 2024
1 parent cd6df33 commit dc1ca4c
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 37 deletions.
25 changes: 25 additions & 0 deletions elm/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,3 +350,28 @@ def clean_headers(self, char_thresh=0.6, page_thresh=0.8, split_on='\n',
iheaders=iheaders)
self.full = combine_pages(self.pages)
return self.full

def convert_to_txt(self, txt_fp):
"""Function to convert contents of pdf document to txt file.
Parameters
----------
txt_fp: str
Directory for output txt file.
Returns
-------
text : str
Text string containing contents from pdf
"""
text = self.clean_poppler(layout=True)
if self.is_double_col():
text = self.clean_poppler(layout=False)
text = self.clean_headers(char_thresh=0.6, page_thresh=0.8,
split_on='\n',
iheaders=[0, 1, 3, -3, -2, -1])
with open(txt_fp, 'w') as f:
f.write(text)
logger.info(f'Saved: {txt_fp}')

return text
12 changes: 8 additions & 4 deletions elm/web/rhub.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ class ProfilesRecord(dict):
"""Class to handle a single profiles as dictionary data.
This class requires setting an 'RHUB_API_KEY' environment
variable to access the Pure Web Service. The API key can be
obtained...
obtained by contacting an NREL library representative:
[email protected].
"""
def __init__(self, record):
"""
Expand Down Expand Up @@ -304,7 +305,8 @@ class ProfilesList(list):
"""Class to retrieve and handle multiple profiles from an API URL.
This class requires setting an 'RHUB_API_KEY' environment
variable to access the Pure Web Service. The API key can be
obtained...
obtained by contacting an NREL library representative:
[email protected].
"""
def __init__(self, url, n_pages=1):
"""
Expand Down Expand Up @@ -447,7 +449,8 @@ class PublicationsRecord(dict):
"""Class to handle a single publication as dictionary data.
This class requires setting an 'RHUB_API_KEY' environment
variable to access the Pure Web Service. The API key can be
obtained...
obtained by contacting an NREL library representative:
[email protected].
"""
def __init__(self, record):
"""
Expand Down Expand Up @@ -664,7 +667,8 @@ class PublicationsList(list):
"""Class to retrieve and handle multiple publications from an API URL.
This class requires setting an 'RHUB_API_KEY' environment
variable to access the Pure Web Service. The API key can be
obtained...
obtained by contacting an NREL library representative:
[email protected].
"""
def __init__(self, url, n_pages=1):
"""
Expand Down
39 changes: 6 additions & 33 deletions examples/research_hub/retrieve_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,59 +52,31 @@
f'apiKey={rhub_api_key}')


def convert_pdf(pdf_fp, txt_fp):
"""Function to convert pdf document to txt file.
Parameters
----------
pdf_fp : str
pdf to convert to text
txt_fp: str
Directory for output txt file.
Returns
-------
text : str
Text string containing contents from pdf
"""

pdf_obj = PDFtoTXT(pdf_fp)
text = pdf_obj.clean_poppler(layout=True)
if pdf_obj.is_double_col():
text = pdf_obj.clean_poppler(layout=False)
text = pdf_obj.clean_headers(char_thresh=0.6, page_thresh=0.8,
split_on='\n',
iheaders=[0, 1, 3, -3, -2, -1])
with open(txt_fp, 'w') as f:
f.write(text)
logger.info(f'Saved: {txt_fp}')

return text


if __name__ == '__main__':
os.makedirs(PDF_DIR, exist_ok=True)
os.makedirs(TXT_DIR, exist_ok=True)
os.makedirs(EMBED_DIR, exist_ok=True)

profiles = ProfilesList(PROFILES_URL, n_pages=5)
profiles = ProfilesList(PROFILES_URL, n_pages=10)
logger.info("Starting download for researcher profiles.")
profiles.download(TXT_DIR)
profiles_meta = profiles.meta()

publications = PublicationsList(PUBLICATIONS_URL, n_pages=5)
publications = PublicationsList(PUBLICATIONS_URL, n_pages=20)
logger.info("Starting download for publications.")
publications.download(PDF_DIR, TXT_DIR)
pubs_meta = publications.meta()

pubs_meta['fn'] = pubs_meta.apply(lambda row:
row['id'] + '.pdf'
if row['category'] == 'Technical Report'
and row['pdf_url'] is not None
and row['pdf_url'].endswith('.pdf')
else row['id'] + '.txt', axis=1)
pubs_meta['fp'] = pubs_meta.apply(lambda row:
PDF_DIR + row['id'] + '.pdf'
if row['category'] == 'Technical Report'
and row['pdf_url'] is not None
and row['pdf_url'].endswith('.pdf')
else TXT_DIR + row['fn'], axis=1)

Expand Down Expand Up @@ -137,7 +109,8 @@ def convert_pdf(pdf_fp, txt_fp):

else:
try:
text = convert_pdf(fp, txt_fp)
pdf_obj = PDFtoTXT(fp)
text = pdf_obj.convert_to_txt(txt_fp)
except Exception as e:
failed_count += 1
logger.info(f'Could not convert {fp} to pdf.')
Expand Down

0 comments on commit dc1ca4c

Please sign in to comment.