Skip to content

Commit

Permalink
Merge pull request #22 from NREL/sp/additional_pdfs
Browse files Browse the repository at this point in the history
Additional column detection, pdf download for papers and fact sheets
  • Loading branch information
spodgorny9 authored Jul 3, 2024
2 parents 21bd276 + 252d028 commit 906eeba
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 27 deletions.
21 changes: 4 additions & 17 deletions elm/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,21 +206,6 @@ async def clean_txt_async(self, ignore_error=None, rate_limit=40e3):

return clean_pages

def is_double_col(self, separator=' '):
"""Does the text look like it has multiple vertical text columns?
Parameters
----------
separator : str
Heuristic split string to look for spaces between columns
Returns
-------
out : bool
True if more than one vertical text column
"""
return is_multi_col(self.full, separator=separator)

def clean_poppler(self, layout=True):
"""Clean the pdf using the poppler pdftotxt utility
Expand Down Expand Up @@ -351,21 +336,23 @@ def clean_headers(self, char_thresh=0.6, page_thresh=0.8, split_on='\n',
self.full = combine_pages(self.pages)
return self.full

def convert_to_txt(self, txt_fp):
def convert_to_txt(self, txt_fp, separator=' '):
"""Function to convert contents of pdf document to txt file.
Parameters
----------
txt_fp: str
Directory for output txt file.
separator : str
Heuristic split string to look for spaces between columns
Returns
-------
text : str
Text string containing contents from pdf
"""
text = self.clean_poppler(layout=True)
if self.is_double_col():
if is_multi_col(text, separator=separator):
text = self.clean_poppler(layout=False)
text = self.clean_headers(char_thresh=0.6, page_thresh=0.8,
split_on='\n',
Expand Down
16 changes: 13 additions & 3 deletions elm/utilities/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
logger = logging.getLogger(__name__)


def is_multi_col(text, separator=" "):
def is_multi_col(text, separator=" ", threshold_ratio=0.35):
"""Does the text look like it has multiple vertical text columns?
Parameters
Expand All @@ -23,14 +23,24 @@ def is_multi_col(text, separator=" "):
columns.
separator : str
Heuristic split string to look for spaces between columns
threshold_ratio : float
Portion of lines containing the separator at which point
the text should be classified as multi-column.
Returns
-------
out : bool
True if more than one vertical text column
"""
n_cols = [len(line.strip().split(separator)) for line in text.split("\n")]
return np.median(n_cols) >= 2
lines = text.split("\n")
total_lines = len(lines)

gap_lines = [line for line in lines if separator in line.strip()]
cols = len(gap_lines)

ratio = cols / total_lines

return ratio >= threshold_ratio


def remove_blank_pages(pages):
Expand Down
15 changes: 10 additions & 5 deletions elm/web/rhub.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,10 +564,13 @@ def id(self):
id : str
Publication Number.
"""
group = self.get('keywordGroups')[0]
cont = group.get('keywordContainers')[0]
id = cont.get('freeKeywords')[0].get('freeKeywords')[0]
id = id.replace('/', '-')
try:
group = self.get('keywordGroups')[0]
cont = group.get('keywordContainers')[0]
id = cont.get('freeKeywords')[0].get('freeKeywords')[0]
id = id.replace('/', '-')
except TypeError:
id = self.get('externalId')

return id

Expand Down Expand Up @@ -690,7 +693,9 @@ def download(self, pdf_dir, txt_dir):
pdf_url = self.links[1]
abstract = self.abstract

if category != 'Technical Report':
pdf_categories = ['Technical Report', 'Paper', 'Fact Sheet']

if category not in pdf_categories:
fn = self.id.replace('/', '-') + '.txt'
fp = os.path.join(txt_dir, fn)
if not os.path.exists(fp):
Expand Down
6 changes: 4 additions & 2 deletions examples/research_hub/retrieve_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,17 @@
publications.download(PDF_DIR, TXT_DIR)
pubs_meta = publications.meta()

pdf_categories = ['Technical Report', 'Paper', 'Fact Sheet']

pubs_meta['fn'] = pubs_meta.apply(lambda row:
row['id'] + '.pdf'
if row['category'] == 'Technical Report'
if row['category'] in pdf_categories
and row['pdf_url'] is not None
and row['pdf_url'].endswith('.pdf')
else row['id'] + '.txt', axis=1)
pubs_meta['fp'] = pubs_meta.apply(lambda row:
PDF_DIR + row['id'] + '.pdf'
if row['category'] == 'Technical Report'
if row['category'] in pdf_categories
and row['pdf_url'] is not None
and row['pdf_url'].endswith('.pdf')
else TXT_DIR + row['fn'], axis=1)
Expand Down
14 changes: 14 additions & 0 deletions tests/utilities/test_utilities_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,20 @@ def test_is_multi_col():
double column!
"""
)
assert is_multi_col(
"""
Text that has multiple
columns and also has
lines without columns.
"""
)
assert not is_multi_col(
"""
Text that is mostly single
column but might have some
weird spacing like this.
"""
)


def test_remove_blank_pages():
Expand Down

0 comments on commit 906eeba

Please sign in to comment.