Skip to content

Commit

Permalink
Remove is_double_col() from PDFtoTXT
Browse files Browse the repository at this point in the history
  • Loading branch information
spodgorny9 committed Jul 2, 2024
1 parent 8e8e41d commit 252d028
Showing 1 changed file with 4 additions and 17 deletions.
21 changes: 4 additions & 17 deletions elm/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,21 +206,6 @@ async def clean_txt_async(self, ignore_error=None, rate_limit=40e3):

return clean_pages

def is_double_col(self, text, separator=' '):
"""Does the text look like it has multiple vertical text columns?
Parameters
----------
separator : str
Heuristic split string to look for spaces between columns
Returns
-------
out : bool
True if more than one vertical text column
"""
return is_multi_col(text, separator=separator)

def clean_poppler(self, layout=True):
"""Clean the pdf using the poppler pdftotxt utility
Expand Down Expand Up @@ -351,21 +336,23 @@ def clean_headers(self, char_thresh=0.6, page_thresh=0.8, split_on='\n',
self.full = combine_pages(self.pages)
return self.full

def convert_to_txt(self, txt_fp):
def convert_to_txt(self, txt_fp, separator=' '):
"""Function to convert contents of pdf document to txt file.
Parameters
----------
txt_fp: str
Directory for output txt file.
separator : str
Heuristic split string to look for spaces between columns
Returns
-------
text : str
Text string containing contents from pdf
"""
text = self.clean_poppler(layout=True)
if self.is_double_col(text):
if is_multi_col(text, separator=separator):
text = self.clean_poppler(layout=False)
text = self.clean_headers(char_thresh=0.6, page_thresh=0.8,
split_on='\n',
Expand Down

0 comments on commit 252d028

Please sign in to comment.