Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ability to read file object #376

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion camelot/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging

from .__version__ import __version__
from .io import read_pdf
from .io import read_pdf, read_file_obj
from .plotting import PlotMethods


Expand Down
104 changes: 61 additions & 43 deletions camelot/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,38 +24,50 @@ class PDFHandler(object):

Parameters
----------
filepath : str
Filepath or URL of the PDF file.
file_path : str
File path or URL of the PDF file.
file_obj: str
File object of the PDF file
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
password : str, optional (default: None)
Password for decryption.

"""

def __init__(self, filepath, pages="1", password=None):
if is_url(filepath):
filepath = download_url(filepath)
self.filepath = filepath
if not filepath.lower().endswith(".pdf"):
raise NotImplementedError("File format not supported")

def __init__(self, file_path="", file_obj="", pages='1', password=None):
if password is None:
self.password = ""
self.password = ''
else:
self.password = password
if sys.version_info[0] < 3:
self.password = self.password.encode("ascii")
self.pages = self._get_pages(self.filepath, pages)
self.password = self.password.encode('ascii')

if file_path:
if is_url(file_path):
file_path = download_url(file_path)
if not file_path.lower().endswith('.pdf'):
raise NotImplementedError("File format not supported")

self.file_path = file_path
self.file_obj = file_obj

if self.file_path:
self.pages = self._get_pages(filepath=file_path, pages=pages)
elif self.file_obj:
self.pages = self._get_pages(fileObj=file_obj, pages=pages)
else:
raise ValueError("You must have either file_path or file_obj not empty")

def _get_pages(self, filepath, pages):
def _get_pages(self, filepath="", pages="1", fileObj=""):
"""Converts pages string to list of ints.

Parameters
----------
filepath : str
Filepath or URL of the PDF file.
fileObj : str
File Object of the PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
Expand All @@ -70,7 +82,10 @@ def _get_pages(self, filepath, pages):
if pages == "1":
page_numbers.append({"start": 1, "end": 1})
else:
infile = PdfFileReader(open(filepath, "rb"), strict=False)
if filepath:
infile = PdfFileReader(open(filepath, 'rb'), strict=False)
if fileObj:
infile = PdfFileReader(fileObj, strict=False)
if infile.isEncrypted:
infile.decrypt(self.password)
if pages == "all":
Expand All @@ -89,7 +104,7 @@ def _get_pages(self, filepath, pages):
P.extend(range(p["start"], p["end"] + 1))
return sorted(set(P))

def _save_page(self, filepath, page, temp):
def _save_page(self, file_obj, page, temp):
"""Saves specified page from PDF into a temporary directory.

Parameters
Expand All @@ -102,38 +117,37 @@ def _save_page(self, filepath, page, temp):
Tmp directory.

"""
with open(filepath, "rb") as fileobj:
infile = PdfFileReader(fileobj, strict=False)
infile = PdfFileReader(file_obj, strict=False)
if infile.isEncrypted:
infile.decrypt(self.password)
fpath = os.path.join(temp, "page-{0}.pdf".format(page))
froot, fext = os.path.splitext(fpath)
p = infile.getPage(page - 1)
outfile = PdfFileWriter()
outfile.addPage(p)
with open(fpath, "wb") as f:
outfile.write(f)
layout, dim = get_page_layout(fpath)
# fix rotated PDF
chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "":
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
os.rename(fpath, fpath_new)
infile = PdfFileReader(open(fpath_new, "rb"), strict=False)
if infile.isEncrypted:
infile.decrypt(self.password)
fpath = os.path.join(temp, "page-{0}.pdf".format(page))
froot, fext = os.path.splitext(fpath)
p = infile.getPage(page - 1)
outfile = PdfFileWriter()
p = infile.getPage(0)
if rotation == "anticlockwise":
p.rotateClockwise(90)
elif rotation == "clockwise":
p.rotateCounterClockwise(90)
outfile.addPage(p)
with open(fpath, "wb") as f:
outfile.write(f)
layout, dim = get_page_layout(fpath)
# fix rotated PDF
chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "":
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
os.rename(fpath, fpath_new)
infile = PdfFileReader(open(fpath_new, "rb"), strict=False)
if infile.isEncrypted:
infile.decrypt(self.password)
outfile = PdfFileWriter()
p = infile.getPage(0)
if rotation == "anticlockwise":
p.rotateClockwise(90)
elif rotation == "clockwise":
p.rotateCounterClockwise(90)
outfile.addPage(p)
with open(fpath, "wb") as f:
outfile.write(f)

def parse(
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
Expand Down Expand Up @@ -162,7 +176,11 @@ def parse(
tables = []
with TemporaryDirectory() as tempdir:
for p in self.pages:
self._save_page(self.filepath, p, tempdir)
if self.file_path != "":
with open(self.file_path, "rb") as file_obj:
self._save_page(file_obj=file_obj, page=p, temp=tempdir)
if self.file_obj != "":
self._save_page(file_obj=self.file_obj, page=p, temp=tempdir)
pages = [
os.path.join(tempdir, "page-{0}.pdf".format(p)) for p in self.pages
]
Expand Down
96 changes: 95 additions & 1 deletion camelot/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def read_pdf(
warnings.simplefilter("ignore")

validate_input(kwargs, flavor=flavor)
p = PDFHandler(filepath, pages=pages, password=password)
p = PDFHandler(file_path=filepath, pages=pages, password=password)
kwargs = remove_extra(kwargs, flavor=flavor)
tables = p.parse(
flavor=flavor,
Expand All @@ -117,3 +117,97 @@ def read_pdf(
**kwargs
)
return tables


def read_file_obj(file_obj, pages='1', password=None, flavor='lattice', suppress_stdout=False, layout_kwargs={},
**kwargs):
"""Read PDF and return extracted tables.
Note: kwargs annotated with ^ can only be used with flavor='stream'
and kwargs annotated with * can only be used with flavor='lattice'.
Parameters
----------
file_obj : str
File Object of the PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
password : str, optional (default: None)
Password for decryption.
flavor : str (default: 'lattice')
The parsing method to use ('lattice' or 'stream').
Lattice is used by default.
suppress_stdout : bool, optional (default: True)
Print all logs and warnings.
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
table_areas : list, optional (default: None)
List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
columns^ : list, optional (default: None)
List of column x-coordinates strings where the coordinates
are comma-separated.
split_text : bool, optional (default: False)
Split text that spans across multiple cells.
flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect
super/subscripts. Adds <s></s> around flagged text.
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
row_tol^ : int, optional (default: 2)
Tolerance parameter used to combine text vertically,
to generate rows.
column_tol^ : int, optional (default: 0)
Tolerance parameter used to combine text horizontally,
to generate columns.
process_background* : bool, optional (default: False)
Process background lines.
line_scale* : int, optional (default: 15)
Line size scaling factor. The larger the value the smaller
the detected lines. Making it very large will lead to text
being detected as lines.
copy_text* : list, optional (default: None)
{'h', 'v'}
Direction in which text in a spanning cell will be copied
over.
shift_text* : list, optional (default: ['l', 't'])
{'l', 'r', 't', 'b'}
Direction in which text in a spanning cell will flow.
line_tol* : int, optional (default: 2)
Tolerance parameter used to merge close vertical and horizontal
lines.
joint_tol* : int, optional (default: 2)
Tolerance parameter used to decide whether the detected lines
and points lie close to each other.
threshold_blocksize* : int, optional (default: 15)
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
threshold_constant* : int, optional (default: -2)
Constant subtracted from the mean or weighted mean.
Normally, it is positive but may be zero or negative as well.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
iterations* : int, optional (default: 0)
Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
resolution* : int, optional (default: 300)
Resolution used for PDF to PNG conversion.
Returns
-------
tables : camelot.core.TableList
"""
if flavor not in ['lattice', 'stream']:
raise NotImplementedError("Unknown flavor specified."
" Use either 'lattice' or 'stream'")

with warnings.catch_warnings():
if suppress_stdout:
warnings.simplefilter("ignore")

validate_input(kwargs, flavor=flavor)
p = PDFHandler(file_obj=file_obj, pages=pages, password=password)
kwargs = remove_extra(kwargs, flavor=flavor)
tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout,
layout_kwargs=layout_kwargs, **kwargs)
return tables
11 changes: 11 additions & 0 deletions tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,17 @@ def test_password():
assert df.equals(tables[0].df)


def test_file_object():
df = pd.DataFrame(data_stream)

filename = os.path.join(testdir, "health_protected.pdf")
file_object = open(filename, "rb")
tables = camelot.read_file_obj(file_object, password="ownerpass", flavor="stream")
assert df.equals(tables[0].df)

tables = camelot.read_file_obj(file_object, password="userpass", flavor="stream")
assert df.equals(tables[0].df)

def test_stream():
df = pd.DataFrame(data_stream)

Expand Down