atlanhq · simonghrt · Sep 13, 2019
diff --git a/camelot/__init__.py b/camelot/__init__.py
@@ -3,7 +3,7 @@
 import logging
 
 from .__version__ import __version__
-from .io import read_pdf
+from .io import read_pdf, read_file_obj
 from .plotting import PlotMethods
 
 

diff --git a/camelot/handlers.py b/camelot/handlers.py
@@ -24,38 +24,50 @@ class PDFHandler(object):
 
     Parameters
     ----------
-    filepath : str
-        Filepath or URL of the PDF file.
+    file_path : str
+        File path or URL of the PDF file.
+    file_obj: str
+        File object of the PDF file
     pages : str, optional (default: '1')
         Comma-separated page numbers.
         Example: '1,3,4' or '1,4-end' or 'all'.
     password : str, optional (default: None)
         Password for decryption.
 
     """
-
-    def __init__(self, filepath, pages="1", password=None):
-        if is_url(filepath):
-            filepath = download_url(filepath)
-        self.filepath = filepath
-        if not filepath.lower().endswith(".pdf"):
-            raise NotImplementedError("File format not supported")
-
+    def __init__(self, file_path="", file_obj="", pages='1', password=None):
         if password is None:
-            self.password = ""
+            self.password = ''
         else:
             self.password = password
             if sys.version_info[0] < 3:
-                self.password = self.password.encode("ascii")
-        self.pages = self._get_pages(self.filepath, pages)
+                self.password = self.password.encode('ascii')
+
+        if file_path:
+            if is_url(file_path):
+                file_path = download_url(file_path)
+            if not file_path.lower().endswith('.pdf'):
+                raise NotImplementedError("File format not supported")
+
+        self.file_path = file_path
+        self.file_obj = file_obj
+
+        if self.file_path:
+            self.pages = self._get_pages(filepath=file_path, pages=pages)
+        elif self.file_obj:
+            self.pages = self._get_pages(fileObj=file_obj, pages=pages)
+        else:
+            raise ValueError("You must have either file_path or file_obj not empty")
 
-    def _get_pages(self, filepath, pages):
+    def _get_pages(self, filepath="", pages="1", fileObj=""):
         """Converts pages string to list of ints.
 
         Parameters
         ----------
         filepath : str
             Filepath or URL of the PDF file.
+        fileObj : str
+            File Object of the PDF file.
         pages : str, optional (default: '1')
             Comma-separated page numbers.
             Example: '1,3,4' or '1,4-end' or 'all'.
@@ -70,7 +82,10 @@ def _get_pages(self, filepath, pages):
         if pages == "1":
             page_numbers.append({"start": 1, "end": 1})
         else:
-            infile = PdfFileReader(open(filepath, "rb"), strict=False)
+            if filepath:
+                infile = PdfFileReader(open(filepath, 'rb'), strict=False)
+            if fileObj:
+                infile = PdfFileReader(fileObj, strict=False)
             if infile.isEncrypted:
                 infile.decrypt(self.password)
             if pages == "all":
@@ -89,7 +104,7 @@ def _get_pages(self, filepath, pages):
             P.extend(range(p["start"], p["end"] + 1))
         return sorted(set(P))
 
-    def _save_page(self, filepath, page, temp):
+    def _save_page(self, file_obj, page, temp):
         """Saves specified page from PDF into a temporary directory.
 
         Parameters
@@ -102,38 +117,37 @@ def _save_page(self, filepath, page, temp):
             Tmp directory.
 
         """
-        with open(filepath, "rb") as fileobj:
-            infile = PdfFileReader(fileobj, strict=False)
+        infile = PdfFileReader(file_obj, strict=False)
+        if infile.isEncrypted:
+            infile.decrypt(self.password)
+        fpath = os.path.join(temp, "page-{0}.pdf".format(page))
+        froot, fext = os.path.splitext(fpath)
+        p = infile.getPage(page - 1)
+        outfile = PdfFileWriter()
+        outfile.addPage(p)
+        with open(fpath, "wb") as f:
+            outfile.write(f)
+        layout, dim = get_page_layout(fpath)
+        # fix rotated PDF
+        chars = get_text_objects(layout, ltype="char")
+        horizontal_text = get_text_objects(layout, ltype="horizontal_text")
+        vertical_text = get_text_objects(layout, ltype="vertical_text")
+        rotation = get_rotation(chars, horizontal_text, vertical_text)
+        if rotation != "":
+            fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
+            os.rename(fpath, fpath_new)
+            infile = PdfFileReader(open(fpath_new, "rb"), strict=False)
             if infile.isEncrypted:
                 infile.decrypt(self.password)
-            fpath = os.path.join(temp, "page-{0}.pdf".format(page))
-            froot, fext = os.path.splitext(fpath)
-            p = infile.getPage(page - 1)
             outfile = PdfFileWriter()
+            p = infile.getPage(0)
+            if rotation == "anticlockwise":
+                p.rotateClockwise(90)
+            elif rotation == "clockwise":
+                p.rotateCounterClockwise(90)
             outfile.addPage(p)
             with open(fpath, "wb") as f:
                 outfile.write(f)
-            layout, dim = get_page_layout(fpath)
-            # fix rotated PDF
-            chars = get_text_objects(layout, ltype="char")
-            horizontal_text = get_text_objects(layout, ltype="horizontal_text")
-            vertical_text = get_text_objects(layout, ltype="vertical_text")
-            rotation = get_rotation(chars, horizontal_text, vertical_text)
-            if rotation != "":
-                fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
-                os.rename(fpath, fpath_new)
-                infile = PdfFileReader(open(fpath_new, "rb"), strict=False)
-                if infile.isEncrypted:
-                    infile.decrypt(self.password)
-                outfile = PdfFileWriter()
-                p = infile.getPage(0)
-                if rotation == "anticlockwise":
-                    p.rotateClockwise(90)
-                elif rotation == "clockwise":
-                    p.rotateCounterClockwise(90)
-                outfile.addPage(p)
-                with open(fpath, "wb") as f:
-                    outfile.write(f)
 
     def parse(
         self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
@@ -162,7 +176,11 @@ def parse(
         tables = []
         with TemporaryDirectory() as tempdir:
             for p in self.pages:
-                self._save_page(self.filepath, p, tempdir)
+                if self.file_path != "":
+                    with open(self.file_path, "rb") as file_obj:
+                        self._save_page(file_obj=file_obj, page=p, temp=tempdir)
+                if self.file_obj != "":
+                    self._save_page(file_obj=self.file_obj, page=p, temp=tempdir)
             pages = [
                 os.path.join(tempdir, "page-{0}.pdf".format(p)) for p in self.pages
             ]

diff --git a/camelot/io.py b/camelot/io.py
@@ -108,7 +108,7 @@ def read_pdf(
             warnings.simplefilter("ignore")
 
         validate_input(kwargs, flavor=flavor)
-        p = PDFHandler(filepath, pages=pages, password=password)
+        p = PDFHandler(file_path=filepath, pages=pages, password=password)
         kwargs = remove_extra(kwargs, flavor=flavor)
         tables = p.parse(
             flavor=flavor,
@@ -117,3 +117,97 @@ def read_pdf(
             **kwargs
         )
         return tables
+
+
+def read_file_obj(file_obj, pages='1', password=None, flavor='lattice', suppress_stdout=False, layout_kwargs={},
+                 **kwargs):
+    """Read PDF and return extracted tables.
+    Note: kwargs annotated with ^ can only be used with flavor='stream'
+    and kwargs annotated with * can only be used with flavor='lattice'.
+    Parameters
+    ----------
+    file_obj : str
+            File Object of the PDF file.
+    pages : str, optional (default: '1')
+        Comma-separated page numbers.
+        Example: '1,3,4' or '1,4-end' or 'all'.
+    password : str, optional (default: None)
+        Password for decryption.
+    flavor : str (default: 'lattice')
+        The parsing method to use ('lattice' or 'stream').
+        Lattice is used by default.
+    suppress_stdout : bool, optional (default: True)
+        Print all logs and warnings.
+    layout_kwargs : dict, optional (default: {})
+        A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
+    table_areas : list, optional (default: None)
+        List of table area strings of the form x1,y1,x2,y2
+        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+        in PDF coordinate space.
+    columns^ : list, optional (default: None)
+        List of column x-coordinates strings where the coordinates
+        are comma-separated.
+    split_text : bool, optional (default: False)
+        Split text that spans across multiple cells.
+    flag_size : bool, optional (default: False)
+        Flag text based on font size. Useful to detect
+        super/subscripts. Adds <s></s> around flagged text.
+    strip_text : str, optional (default: '')
+        Characters that should be stripped from a string before
+        assigning it to a cell.
+    row_tol^ : int, optional (default: 2)
+        Tolerance parameter used to combine text vertically,
+        to generate rows.
+    column_tol^ : int, optional (default: 0)
+        Tolerance parameter used to combine text horizontally,
+        to generate columns.
+    process_background* : bool, optional (default: False)
+        Process background lines.
+    line_scale* : int, optional (default: 15)
+        Line size scaling factor. The larger the value the smaller
+        the detected lines. Making it very large will lead to text
+        being detected as lines.
+    copy_text* : list, optional (default: None)
+        {'h', 'v'}
+        Direction in which text in a spanning cell will be copied
+        over.
+    shift_text* : list, optional (default: ['l', 't'])
+        {'l', 'r', 't', 'b'}
+        Direction in which text in a spanning cell will flow.
+    line_tol* : int, optional (default: 2)
+        Tolerance parameter used to merge close vertical and horizontal
+        lines.
+    joint_tol* : int, optional (default: 2)
+        Tolerance parameter used to decide whether the detected lines
+        and points lie close to each other.
+    threshold_blocksize* : int, optional (default: 15)
+        Size of a pixel neighborhood that is used to calculate a
+        threshold value for the pixel: 3, 5, 7, and so on.
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
+    threshold_constant* : int, optional (default: -2)
+        Constant subtracted from the mean or weighted mean.
+        Normally, it is positive but may be zero or negative as well.
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
+    iterations* : int, optional (default: 0)
+        Number of times for erosion/dilation is applied.
+        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
+    resolution* : int, optional (default: 300)
+        Resolution used for PDF to PNG conversion.
+    Returns
+    -------
+    tables : camelot.core.TableList
+    """
+    if flavor not in ['lattice', 'stream']:
+        raise NotImplementedError("Unknown flavor specified."
+                                  " Use either 'lattice' or 'stream'")
+
+    with warnings.catch_warnings():
+        if suppress_stdout:
+            warnings.simplefilter("ignore")
+
+        validate_input(kwargs, flavor=flavor)
+        p = PDFHandler(file_obj=file_obj, pages=pages, password=password)
+        kwargs = remove_extra(kwargs, flavor=flavor)
+        tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout,
+                         layout_kwargs=layout_kwargs, **kwargs)
+        return tables
diff --git a/tests/test_common.py b/tests/test_common.py
@@ -32,6 +32,17 @@ def test_password():
     assert df.equals(tables[0].df)
 
 
+def test_file_object():
+    df = pd.DataFrame(data_stream)
+
+    filename = os.path.join(testdir, "health_protected.pdf")
+    file_object = open(filename, "rb")
+    tables = camelot.read_file_obj(file_object, password="ownerpass", flavor="stream")
+    assert df.equals(tables[0].df)
+
+    tables = camelot.read_file_obj(file_object, password="userpass", flavor="stream")
+    assert df.equals(tables[0].df)
+
 def test_stream():
     df = pd.DataFrame(data_stream)