Align with tabula-java 1.0.1 (#46)

* Add .venv in .gitignore * Bump up tabula-java to v1.0.1 * Bump up pandas * Remove deplicated function `read_pdf_table()` * Deprecate spreadsheet and nospreadsheet options * Remove deprecated function * Update README.md * Fix tox.ini * Avoid pkg_resources.VersionConflict on travis * Upgrade setuptools
chezou · Aug 8, 2017 · 14b70ee · 14b70ee
1 parent 6bcd36b
commit 14b70ee
Show file tree

Hide file tree

Showing 11 changed files with 45 additions and 37 deletions.
diff --git a/.gitignore b/.gitignore
@@ -59,6 +59,7 @@ target/
 .python-version
 
 venv
+.venv
 *~
 
-.vscode
+.vscode
diff --git a/.travis.yml b/.travis.yml
@@ -3,7 +3,11 @@ language: python
 python:
   - 2.7
   - 3.5
+before_install:
+  - pip install --upgrade setuptools
 install:
+# Avoid pkg_resources.VersionConflict see also: https://github.com/ryanhiebert/tox-travis/issues/26
+  - pip install tox
   - pip install tox-travis
   - pip install coverage coveralls
 script:

diff --git a/README.md b/README.md
@@ -26,6 +26,12 @@ I confirmed working on macOS and Ubuntu. I can't fully support Windows environme
 pip install tabula-py
 ```
 
+If you want to become a contributor, you can install dependency for development of tabula-py as follows:
+
+```
+pip install -r requirements.txt -c constraints.txt
+```
+
 ## Example
 
 tabula-py enables you to extract table from PDF into DataFrame and JSON. It also can extract tables from PDF and save file as CSV, TSV or JSON.
@@ -58,10 +64,10 @@ See [example notebook](./examples/tabula_example.ipynb)
 - area (`list` of `float`, optional):
   - Portion of the page to analyze(top,left,bottom,right).
   - Example: [269.875, 12.75, 790.5, 561]. Default is entire page
-- spreadsheet (bool, optional):
-  - Force PDF to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)
-- nospreadsheet (bool, optional):
-  - Force PDF not to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)
+- lattice (bool, optional):
+  - [`spreadsheet` option is deprecated] Force PDF to be extracted using lattice-mode extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet). 
+- stream (bool, optional):
+  - [`nospreadsheet` option is deprecated] Force PDF to be extracted using stream-mode extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)
 - password (bool, optional):
   - Password to decrypt document. Default is empty
 - silent (bool, optional):
@@ -126,7 +132,7 @@ For example, using macOS's preview, I got area information of this [PDF](https:/
 
 
 ```
-java -jar ./target/tabula-0.9.0-jar-with-dependencies.jar -p all -a $y1,$x1,$y2,$x2 -o $csvfile $filename
+java -jar ./target/tabula-1.0.1-jar-with-dependencies.jar -p all -a $y1,$x1,$y2,$x2 -o $csvfile $filename
 ```
 
 given
@@ -143,7 +149,7 @@ x2 = left + width
 I confirmed with tabula-java:
 
 ```
-java -jar ./tabula/tabula-0.9.1-jar-with-dependencies.jar -a "337.29,226.49,472.85,384.91" table.pdf
+java -jar ./tabula/tabula-1.0.1-jar-with-dependencies.jar -a "337.29,226.49,472.85,384.91" table.pdf
 ```
 
 Without `-r`(same as `--spreadsheet`) option, it does not work properly.

diff --git a/constraints.txt b/constraints.txt
diff --git a/requirements.txt b/requirements.txt
@@ -1,11 +1,4 @@
-mccabe==0.5.3
-numpy==1.11.3
-pandas==0.19.2
-python-dateutil==2.6.0
-pytz==2016.10
-requests==2.12.4
-six==1.10.0
-flake8==3.2.1
-pycodestyle==2.2.0
-pyflakes==1.3.0
-pytest==3.0.5
+requests
+pandas
+pytest
+flake8
diff --git a/tabula/__init__.py b/tabula/__init__.py
@@ -1,4 +1,3 @@
-from .wrapper import read_pdf_table
 from .wrapper import read_pdf
 from .wrapper import convert_into
 from .wrapper import convert_into_by_batch
diff --git a/...la/tabula-0.9.2-jar-with-dependencies.jar → ...la/tabula-1.0.1-jar-with-dependencies.jar b/...la/tabula-0.9.2-jar-with-dependencies.jar → ...la/tabula-1.0.1-jar-with-dependencies.jar
diff --git a/tabula/util.py b/tabula/util.py
@@ -11,4 +11,8 @@ def newFunc(*args, **kwargs):
     newFunc.__name__ = func.__name__
     newFunc.__doc__ = func.__doc__
     newFunc.__dict__.update(func.__dict__)
-    return newFunc
+    return newFunc
+
+def deprecated_option(option):
+    warnings.warn("Call to deprecated option {}.".format(option),
+                  category=DeprecationWarning, stacklevel=2)
diff --git a/tabula/wrapper.py b/tabula/wrapper.py
@@ -16,9 +16,9 @@
 import requests
 import pandas as pd
 import numpy as np
-from .util import deprecated
+from .util import deprecated_option
 
-JAR_NAME = "tabula-0.9.2-jar-with-dependencies.jar"
+JAR_NAME = "tabula-1.0.1-jar-with-dependencies.jar"
 JAR_DIR = os.path.abspath(os.path.dirname(__file__))
 JAR_PATH = os.path.join(JAR_DIR, JAR_NAME)
 
@@ -97,10 +97,6 @@ def read_pdf(input_path,
         return pd.read_csv(io.BytesIO(output), **pandas_options)
 
 
-# Set alias for future rename from `read_pdf_table` to `read_pdf`
-read_pdf_table = deprecated(read_pdf)
-
-
 def convert_into(input_path, output_path, output_format='csv', java_options=None, **kwargs):
     '''Convert tables from PDF into a file.
 
@@ -287,12 +283,12 @@ def build_options(kwargs=None):
         area (:obj:`list` of :obj:`float`, optional):
             Portion of the page to analyze(top,left,bottom,right).
             Example: [269.875,12.75,790.5,561]. Default is entire page
-        spreadsheet (bool, optional):
-            Force PDF to be extracted using spreadsheet-style extraction
+        lattice (bool, optional):
+            Force PDF to be extracted using lattice-mode extraction
             (if there are ruling lines separating each cell, as in a PDF of an
             Excel spreadsheet)
-        nospreadsheet (bool, optional):
-            Force PDF not to be extracted using spreadsheet-style extraction
+        stream (bool, optional):
+            Force PDF to be extracted using stream-mode extraction
             (if there are ruling lines separating each cell, as in a PDF of an
              Excel spreadsheet)
         password (str, optional):
@@ -320,6 +316,10 @@ def build_options(kwargs=None):
     # handle options described in string for backward compatibility
     __options += shlex.split(options)
 
+    DEPRECATED_OPTIONS = ['spreadsheet', 'nospreadsheet']
+    for option in kwargs.keys() and DEPRECATED_OPTIONS:
+        deprecated_option(option)
+
     # parse options
     pages = kwargs.get('pages', 1)
     if pages:
@@ -351,13 +351,13 @@ def build_options(kwargs=None):
     if output_path:
         __options += ["--outfile", output_path]
 
-    spreadsheet = kwargs.get('spreadsheet')
-    if spreadsheet:
-        __options.append("--spreadsheet")
+    lattice = kwargs.get('lattice') or kwargs.get('spreadsheet')
+    if lattice:
+        __options.append("--lattice")
 
-    nospreadsheet = kwargs.get('nospreadsheet')
-    if nospreadsheet:
-        __options.append("--no-spreadsheet")
+    stream = kwargs.get('stream') or kwargs.get('nospreadsheet')
+    if stream:
+        __options.append("--stream")
 
     columns = kwargs.get('columns')
     if columns:

diff --git a/tests/test_read_pdf_table.py b/tests/test_read_pdf_table.py
@@ -82,7 +82,7 @@ def test_read_pdf_for_multiple_tables(self):
         self.assertEqual(len(tabula.read_pdf(pdf_path, pages=2, multiple_tables=True)), 2)
         self.assertTrue(tabula.read_pdf(pdf_path, pages=1, multiple_tables=True)[0].equals(
             pd.read_csv(expected_csv1, header=None)))
-        with self.assertRaises(pd.parser.CParserError):
+        with self.assertRaises(pd.errors.ParserError):
             tabula.read_pdf(pdf_path, pages=2)
 
     def test_convert_from(self):

diff --git a/tox.ini b/tox.ini
@@ -6,4 +6,5 @@ envlist = py27, py36
 deps =
     -U
     -r{toxinidir}/requirements.txt
+    -c{toxinidir}/constraints.txt
 commands = py.test --doctest-module -v tabula/wrapper.py tests