Remove 'xls' as by-default-supported file format (#708)

IAMconsortium · Oct 4, 2022 · 60d84d8 · 60d84d8
1 parent 947d8c4
commit 60d84d8
Show file tree

Hide file tree

Showing 9 changed files with 89 additions and 50 deletions.
diff --git a/.github/workflows/pytest-dependency.yml b/.github/workflows/pytest-dependency.yml
@@ -20,12 +20,12 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v3
       with:
-        python-version: '3.7'
+        python-version: '3.7.1'
 
     - name: Install specific out-dated version of dependencies
       # Update the package requirements when changing minimum dependency versions
       # Please also add a section "Dependency changes" to the release notes
-      run: pip install pandas==1.1.1 numpy==1.19.0 matplotlib==3.5.0 iam-units==2020.4.21 xlrd==2.0 pint==0.13
+      run: pip install pandas==1.2.0 numpy==1.19.0 matplotlib==3.5.0 iam-units==2020.4.21 xlrd==2.0 pint==0.13
 
     - name: Install other dependencies and package
       run: pip install .[tests,optional_plotting,optional_io_formats,tutorials]

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -1,5 +1,14 @@
 # Next Release
 
+## Dependency changes
+
+Remove **xlrd** as a dependency; please install it explicitly for reading `.xls` files.
+Bump minimum version of **pandas** to v1.2.0 to support automatic engine selection.
+
+## Individual updates
+
+- [#708](https://github.com/IAMconsortium/pyam/pull/708) Remove 'xls' as by-default-supported file format
+
 # Release v1.6.0
 
 ## Highlights

diff --git a/pyam/core.py b/pyam/core.py
@@ -190,8 +190,8 @@ def _init(self, data, meta=None, index=DEFAULT_META_INDEX, **kwargs):
             self.set_meta(meta)
 
         # if initializing from xlsx, try to load `meta` table from file
-        if meta_sheet and isinstance(data, Path) and data.suffix == ".xlsx":
-            excel_file = pd.ExcelFile(data, engine="openpyxl")
+        if meta_sheet and isinstance(data, Path) and data.suffix in [".xlsx", ".xls"]:
+            excel_file = pd.ExcelFile(data)
             if meta_sheet in excel_file.sheet_names:
                 self.load_meta(excel_file, sheet_name=meta_sheet, ignore_conflict=True)
 
@@ -1344,7 +1344,7 @@ def check_aggregate(
             List of variables to aggregate, defaults to sub-categories of `variable`.
         method : func or str, optional
             Method to use for aggregation,
-            e.g. :any:`numpy.mean`, :aby:`numpy.sum`, 'min', 'max'.
+            e.g. :any:`numpy.mean`, :any:`numpy.sum`, 'min', 'max'.
         exclude_on_fail : bool, optional
             Flag scenarios failing validation as `exclude: True`.
         multiplier : number, optional
@@ -2261,18 +2261,23 @@ def _to_file_format(self, iamc_index):
         df = df.rename(columns={c: str(c).title() for c in df.columns})
         return df
 
-    def to_csv(self, path, iamc_index=False, **kwargs):
-        """Write timeseries data of this object to a csv file
+    def to_csv(self, path=None, iamc_index=False, **kwargs):
+        """Write :meth:`IamDataFrame.timeseries` to a comma-separated values (csv) file
 
         Parameters
         ----------
-        path : str or path object
-            file path or :class:`pathlib.Path`
-        iamc_index : bool, default False
-            if True, use `['model', 'scenario', 'region', 'variable', 'unit']`;
-            else, use all 'data' columns
+        path : str, path or file-like, optional
+            File path as string or :class:`pathlib.Path`, or file-like object.
+            If *None*, the result is returned as a csv-formatted string.
+            See :meth:`pandas.DataFrame.to_csv` for details.
+        iamc_index : bool, optional
+            If True, use `['model', 'scenario', 'region', 'variable', 'unit']`;
+            else, use all :attr:`dimensions`.
+            See :meth:`IamDataFrame.timeseries` for details.
+        **kwargs
+            Passed to :meth:`pandas.DataFrame.to_csv`.
         """
-        self._to_file_format(iamc_index).to_csv(path, index=False, **kwargs)
+        return self._to_file_format(iamc_index).to_csv(path, index=False, **kwargs)
 
     def to_excel(
         self,
@@ -2286,30 +2291,33 @@ def to_excel(
 
         Parameters
         ----------
-        excel_writer : str, path object or ExcelWriter object
-            any valid string path, :class:`pathlib.Path`
-            or :class:`pandas.ExcelWriter`
+        excel_writer : path-like, file-like, or ExcelWriter object
+            File path as string or :class:`pathlib.Path`,
+            or existing :class:`pandas.ExcelWriter`.
         sheet_name : string
-            name of sheet which will contain :meth:`timeseries` data
-        iamc_index : bool, default False
-            if True, use `['model', 'scenario', 'region', 'variable', 'unit']`;
-            else, use all 'data' columns
-        include_meta : boolean or string
-            if True, write 'meta' to an Excel sheet name 'meta' (default);
-            if this is a string, use it as sheet name
+            Name of sheet which will contain :meth:`IamDataFrame.timeseries` data.
+        iamc_index : bool, optional
+            If True, use `['model', 'scenario', 'region', 'variable', 'unit']`;
+            else, use all :attr:`dimensions`.
+            See :meth:`IamDataFrame.timeseries` for details.
+        include_meta : boolean or string, optional
+            If True, write :any:`IamDataFrame.meta` to a sheet 'meta' (default);
+            if this is a string, use it as sheet name.
+        **kwargs
+            Passed to :class:`pandas.ExcelWriter` (if *excel_writer* is path-like)
         """
         # open a new ExcelWriter instance (if necessary)
         close = False
         if not isinstance(excel_writer, pd.ExcelWriter):
             close = True
-            excel_writer = pd.ExcelWriter(excel_writer, engine="xlsxwriter")
+            excel_writer = pd.ExcelWriter(excel_writer, **kwargs)
 
         # write data table
         write_sheet(excel_writer, sheet_name, self._to_file_format(iamc_index))
 
         # write meta table unless `include_meta=False`
         if include_meta:
-            meta_rename = dict([(i, i.capitalize()) for i in META_IDX])
+            meta_rename = dict([(i, i.capitalize()) for i in self.index.names])
             write_sheet(
                 excel_writer,
                 "meta" if include_meta is True else include_meta,
@@ -2320,20 +2328,21 @@ def to_excel(
         if close:
             excel_writer.close()
 
-    def export_meta(self, excel_writer, sheet_name="meta"):
-        """Write the 'meta' indicators of this object to an Excel sheet
+    def export_meta(self, excel_writer, sheet_name="meta", **kwargs):
+        """Write the 'meta' indicators of this object to an Excel spreadsheet
 
         Parameters
         ----------
         excel_writer : str, path object or ExcelWriter object
-            any valid string path, :class:`pathlib.Path`
-            or :class:`pandas.ExcelWriter`
+            File path, :class:`pathlib.Path`, or existing :class:`pandas.ExcelWriter`.
         sheet_name : str
-            name of sheet which will contain dataframe of 'meta' indicators
+            Name of sheet which will contain 'meta'.
+        **kwargs
+            Passed to :class:`pandas.ExcelWriter` (if *excel_writer* is path-like)
         """
         close = False
         if not isinstance(excel_writer, pd.ExcelWriter):
-            excel_writer = pd.ExcelWriter(excel_writer, engine="xlsxwriter")
+            excel_writer = pd.ExcelWriter(excel_writer, **kwargs)
             close = True
         write_sheet(excel_writer, sheet_name, self.meta, index=True)
         if close:

diff --git a/pyam/iiasa.py b/pyam/iiasa.py
@@ -615,7 +615,7 @@ def lazy_read_iiasa(
     ----------
     file : str or :class:`pathlib.Path`
         The location to test for valid data and save the data if not up-to-date. Must be
-        either xls, xlsx or csv.
+        either xlsx or csv.
     name : str
         | Name of an IIASA Scenario Explorer database instance.
         | See :attr:`pyam.iiasa.Connection.valid_connections`.
@@ -642,8 +642,7 @@ def lazy_read_iiasa(
     assert file.suffix in [
         ".csv",
         ".xlsx",
-        ".xls",
-    ], "We will only read and write to csv, xls and xlsx format."
+    ], "We will only read and write to csv and xlsx format."
     if os.path.exists(file):
         date_set = pd.to_datetime(os.path.getmtime(file), unit="s")
         version_info = Connection(name, creds, base_url).properties()

diff --git a/pyam/utils.py b/pyam/utils.py
@@ -136,19 +136,15 @@ def write_sheet(writer, name, df, index=False):
 
 def read_pandas(path, sheet_name=["data*", "Data*"], *args, **kwargs):
     """Read a file and return a pandas.DataFrame"""
+
     if isinstance(path, Path) and path.suffix == ".csv":
         return pd.read_csv(path, *args, **kwargs)
-    else:
-        if isinstance(path, pd.ExcelFile):
-            xl = path
-        else:
-            xl = pd.ExcelFile(
-                path, engine="xlrd" if path.suffix == ".xls" else "openpyxl"
-            )
 
-        sheet_names = pd.Series(xl.sheet_names)
+    else:
+        xl = path if isinstance(path, pd.ExcelFile) else pd.ExcelFile(path)
 
         # reading multiple sheets
+        sheet_names = pd.Series(xl.sheet_names)
         if len(sheet_names) > 1:
             sheets = kwargs.pop("sheet_name", sheet_name)
             # apply pattern-matching for sheet names (use * as wildcard)

diff --git a/setup.cfg b/setup.cfg
@@ -19,7 +19,7 @@ classifiers =
 [options]
 packages = pyam
 include_package_data = True
-python_requires = >=3.7, <3.11
+python_requires = >=3.7.1, <3.11
 
 # NOTE TO DEVS
 # If you change a minimum version below, please explicitly implement the change
@@ -32,14 +32,13 @@ install_requires =
     pyjwt
     httpx[http2]
     openpyxl
-    pandas >= 1.1.1
+    pandas >= 1.2.0
     scipy
     pint >= 0.13
     PyYAML
     matplotlib >= 3.2.0
     seaborn
     six
-    xlrd >= 2.0
     setuptools >= 41
     setuptools_scm
     # required explicitly for Python 3.7
@@ -61,6 +60,7 @@ optional_plotting =
 optional_io_formats =
     datapackage
     pandas-datareader
+    xlrd
     unfccc_di_api >= 3.0.1
 tutorials =
     pypandoc

diff --git a/tests/data/test_df.xls b/tests/data/test_df.xls
diff --git a/tests/test_iiasa.py b/tests/test_iiasa.py
@@ -392,10 +392,7 @@ def test_lazy_read(tmpdir):
     df_newfilt = lazy_read_iiasa(tmp_file, TEST_API, model="model_b")
     assert df_newfilt.model == ["model_b"]
     assert os.path.getmtime(tmp_file) > writetime
-    # file can also be xls or xlsx
+    # file can also xlsx
     xlsx_file = tmpdir / "test_database.xlsx"
     df_xlsx = lazy_read_iiasa(xlsx_file, TEST_API, model="model_b")
     assert df_newfilt.equals(df_xlsx)
-    xls_file = tmpdir / "test_database.xls"
-    df_xls = lazy_read_iiasa(xls_file, TEST_API, model="model_b")
-    assert df_xls.equals(df_xlsx)
diff --git a/tests/test_io.py b/tests/test_io.py
@@ -9,6 +9,14 @@
 
 from .conftest import TEST_DATA_DIR, META_DF
 
+try:
+    import xlrd  # noqa: F401
+
+    has_xlrd = True
+except ModuleNotFoundError:  # pragma: no cover
+    has_xlrd = False
+
+
 FILTER_ARGS = dict(scenario="scen_a")
 
 
@@ -40,7 +48,7 @@ def test_io_list():
         IamDataFrame([1, 2])
 
 
-def test_io_csv(test_df, tmpdir):
+def test_io_csv_to_file(test_df, tmpdir):
     # write to csv
     file = tmpdir / "testing_io_write_read.csv"
     test_df.to_csv(file)
@@ -50,6 +58,21 @@ def test_io_csv(test_df, tmpdir):
     pd.testing.assert_frame_equal(test_df.data, import_df.data)
 
 
+def test_io_csv_none(test_df_year):
+    # parse data as csv and return as string
+    exp = (
+        "Model,Scenario,Region,Variable,Unit,2005,2010\n"
+        "model_a,scen_a,World,Primary Energy,EJ/yr,1.0,6.0\n"
+        "model_a,scen_a,World,Primary Energy|Coal,EJ/yr,0.5,3.0\n"
+        "model_a,scen_b,World,Primary Energy,EJ/yr,2.0,7.0\n"
+    )
+    try:
+        assert test_df_year.to_csv(lineterminator="\n") == exp
+    # special treatment for Python 3.7 and pandas < 1.5
+    except TypeError:
+        assert test_df_year.to_csv(line_terminator="\n") == exp
+
+
 @pytest.mark.parametrize(
     "meta_args", [[{}, {}], [dict(include_meta="foo"), dict(meta_sheet_name="foo")]]
 )
@@ -92,6 +115,12 @@ def test_io_xlsx_multiple_data_sheets(test_df, sheets, sheetname, tmpdir):
     assert_iamframe_equal(test_df, import_df)
 
 
+@pytest.mark.skipif(not has_xlrd, reason="Package 'xlrd' not installed.")
+def test_read_xls(test_df_year):
+    import_df = IamDataFrame(TEST_DATA_DIR / "test_df.xls")
+    assert_iamframe_equal(test_df_year, import_df)
+
+
 def test_init_df_with_na_unit(test_pd_df, tmpdir):
     # missing values in the unit column are replaced by an empty string
     test_pd_df.loc[1, "unit"] = np.nan