Skip to content

Commit

Permalink
Remove 'xls' as by-default-supported file format (#708)
Browse files Browse the repository at this point in the history
  • Loading branch information
danielhuppmann committed Oct 4, 2022
1 parent 947d8c4 commit 60d84d8
Show file tree
Hide file tree
Showing 9 changed files with 89 additions and 50 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/pytest-dependency.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: '3.7'
python-version: '3.7.1'

- name: Install specific out-dated version of dependencies
# Update the package requirements when changing minimum dependency versions
# Please also add a section "Dependency changes" to the release notes
run: pip install pandas==1.1.1 numpy==1.19.0 matplotlib==3.5.0 iam-units==2020.4.21 xlrd==2.0 pint==0.13
run: pip install pandas==1.2.0 numpy==1.19.0 matplotlib==3.5.0 iam-units==2020.4.21 xlrd==2.0 pint==0.13

- name: Install other dependencies and package
run: pip install .[tests,optional_plotting,optional_io_formats,tutorials]
Expand Down
9 changes: 9 additions & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
# Next Release

## Dependency changes

Remove **xlrd** as a dependency; please install it explicitly for reading `.xls` files.
Bump minimum version of **pandas** to v1.2.0 to support automatic engine selection.

## Individual updates

- [#708](https://github.com/IAMconsortium/pyam/pull/708) Remove 'xls' as by-default-supported file format

# Release v1.6.0

## Highlights
Expand Down
67 changes: 38 additions & 29 deletions pyam/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,8 @@ def _init(self, data, meta=None, index=DEFAULT_META_INDEX, **kwargs):
self.set_meta(meta)

# if initializing from xlsx, try to load `meta` table from file
if meta_sheet and isinstance(data, Path) and data.suffix == ".xlsx":
excel_file = pd.ExcelFile(data, engine="openpyxl")
if meta_sheet and isinstance(data, Path) and data.suffix in [".xlsx", ".xls"]:
excel_file = pd.ExcelFile(data)
if meta_sheet in excel_file.sheet_names:
self.load_meta(excel_file, sheet_name=meta_sheet, ignore_conflict=True)

Expand Down Expand Up @@ -1344,7 +1344,7 @@ def check_aggregate(
List of variables to aggregate, defaults to sub-categories of `variable`.
method : func or str, optional
Method to use for aggregation,
e.g. :any:`numpy.mean`, :aby:`numpy.sum`, 'min', 'max'.
e.g. :any:`numpy.mean`, :any:`numpy.sum`, 'min', 'max'.
exclude_on_fail : bool, optional
Flag scenarios failing validation as `exclude: True`.
multiplier : number, optional
Expand Down Expand Up @@ -2261,18 +2261,23 @@ def _to_file_format(self, iamc_index):
df = df.rename(columns={c: str(c).title() for c in df.columns})
return df

def to_csv(self, path, iamc_index=False, **kwargs):
"""Write timeseries data of this object to a csv file
def to_csv(self, path=None, iamc_index=False, **kwargs):
"""Write :meth:`IamDataFrame.timeseries` to a comma-separated values (csv) file
Parameters
----------
path : str or path object
file path or :class:`pathlib.Path`
iamc_index : bool, default False
if True, use `['model', 'scenario', 'region', 'variable', 'unit']`;
else, use all 'data' columns
path : str, path or file-like, optional
File path as string or :class:`pathlib.Path`, or file-like object.
If *None*, the result is returned as a csv-formatted string.
See :meth:`pandas.DataFrame.to_csv` for details.
iamc_index : bool, optional
If True, use `['model', 'scenario', 'region', 'variable', 'unit']`;
else, use all :attr:`dimensions`.
See :meth:`IamDataFrame.timeseries` for details.
**kwargs
Passed to :meth:`pandas.DataFrame.to_csv`.
"""
self._to_file_format(iamc_index).to_csv(path, index=False, **kwargs)
return self._to_file_format(iamc_index).to_csv(path, index=False, **kwargs)

def to_excel(
self,
Expand All @@ -2286,30 +2291,33 @@ def to_excel(
Parameters
----------
excel_writer : str, path object or ExcelWriter object
any valid string path, :class:`pathlib.Path`
or :class:`pandas.ExcelWriter`
excel_writer : path-like, file-like, or ExcelWriter object
File path as string or :class:`pathlib.Path`,
or existing :class:`pandas.ExcelWriter`.
sheet_name : string
name of sheet which will contain :meth:`timeseries` data
iamc_index : bool, default False
if True, use `['model', 'scenario', 'region', 'variable', 'unit']`;
else, use all 'data' columns
include_meta : boolean or string
if True, write 'meta' to an Excel sheet name 'meta' (default);
if this is a string, use it as sheet name
Name of sheet which will contain :meth:`IamDataFrame.timeseries` data.
iamc_index : bool, optional
If True, use `['model', 'scenario', 'region', 'variable', 'unit']`;
else, use all :attr:`dimensions`.
See :meth:`IamDataFrame.timeseries` for details.
include_meta : boolean or string, optional
If True, write :any:`IamDataFrame.meta` to a sheet 'meta' (default);
if this is a string, use it as sheet name.
**kwargs
Passed to :class:`pandas.ExcelWriter` (if *excel_writer* is path-like)
"""
# open a new ExcelWriter instance (if necessary)
close = False
if not isinstance(excel_writer, pd.ExcelWriter):
close = True
excel_writer = pd.ExcelWriter(excel_writer, engine="xlsxwriter")
excel_writer = pd.ExcelWriter(excel_writer, **kwargs)

# write data table
write_sheet(excel_writer, sheet_name, self._to_file_format(iamc_index))

# write meta table unless `include_meta=False`
if include_meta:
meta_rename = dict([(i, i.capitalize()) for i in META_IDX])
meta_rename = dict([(i, i.capitalize()) for i in self.index.names])
write_sheet(
excel_writer,
"meta" if include_meta is True else include_meta,
Expand All @@ -2320,20 +2328,21 @@ def to_excel(
if close:
excel_writer.close()

def export_meta(self, excel_writer, sheet_name="meta"):
"""Write the 'meta' indicators of this object to an Excel sheet
def export_meta(self, excel_writer, sheet_name="meta", **kwargs):
"""Write the 'meta' indicators of this object to an Excel spreadsheet
Parameters
----------
excel_writer : str, path object or ExcelWriter object
any valid string path, :class:`pathlib.Path`
or :class:`pandas.ExcelWriter`
File path, :class:`pathlib.Path`, or existing :class:`pandas.ExcelWriter`.
sheet_name : str
name of sheet which will contain dataframe of 'meta' indicators
Name of sheet which will contain 'meta'.
**kwargs
Passed to :class:`pandas.ExcelWriter` (if *excel_writer* is path-like)
"""
close = False
if not isinstance(excel_writer, pd.ExcelWriter):
excel_writer = pd.ExcelWriter(excel_writer, engine="xlsxwriter")
excel_writer = pd.ExcelWriter(excel_writer, **kwargs)
close = True
write_sheet(excel_writer, sheet_name, self.meta, index=True)
if close:
Expand Down
5 changes: 2 additions & 3 deletions pyam/iiasa.py
Original file line number Diff line number Diff line change
Expand Up @@ -615,7 +615,7 @@ def lazy_read_iiasa(
----------
file : str or :class:`pathlib.Path`
The location to test for valid data and save the data if not up-to-date. Must be
either xls, xlsx or csv.
either xlsx or csv.
name : str
| Name of an IIASA Scenario Explorer database instance.
| See :attr:`pyam.iiasa.Connection.valid_connections`.
Expand All @@ -642,8 +642,7 @@ def lazy_read_iiasa(
assert file.suffix in [
".csv",
".xlsx",
".xls",
], "We will only read and write to csv, xls and xlsx format."
], "We will only read and write to csv and xlsx format."
if os.path.exists(file):
date_set = pd.to_datetime(os.path.getmtime(file), unit="s")
version_info = Connection(name, creds, base_url).properties()
Expand Down
12 changes: 4 additions & 8 deletions pyam/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,19 +136,15 @@ def write_sheet(writer, name, df, index=False):

def read_pandas(path, sheet_name=["data*", "Data*"], *args, **kwargs):
"""Read a file and return a pandas.DataFrame"""

if isinstance(path, Path) and path.suffix == ".csv":
return pd.read_csv(path, *args, **kwargs)
else:
if isinstance(path, pd.ExcelFile):
xl = path
else:
xl = pd.ExcelFile(
path, engine="xlrd" if path.suffix == ".xls" else "openpyxl"
)

sheet_names = pd.Series(xl.sheet_names)
else:
xl = path if isinstance(path, pd.ExcelFile) else pd.ExcelFile(path)

# reading multiple sheets
sheet_names = pd.Series(xl.sheet_names)
if len(sheet_names) > 1:
sheets = kwargs.pop("sheet_name", sheet_name)
# apply pattern-matching for sheet names (use * as wildcard)
Expand Down
6 changes: 3 additions & 3 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ classifiers =
[options]
packages = pyam
include_package_data = True
python_requires = >=3.7, <3.11
python_requires = >=3.7.1, <3.11

# NOTE TO DEVS
# If you change a minimum version below, please explicitly implement the change
Expand All @@ -32,14 +32,13 @@ install_requires =
pyjwt
httpx[http2]
openpyxl
pandas >= 1.1.1
pandas >= 1.2.0
scipy
pint >= 0.13
PyYAML
matplotlib >= 3.2.0
seaborn
six
xlrd >= 2.0
setuptools >= 41
setuptools_scm
# required explicitly for Python 3.7
Expand All @@ -61,6 +60,7 @@ optional_plotting =
optional_io_formats =
datapackage
pandas-datareader
xlrd
unfccc_di_api >= 3.0.1
tutorials =
pypandoc
Expand Down
Binary file added tests/data/test_df.xls
Binary file not shown.
5 changes: 1 addition & 4 deletions tests/test_iiasa.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,10 +392,7 @@ def test_lazy_read(tmpdir):
df_newfilt = lazy_read_iiasa(tmp_file, TEST_API, model="model_b")
assert df_newfilt.model == ["model_b"]
assert os.path.getmtime(tmp_file) > writetime
# file can also be xls or xlsx
# file can also xlsx
xlsx_file = tmpdir / "test_database.xlsx"
df_xlsx = lazy_read_iiasa(xlsx_file, TEST_API, model="model_b")
assert df_newfilt.equals(df_xlsx)
xls_file = tmpdir / "test_database.xls"
df_xls = lazy_read_iiasa(xls_file, TEST_API, model="model_b")
assert df_xls.equals(df_xlsx)
31 changes: 30 additions & 1 deletion tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@

from .conftest import TEST_DATA_DIR, META_DF

try:
import xlrd # noqa: F401

has_xlrd = True
except ModuleNotFoundError: # pragma: no cover
has_xlrd = False


FILTER_ARGS = dict(scenario="scen_a")


Expand Down Expand Up @@ -40,7 +48,7 @@ def test_io_list():
IamDataFrame([1, 2])


def test_io_csv(test_df, tmpdir):
def test_io_csv_to_file(test_df, tmpdir):
# write to csv
file = tmpdir / "testing_io_write_read.csv"
test_df.to_csv(file)
Expand All @@ -50,6 +58,21 @@ def test_io_csv(test_df, tmpdir):
pd.testing.assert_frame_equal(test_df.data, import_df.data)


def test_io_csv_none(test_df_year):
# parse data as csv and return as string
exp = (
"Model,Scenario,Region,Variable,Unit,2005,2010\n"
"model_a,scen_a,World,Primary Energy,EJ/yr,1.0,6.0\n"
"model_a,scen_a,World,Primary Energy|Coal,EJ/yr,0.5,3.0\n"
"model_a,scen_b,World,Primary Energy,EJ/yr,2.0,7.0\n"
)
try:
assert test_df_year.to_csv(lineterminator="\n") == exp
# special treatment for Python 3.7 and pandas < 1.5
except TypeError:
assert test_df_year.to_csv(line_terminator="\n") == exp


@pytest.mark.parametrize(
"meta_args", [[{}, {}], [dict(include_meta="foo"), dict(meta_sheet_name="foo")]]
)
Expand Down Expand Up @@ -92,6 +115,12 @@ def test_io_xlsx_multiple_data_sheets(test_df, sheets, sheetname, tmpdir):
assert_iamframe_equal(test_df, import_df)


@pytest.mark.skipif(not has_xlrd, reason="Package 'xlrd' not installed.")
def test_read_xls(test_df_year):
import_df = IamDataFrame(TEST_DATA_DIR / "test_df.xls")
assert_iamframe_equal(test_df_year, import_df)


def test_init_df_with_na_unit(test_pd_df, tmpdir):
# missing values in the unit column are replaced by an empty string
test_pd_df.loc[1, "unit"] = np.nan
Expand Down

0 comments on commit 60d84d8

Please sign in to comment.