diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index f49c5c051..9da1d7259 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -41,16 +41,16 @@ jobs: run: pip install .[tests,optional_plotting,optional_io_formats,tutorials] - name: Test with pytest - if: ${{ matrix.python-version != '3.9' }} + if: ${{ matrix.python-version != '3.8' }} run: pytest tests - # only execute Matplotlib tests on latest Python version + # only execute Matplotlib tests on a known stable Python + deps version - name: Test with pytest including Matplotlib & Codecov - if: ${{ matrix.python-version == '3.9' }} + if: ${{ matrix.python-version == '3.8' }} run: pytest tests --mpl --cov=./ --cov-report=xml - name: Upload coverage report to Codecov - if: ${{ matrix.os == 'ubuntu-latest' && matrix.python-version == '3.9' }} + if: ${{ matrix.os == 'ubuntu-latest' && matrix.python-version == '3.8' }} uses: codecov/codecov-action@v1 with: file: ./coverage.xml diff --git a/profile/profile_init.py b/profile/profile_init.py new file mode 100644 index 000000000..5741a4ecf --- /dev/null +++ b/profile/profile_init.py @@ -0,0 +1,110 @@ +import string +import numpy as np +import pandas as pd +from functools import wraps +from pathlib import Path +import time + +import pyam + +YEARS = range(2010, 2101, 10) + + +def timeit(func): + @wraps(func) + def timeit_wrapper(*args, **kwargs): + start_time = time.perf_counter() + result = func(*args, **kwargs) + end_time = time.perf_counter() + total_time = end_time - start_time + return total_time, result + + return timeit_wrapper + + +def join(a): + return "".join(a) + + +def gen_str(N, k=1): + return np.random.choice( + list(string.ascii_lowercase), size=(k, N, len(pyam.IAMC_IDX)) + ) + + +def gen_str_iamc(N, k=1): + return np.apply_along_axis(join, 0, gen_str(N, k)) + + +def gen_float(N, years=YEARS): + return np.random.choice( + range(10), + size=( + N, + len(years), + ), + ) + + +@timeit +def gen_frame(data, fast): + if fast: + data = data.set_index(pyam.IAMC_IDX) + return pyam.IamDataFrame(data, fast=fast) + + +def profile(max=5): + data = {"N": [], "time": [], "type": [], "label": []} + for N in [int(10**n) for n in np.arange(1, max, step=0.5)]: + print(N) + for type in ["slow", "fast"]: + try: + strdata = pd.DataFrame(gen_str_iamc(N, k=5), columns=pyam.IAMC_IDX) + fdata = pd.DataFrame(gen_float(N), columns=YEARS) + _data = pd.concat([strdata, fdata], axis=1) + fast = type == "fast" + time, df = gen_frame(_data, fast=fast) + print(N, type, time) + data["N"].append(N) + data["type"].append(type) + data["time"].append(time) + data["label"].append("autogenerated") + except: + continue + return pd.DataFrame.from_dict(data) + + +@timeit +def gen_frame_from_file(file, fast): + return pyam.IamDataFrame(file, fast=fast) + + +def profile_file(fname): + data = {"N": [], "time": [], "type": [], "label": []} + for type in ["slow", "fast"]: + time, df = gen_frame_from_file(fname, fast=type == "fast") + data["N"].append(len(df)) + data["type"].append(type) + data["time"].append(time) + data["label"].append("from file") + return pd.DataFrame.from_dict(data) + + +def main(): + # requires downloading AR6 dataset and placing it in the data folder + import matplotlib.pyplot as plt + import seaborn as sns + + dfp = profile(max=7) + df6 = profile_file(fname=Path("./data/AR6_Scenarios_Database_World_v1.0.csv")) + df = pd.concat([dfp, df6]).reset_index() + df.to_csv("profile_init.csv") + print(df) + fig, ax = plt.subplots() + sns.lineplot(data=df, x="N", y="time", hue="type", ax=ax) + ax.set(xscale="log") + fig.savefig("profile_init.png") + + +if __name__ == "__main__": + main() diff --git a/pyam/core.py b/pyam/core.py index 5bd6d0a25..a016627de 100755 --- a/pyam/core.py +++ b/pyam/core.py @@ -28,6 +28,7 @@ write_sheet, read_file, read_pandas, + fast_format_data, format_data, merge_meta, find_depth, @@ -120,7 +121,7 @@ class IamDataFrame(object): for those who are not used to the pandas/Python universe. """ - def __init__(self, data, meta=None, index=DEFAULT_META_INDEX, **kwargs): + def __init__(self, data, meta=None, index=DEFAULT_META_INDEX, fast=False, **kwargs): """Initialize an instance of an IamDataFrame""" if isinstance(data, IamDataFrame): if kwargs: @@ -133,9 +134,9 @@ def __init__(self, data, meta=None, index=DEFAULT_META_INDEX, **kwargs): for attr, value in data.__dict__.items(): setattr(self, attr, value) else: - self._init(data, meta, index=index, **kwargs) + self._init(data, meta, index=index, fast=fast, **kwargs) - def _init(self, data, meta=None, index=DEFAULT_META_INDEX, **kwargs): + def _init(self, data, meta=None, index=DEFAULT_META_INDEX, fast=False, **kwargs): """Process data and set attributes for new instance""" # pop kwarg for meta_sheet_name (prior to reading data from file) @@ -159,11 +160,14 @@ def _init(self, data, meta=None, index=DEFAULT_META_INDEX, **kwargs): if not data.is_file(): raise FileNotFoundError(f"No such file: '{data}'") logger.info(f"Reading file {data}") - _data = read_file(data, index=index, **kwargs) + _data = read_file(data, index=index, fast=fast, **kwargs) # cast data from pandas - elif isinstance(data, pd.DataFrame) or isinstance(data, pd.Series): - _data = format_data(data.copy(), index=index, **kwargs) + elif isinstance(data, (pd.DataFrame, pd.Series)): + if fast: + _data = fast_format_data(data, index=index, **kwargs) + else: + _data = format_data(data.copy(), index=index, **kwargs) # unsupported `data` args elif islistable(data): diff --git a/pyam/utils.py b/pyam/utils.py index d1f9de068..b47c42fcd 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -169,20 +169,125 @@ def is_empty(name, s): return False empty_cols = [c for c in df.columns if is_empty(c, df[c])] - return df.drop(columns=empty_cols).dropna(axis=0, how="all") + df.drop(columns=empty_cols, inplace=True) + df.dropna(axis=0, how="all", inplace=True) + return df -def read_file(path, *args, **kwargs): +def read_file(path, fast=False, *args, **kwargs): """Read data from a file""" # extract kwargs that are intended for `format_data` format_kwargs = dict(index=kwargs.pop("index")) for c in [i for i in IAMC_IDX + ["year", "time", "value"] if i in kwargs]: format_kwargs[c] = kwargs.pop(c) - return format_data(read_pandas(path, *args, **kwargs), **format_kwargs) + data = read_pandas(path, *args, **kwargs) + if fast: + # determine non-data columns + extra_cols, time_col, data_cols = intuit_column_groups(data) + # format columns for fast reading + data = data.rename(columns={c: str(c).lower() for c in extra_cols}) + extra_cols = [str(c).lower() for c in extra_cols] + for c in format_kwargs["index"]: + extra_cols.remove(c) + # support databases + if "notes" in data.columns: + data = format_from_database(data) + extra_cols.remove("notes") + # force integer year columns + if time_col == "year": + data = data.rename(columns={c: int(c) for c in data_cols}) + # support file data in long format + if "value" in extra_cols: + extra_cols.remove("value") + idx = IAMC_IDX + list(set(format_kwargs["index"] + extra_cols) - set(IAMC_IDX)) + return fast_format_data(data.set_index(idx), **format_kwargs) + else: + return format_data(data, **format_kwargs) + + +def intuit_column_groups(df, index=[]): + cols = [c for c in df.columns if c not in index + REQUIRED_COLS] + year_cols, time_cols, extra_cols = [], [], [] + for i in cols: + # if the column name can be cast to integer, assume it's a year column + try: + int(i) + year_cols.append(i) + + # otherwise, try casting to datetime + except (ValueError, TypeError): + try: + dateutil.parser.parse(str(i)) + time_cols.append(i) + + # neither year nor datetime, so it is an extra-column + except ValueError: + extra_cols.append(i) + if year_cols and not time_cols: + time_col = "year" + melt_cols = sorted(year_cols) + else: + time_col = "time" + melt_cols = sorted(year_cols) + sorted(time_cols) + if not melt_cols: + raise ValueError("Missing time domain") + return extra_cols, time_col, melt_cols + + +def fast_format_data(df, index=DEFAULT_META_INDEX): + """A faster formatting funciton with more stringent dataframe requirements + + Requirements: + 1. either a pd.Series or pd.DataFrame with a pyam-compatible MultiIndex + 2. if a pd.DataFrame, all columns as either integer year or datetime + 3. no null values + """ + if not isinstance(df, (pd.DataFrame, pd.Series)): + raise TypeError( + "Fast format only works if provided a pd.DataFrame or pd.Series" + ) + if set(IAMC_IDX) - set(df.index.names): + raise ValueError( + f"Missing required index levels: {set(IAMC_IDX) - set(df.index.names)}" + ) + + # index in expected order + extra_cols = list(set(df.index.names).difference((set(IAMC_IDX) | set(index)))) + if len(set(["time", "year"]) - set(extra_cols)) == 0: + raise ValueError("Can not have time and year as indicies") + idx = IAMC_IDX + list(set(index + extra_cols) - set(IAMC_IDX)) + df = df.reorder_levels(idx) + + # migrate dataframe to series + if isinstance(df, pd.DataFrame): + _, time_col, _ = intuit_column_groups(df, index=index) + df = df.rename_axis(columns=time_col) + df = df.stack() + else: + time_col = list(set(["time", "year"]) & set(extra_cols))[0] + extra_cols = list(set(extra_cols) - set(["time", "year"])) + + df.name = "value" + + return df, index, time_col, extra_cols + + +def format_from_database(df): + logger.info("Ignoring notes column in dataframe") + df.drop(columns="notes", inplace=True) + col = df.columns[0] # first column has database copyright notice + df = df[~df[col].str.contains("database", case=False)] + if "scenario" in df.columns and "model" not in df.columns: + # model and scenario are jammed together in RCP data + scen = df["scenario"] + df.loc[:, "model"] = scen.apply(lambda s: s.split("-")[0].strip()) + df.loc[:, "scenario"] = scen.apply(lambda s: "-".join(s.split("-")[1:]).strip()) + return df def format_data(df, index, **kwargs): """Convert a pandas.Dataframe or pandas.Series to the required format""" + if isinstance(df, pd.Series): df.name = df.name or "value" df = df.to_frame() @@ -241,17 +346,7 @@ def convert_r_columns(c): df.rename(columns={c: str(c).lower() for c in str_cols}, inplace=True) if "notes" in df.columns: # this came from the database - logger.info("Ignoring notes column in dataframe") - df.drop(columns="notes", inplace=True) - col = df.columns[0] # first column has database copyright notice - df = df[~df[col].str.contains("database", case=False)] - if "scenario" in df.columns and "model" not in df.columns: - # model and scenario are jammed together in RCP data - scen = df["scenario"] - df.loc[:, "model"] = scen.apply(lambda s: s.split("-")[0].strip()) - df.loc[:, "scenario"] = scen.apply( - lambda s: "-".join(s.split("-")[1:]).strip() - ) + df = format_from_database(df) # reset the index if meaningful entries are included there if not list(df.index.names) == [None]: @@ -274,6 +369,9 @@ def convert_r_columns(c): if missing_required_col: raise ValueError(f"Missing required columns: {missing_required_col}") + # replace missing units by an empty string for user-friendly filtering + df.loc[df.unit.isnull(), "unit"] = "" + # check whether data in wide format (IAMC) or long format (`value` column) if "value" in df.columns: # check if time column is given as `year` (int) or `time` (datetime) @@ -288,6 +386,7 @@ def convert_r_columns(c): for c in df.columns if c not in index + REQUIRED_COLS + [time_col, "value"] ] + wide = False else: # if in wide format, check if columns are years (int) or datetime cols = [c for c in df.columns if c not in index + REQUIRED_COLS] @@ -316,19 +415,38 @@ def convert_r_columns(c): melt_cols = sorted(year_cols) + sorted(time_cols) if not melt_cols: raise ValueError("Missing time domain") + wide = True + + # verify that there are no nan's left (in columns), and transform data + idx = index + REQUIRED_COLS + extra_cols + null_rows = df[idx].isnull().T.any() + if null_rows.any(): + _df = df[idx] + cols = ", ".join(_df.columns[_df.isnull().any().values]) + raise_data_error( + f"Empty cells in `data` (columns: '{cols}')", _df.loc[null_rows] + ) + del null_rows - # melt the dataframe - df = pd.melt( - df, - id_vars=index + REQUIRED_COLS + extra_cols, - var_name=time_col, - value_vars=melt_cols, - value_name="value", + if wide: + df = ( + df + .set_index(idx) + [melt_cols] + .rename_axis(columns=time_col) + .stack() + ) + df.name = "value" + else: + df = ( + df + .set_index(idx + [time_col]) + ['value'] ) # cast value column to numeric and drop nan try: - df["value"] = pd.to_numeric(df["value"]) + df = pd.to_numeric(df) except ValueError as e: # get the row number where the error happened row_nr_regex = re.compile(r"(?<=at position )\d+") @@ -337,23 +455,7 @@ def convert_r_columns(c): short_error = short_error_regex.search(str(e)).group() raise_data_error(f"{short_error} in `data`", df.iloc[[row_nr]]) - df.dropna(inplace=True, subset=["value"]) - - # replace missing units by an empty string for user-friendly filtering - df.loc[df.unit.isnull(), "unit"] = "" - - # verify that there are no nan's left (in columns) - null_rows = df.isnull().T.any() - if null_rows.any(): - cols = ", ".join(df.columns[df.isnull().any().values]) - raise_data_error( - f"Empty cells in `data` (columns: '{cols}')", df.loc[null_rows] - ) - del null_rows - - # cast to pd.Series, check for duplicates - idx_cols = index + REQUIRED_COLS + [time_col] + extra_cols - df = df.set_index(idx_cols).value + df = df.dropna() # format the time-column _time = [to_time(i) for i in get_index_levels(df.index, time_col)] diff --git a/tests/test_core.py b/tests/test_core.py index aa0064cc5..62e9ade97 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -73,6 +73,12 @@ def test_init_from_iamdf(test_df_year): assert test_df_year.scenario == ["scen_b", "scen_foo"] +def test_init_fast(test_df_year): + obs = IamDataFrame(test_df_year, fast=True) + exp = IamDataFrame(test_df_year) + assert_iamframe_equal(obs, exp) + + def test_init_from_iamdf_raises(test_df_year): # casting an IamDataFrame instance again with extra args fails match = "Invalid arguments for initializing from IamDataFrame: {'model': 'foo'}" diff --git a/tests/test_io.py b/tests/test_io.py index 56add9217..94a384a10 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -226,14 +226,16 @@ def test_load_meta_empty(test_pd_df): assert_iamframe_equal(obs, exp) -def test_load_ssp_database_downloaded_file(test_pd_df): +@pytest.mark.parametrize("fast", [True, False]) +def test_load_ssp_database_downloaded_file(test_pd_df, fast): exp = IamDataFrame(test_pd_df).filter(**FILTER_ARGS).as_pandas() file = TEST_DATA_DIR / "test_SSP_database_raw_download.xlsx" - obs_df = IamDataFrame(file) + obs_df = IamDataFrame(file, fast=fast) pd.testing.assert_frame_equal(obs_df.as_pandas(), exp) def test_load_rcp_database_downloaded_file(test_pd_df): + # RCP data not tested for fast at present because it requires additional processing exp = IamDataFrame(test_pd_df).filter(**FILTER_ARGS).as_pandas() file = TEST_DATA_DIR / "test_RCP_database_raw_download.xlsx" obs_df = IamDataFrame(file)