From 64a41f66bb19db30a254bb6af376e8e0a1f9c8e0 Mon Sep 17 00:00:00 2001 From: Matthew Gidden Date: Thu, 16 Feb 2023 16:33:22 +0100 Subject: [PATCH 01/12] initial attempt at a fast init --- pyam/core.py | 9 +++--- pyam/utils.py | 58 ++++++++++++++++++++++++++++++++++++++- tests/profile_init.py | 64 +++++++++++++++++++++++++++++++++++++++++++ tests/test_core.py | 4 +++ 4 files changed, 130 insertions(+), 5 deletions(-) create mode 100644 tests/profile_init.py diff --git a/pyam/core.py b/pyam/core.py index 5bd6d0a25..d46ae89ee 100755 --- a/pyam/core.py +++ b/pyam/core.py @@ -120,7 +120,7 @@ class IamDataFrame(object): for those who are not used to the pandas/Python universe. """ - def __init__(self, data, meta=None, index=DEFAULT_META_INDEX, **kwargs): + def __init__(self, data, meta=None, index=DEFAULT_META_INDEX, fast=False, **kwargs): """Initialize an instance of an IamDataFrame""" if isinstance(data, IamDataFrame): if kwargs: @@ -133,9 +133,9 @@ def __init__(self, data, meta=None, index=DEFAULT_META_INDEX, **kwargs): for attr, value in data.__dict__.items(): setattr(self, attr, value) else: - self._init(data, meta, index=index, **kwargs) + self._init(data, meta, index=index, fast=fast, **kwargs) - def _init(self, data, meta=None, index=DEFAULT_META_INDEX, **kwargs): + def _init(self, data, meta=None, index=DEFAULT_META_INDEX, fast=False, **kwargs): """Process data and set attributes for new instance""" # pop kwarg for meta_sheet_name (prior to reading data from file) @@ -163,7 +163,8 @@ def _init(self, data, meta=None, index=DEFAULT_META_INDEX, **kwargs): # cast data from pandas elif isinstance(data, pd.DataFrame) or isinstance(data, pd.Series): - _data = format_data(data.copy(), index=index, **kwargs) + _data = data if fast else data.copy() + _data = format_data(_data, index=index, fast=fast, **kwargs) # unsupported `data` args elif islistable(data): diff --git a/pyam/utils.py b/pyam/utils.py index d1f9de068..0f4ce57db 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -181,8 +181,64 @@ def read_file(path, *args, **kwargs): return format_data(read_pandas(path, *args, **kwargs), **format_kwargs) -def format_data(df, index, **kwargs): +def intuit_column_groups(df, index): + cols = [c for c in df.columns if c not in index + REQUIRED_COLS] + year_cols, time_cols, extra_cols = [], [], [] + for i in cols: + # if the column name can be cast to integer, assume it's a year column + try: + int(i) + year_cols.append(i) + + # otherwise, try casting to datetime + except (ValueError, TypeError): + try: + dateutil.parser.parse(str(i)) + time_cols.append(i) + + # neither year nor datetime, so it is an extra-column + except ValueError: + extra_cols.append(i) + if year_cols and not time_cols: + time_col = "year" + melt_cols = sorted(year_cols) + else: + time_col = "time" + melt_cols = sorted(year_cols) + sorted(time_cols) + if not melt_cols: + raise ValueError("Missing time domain") + return extra_cols, time_col, melt_cols + +def fast_format_data(df, index=DEFAULT_META_INDEX): + if not isinstance(df, pd.DataFrame): + raise TypeError('Fast format only works if provided a pd.DataFrame') + col_diff = set(IAMC_IDX) - set(df.columns) + if col_diff: + raise ValueError(f'Missing required columns: {col_diff}') + + if "value" not in df.columns: + extra_cols, time_col, melt_cols = intuit_column_groups(df, index) + df = pd.melt( + df, + id_vars=index + REQUIRED_COLS + extra_cols, + var_name=time_col, + value_vars=melt_cols, + value_name="value", + ) + + # cast to pd.Series, check for duplicates + idx_cols = index + REQUIRED_COLS + [time_col] + extra_cols + df.set_index(idx_cols, inplace=True) + df = df.value + + df.sort_index(inplace=True) + return df, index, time_col, extra_cols + +def format_data(df, index, fast=False, **kwargs): """Convert a pandas.Dataframe or pandas.Series to the required format""" + if fast: + return fast_format_data(df, index) + if isinstance(df, pd.Series): df.name = df.name or "value" df = df.to_frame() diff --git a/tests/profile_init.py b/tests/profile_init.py new file mode 100644 index 000000000..34ad3973c --- /dev/null +++ b/tests/profile_init.py @@ -0,0 +1,64 @@ +import string +import numpy as np +import pandas as pd +from functools import wraps +import time + +import pyam + +YEARS = range(2010, 2101, 10) + + + +def timeit(func): + @wraps(func) + def timeit_wrapper(*args, **kwargs): + start_time = time.perf_counter() + result = func(*args, **kwargs) + end_time = time.perf_counter() + total_time = end_time - start_time + return total_time, result + return timeit_wrapper + +def join(a): + return ''.join(a) + +def gen_str(N, k=1): + return np.random.choice(list(string.ascii_lowercase), size=(k, N,len(pyam.IAMC_IDX))) + +def gen_str_iamc(N, k=1): + return np.apply_along_axis(join, 0, gen_str(N, k)) + +def gen_float(N, years=YEARS): + return np.random.choice(range(10), size=(N,len(years), )) + +@timeit +def gen_frame(strdata, fdata, fast): + return pyam.IamDataFrame(pd.concat([strdata, fdata], axis=1), fast=fast) + +def profile(max=5): + data = {'N': [], 'time': [], 'type': []} + for N in [int(10**n) for n in np.arange(1, 8, step=0.5)]: + print(N) + for type in ['slow', 'fast']: + try: + strdata = pd.DataFrame(gen_str_iamc(N, k=5), columns=pyam.IAMC_IDX) + fdata = pd.DataFrame(gen_float(N), columns=YEARS) + time, df = gen_frame(strdata, fdata, fast=type == 'fast') + print(N, type, time) + data['N'].append(N) + data['type'].append(type) + data['time'].append(time) + except: + continue + return pd.DataFrame.from_dict(data) + +if __name__ == '__main__': + import matplotlib.pyplot as plt + import seaborn as sns + df = profile(max=8) + fig, ax = plt.subplots() + sns.lineplot(data=df, x='N', y='time', hue='type', ax=ax) + ax.set(xscale='log') + fig.savefig('profile_init.png') + df.to_csv('profile_init.csv') \ No newline at end of file diff --git a/tests/test_core.py b/tests/test_core.py index aa0064cc5..1a3eec4ce 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -72,6 +72,10 @@ def test_init_from_iamdf(test_df_year): assert df.scenario == ["scen_b", "scen_bar"] assert test_df_year.scenario == ["scen_b", "scen_foo"] +def test_init_fast(test_df_year): + obs = IamDataFrame(test_df_year, fast=True) + exp = IamDataFrame(test_df_year) + assert_iamframe_equal(obs, exp) def test_init_from_iamdf_raises(test_df_year): # casting an IamDataFrame instance again with extra args fails From 2dfc141a409937714b06b21dddc7ccc0b76f8c03 Mon Sep 17 00:00:00 2001 From: Matthew Gidden Date: Fri, 17 Feb 2023 10:03:57 +0100 Subject: [PATCH 02/12] added fast to filereading with tests and profiling --- pyam/core.py | 2 +- pyam/utils.py | 42 +++++++++++++++++++++++++++++------------- tests/profile_init.py | 36 +++++++++++++++++++++++++++++------- tests/test_io.py | 7 ++++--- 4 files changed, 63 insertions(+), 24 deletions(-) diff --git a/pyam/core.py b/pyam/core.py index d46ae89ee..231166854 100755 --- a/pyam/core.py +++ b/pyam/core.py @@ -159,7 +159,7 @@ def _init(self, data, meta=None, index=DEFAULT_META_INDEX, fast=False, **kwargs) if not data.is_file(): raise FileNotFoundError(f"No such file: '{data}'") logger.info(f"Reading file {data}") - _data = read_file(data, index=index, **kwargs) + _data = read_file(data, index=index, fast=fast, **kwargs) # cast data from pandas elif isinstance(data, pd.DataFrame) or isinstance(data, pd.Series): diff --git a/pyam/utils.py b/pyam/utils.py index 0f4ce57db..e1894c861 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -169,16 +169,18 @@ def is_empty(name, s): return False empty_cols = [c for c in df.columns if is_empty(c, df[c])] - return df.drop(columns=empty_cols).dropna(axis=0, how="all") + df.drop(columns=empty_cols, inplace=True) + df.dropna(axis=0, how="all", inplace=True) + return df -def read_file(path, *args, **kwargs): +def read_file(path, fast=False, *args, **kwargs): """Read data from a file""" # extract kwargs that are intended for `format_data` format_kwargs = dict(index=kwargs.pop("index")) for c in [i for i in IAMC_IDX + ["year", "time", "value"] if i in kwargs]: format_kwargs[c] = kwargs.pop(c) - return format_data(read_pandas(path, *args, **kwargs), **format_kwargs) + return format_data(read_pandas(path, *args, **kwargs), fast=fast, **format_kwargs) def intuit_column_groups(df, index): @@ -212,27 +214,41 @@ def intuit_column_groups(df, index): def fast_format_data(df, index=DEFAULT_META_INDEX): if not isinstance(df, pd.DataFrame): raise TypeError('Fast format only works if provided a pd.DataFrame') + + # all lower case + str_cols = [c for c in df.columns if isstr(c)] + df.rename(columns={c: str(c).lower() for c in str_cols}, inplace=True) + + if "notes" in df.columns: # this came from the database + logger.info("Ignoring notes column in dataframe") + df.drop(columns="notes", inplace=True) + col = df.columns[0] # first column has database copyright notice + df = df[~df[col].str.contains("database", case=False)] + col_diff = set(IAMC_IDX) - set(df.columns) if col_diff: raise ValueError(f'Missing required columns: {col_diff}') - - if "value" not in df.columns: - extra_cols, time_col, melt_cols = intuit_column_groups(df, index) + + extra_cols, time_col, melt_cols = intuit_column_groups(df, index) + # build idx in expected order with IAMC_IDX first + idx = IAMC_IDX + list(set(index + extra_cols)- set(IAMC_IDX)) + if "value" not in df.columns: df = pd.melt( df, - id_vars=index + REQUIRED_COLS + extra_cols, + id_vars=idx, var_name=time_col, value_vars=melt_cols, value_name="value", ) - # cast to pd.Series, check for duplicates - idx_cols = index + REQUIRED_COLS + [time_col] + extra_cols - df.set_index(idx_cols, inplace=True) - df = df.value + df.dropna(inplace=True, subset=["value"]) + df.loc[df.unit.isnull(), "unit"] = "" - df.sort_index(inplace=True) - return df, index, time_col, extra_cols + # cast to pd.Series and return + idx_cols = idx + [time_col] + df.set_index(idx_cols, inplace=True) + df.sort_index(inplace=True) # TODO: not sure this is needed + return df.value, index, time_col, extra_cols def format_data(df, index, fast=False, **kwargs): """Convert a pandas.Dataframe or pandas.Series to the required format""" diff --git a/tests/profile_init.py b/tests/profile_init.py index 34ad3973c..dc5d15482 100644 --- a/tests/profile_init.py +++ b/tests/profile_init.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd from functools import wraps +from pathlib import Path import time import pyam @@ -33,18 +34,19 @@ def gen_float(N, years=YEARS): return np.random.choice(range(10), size=(N,len(years), )) @timeit -def gen_frame(strdata, fdata, fast): - return pyam.IamDataFrame(pd.concat([strdata, fdata], axis=1), fast=fast) +def gen_frame(data, fast): + return pyam.IamDataFrame(data, fast=fast) def profile(max=5): data = {'N': [], 'time': [], 'type': []} - for N in [int(10**n) for n in np.arange(1, 8, step=0.5)]: + for N in [int(10**n) for n in np.arange(1, max, step=0.5)]: print(N) for type in ['slow', 'fast']: try: strdata = pd.DataFrame(gen_str_iamc(N, k=5), columns=pyam.IAMC_IDX) fdata = pd.DataFrame(gen_float(N), columns=YEARS) - time, df = gen_frame(strdata, fdata, fast=type == 'fast') + _data = pd.concat([strdata, fdata], axis=1) + time, df = gen_frame(_data, fast=type == 'fast') print(N, type, time) data['N'].append(N) data['type'].append(type) @@ -53,12 +55,32 @@ def profile(max=5): continue return pd.DataFrame.from_dict(data) -if __name__ == '__main__': +@timeit +def gen_frame_from_file(file, fast): + return pyam.IamDataFrame(file, fast=fast) + +def profile_file(fname): + data = {'N': [], 'time': [], 'type': []} + for type in ['slow', 'fast']: + time, df = gen_frame_from_file(fname, fast=type == 'fast') + data['N'].append(len(df)) + data['type'].append(type) + data['time'].append(time) + return pd.DataFrame.from_dict(data) + +def main(): import matplotlib.pyplot as plt import seaborn as sns - df = profile(max=8) + dfp = profile(max=6) + df6 = profile_file(fname=Path('./AR6_Scenarios_Database_World_v1.0.csv')) + df = pd.concat([dfp, df6]).reset_index() + df.to_csv('profile_init.csv') + print(df) fig, ax = plt.subplots() sns.lineplot(data=df, x='N', y='time', hue='type', ax=ax) ax.set(xscale='log') fig.savefig('profile_init.png') - df.to_csv('profile_init.csv') \ No newline at end of file + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/tests/test_io.py b/tests/test_io.py index 56add9217..28174d7cc 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -225,15 +225,16 @@ def test_load_meta_empty(test_pd_df): exp = IamDataFrame(test_pd_df) assert_iamframe_equal(obs, exp) - -def test_load_ssp_database_downloaded_file(test_pd_df): +@pytest.mark.parametrize("fast", [True, False]) +def test_load_ssp_database_downloaded_file(test_pd_df, fast): exp = IamDataFrame(test_pd_df).filter(**FILTER_ARGS).as_pandas() file = TEST_DATA_DIR / "test_SSP_database_raw_download.xlsx" - obs_df = IamDataFrame(file) + obs_df = IamDataFrame(file, fast=fast) pd.testing.assert_frame_equal(obs_df.as_pandas(), exp) def test_load_rcp_database_downloaded_file(test_pd_df): + # RCP data not tested for fast at present because it requires additional processing exp = IamDataFrame(test_pd_df).filter(**FILTER_ARGS).as_pandas() file = TEST_DATA_DIR / "test_RCP_database_raw_download.xlsx" obs_df = IamDataFrame(file) From 6a58ab2ee6f14c92497c3cc89a8fad43ce95a361 Mon Sep 17 00:00:00 2001 From: Matthew Gidden Date: Fri, 17 Feb 2023 10:13:42 +0100 Subject: [PATCH 03/12] put ar6 data location with other files --- tests/profile_init.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/profile_init.py b/tests/profile_init.py index dc5d15482..4c299c523 100644 --- a/tests/profile_init.py +++ b/tests/profile_init.py @@ -69,10 +69,11 @@ def profile_file(fname): return pd.DataFrame.from_dict(data) def main(): + # requires downloading AR6 dataset and placing it in the data folder import matplotlib.pyplot as plt import seaborn as sns dfp = profile(max=6) - df6 = profile_file(fname=Path('./AR6_Scenarios_Database_World_v1.0.csv')) + df6 = profile_file(fname=Path('./data/AR6_Scenarios_Database_World_v1.0.csv')) df = pd.concat([dfp, df6]).reset_index() df.to_csv('profile_init.csv') print(df) From 7d5a5968302d44e274eef556dd778544d3db2f20 Mon Sep 17 00:00:00 2001 From: Matthew Gidden Date: Fri, 17 Feb 2023 10:21:45 +0100 Subject: [PATCH 04/12] blacked files --- pyam/utils.py | 21 ++++++++------ tests/profile_init.py | 66 +++++++++++++++++++++++++++---------------- tests/test_core.py | 2 ++ tests/test_io.py | 1 + 4 files changed, 57 insertions(+), 33 deletions(-) diff --git a/pyam/utils.py b/pyam/utils.py index e1894c861..89732970d 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -211,28 +211,29 @@ def intuit_column_groups(df, index): raise ValueError("Missing time domain") return extra_cols, time_col, melt_cols + def fast_format_data(df, index=DEFAULT_META_INDEX): if not isinstance(df, pd.DataFrame): - raise TypeError('Fast format only works if provided a pd.DataFrame') + raise TypeError("Fast format only works if provided a pd.DataFrame") # all lower case str_cols = [c for c in df.columns if isstr(c)] df.rename(columns={c: str(c).lower() for c in str_cols}, inplace=True) - + if "notes" in df.columns: # this came from the database logger.info("Ignoring notes column in dataframe") df.drop(columns="notes", inplace=True) col = df.columns[0] # first column has database copyright notice df = df[~df[col].str.contains("database", case=False)] - + col_diff = set(IAMC_IDX) - set(df.columns) if col_diff: - raise ValueError(f'Missing required columns: {col_diff}') - + raise ValueError(f"Missing required columns: {col_diff}") + extra_cols, time_col, melt_cols = intuit_column_groups(df, index) # build idx in expected order with IAMC_IDX first - idx = IAMC_IDX + list(set(index + extra_cols)- set(IAMC_IDX)) - if "value" not in df.columns: + idx = IAMC_IDX + list(set(index + extra_cols) - set(IAMC_IDX)) + if "value" not in df.columns: df = pd.melt( df, id_vars=idx, @@ -247,9 +248,10 @@ def fast_format_data(df, index=DEFAULT_META_INDEX): # cast to pd.Series and return idx_cols = idx + [time_col] df.set_index(idx_cols, inplace=True) - df.sort_index(inplace=True) # TODO: not sure this is needed + # df.sort_index(inplace=True) # TODO: not sure this is needed return df.value, index, time_col, extra_cols + def format_data(df, index, fast=False, **kwargs): """Convert a pandas.Dataframe or pandas.Series to the required format""" if fast: @@ -440,7 +442,8 @@ def convert_r_columns(c): if df.empty: logger.warning("Formatted data is empty!") - return df.sort_index(), index, time_col, extra_cols + # return df.sort_index(), index, time_col, extra_cols + return df, index, time_col, extra_cols def sort_data(data, cols): diff --git a/tests/profile_init.py b/tests/profile_init.py index 4c299c523..183a8fc61 100644 --- a/tests/profile_init.py +++ b/tests/profile_init.py @@ -5,12 +5,11 @@ from pathlib import Path import time -import pyam +import pyam YEARS = range(2010, 2101, 10) - def timeit(func): @wraps(func) def timeit_wrapper(*args, **kwargs): @@ -19,69 +18,88 @@ def timeit_wrapper(*args, **kwargs): end_time = time.perf_counter() total_time = end_time - start_time return total_time, result + return timeit_wrapper + def join(a): - return ''.join(a) + return "".join(a) + def gen_str(N, k=1): - return np.random.choice(list(string.ascii_lowercase), size=(k, N,len(pyam.IAMC_IDX))) + return np.random.choice( + list(string.ascii_lowercase), size=(k, N, len(pyam.IAMC_IDX)) + ) + def gen_str_iamc(N, k=1): return np.apply_along_axis(join, 0, gen_str(N, k)) + def gen_float(N, years=YEARS): - return np.random.choice(range(10), size=(N,len(years), )) + return np.random.choice( + range(10), + size=( + N, + len(years), + ), + ) + @timeit def gen_frame(data, fast): return pyam.IamDataFrame(data, fast=fast) + def profile(max=5): - data = {'N': [], 'time': [], 'type': []} + data = {"N": [], "time": [], "type": []} for N in [int(10**n) for n in np.arange(1, max, step=0.5)]: print(N) - for type in ['slow', 'fast']: + for type in ["slow", "fast"]: try: strdata = pd.DataFrame(gen_str_iamc(N, k=5), columns=pyam.IAMC_IDX) fdata = pd.DataFrame(gen_float(N), columns=YEARS) _data = pd.concat([strdata, fdata], axis=1) - time, df = gen_frame(_data, fast=type == 'fast') + time, df = gen_frame(_data, fast=type == "fast") print(N, type, time) - data['N'].append(N) - data['type'].append(type) - data['time'].append(time) + data["N"].append(N) + data["type"].append(type) + data["time"].append(time) except: continue return pd.DataFrame.from_dict(data) + @timeit def gen_frame_from_file(file, fast): return pyam.IamDataFrame(file, fast=fast) + def profile_file(fname): - data = {'N': [], 'time': [], 'type': []} - for type in ['slow', 'fast']: - time, df = gen_frame_from_file(fname, fast=type == 'fast') - data['N'].append(len(df)) - data['type'].append(type) - data['time'].append(time) + data = {"N": [], "time": [], "type": []} + for type in ["slow", "fast"]: + time, df = gen_frame_from_file(fname, fast=type == "fast") + data["N"].append(len(df)) + data["type"].append(type) + data["time"].append(time) return pd.DataFrame.from_dict(data) + def main(): # requires downloading AR6 dataset and placing it in the data folder import matplotlib.pyplot as plt import seaborn as sns + dfp = profile(max=6) - df6 = profile_file(fname=Path('./data/AR6_Scenarios_Database_World_v1.0.csv')) + df6 = profile_file(fname=Path("./data/AR6_Scenarios_Database_World_v1.0.csv")) df = pd.concat([dfp, df6]).reset_index() - df.to_csv('profile_init.csv') + df.to_csv("profile_init.csv") print(df) fig, ax = plt.subplots() - sns.lineplot(data=df, x='N', y='time', hue='type', ax=ax) - ax.set(xscale='log') - fig.savefig('profile_init.png') + sns.lineplot(data=df, x="N", y="time", hue="type", ax=ax) + ax.set(xscale="log") + fig.savefig("profile_init.png") -if __name__ == '__main__': - main() \ No newline at end of file +if __name__ == "__main__": + main() diff --git a/tests/test_core.py b/tests/test_core.py index 1a3eec4ce..62e9ade97 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -72,11 +72,13 @@ def test_init_from_iamdf(test_df_year): assert df.scenario == ["scen_b", "scen_bar"] assert test_df_year.scenario == ["scen_b", "scen_foo"] + def test_init_fast(test_df_year): obs = IamDataFrame(test_df_year, fast=True) exp = IamDataFrame(test_df_year) assert_iamframe_equal(obs, exp) + def test_init_from_iamdf_raises(test_df_year): # casting an IamDataFrame instance again with extra args fails match = "Invalid arguments for initializing from IamDataFrame: {'model': 'foo'}" diff --git a/tests/test_io.py b/tests/test_io.py index 28174d7cc..94a384a10 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -225,6 +225,7 @@ def test_load_meta_empty(test_pd_df): exp = IamDataFrame(test_pd_df) assert_iamframe_equal(obs, exp) + @pytest.mark.parametrize("fast", [True, False]) def test_load_ssp_database_downloaded_file(test_pd_df, fast): exp = IamDataFrame(test_pd_df).filter(**FILTER_ARGS).as_pandas() From 45f31613d28e1649d26ba84fbabb92d1c18a980e Mon Sep 17 00:00:00 2001 From: Matthew Gidden Date: Fri, 17 Feb 2023 10:27:02 +0100 Subject: [PATCH 05/12] uncomment sorts --- pyam/utils.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pyam/utils.py b/pyam/utils.py index 89732970d..1db1853c1 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -248,7 +248,7 @@ def fast_format_data(df, index=DEFAULT_META_INDEX): # cast to pd.Series and return idx_cols = idx + [time_col] df.set_index(idx_cols, inplace=True) - # df.sort_index(inplace=True) # TODO: not sure this is needed + df.sort_index(inplace=True) # TODO: not sure this is needed return df.value, index, time_col, extra_cols @@ -442,8 +442,7 @@ def convert_r_columns(c): if df.empty: logger.warning("Formatted data is empty!") - # return df.sort_index(), index, time_col, extra_cols - return df, index, time_col, extra_cols + return df.sort_index(), index, time_col, extra_cols def sort_data(data, cols): From 7046b86392deabfde6aa52fe4c7791448824db27 Mon Sep 17 00:00:00 2001 From: Matthew Gidden Date: Fri, 17 Feb 2023 10:37:19 +0100 Subject: [PATCH 06/12] moved profile_init to profile module --- {tests => profile}/profile_init.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {tests => profile}/profile_init.py (100%) diff --git a/tests/profile_init.py b/profile/profile_init.py similarity index 100% rename from tests/profile_init.py rename to profile/profile_init.py From 873ee463889c84a88fd44a316cef3f73de7996a1 Mon Sep 17 00:00:00 2001 From: Matthew Gidden Date: Sat, 18 Feb 2023 12:22:47 +0100 Subject: [PATCH 07/12] refactor fast format to support series and basic dataframes. also supports file reading --- pyam/core.py | 9 ++-- pyam/utils.py | 121 +++++++++++++++++++++++++++++--------------------- 2 files changed, 76 insertions(+), 54 deletions(-) diff --git a/pyam/core.py b/pyam/core.py index 231166854..a016627de 100755 --- a/pyam/core.py +++ b/pyam/core.py @@ -28,6 +28,7 @@ write_sheet, read_file, read_pandas, + fast_format_data, format_data, merge_meta, find_depth, @@ -162,9 +163,11 @@ def _init(self, data, meta=None, index=DEFAULT_META_INDEX, fast=False, **kwargs) _data = read_file(data, index=index, fast=fast, **kwargs) # cast data from pandas - elif isinstance(data, pd.DataFrame) or isinstance(data, pd.Series): - _data = data if fast else data.copy() - _data = format_data(_data, index=index, fast=fast, **kwargs) + elif isinstance(data, (pd.DataFrame, pd.Series)): + if fast: + _data = fast_format_data(data, index=index, **kwargs) + else: + _data = format_data(data.copy(), index=index, **kwargs) # unsupported `data` args elif islistable(data): diff --git a/pyam/utils.py b/pyam/utils.py index 1db1853c1..9b48e499a 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -180,10 +180,32 @@ def read_file(path, fast=False, *args, **kwargs): format_kwargs = dict(index=kwargs.pop("index")) for c in [i for i in IAMC_IDX + ["year", "time", "value"] if i in kwargs]: format_kwargs[c] = kwargs.pop(c) - return format_data(read_pandas(path, *args, **kwargs), fast=fast, **format_kwargs) + data = read_pandas(path, *args, **kwargs) + if fast: + # determine non-data columns + extra_cols, time_col, data_cols = intuit_column_groups(data) + # format columns for fast reading + data = data.rename(columns={c: str(c).lower() for c in extra_cols}) + extra_cols = [str(c).lower() for c in extra_cols] + for c in format_kwargs['index']: + extra_cols.remove(c) + # support databases + if 'notes' in data.columns: + data = format_from_database(data) + extra_cols.remove('notes') + # force integer year columns + if time_col == 'year': + data = data.rename(columns={c: int(c) for c in data_cols}) + # support file data in long format + if 'value' in extra_cols: + extra_cols.remove('value') + idx = IAMC_IDX + list(set(format_kwargs['index'] + extra_cols) - set(IAMC_IDX)) + return fast_format_data(data.set_index(idx), **format_kwargs) + else: + return format_data(data, **format_kwargs) -def intuit_column_groups(df, index): +def intuit_column_groups(df, index=[]): cols = [c for c in df.columns if c not in index + REQUIRED_COLS] year_cols, time_cols, extra_cols = [], [], [] for i in cols: @@ -213,49 +235,56 @@ def intuit_column_groups(df, index): def fast_format_data(df, index=DEFAULT_META_INDEX): - if not isinstance(df, pd.DataFrame): - raise TypeError("Fast format only works if provided a pd.DataFrame") + """A faster formatting funciton with more stringent dataframe requirements - # all lower case - str_cols = [c for c in df.columns if isstr(c)] - df.rename(columns={c: str(c).lower() for c in str_cols}, inplace=True) - - if "notes" in df.columns: # this came from the database - logger.info("Ignoring notes column in dataframe") - df.drop(columns="notes", inplace=True) - col = df.columns[0] # first column has database copyright notice - df = df[~df[col].str.contains("database", case=False)] - - col_diff = set(IAMC_IDX) - set(df.columns) - if col_diff: - raise ValueError(f"Missing required columns: {col_diff}") - - extra_cols, time_col, melt_cols = intuit_column_groups(df, index) - # build idx in expected order with IAMC_IDX first + Requirements: + 1. either a pd.Series or pd.DataFrame with a pyam-compatible MultiIndex + 2. if a pd.DataFrame, all columns as either integer year or datetime + 3. no null values + """ + if not isinstance(df, (pd.DataFrame, pd.Series)): + raise TypeError("Fast format only works if provided a pd.DataFrame or pd.Series") + if set(IAMC_IDX) - set(df.index.names): + raise ValueError( + f"Missing required index levels: {set(IAMC_IDX) - set(df.index.names)}" + ) + + # index in expected order + extra_cols = list(set(df.index.names).difference((set(IAMC_IDX) | set(index)))) + if len(set(['time', 'year']) - set(extra_cols)) == 0: + raise ValueError('Can not have time and year as indicies') idx = IAMC_IDX + list(set(index + extra_cols) - set(IAMC_IDX)) - if "value" not in df.columns: - df = pd.melt( - df, - id_vars=idx, - var_name=time_col, - value_vars=melt_cols, - value_name="value", - ) - - df.dropna(inplace=True, subset=["value"]) - df.loc[df.unit.isnull(), "unit"] = "" - - # cast to pd.Series and return - idx_cols = idx + [time_col] - df.set_index(idx_cols, inplace=True) - df.sort_index(inplace=True) # TODO: not sure this is needed - return df.value, index, time_col, extra_cols + df = df.reorder_levels(idx) + # migrate dataframe to series + if isinstance(df, pd.DataFrame): + _, time_col, _ = intuit_column_groups(df, index=index) + df = df.rename_axis(columns=time_col) + df = df.stack() + else: + time_col = list(set(['time', 'year']) & set(extra_cols))[0] + extra_cols = list(set(extra_cols) - set(['time', 'year'])) + + df.name = 'value' + + return df, index, time_col, extra_cols + +def format_from_database(df): + logger.info("Ignoring notes column in dataframe") + df.drop(columns="notes", inplace=True) + col = df.columns[0] # first column has database copyright notice + df = df[~df[col].str.contains("database", case=False)] + if "scenario" in df.columns and "model" not in df.columns: + # model and scenario are jammed together in RCP data + scen = df["scenario"] + df.loc[:, "model"] = scen.apply(lambda s: s.split("-")[0].strip()) + df.loc[:, "scenario"] = scen.apply( + lambda s: "-".join(s.split("-")[1:]).strip() + ) + return df -def format_data(df, index, fast=False, **kwargs): +def format_data(df, index, **kwargs): """Convert a pandas.Dataframe or pandas.Series to the required format""" - if fast: - return fast_format_data(df, index) if isinstance(df, pd.Series): df.name = df.name or "value" @@ -315,17 +344,7 @@ def convert_r_columns(c): df.rename(columns={c: str(c).lower() for c in str_cols}, inplace=True) if "notes" in df.columns: # this came from the database - logger.info("Ignoring notes column in dataframe") - df.drop(columns="notes", inplace=True) - col = df.columns[0] # first column has database copyright notice - df = df[~df[col].str.contains("database", case=False)] - if "scenario" in df.columns and "model" not in df.columns: - # model and scenario are jammed together in RCP data - scen = df["scenario"] - df.loc[:, "model"] = scen.apply(lambda s: s.split("-")[0].strip()) - df.loc[:, "scenario"] = scen.apply( - lambda s: "-".join(s.split("-")[1:]).strip() - ) + df = format_from_database(df) # reset the index if meaningful entries are included there if not list(df.index.names) == [None]: From fd081ee577541104dda0ca08375aeedd02c90233 Mon Sep 17 00:00:00 2001 From: Matthew Gidden Date: Sun, 19 Feb 2023 13:30:37 +0100 Subject: [PATCH 08/12] blacked --- pyam/utils.py | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/pyam/utils.py b/pyam/utils.py index 9b48e499a..17a0dfe0b 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -187,19 +187,19 @@ def read_file(path, fast=False, *args, **kwargs): # format columns for fast reading data = data.rename(columns={c: str(c).lower() for c in extra_cols}) extra_cols = [str(c).lower() for c in extra_cols] - for c in format_kwargs['index']: + for c in format_kwargs["index"]: extra_cols.remove(c) # support databases - if 'notes' in data.columns: + if "notes" in data.columns: data = format_from_database(data) - extra_cols.remove('notes') + extra_cols.remove("notes") # force integer year columns - if time_col == 'year': - data = data.rename(columns={c: int(c) for c in data_cols}) + if time_col == "year": + data = data.rename(columns={c: int(c) for c in data_cols}) # support file data in long format - if 'value' in extra_cols: - extra_cols.remove('value') - idx = IAMC_IDX + list(set(format_kwargs['index'] + extra_cols) - set(IAMC_IDX)) + if "value" in extra_cols: + extra_cols.remove("value") + idx = IAMC_IDX + list(set(format_kwargs["index"] + extra_cols) - set(IAMC_IDX)) return fast_format_data(data.set_index(idx), **format_kwargs) else: return format_data(data, **format_kwargs) @@ -239,20 +239,22 @@ def fast_format_data(df, index=DEFAULT_META_INDEX): Requirements: 1. either a pd.Series or pd.DataFrame with a pyam-compatible MultiIndex - 2. if a pd.DataFrame, all columns as either integer year or datetime + 2. if a pd.DataFrame, all columns as either integer year or datetime 3. no null values """ if not isinstance(df, (pd.DataFrame, pd.Series)): - raise TypeError("Fast format only works if provided a pd.DataFrame or pd.Series") + raise TypeError( + "Fast format only works if provided a pd.DataFrame or pd.Series" + ) if set(IAMC_IDX) - set(df.index.names): raise ValueError( f"Missing required index levels: {set(IAMC_IDX) - set(df.index.names)}" - ) - + ) + # index in expected order extra_cols = list(set(df.index.names).difference((set(IAMC_IDX) | set(index)))) - if len(set(['time', 'year']) - set(extra_cols)) == 0: - raise ValueError('Can not have time and year as indicies') + if len(set(["time", "year"]) - set(extra_cols)) == 0: + raise ValueError("Can not have time and year as indicies") idx = IAMC_IDX + list(set(index + extra_cols) - set(IAMC_IDX)) df = df.reorder_levels(idx) @@ -262,13 +264,14 @@ def fast_format_data(df, index=DEFAULT_META_INDEX): df = df.rename_axis(columns=time_col) df = df.stack() else: - time_col = list(set(['time', 'year']) & set(extra_cols))[0] - extra_cols = list(set(extra_cols) - set(['time', 'year'])) + time_col = list(set(["time", "year"]) & set(extra_cols))[0] + extra_cols = list(set(extra_cols) - set(["time", "year"])) - df.name = 'value' + df.name = "value" return df, index, time_col, extra_cols + def format_from_database(df): logger.info("Ignoring notes column in dataframe") df.drop(columns="notes", inplace=True) @@ -278,11 +281,10 @@ def format_from_database(df): # model and scenario are jammed together in RCP data scen = df["scenario"] df.loc[:, "model"] = scen.apply(lambda s: s.split("-")[0].strip()) - df.loc[:, "scenario"] = scen.apply( - lambda s: "-".join(s.split("-")[1:]).strip() - ) + df.loc[:, "scenario"] = scen.apply(lambda s: "-".join(s.split("-")[1:]).strip()) return df + def format_data(df, index, **kwargs): """Convert a pandas.Dataframe or pandas.Series to the required format""" From 2b06af78162f1c8dbd660bf98f5ec505a3a7b722 Mon Sep 17 00:00:00 2001 From: Matthew Gidden Date: Sun, 19 Feb 2023 14:38:52 +0100 Subject: [PATCH 09/12] update profiling for new structure --- profile/profile_init.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/profile/profile_init.py b/profile/profile_init.py index 183a8fc61..9cd6df864 100644 --- a/profile/profile_init.py +++ b/profile/profile_init.py @@ -52,7 +52,7 @@ def gen_frame(data, fast): def profile(max=5): - data = {"N": [], "time": [], "type": []} + data = {"N": [], "time": [], "type": [], "label": []} for N in [int(10**n) for n in np.arange(1, max, step=0.5)]: print(N) for type in ["slow", "fast"]: @@ -60,11 +60,15 @@ def profile(max=5): strdata = pd.DataFrame(gen_str_iamc(N, k=5), columns=pyam.IAMC_IDX) fdata = pd.DataFrame(gen_float(N), columns=YEARS) _data = pd.concat([strdata, fdata], axis=1) - time, df = gen_frame(_data, fast=type == "fast") + fast = type == "fast" + if fast: + _data = _data.set_index(pyam.IAMC_IDX) + time, df = gen_frame(_data, fast=fast) print(N, type, time) data["N"].append(N) data["type"].append(type) data["time"].append(time) + data["label"].append("autogenerated") except: continue return pd.DataFrame.from_dict(data) @@ -76,12 +80,13 @@ def gen_frame_from_file(file, fast): def profile_file(fname): - data = {"N": [], "time": [], "type": []} + data = {"N": [], "time": [], "type": [], "label": []} for type in ["slow", "fast"]: time, df = gen_frame_from_file(fname, fast=type == "fast") data["N"].append(len(df)) data["type"].append(type) data["time"].append(time) + data["label"].append("from file") return pd.DataFrame.from_dict(data) @@ -90,7 +95,7 @@ def main(): import matplotlib.pyplot as plt import seaborn as sns - dfp = profile(max=6) + dfp = profile(max=7) df6 = profile_file(fname=Path("./data/AR6_Scenarios_Database_World_v1.0.csv")) df = pd.concat([dfp, df6]).reset_index() df.to_csv("profile_init.csv") From 58c9822e8e4c2aa71b42e14cc091fe28973a019b Mon Sep 17 00:00:00 2001 From: Matthew Gidden Date: Mon, 20 Feb 2023 08:38:56 +0100 Subject: [PATCH 10/12] refactor for using stack instead of melt --- profile/profile_init.py | 4 +-- pyam/utils.py | 59 ++++++++++++++++++++++++----------------- 2 files changed, 36 insertions(+), 27 deletions(-) diff --git a/profile/profile_init.py b/profile/profile_init.py index 9cd6df864..5741a4ecf 100644 --- a/profile/profile_init.py +++ b/profile/profile_init.py @@ -48,6 +48,8 @@ def gen_float(N, years=YEARS): @timeit def gen_frame(data, fast): + if fast: + data = data.set_index(pyam.IAMC_IDX) return pyam.IamDataFrame(data, fast=fast) @@ -61,8 +63,6 @@ def profile(max=5): fdata = pd.DataFrame(gen_float(N), columns=YEARS) _data = pd.concat([strdata, fdata], axis=1) fast = type == "fast" - if fast: - _data = _data.set_index(pyam.IAMC_IDX) time, df = gen_frame(_data, fast=fast) print(N, type, time) data["N"].append(N) diff --git a/pyam/utils.py b/pyam/utils.py index 17a0dfe0b..6ed6b74a1 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -369,6 +369,9 @@ def convert_r_columns(c): if missing_required_col: raise ValueError(f"Missing required columns: {missing_required_col}") + # replace missing units by an empty string for user-friendly filtering + df.loc[df.unit.isnull(), "unit"] = "" + # check whether data in wide format (IAMC) or long format (`value` column) if "value" in df.columns: # check if time column is given as `year` (int) or `time` (datetime) @@ -383,6 +386,7 @@ def convert_r_columns(c): for c in df.columns if c not in index + REQUIRED_COLS + [time_col, "value"] ] + wide = False else: # if in wide format, check if columns are years (int) or datetime cols = [c for c in df.columns if c not in index + REQUIRED_COLS] @@ -411,19 +415,40 @@ def convert_r_columns(c): melt_cols = sorted(year_cols) + sorted(time_cols) if not melt_cols: raise ValueError("Missing time domain") + wide = True + + # verify that there are no nan's left (in columns), and transform data + idx = index + REQUIRED_COLS + extra_cols + null_rows = df[idx].isnull().T.any() + if null_rows.any(): + _df = df[idx] + cols = ", ".join(_df.columns[_df.isnull().any().values]) + raise_data_error( + f"Empty cells in `data` (columns: '{cols}')", _df.loc[null_rows] + ) + del null_rows - # melt the dataframe - df = pd.melt( - df, - id_vars=index + REQUIRED_COLS + extra_cols, - var_name=time_col, - value_vars=melt_cols, - value_name="value", + if wide: + df = ( + df + .set_index(idx) + [melt_cols] + .rename_axis(columns=time_col) + .stack() + ) + df.name = "value" + else: + df = ( + df + .set_index(idx + [time_col]) + ['value'] ) # cast value column to numeric and drop nan + print('foo', type(df)) + print(df) try: - df["value"] = pd.to_numeric(df["value"]) + df = pd.to_numeric(df) except ValueError as e: # get the row number where the error happened row_nr_regex = re.compile(r"(?<=at position )\d+") @@ -432,23 +457,7 @@ def convert_r_columns(c): short_error = short_error_regex.search(str(e)).group() raise_data_error(f"{short_error} in `data`", df.iloc[[row_nr]]) - df.dropna(inplace=True, subset=["value"]) - - # replace missing units by an empty string for user-friendly filtering - df.loc[df.unit.isnull(), "unit"] = "" - - # verify that there are no nan's left (in columns) - null_rows = df.isnull().T.any() - if null_rows.any(): - cols = ", ".join(df.columns[df.isnull().any().values]) - raise_data_error( - f"Empty cells in `data` (columns: '{cols}')", df.loc[null_rows] - ) - del null_rows - - # cast to pd.Series, check for duplicates - idx_cols = index + REQUIRED_COLS + [time_col] + extra_cols - df = df.set_index(idx_cols).value + df = df.dropna() # format the time-column _time = [to_time(i) for i in get_index_levels(df.index, time_col)] From 22a6fa5eaba32d3f0a76245f0ca43c191313921c Mon Sep 17 00:00:00 2001 From: Matthew Gidden Date: Mon, 20 Feb 2023 09:06:29 +0100 Subject: [PATCH 11/12] bump workflow to do mpl tests on 3.8 since something upstream broke defaults --- .github/workflows/pytest.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index f49c5c051..9da1d7259 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -41,16 +41,16 @@ jobs: run: pip install .[tests,optional_plotting,optional_io_formats,tutorials] - name: Test with pytest - if: ${{ matrix.python-version != '3.9' }} + if: ${{ matrix.python-version != '3.8' }} run: pytest tests - # only execute Matplotlib tests on latest Python version + # only execute Matplotlib tests on a known stable Python + deps version - name: Test with pytest including Matplotlib & Codecov - if: ${{ matrix.python-version == '3.9' }} + if: ${{ matrix.python-version == '3.8' }} run: pytest tests --mpl --cov=./ --cov-report=xml - name: Upload coverage report to Codecov - if: ${{ matrix.os == 'ubuntu-latest' && matrix.python-version == '3.9' }} + if: ${{ matrix.os == 'ubuntu-latest' && matrix.python-version == '3.8' }} uses: codecov/codecov-action@v1 with: file: ./coverage.xml From 80cf8c585ca6eabd5bd8e55be036fc86070531bb Mon Sep 17 00:00:00 2001 From: Matthew Gidden Date: Mon, 20 Feb 2023 14:58:36 +0100 Subject: [PATCH 12/12] errant prints --- pyam/utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pyam/utils.py b/pyam/utils.py index 6ed6b74a1..b47c42fcd 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -445,8 +445,6 @@ def convert_r_columns(c): ) # cast value column to numeric and drop nan - print('foo', type(df)) - print(df) try: df = pd.to_numeric(df) except ValueError as e: