From 150e90dfd240dd4f34bd421cbb628427093ed88c Mon Sep 17 00:00:00 2001 From: Jonas Hoersch Date: Wed, 22 Feb 2023 01:32:12 +0100 Subject: [PATCH 1/6] Add fast-path to format data Co-authored-by: Matthew Gidden --- pyam/utils.py | 79 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 54 insertions(+), 25 deletions(-) diff --git a/pyam/utils.py b/pyam/utils.py index fd7574664..42a26cb6a 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -252,8 +252,14 @@ def _format_from_legacy_database(df): def _intuit_column_groups(df, index): """Check and categorise columns in dataframe""" + existing_cols = pd.Index(df.index.names).dropna() # skip unnamed columns + if isinstance(df, pd.Series): + existing_cols = existing_cols.union(["value"]) + elif isinstance(df, pd.DataFrame): + existing_cols = existing_cols.union(df.columns) + # check that there is no column in the timeseries data with reserved names - conflict_cols = [i for i in df.columns if i in ILLEGAL_COLS] + conflict_cols = [i for i in existing_cols if i in ILLEGAL_COLS] if conflict_cols: msg = f"Column name {conflict_cols} is illegal for timeseries data.\n" _args = ", ".join([f"{i}_1='{i}'" for i in conflict_cols]) @@ -261,32 +267,32 @@ def _intuit_column_groups(df, index): raise ValueError(msg) # check that index and required columns exist - missing_index = [c for c in index if c not in df.columns] + missing_index = [c for c in index if c not in existing_cols] if missing_index: raise ValueError(f"Missing index columns: {missing_index}") - missing_required_col = [c for c in REQUIRED_COLS if c not in df.columns] + missing_required_col = [c for c in REQUIRED_COLS if c not in existing_cols] if missing_required_col: raise ValueError(f"Missing required columns: {missing_required_col}") # check whether data in wide format (standard IAMC) or long format (`value` column) - if "value" in df.columns: + if "value" in existing_cols: # check if time column is given as `year` (int) or `time` (datetime) - if "year" in df.columns and "time" not in df.columns: + if "year" in existing_cols and "time" not in existing_cols: time_col = "year" - elif "time" in df.columns and "year" not in df.columns: + elif "time" in existing_cols and "year" not in existing_cols: time_col = "time" else: raise ValueError("Invalid time domain, must have either `year` or `time`!") extra_cols = [ c - for c in df.columns + for c in existing_cols if c not in index + REQUIRED_COLS + [time_col, "value"] ] data_cols = [] else: # if in wide format, check if columns are years (int) or datetime - cols = [c for c in df.columns if c not in index + REQUIRED_COLS] + cols = [c for c in existing_cols if c not in index + REQUIRED_COLS] year_cols, time_cols, extra_cols = [], [], [] for i in cols: # if the column name can be cast to integer, assume it's a year column @@ -344,29 +350,52 @@ def _format_data_to_series(df, index): def format_data(df, index, **kwargs): """Convert a pandas.Dataframe or pandas.Series to the required format""" - if isinstance(df, pd.Series): - if not df.name: - df = df.rename("value") - df = df.reset_index() - elif not list(df.index.names) == [None]: - # reset the index if meaningful entries are included there - df = df.reset_index() + if set(df.index.names) >= set(index) | set(REQUIRED_COLS) and not kwargs: + # Let's try to cut corners here, it's our fast-path + time_col, extra_cols, data_cols = _intuit_column_groups(df, index=index) + + if isinstance(df, pd.DataFrame): + extra_cols_not_in_index = [c for c in extra_cols if c in df.columns] + if extra_cols_not_in_index: + df = df.set_index(extra_cols_not_in_index, append=True) - df = _convert_r_columns(df) + if data_cols: + df = df[data_cols] + df = df.rename_axis(columns=time_col) + df = df.stack() + df = df.rename("value") + else: + df = df["value"] - if kwargs: - df = _knead_data(df, **kwargs) + df = df.reorder_levels(index + REQUIRED_COLS + extra_cols + [time_col]) - # cast all columns names to lower case - df.rename(columns={c: str(c).lower() for c in df.columns if isstr(c)}, inplace=True) + else: + + if isinstance(df, pd.Series): + if not df.name: + df = df.rename("value") + df = df.reset_index() + elif not list(df.index.names) == [None]: + # reset the index if meaningful entries are included there + df = df.reset_index() + + df = _convert_r_columns(df) + + if kwargs: + df = _knead_data(df, **kwargs) + + # all lower case + df.rename( + columns={c: str(c).lower() for c in df.columns if isstr(c)}, inplace=True + ) - if "notes" in df.columns: # this came from a legacy database (SSP or earlier) - df = _format_from_legacy_database(df) + if "notes" in df.columns: # this came from a legacy database (SSP or earlier) + df = _format_from_legacy_database(df) - # replace missing units by an empty string for user-friendly filtering - df = df.assign(unit=df["unit"].fillna("")) + # replace missing units by an empty string for user-friendly filtering + df = df.assign(unit=df["unit"].fillna("")) - df, time_col, extra_cols = _format_data_to_series(df, index) + df, time_col, extra_cols = _format_data_to_series(df, index) # cast value column to numeric try: From a011b9756e75591e0d32e51bf7d17a1299319a69 Mon Sep 17 00:00:00 2001 From: Jonas Hoersch Date: Wed, 22 Feb 2023 12:46:00 +0100 Subject: [PATCH 2/6] Add missing dropna and fix column order --- pyam/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyam/utils.py b/pyam/utils.py index 42a26cb6a..f639cc3b1 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -367,10 +367,9 @@ def format_data(df, index, **kwargs): else: df = df["value"] - df = df.reorder_levels(index + REQUIRED_COLS + extra_cols + [time_col]) + df = df.reorder_levels(index + REQUIRED_COLS + [time_col] + extra_cols).dropna() else: - if isinstance(df, pd.Series): if not df.name: df = df.rename("value") From 3c1988660e16d01caae05802825ef718d5bedbf6 Mon Sep 17 00:00:00 2001 From: Jonas Hoersch Date: Mon, 27 Feb 2023 08:40:59 +0100 Subject: [PATCH 3/6] Style suggestion Black puts it into one line. No haggling. Co-authored-by: Matthew Gidden --- pyam/utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pyam/utils.py b/pyam/utils.py index f639cc3b1..041ad7bb0 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -360,10 +360,7 @@ def format_data(df, index, **kwargs): df = df.set_index(extra_cols_not_in_index, append=True) if data_cols: - df = df[data_cols] - df = df.rename_axis(columns=time_col) - df = df.stack() - df = df.rename("value") + df = df[data_cols].rename_axis(columns=time_col).stack().rename("value") else: df = df["value"] From 3effa08be667ceab0529c01d968c5d5f98d36ea8 Mon Sep 17 00:00:00 2001 From: Jonas Hoersch Date: Mon, 27 Feb 2023 15:24:06 +0100 Subject: [PATCH 4/6] Apply suggestions Co-authored-by: Daniel Huppmann --- pyam/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyam/utils.py b/pyam/utils.py index 041ad7bb0..4f51311b4 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -350,8 +350,8 @@ def _format_data_to_series(df, index): def format_data(df, index, **kwargs): """Convert a pandas.Dataframe or pandas.Series to the required format""" + # Fast-pass if `df` has the index and required columns as a pd.MultiIndex if set(df.index.names) >= set(index) | set(REQUIRED_COLS) and not kwargs: - # Let's try to cut corners here, it's our fast-path time_col, extra_cols, data_cols = _intuit_column_groups(df, index=index) if isinstance(df, pd.DataFrame): From 4bc9185379bd752e8ad4aacab81594630a2e2468 Mon Sep 17 00:00:00 2001 From: Jonas Hoersch Date: Mon, 27 Feb 2023 15:39:05 +0100 Subject: [PATCH 5/6] Make choice to examine index levels explicit --- pyam/utils.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pyam/utils.py b/pyam/utils.py index 4f51311b4..721b4ccc4 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -249,10 +249,13 @@ def _format_from_legacy_database(df): return df -def _intuit_column_groups(df, index): +def _intuit_column_groups(df, index, include_index=False): """Check and categorise columns in dataframe""" - existing_cols = pd.Index(df.index.names).dropna() # skip unnamed columns + if include_index: + existing_cols = pd.Index(df.index.names) + else: + existing_cols = pd.Index([]) if isinstance(df, pd.Series): existing_cols = existing_cols.union(["value"]) elif isinstance(df, pd.DataFrame): @@ -352,7 +355,9 @@ def format_data(df, index, **kwargs): # Fast-pass if `df` has the index and required columns as a pd.MultiIndex if set(df.index.names) >= set(index) | set(REQUIRED_COLS) and not kwargs: - time_col, extra_cols, data_cols = _intuit_column_groups(df, index=index) + time_col, extra_cols, data_cols = _intuit_column_groups( + df, index=index, include_index=True + ) if isinstance(df, pd.DataFrame): extra_cols_not_in_index = [c for c in extra_cols if c in df.columns] From ab6b32e3cd0cc037f79ecb993ab1d20784179c26 Mon Sep 17 00:00:00 2001 From: Jonas Hoersch Date: Mon, 27 Feb 2023 20:26:30 +0100 Subject: [PATCH 6/6] Add entry to release notes --- RELEASE_NOTES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 21e2bb4d5..d997366ca 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,5 +1,6 @@ # Next Release +- [#731](https://github.com/IAMconsortium/pyam/pull/731) Add fast-path to initialization for sufficient multiindex - [#732](https://github.com/IAMconsortium/pyam/pull/732) Fix a few typos in tutorials - [#730](https://github.com/IAMconsortium/pyam/pull/730) Refactor initialization code - [#729](https://github.com/IAMconsortium/pyam/pull/729) Improve performance at initialization