From ac80d408def037617688afa59216bed17d02e995 Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Mon, 20 Feb 2023 09:51:36 +0100 Subject: [PATCH 1/9] Refactor `format_data()` to use `stack` --- pyam/utils.py | 64 +++++++++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/pyam/utils.py b/pyam/utils.py index d1f9de068..eefef73df 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -274,8 +274,9 @@ def convert_r_columns(c): if missing_required_col: raise ValueError(f"Missing required columns: {missing_required_col}") - # check whether data in wide format (IAMC) or long format (`value` column) + # check whether data in wide format (standard IAMC) or long format (`value` column) if "value" in df.columns: + # check if time column is given as `year` (int) or `time` (datetime) if "year" in df.columns and "time" not in df.columns: time_col = "year" @@ -288,7 +289,13 @@ def convert_r_columns(c): for c in df.columns if c not in index + REQUIRED_COLS + [time_col, "value"] ] + + # cast to pd.Series + idx_cols = index + REQUIRED_COLS + [time_col] + extra_cols + df = df.set_index(idx_cols).value + else: + # if in wide format, check if columns are years (int) or datetime cols = [c for c in df.columns if c not in index + REQUIRED_COLS] year_cols, time_cols, extra_cols = [], [], [] @@ -317,43 +324,34 @@ def convert_r_columns(c): if not melt_cols: raise ValueError("Missing time domain") - # melt the dataframe - df = pd.melt( - df, - id_vars=index + REQUIRED_COLS + extra_cols, - var_name=time_col, - value_vars=melt_cols, - value_name="value", - ) + # replace missing units by an empty string for user-friendly filtering + df.loc[df.unit.isnull(), "unit"] = "" + + # cast to long format, set + df.set_index(index + REQUIRED_COLS + extra_cols, inplace=True) + df = df.stack(dropna=True) + df.name = "value" + df.index.names = df.index.names[:-1] + [time_col] # cast value column to numeric and drop nan - try: - df["value"] = pd.to_numeric(df["value"]) - except ValueError as e: +# try: +# df["value"] = pd.to_numeric(df["value"]) +# except ValueError as e: # get the row number where the error happened - row_nr_regex = re.compile(r"(?<=at position )\d+") - row_nr = int(row_nr_regex.search(str(e)).group()) - short_error_regex = re.compile(r".*(?= at position \d*)") - short_error = short_error_regex.search(str(e)).group() - raise_data_error(f"{short_error} in `data`", df.iloc[[row_nr]]) - - df.dropna(inplace=True, subset=["value"]) - - # replace missing units by an empty string for user-friendly filtering - df.loc[df.unit.isnull(), "unit"] = "" +# row_nr_regex = re.compile(r"(?<=at position )\d+") +# row_nr = int(row_nr_regex.search(str(e)).group()) +# short_error_regex = re.compile(r".*(?= at position \d*)") +# short_error = short_error_regex.search(str(e)).group() +# raise_data_error(f"{short_error} in `data`", df.iloc[[row_nr]]) # verify that there are no nan's left (in columns) - null_rows = df.isnull().T.any() - if null_rows.any(): - cols = ", ".join(df.columns[df.isnull().any().values]) - raise_data_error( - f"Empty cells in `data` (columns: '{cols}')", df.loc[null_rows] - ) - del null_rows - - # cast to pd.Series, check for duplicates - idx_cols = index + REQUIRED_COLS + [time_col] + extra_cols - df = df.set_index(idx_cols).value +# null_rows = df.isnull().T.any() +# if null_rows.any(): +# cols = ", ".join(df.columns[df.isnull().any().values]) +# raise_data_error( +# f"Empty cells in `data` (columns: '{cols}')", df.loc[null_rows] +# ) +# del null_rows # format the time-column _time = [to_time(i) for i in get_index_levels(df.index, time_col)] From 25de43cbca22cb0ec8585aa1382ea60b2a95e9c4 Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Tue, 21 Feb 2023 23:54:37 +0100 Subject: [PATCH 2/9] Fix validation steps --- pyam/utils.py | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/pyam/utils.py b/pyam/utils.py index eefef73df..728b3f4cb 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -293,6 +293,7 @@ def convert_r_columns(c): # cast to pd.Series idx_cols = index + REQUIRED_COLS + [time_col] + extra_cols df = df.set_index(idx_cols).value + df.dropna(inplace=True) else: @@ -334,24 +335,31 @@ def convert_r_columns(c): df.index.names = df.index.names[:-1] + [time_col] # cast value column to numeric and drop nan -# try: -# df["value"] = pd.to_numeric(df["value"]) -# except ValueError as e: + try: + df = pd.to_numeric(df) + except ValueError as e: # get the row number where the error happened -# row_nr_regex = re.compile(r"(?<=at position )\d+") -# row_nr = int(row_nr_regex.search(str(e)).group()) -# short_error_regex = re.compile(r".*(?= at position \d*)") -# short_error = short_error_regex.search(str(e)).group() -# raise_data_error(f"{short_error} in `data`", df.iloc[[row_nr]]) - - # verify that there are no nan's left (in columns) -# null_rows = df.isnull().T.any() -# if null_rows.any(): -# cols = ", ".join(df.columns[df.isnull().any().values]) -# raise_data_error( -# f"Empty cells in `data` (columns: '{cols}')", df.loc[null_rows] -# ) -# del null_rows + row_nr_regex = re.compile(r"(?<=at position )\d+") + row_nr = int(row_nr_regex.search(str(e)).group()) + short_error_regex = re.compile(r".*(?= at position \d*)") + short_error = short_error_regex.search(str(e)).group() + raise_data_error(f"{short_error} in `data`", df.iloc[[row_nr]]) + + # verify that there are no nan's in the index + null_rows = np.zeros(len(df), dtype=bool) + null_cols = [] + for _name, _codes in zip(df.index.names, df.index.codes): + _null_fields = [i == -1 for i in _codes] + if any(_null_fields): + null_rows = np.logical_or(null_rows, _null_fields) + null_cols.append(_name) + + if null_cols: + cols = ", ".join(null_cols) + raise_data_error( + f"Empty cells in `data` (columns: '{cols}')", df.loc[null_rows] + ) + del null_rows # format the time-column _time = [to_time(i) for i in get_index_levels(df.index, time_col)] From f371e076c5118e2f6a63bc71550ee0ab3066dbb3 Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Tue, 21 Feb 2023 23:54:58 +0100 Subject: [PATCH 3/9] Add match to testing for nan in data index --- tests/test_core.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index aa0064cc5..f98ff52f0 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -102,7 +102,6 @@ def test_init_df_with_illegal_values_raises(test_pd_df, illegal_value): f'.*string "{illegal_value}" in `data`:' r"(\n.*){2}model_a.*scen_a.*World.*Primary Energy.*EJ/yr.*2005" ) - with pytest.raises(ValueError, match=msg): IamDataFrame(test_pd_df) @@ -110,7 +109,13 @@ def test_init_df_with_illegal_values_raises(test_pd_df, illegal_value): def test_init_df_with_na_scenario(test_pd_df): # missing values in an index dimension raises an error test_pd_df.loc[1, "scenario"] = np.nan - pytest.raises(ValueError, IamDataFrame, data=test_pd_df) + msg = ( + "Empty cells in `data` \(columns: 'scenario'\):" + r"(\n.*){2}model_a.*NaN.*World.*Primary Energy|Coal.*EJ/yr.*2005.*" + ) + with pytest.raises(ValueError, match=msg): + IamDataFrame(test_pd_df) + def test_init_df_with_float_cols(test_pd_df): From 9283e84a890eb85d6be28f1abdb3f9340748791c Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Wed, 22 Feb 2023 06:31:50 +0100 Subject: [PATCH 4/9] Use pandas to check for complete index --- pyam/utils.py | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/pyam/utils.py b/pyam/utils.py index 728b3f4cb..9b4913ea1 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -290,6 +290,11 @@ def convert_r_columns(c): if c not in index + REQUIRED_COLS + [time_col, "value"] ] + # replace missing units by an empty string for user-friendly filtering + df.loc[df.unit.isnull(), "unit"] = "" + + _validate_complete_index(df[index + REQUIRED_COLS + extra_cols]) + # cast to pd.Series idx_cols = index + REQUIRED_COLS + [time_col] + extra_cols df = df.set_index(idx_cols).value @@ -328,6 +333,8 @@ def convert_r_columns(c): # replace missing units by an empty string for user-friendly filtering df.loc[df.unit.isnull(), "unit"] = "" + _validate_complete_index(df[index + REQUIRED_COLS + extra_cols]) + # cast to long format, set df.set_index(index + REQUIRED_COLS + extra_cols, inplace=True) df = df.stack(dropna=True) @@ -345,22 +352,6 @@ def convert_r_columns(c): short_error = short_error_regex.search(str(e)).group() raise_data_error(f"{short_error} in `data`", df.iloc[[row_nr]]) - # verify that there are no nan's in the index - null_rows = np.zeros(len(df), dtype=bool) - null_cols = [] - for _name, _codes in zip(df.index.names, df.index.codes): - _null_fields = [i == -1 for i in _codes] - if any(_null_fields): - null_rows = np.logical_or(null_rows, _null_fields) - null_cols.append(_name) - - if null_cols: - cols = ", ".join(null_cols) - raise_data_error( - f"Empty cells in `data` (columns: '{cols}')", df.loc[null_rows] - ) - del null_rows - # format the time-column _time = [to_time(i) for i in get_index_levels(df.index, time_col)] df.index = replace_index_labels(df.index, time_col, _time) @@ -377,6 +368,18 @@ def convert_r_columns(c): return df.sort_index(), index, time_col, extra_cols +def _validate_complete_index(df): + """Validate that there are no nan's in the (index) columns""" + null_cells = df.isnull() + null_rows = null_cells.T.any() + if null_rows.any(): + null_cols = null_cells.any() + cols = ", ".join(null_cols[null_cols].index) + raise_data_error( + f"Empty cells in `data` (columns: '{cols}')", df.loc[null_rows] + ) + del null_rows + def sort_data(data, cols): """Sort data rows and order columns by cols""" return data.sort_values(cols)[cols + ["value"]].reset_index(drop=True) From 5eb1764ad6f7f8cbd3f38ad7b52983ddb3a6ba76 Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Wed, 22 Feb 2023 06:38:40 +0100 Subject: [PATCH 5/9] Make black --- pyam/utils.py | 1 + tests/test_core.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/pyam/utils.py b/pyam/utils.py index 9b4913ea1..65b65e343 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -380,6 +380,7 @@ def _validate_complete_index(df): ) del null_rows + def sort_data(data, cols): """Sort data rows and order columns by cols""" return data.sort_values(cols)[cols + ["value"]].reset_index(drop=True) diff --git a/tests/test_core.py b/tests/test_core.py index f98ff52f0..939bb0888 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -117,7 +117,6 @@ def test_init_df_with_na_scenario(test_pd_df): IamDataFrame(test_pd_df) - def test_init_df_with_float_cols(test_pd_df): _test_df = test_pd_df.rename(columns={2005: 2005.0, 2010: 2010.0}) obs = IamDataFrame(_test_df).timeseries().reset_index() From 8602ef783d4dd2658907958537e432b656679deb Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Wed, 22 Feb 2023 06:49:03 +0100 Subject: [PATCH 6/9] Add to release notes --- RELEASE_NOTES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index e444d07cc..65dfcd86f 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,5 +1,6 @@ # Next Release +- [#729](https://github.com/IAMconsortium/pyam/pull/729) Improve performance at initialization - [#723](https://github.com/IAMconsortium/pyam/pull/723) Ensure correct order of `time` attribute # Release v1.7.0 From 4b6207cac506eb679fa08a140c1eb7bb08585f2d Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Wed, 22 Feb 2023 09:38:32 +0100 Subject: [PATCH 7/9] Implement suggestions by @coroa MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jonas Hörsch --- pyam/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyam/utils.py b/pyam/utils.py index 65b65e343..cecf818e2 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -371,10 +371,10 @@ def convert_r_columns(c): def _validate_complete_index(df): """Validate that there are no nan's in the (index) columns""" null_cells = df.isnull() - null_rows = null_cells.T.any() + null_rows = null_cells.any(axis=1) if null_rows.any(): null_cols = null_cells.any() - cols = ", ".join(null_cols[null_cols].index) + cols = ", ".join(null_cols.index[null_cols]) raise_data_error( f"Empty cells in `data` (columns: '{cols}')", df.loc[null_rows] ) From 5877b308ed1188e15a96379d7de3e707bb09fa75 Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Wed, 22 Feb 2023 09:41:37 +0100 Subject: [PATCH 8/9] Remove superfluous deletion --- pyam/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pyam/utils.py b/pyam/utils.py index cecf818e2..1df4e9b0e 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -378,7 +378,6 @@ def _validate_complete_index(df): raise_data_error( f"Empty cells in `data` (columns: '{cols}')", df.loc[null_rows] ) - del null_rows def sort_data(data, cols): From 3fb9bd622c4fad468abd3d6a5cb63c1fb4fabbc8 Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Wed, 22 Feb 2023 09:44:49 +0100 Subject: [PATCH 9/9] Update a comment --- pyam/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyam/utils.py b/pyam/utils.py index 1df4e9b0e..468aa2ac9 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -341,7 +341,7 @@ def convert_r_columns(c): df.name = "value" df.index.names = df.index.names[:-1] + [time_col] - # cast value column to numeric and drop nan + # cast value column to numeric try: df = pd.to_numeric(df) except ValueError as e: