diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index e444d07cc..65dfcd86f 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,5 +1,6 @@ # Next Release +- [#729](https://github.com/IAMconsortium/pyam/pull/729) Improve performance at initialization - [#723](https://github.com/IAMconsortium/pyam/pull/723) Ensure correct order of `time` attribute # Release v1.7.0 diff --git a/pyam/utils.py b/pyam/utils.py index d1f9de068..468aa2ac9 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -274,8 +274,9 @@ def convert_r_columns(c): if missing_required_col: raise ValueError(f"Missing required columns: {missing_required_col}") - # check whether data in wide format (IAMC) or long format (`value` column) + # check whether data in wide format (standard IAMC) or long format (`value` column) if "value" in df.columns: + # check if time column is given as `year` (int) or `time` (datetime) if "year" in df.columns and "time" not in df.columns: time_col = "year" @@ -288,7 +289,19 @@ def convert_r_columns(c): for c in df.columns if c not in index + REQUIRED_COLS + [time_col, "value"] ] + + # replace missing units by an empty string for user-friendly filtering + df.loc[df.unit.isnull(), "unit"] = "" + + _validate_complete_index(df[index + REQUIRED_COLS + extra_cols]) + + # cast to pd.Series + idx_cols = index + REQUIRED_COLS + [time_col] + extra_cols + df = df.set_index(idx_cols).value + df.dropna(inplace=True) + else: + # if in wide format, check if columns are years (int) or datetime cols = [c for c in df.columns if c not in index + REQUIRED_COLS] year_cols, time_cols, extra_cols = [], [], [] @@ -317,18 +330,20 @@ def convert_r_columns(c): if not melt_cols: raise ValueError("Missing time domain") - # melt the dataframe - df = pd.melt( - df, - id_vars=index + REQUIRED_COLS + extra_cols, - var_name=time_col, - value_vars=melt_cols, - value_name="value", - ) + # replace missing units by an empty string for user-friendly filtering + df.loc[df.unit.isnull(), "unit"] = "" + + _validate_complete_index(df[index + REQUIRED_COLS + extra_cols]) + + # cast to long format, set + df.set_index(index + REQUIRED_COLS + extra_cols, inplace=True) + df = df.stack(dropna=True) + df.name = "value" + df.index.names = df.index.names[:-1] + [time_col] - # cast value column to numeric and drop nan + # cast value column to numeric try: - df["value"] = pd.to_numeric(df["value"]) + df = pd.to_numeric(df) except ValueError as e: # get the row number where the error happened row_nr_regex = re.compile(r"(?<=at position )\d+") @@ -337,24 +352,6 @@ def convert_r_columns(c): short_error = short_error_regex.search(str(e)).group() raise_data_error(f"{short_error} in `data`", df.iloc[[row_nr]]) - df.dropna(inplace=True, subset=["value"]) - - # replace missing units by an empty string for user-friendly filtering - df.loc[df.unit.isnull(), "unit"] = "" - - # verify that there are no nan's left (in columns) - null_rows = df.isnull().T.any() - if null_rows.any(): - cols = ", ".join(df.columns[df.isnull().any().values]) - raise_data_error( - f"Empty cells in `data` (columns: '{cols}')", df.loc[null_rows] - ) - del null_rows - - # cast to pd.Series, check for duplicates - idx_cols = index + REQUIRED_COLS + [time_col] + extra_cols - df = df.set_index(idx_cols).value - # format the time-column _time = [to_time(i) for i in get_index_levels(df.index, time_col)] df.index = replace_index_labels(df.index, time_col, _time) @@ -371,6 +368,18 @@ def convert_r_columns(c): return df.sort_index(), index, time_col, extra_cols +def _validate_complete_index(df): + """Validate that there are no nan's in the (index) columns""" + null_cells = df.isnull() + null_rows = null_cells.any(axis=1) + if null_rows.any(): + null_cols = null_cells.any() + cols = ", ".join(null_cols.index[null_cols]) + raise_data_error( + f"Empty cells in `data` (columns: '{cols}')", df.loc[null_rows] + ) + + def sort_data(data, cols): """Sort data rows and order columns by cols""" return data.sort_values(cols)[cols + ["value"]].reset_index(drop=True) diff --git a/tests/test_core.py b/tests/test_core.py index aa0064cc5..939bb0888 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -102,7 +102,6 @@ def test_init_df_with_illegal_values_raises(test_pd_df, illegal_value): f'.*string "{illegal_value}" in `data`:' r"(\n.*){2}model_a.*scen_a.*World.*Primary Energy.*EJ/yr.*2005" ) - with pytest.raises(ValueError, match=msg): IamDataFrame(test_pd_df) @@ -110,7 +109,12 @@ def test_init_df_with_illegal_values_raises(test_pd_df, illegal_value): def test_init_df_with_na_scenario(test_pd_df): # missing values in an index dimension raises an error test_pd_df.loc[1, "scenario"] = np.nan - pytest.raises(ValueError, IamDataFrame, data=test_pd_df) + msg = ( + "Empty cells in `data` \(columns: 'scenario'\):" + r"(\n.*){2}model_a.*NaN.*World.*Primary Energy|Coal.*EJ/yr.*2005.*" + ) + with pytest.raises(ValueError, match=msg): + IamDataFrame(test_pd_df) def test_init_df_with_float_cols(test_pd_df):