Improve performance of format_data() (#729)

Co-authored-by: Jonas Hörsch <[email protected]>
IAMconsortium · Feb 22, 2023 · 7a97516 · 7a97516
1 parent 8c56dc3
commit 7a97516
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 31 deletions.
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -1,5 +1,6 @@
 # Next Release
 
+- [#729](https://github.com/IAMconsortium/pyam/pull/729) Improve performance at initialization
 - [#723](https://github.com/IAMconsortium/pyam/pull/723) Ensure correct order of `time` attribute
 
 # Release v1.7.0

diff --git a/pyam/utils.py b/pyam/utils.py
@@ -274,8 +274,9 @@ def convert_r_columns(c):
     if missing_required_col:
         raise ValueError(f"Missing required columns: {missing_required_col}")
 
-    # check whether data in wide format (IAMC) or long format (`value` column)
+    # check whether data in wide format (standard IAMC) or long format (`value` column)
     if "value" in df.columns:
+
         # check if time column is given as `year` (int) or `time` (datetime)
         if "year" in df.columns and "time" not in df.columns:
             time_col = "year"
@@ -288,7 +289,19 @@ def convert_r_columns(c):
             for c in df.columns
             if c not in index + REQUIRED_COLS + [time_col, "value"]
         ]
+
+        # replace missing units by an empty string for user-friendly filtering
+        df.loc[df.unit.isnull(), "unit"] = ""
+
+        _validate_complete_index(df[index + REQUIRED_COLS + extra_cols])
+
+        # cast to pd.Series
+        idx_cols = index + REQUIRED_COLS + [time_col] + extra_cols
+        df = df.set_index(idx_cols).value
+        df.dropna(inplace=True)
+
     else:
+
         # if in wide format, check if columns are years (int) or datetime
         cols = [c for c in df.columns if c not in index + REQUIRED_COLS]
         year_cols, time_cols, extra_cols = [], [], []
@@ -317,18 +330,20 @@ def convert_r_columns(c):
         if not melt_cols:
             raise ValueError("Missing time domain")
 
-        # melt the dataframe
-        df = pd.melt(
-            df,
-            id_vars=index + REQUIRED_COLS + extra_cols,
-            var_name=time_col,
-            value_vars=melt_cols,
-            value_name="value",
-        )
+        # replace missing units by an empty string for user-friendly filtering
+        df.loc[df.unit.isnull(), "unit"] = ""
+
+        _validate_complete_index(df[index + REQUIRED_COLS + extra_cols])
+
+        # cast to long format, set
+        df.set_index(index + REQUIRED_COLS + extra_cols, inplace=True)
+        df = df.stack(dropna=True)
+        df.name = "value"
+        df.index.names = df.index.names[:-1] + [time_col]
 
-    # cast value column to numeric and drop nan
+    # cast value column to numeric
     try:
-        df["value"] = pd.to_numeric(df["value"])
+        df = pd.to_numeric(df)
     except ValueError as e:
         # get the row number where the error happened
         row_nr_regex = re.compile(r"(?<=at position )\d+")
@@ -337,24 +352,6 @@ def convert_r_columns(c):
         short_error = short_error_regex.search(str(e)).group()
         raise_data_error(f"{short_error} in `data`", df.iloc[[row_nr]])
 
-    df.dropna(inplace=True, subset=["value"])
-
-    # replace missing units by an empty string for user-friendly filtering
-    df.loc[df.unit.isnull(), "unit"] = ""
-
-    # verify that there are no nan's left (in columns)
-    null_rows = df.isnull().T.any()
-    if null_rows.any():
-        cols = ", ".join(df.columns[df.isnull().any().values])
-        raise_data_error(
-            f"Empty cells in `data` (columns: '{cols}')", df.loc[null_rows]
-        )
-    del null_rows
-
-    # cast to pd.Series, check for duplicates
-    idx_cols = index + REQUIRED_COLS + [time_col] + extra_cols
-    df = df.set_index(idx_cols).value
-
     # format the time-column
     _time = [to_time(i) for i in get_index_levels(df.index, time_col)]
     df.index = replace_index_labels(df.index, time_col, _time)
@@ -371,6 +368,18 @@ def convert_r_columns(c):
     return df.sort_index(), index, time_col, extra_cols
 
 
+def _validate_complete_index(df):
+    """Validate that there are no nan's in the (index) columns"""
+    null_cells = df.isnull()
+    null_rows = null_cells.any(axis=1)
+    if null_rows.any():
+        null_cols = null_cells.any()
+        cols = ", ".join(null_cols.index[null_cols])
+        raise_data_error(
+            f"Empty cells in `data` (columns: '{cols}')", df.loc[null_rows]
+        )
+
+
 def sort_data(data, cols):
     """Sort data rows and order columns by cols"""
     return data.sort_values(cols)[cols + ["value"]].reset_index(drop=True)

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -102,15 +102,19 @@ def test_init_df_with_illegal_values_raises(test_pd_df, illegal_value):
         f'.*string "{illegal_value}" in `data`:'
         r"(\n.*){2}model_a.*scen_a.*World.*Primary Energy.*EJ/yr.*2005"
     )
-
     with pytest.raises(ValueError, match=msg):
         IamDataFrame(test_pd_df)
 
 
 def test_init_df_with_na_scenario(test_pd_df):
     # missing values in an index dimension raises an error
     test_pd_df.loc[1, "scenario"] = np.nan
-    pytest.raises(ValueError, IamDataFrame, data=test_pd_df)
+    msg = (
+        "Empty cells in `data` \(columns: 'scenario'\):"
+        r"(\n.*){2}model_a.*NaN.*World.*Primary Energy|Coal.*EJ/yr.*2005.*"
+    )
+    with pytest.raises(ValueError, match=msg):
+        IamDataFrame(test_pd_df)
 
 
 def test_init_df_with_float_cols(test_pd_df):