From ac80d408def037617688afa59216bed17d02e995 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Mon, 20 Feb 2023 09:51:36 +0100
Subject: [PATCH 1/9] Refactor `format_data()` to use `stack`

---
 pyam/utils.py | 64 +++++++++++++++++++++++++--------------------------
 1 file changed, 31 insertions(+), 33 deletions(-)

diff --git a/pyam/utils.py b/pyam/utils.py
index d1f9de068..eefef73df 100644
--- a/pyam/utils.py
+++ b/pyam/utils.py
@@ -274,8 +274,9 @@ def convert_r_columns(c):
     if missing_required_col:
         raise ValueError(f"Missing required columns: {missing_required_col}")
 
-    # check whether data in wide format (IAMC) or long format (`value` column)
+    # check whether data in wide format (standard IAMC) or long format (`value` column)
     if "value" in df.columns:
+
         # check if time column is given as `year` (int) or `time` (datetime)
         if "year" in df.columns and "time" not in df.columns:
             time_col = "year"
@@ -288,7 +289,13 @@ def convert_r_columns(c):
             for c in df.columns
             if c not in index + REQUIRED_COLS + [time_col, "value"]
         ]
+
+        # cast to pd.Series
+        idx_cols = index + REQUIRED_COLS + [time_col] + extra_cols
+        df = df.set_index(idx_cols).value
+
     else:
+
         # if in wide format, check if columns are years (int) or datetime
         cols = [c for c in df.columns if c not in index + REQUIRED_COLS]
         year_cols, time_cols, extra_cols = [], [], []
@@ -317,43 +324,34 @@ def convert_r_columns(c):
         if not melt_cols:
             raise ValueError("Missing time domain")
 
-        # melt the dataframe
-        df = pd.melt(
-            df,
-            id_vars=index + REQUIRED_COLS + extra_cols,
-            var_name=time_col,
-            value_vars=melt_cols,
-            value_name="value",
-        )
+        # replace missing units by an empty string for user-friendly filtering
+        df.loc[df.unit.isnull(), "unit"] = ""
+
+        # cast to long format, set
+        df.set_index(index + REQUIRED_COLS + extra_cols, inplace=True)
+        df = df.stack(dropna=True)
+        df.name = "value"
+        df.index.names = df.index.names[:-1] + [time_col]
 
     # cast value column to numeric and drop nan
-    try:
-        df["value"] = pd.to_numeric(df["value"])
-    except ValueError as e:
+#    try:
+#        df["value"] = pd.to_numeric(df["value"])
+#    except ValueError as e:
         # get the row number where the error happened
-        row_nr_regex = re.compile(r"(?<=at position )\d+")
-        row_nr = int(row_nr_regex.search(str(e)).group())
-        short_error_regex = re.compile(r".*(?= at position \d*)")
-        short_error = short_error_regex.search(str(e)).group()
-        raise_data_error(f"{short_error} in `data`", df.iloc[[row_nr]])
-
-    df.dropna(inplace=True, subset=["value"])
-
-    # replace missing units by an empty string for user-friendly filtering
-    df.loc[df.unit.isnull(), "unit"] = ""
+#        row_nr_regex = re.compile(r"(?<=at position )\d+")
+#        row_nr = int(row_nr_regex.search(str(e)).group())
+#        short_error_regex = re.compile(r".*(?= at position \d*)")
+#        short_error = short_error_regex.search(str(e)).group()
+#        raise_data_error(f"{short_error} in `data`", df.iloc[[row_nr]])
 
     # verify that there are no nan's left (in columns)
-    null_rows = df.isnull().T.any()
-    if null_rows.any():
-        cols = ", ".join(df.columns[df.isnull().any().values])
-        raise_data_error(
-            f"Empty cells in `data` (columns: '{cols}')", df.loc[null_rows]
-        )
-    del null_rows
-
-    # cast to pd.Series, check for duplicates
-    idx_cols = index + REQUIRED_COLS + [time_col] + extra_cols
-    df = df.set_index(idx_cols).value
+#    null_rows = df.isnull().T.any()
+#    if null_rows.any():
+#        cols = ", ".join(df.columns[df.isnull().any().values])
+#        raise_data_error(
+#            f"Empty cells in `data` (columns: '{cols}')", df.loc[null_rows]
+#        )
+#    del null_rows
 
     # format the time-column
     _time = [to_time(i) for i in get_index_levels(df.index, time_col)]

From 25de43cbca22cb0ec8585aa1382ea60b2a95e9c4 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Tue, 21 Feb 2023 23:54:37 +0100
Subject: [PATCH 2/9] Fix validation steps

---
 pyam/utils.py | 42 +++++++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/pyam/utils.py b/pyam/utils.py
index eefef73df..728b3f4cb 100644
--- a/pyam/utils.py
+++ b/pyam/utils.py
@@ -293,6 +293,7 @@ def convert_r_columns(c):
         # cast to pd.Series
         idx_cols = index + REQUIRED_COLS + [time_col] + extra_cols
         df = df.set_index(idx_cols).value
+        df.dropna(inplace=True)
 
     else:
 
@@ -334,24 +335,31 @@ def convert_r_columns(c):
         df.index.names = df.index.names[:-1] + [time_col]
 
     # cast value column to numeric and drop nan
-#    try:
-#        df["value"] = pd.to_numeric(df["value"])
-#    except ValueError as e:
+    try:
+        df = pd.to_numeric(df)
+    except ValueError as e:
         # get the row number where the error happened
-#        row_nr_regex = re.compile(r"(?<=at position )\d+")
-#        row_nr = int(row_nr_regex.search(str(e)).group())
-#        short_error_regex = re.compile(r".*(?= at position \d*)")
-#        short_error = short_error_regex.search(str(e)).group()
-#        raise_data_error(f"{short_error} in `data`", df.iloc[[row_nr]])
-
-    # verify that there are no nan's left (in columns)
-#    null_rows = df.isnull().T.any()
-#    if null_rows.any():
-#        cols = ", ".join(df.columns[df.isnull().any().values])
-#        raise_data_error(
-#            f"Empty cells in `data` (columns: '{cols}')", df.loc[null_rows]
-#        )
-#    del null_rows
+        row_nr_regex = re.compile(r"(?<=at position )\d+")
+        row_nr = int(row_nr_regex.search(str(e)).group())
+        short_error_regex = re.compile(r".*(?= at position \d*)")
+        short_error = short_error_regex.search(str(e)).group()
+        raise_data_error(f"{short_error} in `data`", df.iloc[[row_nr]])
+
+    # verify that there are no nan's in the index
+    null_rows = np.zeros(len(df), dtype=bool)
+    null_cols = []
+    for _name, _codes in zip(df.index.names, df.index.codes):
+        _null_fields = [i == -1 for i in _codes]
+        if any(_null_fields):
+            null_rows = np.logical_or(null_rows, _null_fields)
+            null_cols.append(_name)
+
+    if null_cols:
+        cols = ", ".join(null_cols)
+        raise_data_error(
+            f"Empty cells in `data` (columns: '{cols}')", df.loc[null_rows]
+        )
+    del null_rows
 
     # format the time-column
     _time = [to_time(i) for i in get_index_levels(df.index, time_col)]

From f371e076c5118e2f6a63bc71550ee0ab3066dbb3 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Tue, 21 Feb 2023 23:54:58 +0100
Subject: [PATCH 3/9] Add match to testing for nan in data index

---
 tests/test_core.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/test_core.py b/tests/test_core.py
index aa0064cc5..f98ff52f0 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -102,7 +102,6 @@ def test_init_df_with_illegal_values_raises(test_pd_df, illegal_value):
         f'.*string "{illegal_value}" in `data`:'
         r"(\n.*){2}model_a.*scen_a.*World.*Primary Energy.*EJ/yr.*2005"
     )
-
     with pytest.raises(ValueError, match=msg):
         IamDataFrame(test_pd_df)
 
@@ -110,7 +109,13 @@ def test_init_df_with_illegal_values_raises(test_pd_df, illegal_value):
 def test_init_df_with_na_scenario(test_pd_df):
     # missing values in an index dimension raises an error
     test_pd_df.loc[1, "scenario"] = np.nan
-    pytest.raises(ValueError, IamDataFrame, data=test_pd_df)
+    msg = (
+        "Empty cells in `data` \(columns: 'scenario'\):"
+        r"(\n.*){2}model_a.*NaN.*World.*Primary Energy|Coal.*EJ/yr.*2005.*"
+    )
+    with pytest.raises(ValueError, match=msg):
+        IamDataFrame(test_pd_df)
+
 
 
 def test_init_df_with_float_cols(test_pd_df):

From 9283e84a890eb85d6be28f1abdb3f9340748791c Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Wed, 22 Feb 2023 06:31:50 +0100
Subject: [PATCH 4/9] Use pandas to check for complete index

---
 pyam/utils.py | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/pyam/utils.py b/pyam/utils.py
index 728b3f4cb..9b4913ea1 100644
--- a/pyam/utils.py
+++ b/pyam/utils.py
@@ -290,6 +290,11 @@ def convert_r_columns(c):
             if c not in index + REQUIRED_COLS + [time_col, "value"]
         ]
 
+        # replace missing units by an empty string for user-friendly filtering
+        df.loc[df.unit.isnull(), "unit"] = ""
+
+        _validate_complete_index(df[index + REQUIRED_COLS + extra_cols])
+
         # cast to pd.Series
         idx_cols = index + REQUIRED_COLS + [time_col] + extra_cols
         df = df.set_index(idx_cols).value
@@ -328,6 +333,8 @@ def convert_r_columns(c):
         # replace missing units by an empty string for user-friendly filtering
         df.loc[df.unit.isnull(), "unit"] = ""
 
+        _validate_complete_index(df[index + REQUIRED_COLS + extra_cols])
+
         # cast to long format, set
         df.set_index(index + REQUIRED_COLS + extra_cols, inplace=True)
         df = df.stack(dropna=True)
@@ -345,22 +352,6 @@ def convert_r_columns(c):
         short_error = short_error_regex.search(str(e)).group()
         raise_data_error(f"{short_error} in `data`", df.iloc[[row_nr]])
 
-    # verify that there are no nan's in the index
-    null_rows = np.zeros(len(df), dtype=bool)
-    null_cols = []
-    for _name, _codes in zip(df.index.names, df.index.codes):
-        _null_fields = [i == -1 for i in _codes]
-        if any(_null_fields):
-            null_rows = np.logical_or(null_rows, _null_fields)
-            null_cols.append(_name)
-
-    if null_cols:
-        cols = ", ".join(null_cols)
-        raise_data_error(
-            f"Empty cells in `data` (columns: '{cols}')", df.loc[null_rows]
-        )
-    del null_rows
-
     # format the time-column
     _time = [to_time(i) for i in get_index_levels(df.index, time_col)]
     df.index = replace_index_labels(df.index, time_col, _time)
@@ -377,6 +368,18 @@ def convert_r_columns(c):
     return df.sort_index(), index, time_col, extra_cols
 
 
+def _validate_complete_index(df):
+    """Validate that there are no nan's in the (index) columns"""
+    null_cells = df.isnull()
+    null_rows = null_cells.T.any()
+    if null_rows.any():
+        null_cols = null_cells.any()
+        cols = ", ".join(null_cols[null_cols].index)
+        raise_data_error(
+            f"Empty cells in `data` (columns: '{cols}')", df.loc[null_rows]
+        )
+    del null_rows
+
 def sort_data(data, cols):
     """Sort data rows and order columns by cols"""
     return data.sort_values(cols)[cols + ["value"]].reset_index(drop=True)

From 5eb1764ad6f7f8cbd3f38ad7b52983ddb3a6ba76 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Wed, 22 Feb 2023 06:38:40 +0100
Subject: [PATCH 5/9] Make black

---
 pyam/utils.py      | 1 +
 tests/test_core.py | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyam/utils.py b/pyam/utils.py
index 9b4913ea1..65b65e343 100644
--- a/pyam/utils.py
+++ b/pyam/utils.py
@@ -380,6 +380,7 @@ def _validate_complete_index(df):
         )
     del null_rows
 
+
 def sort_data(data, cols):
     """Sort data rows and order columns by cols"""
     return data.sort_values(cols)[cols + ["value"]].reset_index(drop=True)
diff --git a/tests/test_core.py b/tests/test_core.py
index f98ff52f0..939bb0888 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -117,7 +117,6 @@ def test_init_df_with_na_scenario(test_pd_df):
         IamDataFrame(test_pd_df)
 
 
-
 def test_init_df_with_float_cols(test_pd_df):
     _test_df = test_pd_df.rename(columns={2005: 2005.0, 2010: 2010.0})
     obs = IamDataFrame(_test_df).timeseries().reset_index()

From 8602ef783d4dd2658907958537e432b656679deb Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Wed, 22 Feb 2023 06:49:03 +0100
Subject: [PATCH 6/9] Add to release notes

---
 RELEASE_NOTES.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index e444d07cc..65dfcd86f 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -1,5 +1,6 @@
 # Next Release
 
+- [#729](https://github.com/IAMconsortium/pyam/pull/729) Improve performance at initialization
 - [#723](https://github.com/IAMconsortium/pyam/pull/723) Ensure correct order of `time` attribute
 
 # Release v1.7.0

From 4b6207cac506eb679fa08a140c1eb7bb08585f2d Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Wed, 22 Feb 2023 09:38:32 +0100
Subject: [PATCH 7/9] Implement suggestions by @coroa
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jonas Hörsch <coroa@posteo.de>
---
 pyam/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyam/utils.py b/pyam/utils.py
index 65b65e343..cecf818e2 100644
--- a/pyam/utils.py
+++ b/pyam/utils.py
@@ -371,10 +371,10 @@ def convert_r_columns(c):
 def _validate_complete_index(df):
     """Validate that there are no nan's in the (index) columns"""
     null_cells = df.isnull()
-    null_rows = null_cells.T.any()
+    null_rows = null_cells.any(axis=1)
     if null_rows.any():
         null_cols = null_cells.any()
-        cols = ", ".join(null_cols[null_cols].index)
+        cols = ", ".join(null_cols.index[null_cols])
         raise_data_error(
             f"Empty cells in `data` (columns: '{cols}')", df.loc[null_rows]
         )

From 5877b308ed1188e15a96379d7de3e707bb09fa75 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Wed, 22 Feb 2023 09:41:37 +0100
Subject: [PATCH 8/9] Remove superfluous deletion

---
 pyam/utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pyam/utils.py b/pyam/utils.py
index cecf818e2..1df4e9b0e 100644
--- a/pyam/utils.py
+++ b/pyam/utils.py
@@ -378,7 +378,6 @@ def _validate_complete_index(df):
         raise_data_error(
             f"Empty cells in `data` (columns: '{cols}')", df.loc[null_rows]
         )
-    del null_rows
 
 
 def sort_data(data, cols):

From 3fb9bd622c4fad468abd3d6a5cb63c1fb4fabbc8 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Wed, 22 Feb 2023 09:44:49 +0100
Subject: [PATCH 9/9] Update a comment

---
 pyam/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyam/utils.py b/pyam/utils.py
index 1df4e9b0e..468aa2ac9 100644
--- a/pyam/utils.py
+++ b/pyam/utils.py
@@ -341,7 +341,7 @@ def convert_r_columns(c):
         df.name = "value"
         df.index.names = df.index.names[:-1] + [time_col]
 
-    # cast value column to numeric and drop nan
+    # cast value column to numeric
     try:
         df = pd.to_numeric(df)
     except ValueError as e: