From 150e90dfd240dd4f34bd421cbb628427093ed88c Mon Sep 17 00:00:00 2001
From: Jonas Hoersch <jonas.hoersch@climateanalytics.org>
Date: Wed, 22 Feb 2023 01:32:12 +0100
Subject: [PATCH 1/6] Add fast-path to format data

Co-authored-by: Matthew Gidden <matthew.gidden@gmail.com>
---
 pyam/utils.py | 79 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 54 insertions(+), 25 deletions(-)

diff --git a/pyam/utils.py b/pyam/utils.py
index fd7574664..42a26cb6a 100644
--- a/pyam/utils.py
+++ b/pyam/utils.py
@@ -252,8 +252,14 @@ def _format_from_legacy_database(df):
 def _intuit_column_groups(df, index):
     """Check and categorise columns in dataframe"""
 
+    existing_cols = pd.Index(df.index.names).dropna()  # skip unnamed columns
+    if isinstance(df, pd.Series):
+        existing_cols = existing_cols.union(["value"])
+    elif isinstance(df, pd.DataFrame):
+        existing_cols = existing_cols.union(df.columns)
+
     # check that there is no column in the timeseries data with reserved names
-    conflict_cols = [i for i in df.columns if i in ILLEGAL_COLS]
+    conflict_cols = [i for i in existing_cols if i in ILLEGAL_COLS]
     if conflict_cols:
         msg = f"Column name {conflict_cols} is illegal for timeseries data.\n"
         _args = ", ".join([f"{i}_1='{i}'" for i in conflict_cols])
@@ -261,32 +267,32 @@ def _intuit_column_groups(df, index):
         raise ValueError(msg)
 
     # check that index and required columns exist
-    missing_index = [c for c in index if c not in df.columns]
+    missing_index = [c for c in index if c not in existing_cols]
     if missing_index:
         raise ValueError(f"Missing index columns: {missing_index}")
 
-    missing_required_col = [c for c in REQUIRED_COLS if c not in df.columns]
+    missing_required_col = [c for c in REQUIRED_COLS if c not in existing_cols]
     if missing_required_col:
         raise ValueError(f"Missing required columns: {missing_required_col}")
 
     # check whether data in wide format (standard IAMC) or long format (`value` column)
-    if "value" in df.columns:
+    if "value" in existing_cols:
         # check if time column is given as `year` (int) or `time` (datetime)
-        if "year" in df.columns and "time" not in df.columns:
+        if "year" in existing_cols and "time" not in existing_cols:
             time_col = "year"
-        elif "time" in df.columns and "year" not in df.columns:
+        elif "time" in existing_cols and "year" not in existing_cols:
             time_col = "time"
         else:
             raise ValueError("Invalid time domain, must have either `year` or `time`!")
         extra_cols = [
             c
-            for c in df.columns
+            for c in existing_cols
             if c not in index + REQUIRED_COLS + [time_col, "value"]
         ]
         data_cols = []
     else:
         # if in wide format, check if columns are years (int) or datetime
-        cols = [c for c in df.columns if c not in index + REQUIRED_COLS]
+        cols = [c for c in existing_cols if c not in index + REQUIRED_COLS]
         year_cols, time_cols, extra_cols = [], [], []
         for i in cols:
             # if the column name can be cast to integer, assume it's a year column
@@ -344,29 +350,52 @@ def _format_data_to_series(df, index):
 def format_data(df, index, **kwargs):
     """Convert a pandas.Dataframe or pandas.Series to the required format"""
 
-    if isinstance(df, pd.Series):
-        if not df.name:
-            df = df.rename("value")
-        df = df.reset_index()
-    elif not list(df.index.names) == [None]:
-        # reset the index if meaningful entries are included there
-        df = df.reset_index()
+    if set(df.index.names) >= set(index) | set(REQUIRED_COLS) and not kwargs:
+        # Let's try to cut corners here, it's our fast-path
+        time_col, extra_cols, data_cols = _intuit_column_groups(df, index=index)
+
+        if isinstance(df, pd.DataFrame):
+            extra_cols_not_in_index = [c for c in extra_cols if c in df.columns]
+            if extra_cols_not_in_index:
+                df = df.set_index(extra_cols_not_in_index, append=True)
 
-    df = _convert_r_columns(df)
+            if data_cols:
+                df = df[data_cols]
+                df = df.rename_axis(columns=time_col)
+                df = df.stack()
+                df = df.rename("value")
+            else:
+                df = df["value"]
 
-    if kwargs:
-        df = _knead_data(df, **kwargs)
+        df = df.reorder_levels(index + REQUIRED_COLS + extra_cols + [time_col])
 
-    # cast all columns names to lower case
-    df.rename(columns={c: str(c).lower() for c in df.columns if isstr(c)}, inplace=True)
+    else:
+    
+        if isinstance(df, pd.Series):
+            if not df.name:
+                df = df.rename("value")
+            df = df.reset_index()
+        elif not list(df.index.names) == [None]:
+            # reset the index if meaningful entries are included there
+            df = df.reset_index()
+
+        df = _convert_r_columns(df)
+
+        if kwargs:
+            df = _knead_data(df, **kwargs)
+
+        # all lower case
+        df.rename(
+            columns={c: str(c).lower() for c in df.columns if isstr(c)}, inplace=True
+        )
 
-    if "notes" in df.columns:  # this came from a legacy database (SSP or earlier)
-        df = _format_from_legacy_database(df)
+        if "notes" in df.columns:  # this came from a legacy database (SSP or earlier)
+            df = _format_from_legacy_database(df)
 
-    # replace missing units by an empty string for user-friendly filtering
-    df = df.assign(unit=df["unit"].fillna(""))
+        # replace missing units by an empty string for user-friendly filtering
+        df = df.assign(unit=df["unit"].fillna(""))
 
-    df, time_col, extra_cols = _format_data_to_series(df, index)
+        df, time_col, extra_cols = _format_data_to_series(df, index)
 
     # cast value column to numeric
     try:

From a011b9756e75591e0d32e51bf7d17a1299319a69 Mon Sep 17 00:00:00 2001
From: Jonas Hoersch <jonas.hoersch@climateanalytics.org>
Date: Wed, 22 Feb 2023 12:46:00 +0100
Subject: [PATCH 2/6] Add missing dropna and fix column order

---
 pyam/utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pyam/utils.py b/pyam/utils.py
index 42a26cb6a..f639cc3b1 100644
--- a/pyam/utils.py
+++ b/pyam/utils.py
@@ -367,10 +367,9 @@ def format_data(df, index, **kwargs):
             else:
                 df = df["value"]
 
-        df = df.reorder_levels(index + REQUIRED_COLS + extra_cols + [time_col])
+        df = df.reorder_levels(index + REQUIRED_COLS + [time_col] + extra_cols).dropna()
 
     else:
-    
         if isinstance(df, pd.Series):
             if not df.name:
                 df = df.rename("value")

From 3c1988660e16d01caae05802825ef718d5bedbf6 Mon Sep 17 00:00:00 2001
From: Jonas Hoersch <jonas.hoersch@climateanalytics.org>
Date: Mon, 27 Feb 2023 08:40:59 +0100
Subject: [PATCH 3/6] Style suggestion

Black puts it into one line. No haggling.

Co-authored-by: Matthew Gidden <matthew.gidden@gmail.com>
---
 pyam/utils.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/pyam/utils.py b/pyam/utils.py
index f639cc3b1..041ad7bb0 100644
--- a/pyam/utils.py
+++ b/pyam/utils.py
@@ -360,10 +360,7 @@ def format_data(df, index, **kwargs):
                 df = df.set_index(extra_cols_not_in_index, append=True)
 
             if data_cols:
-                df = df[data_cols]
-                df = df.rename_axis(columns=time_col)
-                df = df.stack()
-                df = df.rename("value")
+                df = df[data_cols].rename_axis(columns=time_col).stack().rename("value")
             else:
                 df = df["value"]
 

From 3effa08be667ceab0529c01d968c5d5f98d36ea8 Mon Sep 17 00:00:00 2001
From: Jonas Hoersch <jonas.hoersch@climateanalytics.org>
Date: Mon, 27 Feb 2023 15:24:06 +0100
Subject: [PATCH 4/6] Apply suggestions

Co-authored-by: Daniel Huppmann <huppmann@iiasa.ac.at>
---
 pyam/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyam/utils.py b/pyam/utils.py
index 041ad7bb0..4f51311b4 100644
--- a/pyam/utils.py
+++ b/pyam/utils.py
@@ -350,8 +350,8 @@ def _format_data_to_series(df, index):
 def format_data(df, index, **kwargs):
     """Convert a pandas.Dataframe or pandas.Series to the required format"""
 
+    # Fast-pass if `df` has the index and required columns as a pd.MultiIndex
     if set(df.index.names) >= set(index) | set(REQUIRED_COLS) and not kwargs:
-        # Let's try to cut corners here, it's our fast-path
         time_col, extra_cols, data_cols = _intuit_column_groups(df, index=index)
 
         if isinstance(df, pd.DataFrame):

From 4bc9185379bd752e8ad4aacab81594630a2e2468 Mon Sep 17 00:00:00 2001
From: Jonas Hoersch <jonas.hoersch@climateanalytics.org>
Date: Mon, 27 Feb 2023 15:39:05 +0100
Subject: [PATCH 5/6] Make choice to examine index levels explicit

---
 pyam/utils.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/pyam/utils.py b/pyam/utils.py
index 4f51311b4..721b4ccc4 100644
--- a/pyam/utils.py
+++ b/pyam/utils.py
@@ -249,10 +249,13 @@ def _format_from_legacy_database(df):
     return df
 
 
-def _intuit_column_groups(df, index):
+def _intuit_column_groups(df, index, include_index=False):
     """Check and categorise columns in dataframe"""
 
-    existing_cols = pd.Index(df.index.names).dropna()  # skip unnamed columns
+    if include_index:
+        existing_cols = pd.Index(df.index.names)
+    else:
+        existing_cols = pd.Index([])
     if isinstance(df, pd.Series):
         existing_cols = existing_cols.union(["value"])
     elif isinstance(df, pd.DataFrame):
@@ -352,7 +355,9 @@ def format_data(df, index, **kwargs):
 
     # Fast-pass if `df` has the index and required columns as a pd.MultiIndex
     if set(df.index.names) >= set(index) | set(REQUIRED_COLS) and not kwargs:
-        time_col, extra_cols, data_cols = _intuit_column_groups(df, index=index)
+        time_col, extra_cols, data_cols = _intuit_column_groups(
+            df, index=index, include_index=True
+        )
 
         if isinstance(df, pd.DataFrame):
             extra_cols_not_in_index = [c for c in extra_cols if c in df.columns]

From ab6b32e3cd0cc037f79ecb993ab1d20784179c26 Mon Sep 17 00:00:00 2001
From: Jonas Hoersch <jonas.hoersch@climateanalytics.org>
Date: Mon, 27 Feb 2023 20:26:30 +0100
Subject: [PATCH 6/6] Add entry to release notes

---
 RELEASE_NOTES.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 21e2bb4d5..d997366ca 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -1,5 +1,6 @@
 # Next Release
 
+- [#731](https://github.com/IAMconsortium/pyam/pull/731) Add fast-path to initialization for sufficient multiindex
 - [#732](https://github.com/IAMconsortium/pyam/pull/732) Fix a few typos in tutorials
 - [#730](https://github.com/IAMconsortium/pyam/pull/730) Refactor initialization code
 - [#729](https://github.com/IAMconsortium/pyam/pull/729) Improve performance at initialization