IAMconsortium · gidden · Feb 16, 2023 · Feb 17, 2023 · Feb 17, 2023 · Feb 17, 2023
diff --git a/pyam/core.py b/pyam/core.py
@@ -120,7 +120,7 @@ class IamDataFrame(object):
     for those who are not used to the pandas/Python universe.
     """
 
-    def __init__(self, data, meta=None, index=DEFAULT_META_INDEX, **kwargs):
+    def __init__(self, data, meta=None, index=DEFAULT_META_INDEX, fast=False, **kwargs):
         """Initialize an instance of an IamDataFrame"""
         if isinstance(data, IamDataFrame):
             if kwargs:
@@ -133,9 +133,9 @@ def __init__(self, data, meta=None, index=DEFAULT_META_INDEX, **kwargs):
             for attr, value in data.__dict__.items():
                 setattr(self, attr, value)
         else:
-            self._init(data, meta, index=index, **kwargs)
+            self._init(data, meta, index=index, fast=fast, **kwargs)
 
-    def _init(self, data, meta=None, index=DEFAULT_META_INDEX, **kwargs):
+    def _init(self, data, meta=None, index=DEFAULT_META_INDEX, fast=False, **kwargs):
         """Process data and set attributes for new instance"""
 
         # pop kwarg for meta_sheet_name (prior to reading data from file)
@@ -159,11 +159,12 @@ def _init(self, data, meta=None, index=DEFAULT_META_INDEX, **kwargs):
             if not data.is_file():
                 raise FileNotFoundError(f"No such file: '{data}'")
             logger.info(f"Reading file {data}")
-            _data = read_file(data, index=index, **kwargs)
+            _data = read_file(data, index=index, fast=fast, **kwargs)
 
         # cast data from pandas
         elif isinstance(data, pd.DataFrame) or isinstance(data, pd.Series):
-            _data = format_data(data.copy(), index=index, **kwargs)
+            _data = data if fast else data.copy()
+            _data = format_data(_data, index=index, fast=fast, **kwargs)
 
         # unsupported `data` args
         elif islistable(data):

diff --git a/pyam/utils.py b/pyam/utils.py
@@ -169,20 +169,94 @@ def is_empty(name, s):
             return False
 
         empty_cols = [c for c in df.columns if is_empty(c, df[c])]
-        return df.drop(columns=empty_cols).dropna(axis=0, how="all")
+        df.drop(columns=empty_cols, inplace=True)
+        df.dropna(axis=0, how="all", inplace=True)
+        return df
 
 
-def read_file(path, *args, **kwargs):
+def read_file(path, fast=False, *args, **kwargs):
     """Read data from a file"""
     # extract kwargs that are intended for `format_data`
     format_kwargs = dict(index=kwargs.pop("index"))
     for c in [i for i in IAMC_IDX + ["year", "time", "value"] if i in kwargs]:
         format_kwargs[c] = kwargs.pop(c)
-    return format_data(read_pandas(path, *args, **kwargs), **format_kwargs)
+    return format_data(read_pandas(path, *args, **kwargs), fast=fast, **format_kwargs)
 
 
-def format_data(df, index, **kwargs):
+def intuit_column_groups(df, index):
+    cols = [c for c in df.columns if c not in index + REQUIRED_COLS]
+    year_cols, time_cols, extra_cols = [], [], []
+    for i in cols:
+        # if the column name can be cast to integer, assume it's a year column
+        try:
+            int(i)
+            year_cols.append(i)
+
+        # otherwise, try casting to datetime
+        except (ValueError, TypeError):
+            try:
+                dateutil.parser.parse(str(i))
+                time_cols.append(i)
+
+            # neither year nor datetime, so it is an extra-column
+            except ValueError:
+                extra_cols.append(i)
+    if year_cols and not time_cols:
+        time_col = "year"
+        melt_cols = sorted(year_cols)
+    else:
+        time_col = "time"
+        melt_cols = sorted(year_cols) + sorted(time_cols)
+    if not melt_cols:
+        raise ValueError("Missing time domain")
+    return extra_cols, time_col, melt_cols
+
+
+def fast_format_data(df, index=DEFAULT_META_INDEX):
+    if not isinstance(df, pd.DataFrame):
+        raise TypeError("Fast format only works if provided a pd.DataFrame")
+
+    # all lower case
+    str_cols = [c for c in df.columns if isstr(c)]
+    df.rename(columns={c: str(c).lower() for c in str_cols}, inplace=True)
+
+    if "notes" in df.columns:  # this came from the database
+        logger.info("Ignoring notes column in dataframe")
+        df.drop(columns="notes", inplace=True)
+        col = df.columns[0]  # first column has database copyright notice
+        df = df[~df[col].str.contains("database", case=False)]
+
+    col_diff = set(IAMC_IDX) - set(df.columns)
+    if col_diff:
+        raise ValueError(f"Missing required columns: {col_diff}")
+
+    extra_cols, time_col, melt_cols = intuit_column_groups(df, index)
+    # build idx in expected order with IAMC_IDX first
+    idx = IAMC_IDX + list(set(index + extra_cols) - set(IAMC_IDX))
+    if "value" not in df.columns:
+        df = pd.melt(
+            df,
+            id_vars=idx,
+            var_name=time_col,
+            value_vars=melt_cols,
+            value_name="value",
+        )
+
+    df.dropna(inplace=True, subset=["value"])
+    df.loc[df.unit.isnull(), "unit"] = ""
+
+    # cast to pd.Series and return
+    idx_cols = idx + [time_col]
+    df.set_index(idx_cols, inplace=True)
+    #    df.sort_index(inplace=True) # TODO: not sure this is needed
+    return df.value, index, time_col, extra_cols
+
+
+def format_data(df, index, fast=False, **kwargs):
     """Convert a pandas.Dataframe or pandas.Series to the required format"""
+    if fast:
+        return fast_format_data(df, index)
+
     if isinstance(df, pd.Series):
         df.name = df.name or "value"
         df = df.to_frame()
@@ -368,7 +442,8 @@ def convert_r_columns(c):
     if df.empty:
         logger.warning("Formatted data is empty!")
 
-    return df.sort_index(), index, time_col, extra_cols
+    #    return df.sort_index(), index, time_col, extra_cols
+    return df, index, time_col, extra_cols
 
 
 def sort_data(data, cols):

diff --git a/tests/profile_init.py b/tests/profile_init.py
@@ -0,0 +1,105 @@
+import string
+import numpy as np
+import pandas as pd
+from functools import wraps
+from pathlib import Path
+import time
+
+import pyam
+
+YEARS = range(2010, 2101, 10)
+
+
+def timeit(func):
+    @wraps(func)
+    def timeit_wrapper(*args, **kwargs):
+        start_time = time.perf_counter()
+        result = func(*args, **kwargs)
+        end_time = time.perf_counter()
+        total_time = end_time - start_time
+        return total_time, result
+
+    return timeit_wrapper
+
+
+def join(a):
+    return "".join(a)
+
+
+def gen_str(N, k=1):
+    return np.random.choice(
+        list(string.ascii_lowercase), size=(k, N, len(pyam.IAMC_IDX))
+    )
+
+
+def gen_str_iamc(N, k=1):
+    return np.apply_along_axis(join, 0, gen_str(N, k))
+
+
+def gen_float(N, years=YEARS):
+    return np.random.choice(
+        range(10),
+        size=(
+            N,
+            len(years),
+        ),
+    )
+
+
+@timeit
+def gen_frame(data, fast):
+    return pyam.IamDataFrame(data, fast=fast)
+
+
+def profile(max=5):
+    data = {"N": [], "time": [], "type": []}
+    for N in [int(10**n) for n in np.arange(1, max, step=0.5)]:
+        print(N)
+        for type in ["slow", "fast"]:
+            try:
+                strdata = pd.DataFrame(gen_str_iamc(N, k=5), columns=pyam.IAMC_IDX)
+                fdata = pd.DataFrame(gen_float(N), columns=YEARS)
+                _data = pd.concat([strdata, fdata], axis=1)
+                time, df = gen_frame(_data, fast=type == "fast")
+                print(N, type, time)
+                data["N"].append(N)
+                data["type"].append(type)
+                data["time"].append(time)
+            except:
+                continue
+    return pd.DataFrame.from_dict(data)
+
+
+@timeit
+def gen_frame_from_file(file, fast):
+    return pyam.IamDataFrame(file, fast=fast)
+
+
+def profile_file(fname):
+    data = {"N": [], "time": [], "type": []}
+    for type in ["slow", "fast"]:
+        time, df = gen_frame_from_file(fname, fast=type == "fast")
+        data["N"].append(len(df))
+        data["type"].append(type)
+        data["time"].append(time)
+    return pd.DataFrame.from_dict(data)
+
+
+def main():
+    # requires downloading AR6 dataset and placing it in the data folder
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+
+    dfp = profile(max=6)
+    df6 = profile_file(fname=Path("./data/AR6_Scenarios_Database_World_v1.0.csv"))
+    df = pd.concat([dfp, df6]).reset_index()
+    df.to_csv("profile_init.csv")
+    print(df)
+    fig, ax = plt.subplots()
+    sns.lineplot(data=df, x="N", y="time", hue="type", ax=ax)
+    ax.set(xscale="log")
+    fig.savefig("profile_init.png")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -73,6 +73,12 @@ def test_init_from_iamdf(test_df_year):
     assert test_df_year.scenario == ["scen_b", "scen_foo"]
 
 
+def test_init_fast(test_df_year):
+    obs = IamDataFrame(test_df_year, fast=True)
+    exp = IamDataFrame(test_df_year)
+    assert_iamframe_equal(obs, exp)
+
+
 def test_init_from_iamdf_raises(test_df_year):
     # casting an IamDataFrame instance again with extra args fails
     match = "Invalid arguments for initializing from IamDataFrame: {'model': 'foo'}"

diff --git a/tests/test_io.py b/tests/test_io.py
@@ -226,14 +226,16 @@ def test_load_meta_empty(test_pd_df):
     assert_iamframe_equal(obs, exp)
 
 
-def test_load_ssp_database_downloaded_file(test_pd_df):
+@pytest.mark.parametrize("fast", [True, False])
+def test_load_ssp_database_downloaded_file(test_pd_df, fast):
     exp = IamDataFrame(test_pd_df).filter(**FILTER_ARGS).as_pandas()
     file = TEST_DATA_DIR / "test_SSP_database_raw_download.xlsx"
-    obs_df = IamDataFrame(file)
+    obs_df = IamDataFrame(file, fast=fast)
     pd.testing.assert_frame_equal(obs_df.as_pandas(), exp)
 
 
 def test_load_rcp_database_downloaded_file(test_pd_df):
+    # RCP data not tested for fast at present because it requires additional processing
     exp = IamDataFrame(test_pd_df).filter(**FILTER_ARGS).as_pandas()
     file = TEST_DATA_DIR / "test_RCP_database_raw_download.xlsx"
     obs_df = IamDataFrame(file)