Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

initial attempt at a fast init #726

Closed
wants to merge 12 commits into from
11 changes: 6 additions & 5 deletions pyam/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ class IamDataFrame(object):
for those who are not used to the pandas/Python universe.
"""

def __init__(self, data, meta=None, index=DEFAULT_META_INDEX, **kwargs):
def __init__(self, data, meta=None, index=DEFAULT_META_INDEX, fast=False, **kwargs):
"""Initialize an instance of an IamDataFrame"""
if isinstance(data, IamDataFrame):
if kwargs:
Expand All @@ -133,9 +133,9 @@ def __init__(self, data, meta=None, index=DEFAULT_META_INDEX, **kwargs):
for attr, value in data.__dict__.items():
setattr(self, attr, value)
else:
self._init(data, meta, index=index, **kwargs)
self._init(data, meta, index=index, fast=fast, **kwargs)

def _init(self, data, meta=None, index=DEFAULT_META_INDEX, **kwargs):
def _init(self, data, meta=None, index=DEFAULT_META_INDEX, fast=False, **kwargs):
"""Process data and set attributes for new instance"""

# pop kwarg for meta_sheet_name (prior to reading data from file)
Expand All @@ -159,11 +159,12 @@ def _init(self, data, meta=None, index=DEFAULT_META_INDEX, **kwargs):
if not data.is_file():
raise FileNotFoundError(f"No such file: '{data}'")
logger.info(f"Reading file {data}")
_data = read_file(data, index=index, **kwargs)
_data = read_file(data, index=index, fast=fast, **kwargs)

# cast data from pandas
elif isinstance(data, pd.DataFrame) or isinstance(data, pd.Series):
_data = format_data(data.copy(), index=index, **kwargs)
_data = data if fast else data.copy()
_data = format_data(_data, index=index, fast=fast, **kwargs)
coroa marked this conversation as resolved.
Show resolved Hide resolved

# unsupported `data` args
elif islistable(data):
Expand Down
85 changes: 80 additions & 5 deletions pyam/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,20 +169,94 @@ def is_empty(name, s):
return False

empty_cols = [c for c in df.columns if is_empty(c, df[c])]
return df.drop(columns=empty_cols).dropna(axis=0, how="all")
df.drop(columns=empty_cols, inplace=True)
df.dropna(axis=0, how="all", inplace=True)
return df
Comment on lines +172 to +174
Copy link
Collaborator

@coroa coroa Feb 17, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

inplace=True seldom gives you a speed-up with pandas.

https://stackoverflow.com/a/60020384/2873952 .

reset_index, fillna, clip are exceptions, but there is not a good list.
In general, if the size of the data and the data-type stays the same, but needs to be copied to not operate in inplace, then there might be a sizable speed-up. In other situations more likely that it does not change anything. (Other than that you are forcing your users to be careful.)

drop along columns can be done on a view, so it doesn't matter. dropna might actually be faster, but probably you have to copy data in both cases.



def read_file(path, *args, **kwargs):
def read_file(path, fast=False, *args, **kwargs):
"""Read data from a file"""
# extract kwargs that are intended for `format_data`
format_kwargs = dict(index=kwargs.pop("index"))
for c in [i for i in IAMC_IDX + ["year", "time", "value"] if i in kwargs]:
format_kwargs[c] = kwargs.pop(c)
return format_data(read_pandas(path, *args, **kwargs), **format_kwargs)
return format_data(read_pandas(path, *args, **kwargs), fast=fast, **format_kwargs)


def format_data(df, index, **kwargs):
def intuit_column_groups(df, index):
cols = [c for c in df.columns if c not in index + REQUIRED_COLS]
year_cols, time_cols, extra_cols = [], [], []
for i in cols:
# if the column name can be cast to integer, assume it's a year column
try:
int(i)
year_cols.append(i)

# otherwise, try casting to datetime
except (ValueError, TypeError):
try:
dateutil.parser.parse(str(i))
time_cols.append(i)

# neither year nor datetime, so it is an extra-column
except ValueError:
extra_cols.append(i)
if year_cols and not time_cols:
time_col = "year"
melt_cols = sorted(year_cols)
else:
time_col = "time"
melt_cols = sorted(year_cols) + sorted(time_cols)
if not melt_cols:
raise ValueError("Missing time domain")
return extra_cols, time_col, melt_cols


def fast_format_data(df, index=DEFAULT_META_INDEX):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that this function is mostly copied/pasted from format_data() with a few optimisations. We could in principle break out the copied over parts to separate methods that are called by both.

if not isinstance(df, pd.DataFrame):
raise TypeError("Fast format only works if provided a pd.DataFrame")
coroa marked this conversation as resolved.
Show resolved Hide resolved

# all lower case
str_cols = [c for c in df.columns if isstr(c)]
df.rename(columns={c: str(c).lower() for c in str_cols}, inplace=True)
coroa marked this conversation as resolved.
Show resolved Hide resolved

if "notes" in df.columns: # this came from the database
logger.info("Ignoring notes column in dataframe")
df.drop(columns="notes", inplace=True)
coroa marked this conversation as resolved.
Show resolved Hide resolved
col = df.columns[0] # first column has database copyright notice
df = df[~df[col].str.contains("database", case=False)]

col_diff = set(IAMC_IDX) - set(df.columns)
coroa marked this conversation as resolved.
Show resolved Hide resolved
if col_diff:
raise ValueError(f"Missing required columns: {col_diff}")

extra_cols, time_col, melt_cols = intuit_column_groups(df, index)
# build idx in expected order with IAMC_IDX first
idx = IAMC_IDX + list(set(index + extra_cols) - set(IAMC_IDX))
if "value" not in df.columns:
df = pd.melt(
coroa marked this conversation as resolved.
Show resolved Hide resolved
df,
id_vars=idx,
var_name=time_col,
value_vars=melt_cols,
value_name="value",
)

df.dropna(inplace=True, subset=["value"])
df.loc[df.unit.isnull(), "unit"] = ""

# cast to pd.Series and return
idx_cols = idx + [time_col]
df.set_index(idx_cols, inplace=True)
# df.sort_index(inplace=True) # TODO: not sure this is needed
return df.value, index, time_col, extra_cols


def format_data(df, index, fast=False, **kwargs):
"""Convert a pandas.Dataframe or pandas.Series to the required format"""
if fast:
return fast_format_data(df, index)

if isinstance(df, pd.Series):
df.name = df.name or "value"
df = df.to_frame()
Expand Down Expand Up @@ -368,7 +442,8 @@ def convert_r_columns(c):
if df.empty:
logger.warning("Formatted data is empty!")

return df.sort_index(), index, time_col, extra_cols
# return df.sort_index(), index, time_col, extra_cols
return df, index, time_col, extra_cols


def sort_data(data, cols):
Expand Down
105 changes: 105 additions & 0 deletions tests/profile_init.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import string
gidden marked this conversation as resolved.
Show resolved Hide resolved
import numpy as np
import pandas as pd
from functools import wraps
from pathlib import Path
import time

import pyam

YEARS = range(2010, 2101, 10)


def timeit(func):
@wraps(func)
def timeit_wrapper(*args, **kwargs):
start_time = time.perf_counter()
result = func(*args, **kwargs)
end_time = time.perf_counter()
total_time = end_time - start_time
return total_time, result

return timeit_wrapper


def join(a):
return "".join(a)


def gen_str(N, k=1):
return np.random.choice(
list(string.ascii_lowercase), size=(k, N, len(pyam.IAMC_IDX))
)


def gen_str_iamc(N, k=1):
return np.apply_along_axis(join, 0, gen_str(N, k))


def gen_float(N, years=YEARS):
return np.random.choice(
range(10),
size=(
N,
len(years),
),
)


@timeit
def gen_frame(data, fast):
return pyam.IamDataFrame(data, fast=fast)


def profile(max=5):
data = {"N": [], "time": [], "type": []}
for N in [int(10**n) for n in np.arange(1, max, step=0.5)]:
print(N)
for type in ["slow", "fast"]:
try:
strdata = pd.DataFrame(gen_str_iamc(N, k=5), columns=pyam.IAMC_IDX)
fdata = pd.DataFrame(gen_float(N), columns=YEARS)
_data = pd.concat([strdata, fdata], axis=1)
time, df = gen_frame(_data, fast=type == "fast")
print(N, type, time)
data["N"].append(N)
data["type"].append(type)
data["time"].append(time)
except:
continue
return pd.DataFrame.from_dict(data)


@timeit
def gen_frame_from_file(file, fast):
return pyam.IamDataFrame(file, fast=fast)


def profile_file(fname):
data = {"N": [], "time": [], "type": []}
for type in ["slow", "fast"]:
time, df = gen_frame_from_file(fname, fast=type == "fast")
data["N"].append(len(df))
data["type"].append(type)
data["time"].append(time)
return pd.DataFrame.from_dict(data)


def main():
# requires downloading AR6 dataset and placing it in the data folder
import matplotlib.pyplot as plt
import seaborn as sns

dfp = profile(max=6)
df6 = profile_file(fname=Path("./data/AR6_Scenarios_Database_World_v1.0.csv"))
df = pd.concat([dfp, df6]).reset_index()
df.to_csv("profile_init.csv")
print(df)
fig, ax = plt.subplots()
sns.lineplot(data=df, x="N", y="time", hue="type", ax=ax)
ax.set(xscale="log")
fig.savefig("profile_init.png")


if __name__ == "__main__":
main()
6 changes: 6 additions & 0 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,12 @@ def test_init_from_iamdf(test_df_year):
assert test_df_year.scenario == ["scen_b", "scen_foo"]


def test_init_fast(test_df_year):
obs = IamDataFrame(test_df_year, fast=True)
exp = IamDataFrame(test_df_year)
assert_iamframe_equal(obs, exp)


def test_init_from_iamdf_raises(test_df_year):
# casting an IamDataFrame instance again with extra args fails
match = "Invalid arguments for initializing from IamDataFrame: {'model': 'foo'}"
Expand Down
6 changes: 4 additions & 2 deletions tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,14 +226,16 @@ def test_load_meta_empty(test_pd_df):
assert_iamframe_equal(obs, exp)


def test_load_ssp_database_downloaded_file(test_pd_df):
@pytest.mark.parametrize("fast", [True, False])
def test_load_ssp_database_downloaded_file(test_pd_df, fast):
exp = IamDataFrame(test_pd_df).filter(**FILTER_ARGS).as_pandas()
file = TEST_DATA_DIR / "test_SSP_database_raw_download.xlsx"
obs_df = IamDataFrame(file)
obs_df = IamDataFrame(file, fast=fast)
pd.testing.assert_frame_equal(obs_df.as_pandas(), exp)


def test_load_rcp_database_downloaded_file(test_pd_df):
# RCP data not tested for fast at present because it requires additional processing
exp = IamDataFrame(test_pd_df).filter(**FILTER_ARGS).as_pandas()
file = TEST_DATA_DIR / "test_RCP_database_raw_download.xlsx"
obs_df = IamDataFrame(file)
Expand Down