Skip to content

Commit

Permalink
Improve performance of format_data() (#729)
Browse files Browse the repository at this point in the history
Co-authored-by: Jonas Hörsch <[email protected]>
  • Loading branch information
danielhuppmann and coroa authored Feb 22, 2023
1 parent 8c56dc3 commit 7a97516
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 31 deletions.
1 change: 1 addition & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Next Release

- [#729](https://github.com/IAMconsortium/pyam/pull/729) Improve performance at initialization
- [#723](https://github.com/IAMconsortium/pyam/pull/723) Ensure correct order of `time` attribute

# Release v1.7.0
Expand Down
67 changes: 38 additions & 29 deletions pyam/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,8 +274,9 @@ def convert_r_columns(c):
if missing_required_col:
raise ValueError(f"Missing required columns: {missing_required_col}")

# check whether data in wide format (IAMC) or long format (`value` column)
# check whether data in wide format (standard IAMC) or long format (`value` column)
if "value" in df.columns:

# check if time column is given as `year` (int) or `time` (datetime)
if "year" in df.columns and "time" not in df.columns:
time_col = "year"
Expand All @@ -288,7 +289,19 @@ def convert_r_columns(c):
for c in df.columns
if c not in index + REQUIRED_COLS + [time_col, "value"]
]

# replace missing units by an empty string for user-friendly filtering
df.loc[df.unit.isnull(), "unit"] = ""

_validate_complete_index(df[index + REQUIRED_COLS + extra_cols])

# cast to pd.Series
idx_cols = index + REQUIRED_COLS + [time_col] + extra_cols
df = df.set_index(idx_cols).value
df.dropna(inplace=True)

else:

# if in wide format, check if columns are years (int) or datetime
cols = [c for c in df.columns if c not in index + REQUIRED_COLS]
year_cols, time_cols, extra_cols = [], [], []
Expand Down Expand Up @@ -317,18 +330,20 @@ def convert_r_columns(c):
if not melt_cols:
raise ValueError("Missing time domain")

# melt the dataframe
df = pd.melt(
df,
id_vars=index + REQUIRED_COLS + extra_cols,
var_name=time_col,
value_vars=melt_cols,
value_name="value",
)
# replace missing units by an empty string for user-friendly filtering
df.loc[df.unit.isnull(), "unit"] = ""

_validate_complete_index(df[index + REQUIRED_COLS + extra_cols])

# cast to long format, set
df.set_index(index + REQUIRED_COLS + extra_cols, inplace=True)
df = df.stack(dropna=True)
df.name = "value"
df.index.names = df.index.names[:-1] + [time_col]

# cast value column to numeric and drop nan
# cast value column to numeric
try:
df["value"] = pd.to_numeric(df["value"])
df = pd.to_numeric(df)
except ValueError as e:
# get the row number where the error happened
row_nr_regex = re.compile(r"(?<=at position )\d+")
Expand All @@ -337,24 +352,6 @@ def convert_r_columns(c):
short_error = short_error_regex.search(str(e)).group()
raise_data_error(f"{short_error} in `data`", df.iloc[[row_nr]])

df.dropna(inplace=True, subset=["value"])

# replace missing units by an empty string for user-friendly filtering
df.loc[df.unit.isnull(), "unit"] = ""

# verify that there are no nan's left (in columns)
null_rows = df.isnull().T.any()
if null_rows.any():
cols = ", ".join(df.columns[df.isnull().any().values])
raise_data_error(
f"Empty cells in `data` (columns: '{cols}')", df.loc[null_rows]
)
del null_rows

# cast to pd.Series, check for duplicates
idx_cols = index + REQUIRED_COLS + [time_col] + extra_cols
df = df.set_index(idx_cols).value

# format the time-column
_time = [to_time(i) for i in get_index_levels(df.index, time_col)]
df.index = replace_index_labels(df.index, time_col, _time)
Expand All @@ -371,6 +368,18 @@ def convert_r_columns(c):
return df.sort_index(), index, time_col, extra_cols


def _validate_complete_index(df):
"""Validate that there are no nan's in the (index) columns"""
null_cells = df.isnull()
null_rows = null_cells.any(axis=1)
if null_rows.any():
null_cols = null_cells.any()
cols = ", ".join(null_cols.index[null_cols])
raise_data_error(
f"Empty cells in `data` (columns: '{cols}')", df.loc[null_rows]
)


def sort_data(data, cols):
"""Sort data rows and order columns by cols"""
return data.sort_values(cols)[cols + ["value"]].reset_index(drop=True)
Expand Down
8 changes: 6 additions & 2 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,15 +102,19 @@ def test_init_df_with_illegal_values_raises(test_pd_df, illegal_value):
f'.*string "{illegal_value}" in `data`:'
r"(\n.*){2}model_a.*scen_a.*World.*Primary Energy.*EJ/yr.*2005"
)

with pytest.raises(ValueError, match=msg):
IamDataFrame(test_pd_df)


def test_init_df_with_na_scenario(test_pd_df):
# missing values in an index dimension raises an error
test_pd_df.loc[1, "scenario"] = np.nan
pytest.raises(ValueError, IamDataFrame, data=test_pd_df)
msg = (
"Empty cells in `data` \(columns: 'scenario'\):"
r"(\n.*){2}model_a.*NaN.*World.*Primary Energy|Coal.*EJ/yr.*2005.*"
)
with pytest.raises(ValueError, match=msg):
IamDataFrame(test_pd_df)


def test_init_df_with_float_cols(test_pd_df):
Expand Down

0 comments on commit 7a97516

Please sign in to comment.