-
Notifications
You must be signed in to change notification settings - Fork 80
Optimize TSDataset.describe
and TSDataset.info
by vectorization
#1344
Conversation
ts.describe
and ts.info
by vectorizationTSDataset.describe
and TSDataset.info
by vectorization
Script for testing import time
import json
import numpy as np
import pandas as pd
from loguru import logger
from etna.models import NaiveModel
from etna.datasets import TSDataset, generate_ar_df
from etna.metrics import MAE
from etna.pipeline import Pipeline
HORIZON = 14
def make_df(num_segments: int, num_features: int, num_periods: int, random_state: int = 0) -> pd.DataFrame:
rng = np.random.default_rng(random_state)
df = generate_ar_df(
periods=num_periods, start_time="2020-01-01", n_segments=num_segments
)
for i in range(num_features):
# add int column
df[f"new_int_{i}"] = rng.integers(low=-100, high=100, size=df.shape[0])
return df
def check_time(num_segments: int, num_features: int, num_periods: int = 365):
df = make_df(num_segments=num_segments, num_features=num_features, num_periods=num_periods)
df_wide = TSDataset.to_dataset(df)
ts = TSDataset(df=df_wide, freq="D")
start_time = time.perf_counter()
_ = ts.describe()
elapsed_time = time.perf_counter() - start_time
return elapsed_time
def main():
num_segments = [10, 100, 1000, 10_000, 100_000]
num_features = [0, 3, 10]
results = []
for cur_num_segments in num_segments:
for cur_num_features in num_features:
time_result = check_time(num_segments=cur_num_segments, num_features=cur_num_features)
record = {"num_segments": cur_num_segments, "num_features": cur_num_features, "time": time_result}
results.append(record)
logger.info(json.dumps(record))
json.dump(results, open("records.json", "w"), indent=2)
if __name__ == "__main__":
main() Results before optimization: [
{
"num_segments": 10,
"num_features": 0,
"time": 0.007862442000000414
},
{
"num_segments": 10,
"num_features": 3,
"time": 0.00775050900000096
},
{
"num_segments": 10,
"num_features": 10,
"time": 0.00862645999999856
},
{
"num_segments": 100,
"num_features": 0,
"time": 0.06804819900000147
},
{
"num_segments": 100,
"num_features": 3,
"time": 0.05528060099999976
},
{
"num_segments": 100,
"num_features": 10,
"time": 0.05490351599999954
},
{
"num_segments": 1000,
"num_features": 0,
"time": 0.511956906
},
{
"num_segments": 1000,
"num_features": 3,
"time": 0.5077033259999997
},
{
"num_segments": 1000,
"num_features": 10,
"time": 0.49680727800000035
},
{
"num_segments": 10000,
"num_features": 0,
"time": 5.198245515000002
},
{
"num_segments": 10000,
"num_features": 3,
"time": 5.023976880999999
},
{
"num_segments": 10000,
"num_features": 10,
"time": 5.116792693999997
},
{
"num_segments": 100000,
"num_features": 0,
"time": 50.777624478999996
},
{
"num_segments": 100000,
"num_features": 3,
"time": 51.87359783100001
},
{
"num_segments": 100000,
"num_features": 10,
"time": 62.32446584499996
}
] Results after optimization: [
{
"num_segments": 10,
"num_features": 0,
"time": 0.006445242999999934
},
{
"num_segments": 10,
"num_features": 3,
"time": 0.005044411000000082
},
{
"num_segments": 10,
"num_features": 10,
"time": 0.00412322800000009
},
{
"num_segments": 100,
"num_features": 0,
"time": 0.007066482000000818
},
{
"num_segments": 100,
"num_features": 3,
"time": 0.006966671999999008
},
{
"num_segments": 100,
"num_features": 10,
"time": 0.006875658000000229
},
{
"num_segments": 1000,
"num_features": 0,
"time": 0.015869262000000717
},
{
"num_segments": 1000,
"num_features": 3,
"time": 0.018922749999999766
},
{
"num_segments": 1000,
"num_features": 10,
"time": 0.019158535000000754
},
{
"num_segments": 10000,
"num_features": 0,
"time": 0.05820048300000025
},
{
"num_segments": 10000,
"num_features": 3,
"time": 0.07253477299999922
},
{
"num_segments": 10000,
"num_features": 10,
"time": 0.0792398090000006
},
{
"num_segments": 100000,
"num_features": 0,
"time": 0.47934153599999973
},
{
"num_segments": 100000,
"num_features": 3,
"time": 0.6095439070000026
},
{
"num_segments": 100000,
"num_features": 10,
"time": 0.9615795119999859
}
] |
Results for script from #1338: [
{
"num_segments": 10,
"num_features": 0,
"time": 0.18644669199999964
},
{
"num_segments": 10,
"num_features": 3,
"time": 0.44299768199999967
},
{
"num_segments": 10,
"num_features": 10,
"time": 0.3184416309999998
},
{
"num_segments": 100,
"num_features": 0,
"time": 0.40837533100000023
},
{
"num_segments": 100,
"num_features": 3,
"time": 0.49695419899999926
},
{
"num_segments": 100,
"num_features": 10,
"time": 0.6302544880000003
},
{
"num_segments": 1000,
"num_features": 0,
"time": 2.3735116309999995
},
{
"num_segments": 1000,
"num_features": 3,
"time": 2.3557946890000014
},
{
"num_segments": 1000,
"num_features": 10,
"time": 3.484642255999999
},
{
"num_segments": 10000,
"num_features": 0,
"time": 18.414344812
},
{
"num_segments": 10000,
"num_features": 3,
"time": 23.947836302000006
},
{
"num_segments": 10000,
"num_features": 10,
"time": 37.481222474999996
}
] |
🚀 Deployed on https://deploy-preview-1344--etna-docs.netlify.app |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice speed up))
What is the bottleneck in the backtest metrics computation now?
etna/datasets/tsdataset.py
Outdated
segments_dict["start_timestamp"] = df.index[min_idx].to_series(index=segments) | ||
segments_dict["end_timestamp"] = df.index[max_idx].to_series(index=segments) | ||
size_borders = min_idx + (size - max_idx - 1) | ||
segments_dict["length"] = pd.Series(size - size_borders, dtype="Int64", index=segments) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
May be better:
series_len = max_idx - min_idx + 1
num_missing = series_len - np.sum(not_na, axis=0) # Max num not nans - Num not nans
etna/datasets/tsdataset.py
Outdated
|
||
df = self.df.loc[:, (segments_slice, "target")] | ||
|
||
size = df.shape[0] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
size
is not the best naming, may be num_timestamps or sth like this
tests/test_datasets/test_dataset.py
Outdated
|
||
assert np.all(segment_df.index == ts.segments) | ||
assert np.all(segment_df.index == ts_info.segments) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks strange as you created dataframe with this index in the previous line
Codecov Report
❗ Your organization is not using the GitHub App Integration. As a result you may experience degraded service beginning May 15th. Please install the Github App Integration for your organization. Read more. @@ Coverage Diff @@
## master #1344 +/- ##
==========================================
+ Coverage 88.84% 89.15% +0.30%
==========================================
Files 204 204
Lines 12665 12675 +10
==========================================
+ Hits 11252 11300 +48
+ Misses 1413 1375 -38
... and 4 files with indirect coverage changes 📣 We’re building smart automated test selection to slash your CI/CD build times. Learn more |
Before submitting (must do checklist)
Proposed Changes
Look at #1341.
Closing issues
Closes #1341.