diff --git a/asv_bench/benchmarks/benchmarks.py b/asv_bench/benchmarks/benchmarks.py index 281b18ca924..6947fdd68d0 100644 --- a/asv_bench/benchmarks/benchmarks.py +++ b/asv_bench/benchmarks/benchmarks.py @@ -52,11 +52,9 @@ ] -def trigger_execution(func): - def real_executor(*arg, **kwargs): - return func(*arg, **kwargs).shape - - return real_executor +def execute(df): + "Make sure the calculations are done." + return df.shape class TimeMultiColumnGroupby: @@ -69,13 +67,11 @@ def setup(self, data_size, count_columns): ) self.groupby_columns = [col for col in self.df.columns[:count_columns]] - @trigger_execution def time_groupby_agg_quan(self, data_size, count_columns): - return self.df.groupby(by=self.groupby_columns).agg("quantile") + execute(self.df.groupby(by=self.groupby_columns).agg("quantile")) - @trigger_execution def time_groupby_agg_mean(self, data_size, count_columns): - return self.df.groupby(by=self.groupby_columns).apply(lambda df: df.mean()) + execute(self.df.groupby(by=self.groupby_columns).apply(lambda df: df.mean())) class TimeGroupByDefaultAggregations: @@ -90,21 +86,17 @@ def setup(self, data_size): ) self.groupby_column = self.df.columns[0] - @trigger_execution def time_groupby_count(self, data_size): - return self.df.groupby(by=self.groupby_column).count() + execute(self.df.groupby(by=self.groupby_column).count()) - @trigger_execution def time_groupby_size(self, data_size): - return self.df.groupby(by=self.groupby_column).size() + execute(self.df.groupby(by=self.groupby_column).size()) - @trigger_execution def time_groupby_sum(self, data_size): - return self.df.groupby(by=self.groupby_column).sum() + execute(self.df.groupby(by=self.groupby_column).sum()) - @trigger_execution def time_groupby_mean(self, data_size): - return self.df.groupby(by=self.groupby_column).mean() + execute(self.df.groupby(by=self.groupby_column).mean()) class TimeJoin: @@ -123,10 +115,11 @@ def setup(self, data_size, how, sort): ASV_USE_IMPL, "int", data_size[3], data_size[2], RAND_LOW, RAND_HIGH ) - @trigger_execution def time_join(self, data_size, how, sort): - return self.df1.join( - self.df2, on=self.df1.columns[0], how=how, lsuffix="left_", sort=sort + execute( + self.df1.join( + self.df2, on=self.df1.columns[0], how=how, lsuffix="left_", sort=sort + ) ) @@ -146,9 +139,8 @@ def setup(self, data_size, how, sort): ASV_USE_IMPL, "int", data_size[3], data_size[2], RAND_LOW, RAND_HIGH ) - @trigger_execution def time_merge(self, data_size, how, sort): - return self.df1.merge(self.df2, on=self.df1.columns[0], how=how, sort=sort) + execute(self.df1.merge(self.df2, on=self.df1.columns[0], how=how, sort=sort)) class TimeConcat: @@ -168,12 +160,11 @@ def setup(self, data_size, how, axis): ASV_USE_IMPL, "int", data_size[3], data_size[2], RAND_LOW, RAND_HIGH ) - @trigger_execution def time_concat(self, data_size, how, axis): if ASV_USE_IMPL == "modin": - return pd.concat([self.df1, self.df2], axis=axis, join=how) + execute(pd.concat([self.df1, self.df2], axis=axis, join=how)) elif ASV_USE_IMPL == "pandas": - return pandas.concat([self.df1, self.df2], axis=axis, join=how) + execute(pandas.concat([self.df1, self.df2], axis=axis, join=how)) else: raise NotImplementedError @@ -196,9 +187,8 @@ def setup(self, data_size, binary_op, axis): ) self.op = getattr(self.df1, binary_op) - @trigger_execution def time_binary_op(self, data_size, binary_op, axis): - return self.op(self.df2, axis=axis) + execute(self.op(self.df2, axis=axis)) class BaseTimeSetItem: @@ -243,15 +233,13 @@ class TimeSetItem(BaseTimeSetItem): [True, False], ] - @trigger_execution def time_setitem_qc(self, *args, **kwargs): self.df[self.loc] = self.item - return self.df + execute(self.df) - @trigger_execution def time_setitem_raw(self, *args, **kwargs): self.df[self.loc] = self.item_raw - return self.df + execute(self.df) class TimeInsert(BaseTimeSetItem): @@ -262,15 +250,13 @@ class TimeInsert(BaseTimeSetItem): [True, False], ] - @trigger_execution def time_insert_qc(self, *args, **kwargs): self.df.insert(loc=self.iloc, column=random_string(), value=self.item) - return self.df + execute(self.df) - @trigger_execution def time_insert_raw(self, *args, **kwargs): self.df.insert(loc=self.iloc, column=random_string(), value=self.item_raw) - return self.df + execute(self.df) class TimeArithmetic: @@ -285,22 +271,17 @@ def setup(self, data_size, axis): ASV_USE_IMPL, "int", data_size[1], data_size[0], RAND_LOW, RAND_HIGH ) - @trigger_execution def time_sum(self, data_size, axis): - return self.df.sum(axis=axis) + execute(self.df.sum(axis=axis)) - @trigger_execution def time_median(self, data_size, axis): - return self.df.median(axis=axis) + execute(self.df.median(axis=axis)) - @trigger_execution def time_nunique(self, data_size, axis): - return self.df.nunique(axis=axis) + execute(self.df.nunique(axis=axis)) - @trigger_execution def time_apply(self, data_size, axis): - return self.df.apply(lambda df: df.sum(), axis=axis) + execute(self.df.apply(lambda df: df.sum(), axis=axis)) - @trigger_execution def time_mean(self, data_size, axis): - return self.df.mean(axis=axis) + execute(self.df.mean(axis=axis))