Skip to content

Commit

Permalink
FEAT-modin-project#2479: add CI job for check asv benchmarks
Browse files Browse the repository at this point in the history
Signed-off-by: Anatoly Myachev <[email protected]>
  • Loading branch information
anmyachev committed Dec 2, 2020
1 parent f1a3f64 commit a7974de
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 78 deletions.
56 changes: 56 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,62 @@ jobs:
- shell: bash -l {0}
run: bash <(curl -s https://codecov.io/bash)

test-asv-benchmarks:
needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers]
runs-on: ubuntu-latest
env:
MODIN_ENGINE: ray
MODIN_MEMORY: 1000000000
TestDatasetSize: small
name: test-asv-benchmarks
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 1
- name: Cache pip
uses: actions/cache@v1
with:
path: ~/.cache/pip
key: ${{ runner.os }}-python-3.7-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }}
- uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: modin
environment-file: environment.yml
python-version: 3.7
channel-priority: strict
use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly!
- name: Conda environment
shell: bash -l {0}
run: |
conda info
conda list
- name: Running benchmarks
shell: bash -l {0}
run: |
pip install -e .
cd asv_bench
asv check -E existing
git remote add upstream https://github.com/modin-project/modin.git
git fetch upstream
if git diff upstream/master --name-only | grep -q "^asv_bench/"; then
asv machine --yes
asv dev | sed "/failed$/ s/^/##[error]/" | tee benchmarks.log
if grep "failed" benchmarks.log > /dev/null ; then
exit 1
fi
else
echo "Benchmarks did not run, no changes detected"
fi
if: always()

- name: Publish benchmarks artifact
uses: actions/upload-artifact@master
with:
name: Benchmarks log
path: asv_bench/benchmarks.log
if: failure()

test-all:
needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers]
runs-on: ubuntu-latest
Expand Down
148 changes: 70 additions & 78 deletions asv_bench/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,75 +11,88 @@
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

# Write the benchmarking functions here.
# See "Writing benchmarks" in the asv docs for more information.
import modin.pandas as pd
import numpy as np
from modin.config import TestDatasetSize
from .utils import generate_dataframe, RAND_LOW, RAND_HIGH

pd.DEFAULT_NPARTITIONS = 4

if TestDatasetSize.get() == "Big":
MERGE_DATA_SIZE = [
(5000, 5000, 5000, 5000),
(10, 1_000_000, 10, 1_000_000),
(1_000_000, 10, 1_000_000, 10),
]
GROUPBY_DATA_SIZE = [
(5000, 5000),
(10, 1_000_000),
(1_000_000, 10),
]
else:
MERGE_DATA_SIZE = [
(2000, 100, 2000, 100),
]
GROUPBY_DATA_SIZE = [
(2000, 100),
]

JOIN_DATA_SIZE = MERGE_DATA_SIZE
ARITHMETIC_DATA_SIZE = GROUPBY_DATA_SIZE


class TimeGroupBy:
param_names = ["rows_cols"]
param_names = ["impl", "data_type", "data_size"]
params = [
[
(100, 1000),
(10000, 1000),
]
["modin", "pandas"],
["int"],
GROUPBY_DATA_SIZE,
]

def setup(self, rows_cols):
rows, cols = rows_cols
# workaround for #2482
columns = [str(x) for x in range(cols)]
self.df = pd.DataFrame(
np.random.randint(0, 100, size=(rows, cols)), columns=columns
def setup(self, impl, data_type, data_size):
self.df = generate_dataframe(
impl, data_type, data_size[0], data_size[1], RAND_LOW, RAND_HIGH
)

# add case for multiple by
def time_groupby_sum(self, rows_cols):
self.df.groupby(by="1").sum()
def time_groupby_sum(self, impl, data_type, data_size):
self.df.groupby(by=self.df.columns[0]).sum()

def time_groupby_mean(self, rows_cols):
self.df.groupby(by="1").mean()
def time_groupby_mean(self, impl, data_type, data_size):
self.df.groupby(by=self.df.columns[0]).mean()

def time_groupby_count(self, rows_cols):
self.df.groupby(by="1").count()
def time_groupby_count(self, impl, data_type, data_size):
self.df.groupby(by=self.df.columns[0]).count()


class TimeJoin:
param_names = ["rows_cols", "how"]
param_names = ["impl", "data_type", "data_size", "how", "sort"]
params = [
[
(100, 1000),
(10000, 1000),
],
["outer", "inner", "left", "right"],
["modin", "pandas"],
["int"],
JOIN_DATA_SIZE,
["left", "right", "outer", "inner"],
[False, True],
]

def setup(self, rows_cols, how):
rows, cols = rows_cols
# workaround for #2482
columns = [str(x) for x in range(cols)]
numpy_data = np.random.randint(0, 100, size=(rows, cols))
self.df_left = pd.DataFrame(numpy_data, columns=columns)
self.df_right = pd.DataFrame(numpy_data, columns=columns)
def setup(self, impl, data_type, data_size, how, sort):
self.df1 = generate_dataframe(
impl, data_type, data_size[0], data_size[1], RAND_LOW, RAND_HIGH
)
self.df2 = generate_dataframe(
impl, data_type, data_size[2], data_size[3], RAND_LOW, RAND_HIGH
)

def time_join(self, rows_cols, how):
self.df_left.join(self.df_right, how=how, lsuffix="left_")
def time_join(self, impl, data_type, data_size, how, sort):
self.df1.join(
self.df2, on=self.df1.columns[0], how=how, lsuffix="left_", sort=sort
)


class TimeMerge:
param_names = ["impl", "data_type", "data_size", "how", "sort"]
params = [
["modin", "pandas"],
["int"],
[
(5000, 5000, 5000, 5000),
(10, 1_000_00, 10, 1_000_00),
(1_000_00, 10, 1_000_00, 10),
],
MERGE_DATA_SIZE,
["left", "right", "outer", "inner"],
[False, True],
]
Expand All @@ -97,48 +110,27 @@ def time_merge(self, impl, data_type, data_size, how, sort):


class TimeArithmetic:
param_names = ["rows_cols"]
param_names = ["impl", "data_type", "data_size", "axis"]
params = [
[
(100, 1000),
(10000, 1000),
]
["modin", "pandas"],
["int"],
ARITHMETIC_DATA_SIZE,
[0, 1],
]

def setup(self, rows_cols):
rows, cols = rows_cols
# workaround for #2482
columns = [str(x) for x in range(cols)]
self.df = pd.DataFrame(
np.random.randint(0, 100, size=(rows, cols)), columns=columns
def setup(self, impl, data_type, data_size, axis):
self.df = generate_dataframe(
impl, data_type, data_size[0], data_size[1], RAND_LOW, RAND_HIGH
)

def time_transpose_lazy(self, rows_cols):
self.df.T

def time_transpose(self, rows_cols):
repr(self.df.T)

def time_sum(self, rows_cols):
self.df.sum()

def time_sum_axis_1(self, rows_cols):
self.df.sum(axis=1)

def time_median(self, rows_cols):
self.df.median()

def time_median_axis_1(self, rows_cols):
self.df.median(axis=1)

def time_nunique(self, rows_cols):
self.df.nunique()
def time_sum(self, impl, data_type, data_size, axis):
self.df.sum(axis=axis)

def time_nunique_axis_1(self, rows_cols):
self.df.nunique(axis=1)
def time_median(self, impl, data_type, data_size, axis):
self.df.median(axis=axis)

def time_apply(self, rows_cols):
self.df.apply(lambda df: df.sum())
def time_nunique(self, impl, data_type, data_size, axis):
self.df.nunique(axis=axis)

def time_apply_axis_1(self, rows_cols):
self.df.apply(lambda df: df.sum(), axis=1)
def time_apply(self, impl, data_type, data_size, axis):
self.df.apply(lambda df: df.sum(), axis=axis)
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,6 @@ dependencies:
- rpyc==4.1.5
- cloudpickle==1.4.1
- boto3
- asv
- pip:
- ray>=1.0.0
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ msgpack
pandas_gbq
cloudpickle
rpyc==4.1.5
asv

0 comments on commit a7974de

Please sign in to comment.