FEAT-modin-project#2479: add CI job for check asv benchmarks

Signed-off-by: Anatoly Myachev <[email protected]>
anmyachev · Dec 2, 2020 · a7974de · a7974de
1 parent f1a3f64
commit a7974de
Show file tree

Hide file tree

Showing 4 changed files with 128 additions and 78 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -297,6 +297,62 @@ jobs:
       - shell: bash -l {0}
         run: bash <(curl -s https://codecov.io/bash)
 
+  test-asv-benchmarks:
+    needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers]
+    runs-on: ubuntu-latest
+    env:
+      MODIN_ENGINE: ray
+      MODIN_MEMORY: 1000000000
+      TestDatasetSize: small
+    name: test-asv-benchmarks
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 1
+      - name: Cache pip
+        uses: actions/cache@v1
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-python-3.7-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }}
+      - uses: conda-incubator/setup-miniconda@v2
+        with:
+          activate-environment: modin
+          environment-file: environment.yml
+          python-version: 3.7
+          channel-priority: strict
+          use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly!
+      - name: Conda environment
+        shell: bash -l {0}
+        run: |
+          conda info
+          conda list
+
+      - name: Running benchmarks
+        shell: bash -l {0}
+        run: |
+          pip install -e .
+          cd asv_bench
+          asv check -E existing
+          git remote add upstream https://github.com/modin-project/modin.git
+          git fetch upstream
+          if git diff upstream/master --name-only | grep -q "^asv_bench/"; then
+              asv machine --yes
+              asv dev | sed "/failed$/ s/^/##[error]/" | tee benchmarks.log
+              if grep "failed" benchmarks.log > /dev/null ; then
+                  exit 1
+              fi
+          else
+              echo "Benchmarks did not run, no changes detected"
+          fi
+        if: always()
+
+      - name: Publish benchmarks artifact
+        uses: actions/upload-artifact@master
+        with:
+          name: Benchmarks log
+          path: asv_bench/benchmarks.log
+        if: failure()
+
   test-all:
     needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers]
     runs-on: ubuntu-latest

diff --git a/asv_bench/benchmarks/benchmarks.py b/asv_bench/benchmarks/benchmarks.py
@@ -11,75 +11,88 @@
 # ANY KIND, either express or implied. See the License for the specific language
 # governing permissions and limitations under the License.
 
-# Write the benchmarking functions here.
-# See "Writing benchmarks" in the asv docs for more information.
 import modin.pandas as pd
-import numpy as np
+from modin.config import TestDatasetSize
 from .utils import generate_dataframe, RAND_LOW, RAND_HIGH
 
 pd.DEFAULT_NPARTITIONS = 4
 
+if TestDatasetSize.get() == "Big":
+    MERGE_DATA_SIZE = [
+        (5000, 5000, 5000, 5000),
+        (10, 1_000_000, 10, 1_000_000),
+        (1_000_000, 10, 1_000_000, 10),
+    ]
+    GROUPBY_DATA_SIZE = [
+        (5000, 5000),
+        (10, 1_000_000),
+        (1_000_000, 10),
+    ]
+else:
+    MERGE_DATA_SIZE = [
+        (2000, 100, 2000, 100),
+    ]
+    GROUPBY_DATA_SIZE = [
+        (2000, 100),
+    ]
+
+JOIN_DATA_SIZE = MERGE_DATA_SIZE
+ARITHMETIC_DATA_SIZE = GROUPBY_DATA_SIZE
+
 
 class TimeGroupBy:
-    param_names = ["rows_cols"]
+    param_names = ["impl", "data_type", "data_size"]
     params = [
-        [
-            (100, 1000),
-            (10000, 1000),
-        ]
+        ["modin", "pandas"],
+        ["int"],
+        GROUPBY_DATA_SIZE,
     ]
 
-    def setup(self, rows_cols):
-        rows, cols = rows_cols
-        # workaround for #2482
-        columns = [str(x) for x in range(cols)]
-        self.df = pd.DataFrame(
-            np.random.randint(0, 100, size=(rows, cols)), columns=columns
+    def setup(self, impl, data_type, data_size):
+        self.df = generate_dataframe(
+            impl, data_type, data_size[0], data_size[1], RAND_LOW, RAND_HIGH
         )
 
-    # add case for multiple by
-    def time_groupby_sum(self, rows_cols):
-        self.df.groupby(by="1").sum()
+    def time_groupby_sum(self, impl, data_type, data_size):
+        self.df.groupby(by=self.df.columns[0]).sum()
 
-    def time_groupby_mean(self, rows_cols):
-        self.df.groupby(by="1").mean()
+    def time_groupby_mean(self, impl, data_type, data_size):
+        self.df.groupby(by=self.df.columns[0]).mean()
 
-    def time_groupby_count(self, rows_cols):
-        self.df.groupby(by="1").count()
+    def time_groupby_count(self, impl, data_type, data_size):
+        self.df.groupby(by=self.df.columns[0]).count()
 
 
 class TimeJoin:
-    param_names = ["rows_cols", "how"]
+    param_names = ["impl", "data_type", "data_size", "how", "sort"]
     params = [
-        [
-            (100, 1000),
-            (10000, 1000),
-        ],
-        ["outer", "inner", "left", "right"],
+        ["modin", "pandas"],
+        ["int"],
+        JOIN_DATA_SIZE,
+        ["left", "right", "outer", "inner"],
+        [False, True],
     ]
 
-    def setup(self, rows_cols, how):
-        rows, cols = rows_cols
-        # workaround for #2482
-        columns = [str(x) for x in range(cols)]
-        numpy_data = np.random.randint(0, 100, size=(rows, cols))
-        self.df_left = pd.DataFrame(numpy_data, columns=columns)
-        self.df_right = pd.DataFrame(numpy_data, columns=columns)
+    def setup(self, impl, data_type, data_size, how, sort):
+        self.df1 = generate_dataframe(
+            impl, data_type, data_size[0], data_size[1], RAND_LOW, RAND_HIGH
+        )
+        self.df2 = generate_dataframe(
+            impl, data_type, data_size[2], data_size[3], RAND_LOW, RAND_HIGH
+        )
 
-    def time_join(self, rows_cols, how):
-        self.df_left.join(self.df_right, how=how, lsuffix="left_")
+    def time_join(self, impl, data_type, data_size, how, sort):
+        self.df1.join(
+            self.df2, on=self.df1.columns[0], how=how, lsuffix="left_", sort=sort
+        )
 
 
 class TimeMerge:
     param_names = ["impl", "data_type", "data_size", "how", "sort"]
     params = [
         ["modin", "pandas"],
         ["int"],
-        [
-            (5000, 5000, 5000, 5000),
-            (10, 1_000_00, 10, 1_000_00),
-            (1_000_00, 10, 1_000_00, 10),
-        ],
+        MERGE_DATA_SIZE,
         ["left", "right", "outer", "inner"],
         [False, True],
     ]
@@ -97,48 +110,27 @@ def time_merge(self, impl, data_type, data_size, how, sort):
 
 
 class TimeArithmetic:
-    param_names = ["rows_cols"]
+    param_names = ["impl", "data_type", "data_size", "axis"]
     params = [
-        [
-            (100, 1000),
-            (10000, 1000),
-        ]
+        ["modin", "pandas"],
+        ["int"],
+        ARITHMETIC_DATA_SIZE,
+        [0, 1],
     ]
 
-    def setup(self, rows_cols):
-        rows, cols = rows_cols
-        # workaround for #2482
-        columns = [str(x) for x in range(cols)]
-        self.df = pd.DataFrame(
-            np.random.randint(0, 100, size=(rows, cols)), columns=columns
+    def setup(self, impl, data_type, data_size, axis):
+        self.df = generate_dataframe(
+            impl, data_type, data_size[0], data_size[1], RAND_LOW, RAND_HIGH
         )
 
-    def time_transpose_lazy(self, rows_cols):
-        self.df.T
-
-    def time_transpose(self, rows_cols):
-        repr(self.df.T)
-
-    def time_sum(self, rows_cols):
-        self.df.sum()
-
-    def time_sum_axis_1(self, rows_cols):
-        self.df.sum(axis=1)
-
-    def time_median(self, rows_cols):
-        self.df.median()
-
-    def time_median_axis_1(self, rows_cols):
-        self.df.median(axis=1)
-
-    def time_nunique(self, rows_cols):
-        self.df.nunique()
+    def time_sum(self, impl, data_type, data_size, axis):
+        self.df.sum(axis=axis)
 
-    def time_nunique_axis_1(self, rows_cols):
-        self.df.nunique(axis=1)
+    def time_median(self, impl, data_type, data_size, axis):
+        self.df.median(axis=axis)
 
-    def time_apply(self, rows_cols):
-        self.df.apply(lambda df: df.sum())
+    def time_nunique(self, impl, data_type, data_size, axis):
+        self.df.nunique(axis=axis)
 
-    def time_apply_axis_1(self, rows_cols):
-        self.df.apply(lambda df: df.sum(), axis=1)
+    def time_apply(self, impl, data_type, data_size, axis):
+        self.df.apply(lambda df: df.sum(), axis=axis)
diff --git a/environment.yml b/environment.yml
@@ -32,5 +32,6 @@ dependencies:
   - rpyc==4.1.5
   - cloudpickle==1.4.1
   - boto3
+  - asv
   - pip:
     - ray>=1.0.0
diff --git a/requirements.txt b/requirements.txt
@@ -26,3 +26,4 @@ msgpack
 pandas_gbq
 cloudpickle
 rpyc==4.1.5
+asv