[REVIEW] Speeding up MNMG KNN Cl&Re testing (#3052)

* Speeding up MNMG KNN Cl&Re testing * Update changelog * Testing with extreme values
rapidsai · Nov 2, 2020 · 5b7757a · 5b7757a
1 parent 983c6f8
commit 5b7757a
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 20 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,7 @@
 - PR #3044: Move leftover `linalg` and `stats` to RAFT namespaces
 - PR #3067: Deleting prims moved to RAFT and updating header paths
 - PR #3074: Reducing dask coordinate descent test runtime
+- PR #3052: Speeding up MNMG KNN Cl&Re testing
 
 ## Bug Fixes
 - PR #3033: Splitting ml metrics to individual files

diff --git a/cpp/src_prims/selection/knn.cuh b/cpp/src_prims/selection/knn.cuh
@@ -471,7 +471,8 @@ void class_probs(std::vector<float *> &out, const int64_t *knn_indices,
      * Build array of class probability arrays from
      * knn_indices and labels
      */
-    device_buffer<int> y_normalized(allocator, stream, n_index_rows);
+    device_buffer<int> y_normalized(allocator, stream,
+                                    n_index_rows + n_unique_labels);
 
     /*
      * Appending the array of unique labels to the original labels array

diff --git a/python/cuml/test/dask/test_kneighbors_classifier.py b/python/cuml/test/dask/test_kneighbors_classifier.py
@@ -69,11 +69,11 @@ def dataset(request):
         if len(new_x) >= request.param['n_samples']:
             break
     X = X[new_x]
-    noise = np.random.normal(0, 1.2, X.shape)
+    noise = np.random.normal(0, 5., X.shape)
     X += noise
     y = np.array(new_y)
 
-    return train_test_split(X, y, test_size=0.1)
+    return train_test_split(X, y, test_size=0.3)
 
 
 def exact_match(output1, output2):
@@ -108,11 +108,11 @@ def check_probabilities(l_probas, d_probas):
 
 
 @pytest.mark.parametrize("datatype", ['dask_array', 'dask_cudf'])
-@pytest.mark.parametrize("n_neighbors", [1, 3, 8])
-@pytest.mark.parametrize("n_parts", [2, 4, 12])
-@pytest.mark.parametrize("batch_size", [128, 1024])
-def test_predict_and_score(dataset, datatype, n_neighbors,
-                           n_parts, batch_size, client):
+@pytest.mark.parametrize("parameters", [(1, 3, 256),
+                                        (8, 8, 256),
+                                        (9, 3, 128)])
+def test_predict_and_score(dataset, datatype, parameters, client):
+    n_neighbors, n_parts, batch_size = parameters
     X_train, X_test, y_train, y_test = dataset
     np_y_test = y_test
 
@@ -165,11 +165,11 @@ def test_predict_and_score(dataset, datatype, n_neighbors,
 
 
 @pytest.mark.parametrize("datatype", ['dask_array', 'dask_cudf'])
-@pytest.mark.parametrize("n_neighbors", [1, 3, 8])
-@pytest.mark.parametrize("n_parts", [2, 4, 12])
-@pytest.mark.parametrize("batch_size", [128, 1024])
-def test_predict_proba(dataset, datatype, n_neighbors,
-                       n_parts, batch_size, client):
+@pytest.mark.parametrize("parameters", [(1, 3, 256),
+                                        (8, 8, 256),
+                                        (9, 3, 128)])
+def test_predict_proba(dataset, datatype, parameters, client):
+    n_neighbors, n_parts, batch_size = parameters
     X_train, X_test, y_train, y_test = dataset
 
     l_model = lKNNClf(n_neighbors=n_neighbors)

diff --git a/python/cuml/test/dask/test_kneighbors_regressor.py b/python/cuml/test/dask/test_kneighbors_regressor.py
@@ -70,11 +70,11 @@ def dataset(request):
         if len(new_x) >= request.param['n_samples']:
             break
     X = X[new_x]
-    noise = np.random.normal(0, 1.2, X.shape)
+    noise = np.random.normal(0, 5., X.shape)
     X += noise
     y = np.array(new_y, dtype=np.float32)
 
-    return train_test_split(X, y, test_size=0.1)
+    return train_test_split(X, y, test_size=0.3)
 
 
 def exact_match(output1, output2):
@@ -102,11 +102,11 @@ def exact_match(output1, output2):
 
 
 @pytest.mark.parametrize("datatype", ['dask_array', 'dask_cudf'])
-@pytest.mark.parametrize("n_neighbors", [1, 3, 8])
-@pytest.mark.parametrize("n_parts", [2, 4, 12])
-@pytest.mark.parametrize("batch_size", [128, 1024])
-def test_predict_and_score(dataset, datatype, n_neighbors,
-                           n_parts, batch_size, client):
+@pytest.mark.parametrize("parameters", [(1, 3, 256),
+                                        (8, 8, 256),
+                                        (9, 3, 128)])
+def test_predict_and_score(dataset, datatype, parameters, client):
+    n_neighbors, n_parts, batch_size = parameters
     X_train, X_test, y_train, y_test = dataset
     np_y_test = y_test