lmcinnes · lmcinnes · Jun 23, 2019 · Apr 3, 2019 · May 23, 2019
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -35,3 +35,18 @@ are fixing a new issue feel free to file an issue and then reference it in the P
 You can [browse open issues](https://github.com/lmcinnes/umap/issues), 
 or consult the [project roadmap](https://github.com/lmcinnes/umap/issues/15), for potential code
 contributions. Fixes for issues tagged with 'help wanted' are especially appreciated.
+
+### Code formatting
+
+If possible, install the [black code formatter](https://github.com/python/black) (e.g.
+`pip install black`) and run it before submitting a pull request. This helps maintain consistency
+across the code, but also there is a check in the Travis-CI continuous integration system which
+will show up as a failure in the pull request if `black` detects that it hasn't been run.
+
+Formatting is as simple as running:
+
+```bash
+black .
+```
+
+in the root of the project.
diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh
@@ -31,6 +31,7 @@ if [[ "$DISTRIB" == "conda" ]]; then
 
   source activate testenv
 
+  pip install black
 
   if [[ "$COVERAGE" == "true" ]]; then
       pip install coverage coveralls

diff --git a/ci_scripts/test.sh b/ci_scripts/test.sh
@@ -1,5 +1,9 @@
 set -e
 
+if [[ "$COVERAGE" == "true" ]]; then
+    black --check $MODULE
+fi
+
 # Get into a temp directory to run test from the installed scikit learn and
 # check if we do not leave artifacts
 mkdir -p $TEST_DIR

diff --git a/umap/__init__.py b/umap/__init__.py
@@ -2,7 +2,8 @@
 
 # Workaround: https://github.com/numba/numba/issues/3341
 import numba
-numba.config.THREADING_LAYER = 'workqueue'
+
+numba.config.THREADING_LAYER = "workqueue"
 
 import pkg_resources
 

diff --git a/umap/rp_tree.py b/umap/rp_tree.py
@@ -79,7 +79,7 @@ def angular_random_projection_split(data, indices, rng_state):
 
     for d in range(dim):
         hyperplane_vector[d] = (data[left, d] / left_norm) - (
-                data[right, d] / right_norm
+            data[right, d] / right_norm
         )
 
     hyperplane_norm = norm(hyperplane_vector)
@@ -175,7 +175,7 @@ def euclidean_random_projection_split(data, indices, rng_state):
     for d in range(dim):
         hyperplane_vector[d] = data[left, d] - data[right, d]
         hyperplane_offset -= (
-                hyperplane_vector[d] * (data[left, d] + data[right, d]) / 2.0
+            hyperplane_vector[d] * (data[left, d] + data[right, d]) / 2.0
         )
 
     # For each point compute the margin (project into normal vector, add offset)
@@ -606,7 +606,7 @@ def max_sparse_hyperplane_size(tree):
 
 
 def recursive_flatten(
-        tree, hyperplanes, offsets, children, indices, node_num, leaf_num
+    tree, hyperplanes, offsets, children, indices, node_num, leaf_num
 ):
     if tree.is_leaf:
         children[node_num, 0] = -leaf_num
@@ -630,7 +630,7 @@ def recursive_flatten(
             indices,
             node_num + 1,
             leaf_num,
-            )
+        )
         children[old_node_num, 1] = node_num + 1
         node_num, leaf_num = recursive_flatten(
             tree.right_child,
@@ -640,7 +640,7 @@ def recursive_flatten(
             indices,
             node_num + 1,
             leaf_num,
-            )
+        )
         return node_num, leaf_num
 
 

diff --git a/umap/spectral.py b/umap/spectral.py
@@ -266,10 +266,7 @@ def spectral_layout(data, graph, dim, random_state, metric="euclidean", metric_k
             )
         else:
             eigenvalues, eigenvectors = scipy.sparse.linalg.lobpcg(
-                L,
-                random_state.normal(size=(L.shape[0], k)),
-                largest=False,
-                tol=1e-8
+                L, random_state.normal(size=(L.shape[0], k)), largest=False, tol=1e-8
             )
         order = np.argsort(eigenvalues)[1:k]
         return eigenvectors[:, order]

diff --git a/umap/tests/test_umap.py b/umap/tests/test_umap.py
@@ -46,22 +46,23 @@
 import os.path
 from nose.tools import assert_greater_equal
 from nose.tools import assert_less
+
 """
 Tests for UMAP to ensure things are working as expected.
 """
 import warnings
-warnings.filterwarnings('ignore', category=UserWarning)
+
+warnings.filterwarnings("ignore", category=UserWarning)
 
 
 np.random.seed(42)
 spatial_data = np.random.randn(10, 20)
 spatial_data = np.vstack(
     [spatial_data, np.zeros((2, 20))]
 )  # Add some all zero data for corner case test
-binary_data = np.random.choice(
-    a=[False, True], size=(10, 20), p=[0.66, 1 - 0.66])
+binary_data = np.random.choice(a=[False, True], size=(10, 20), p=[0.66, 1 - 0.66])
 binary_data = np.vstack(
-    [binary_data, np.zeros((2, 20), dtype='bool')]
+    [binary_data, np.zeros((2, 20), dtype="bool")]
 )  # Add some all zero data for corner case test
 sparse_spatial_data = sparse.csr_matrix(spatial_data * binary_data)
 sparse_binary_data = sparse.csr_matrix(binary_data)
@@ -70,16 +71,14 @@
 nn_data = np.vstack(
     [nn_data, np.zeros((2, 5))]
 )  # Add some all zero data for corner case test
-binary_nn_data = np.random.choice(
-    a=[False, True], size=(1000, 5), p=[0.66, 1 - 0.66])
+binary_nn_data = np.random.choice(a=[False, True], size=(1000, 5), p=[0.66, 1 - 0.66])
 binary_nn_data = np.vstack(
-    [binary_nn_data, np.zeros((2, 5), dtype='bool')]
+    [binary_nn_data, np.zeros((2, 5), dtype="bool")]
 )  # Add some all zero data for corner case test
 sparse_nn_data = sparse.csr_matrix(nn_data * binary_nn_data)
 
 iris = datasets.load_iris()
-iris_selection = np.random.choice(
-    [True, False], 150, replace=True, p=[0.75, 0.25])
+iris_selection = np.random.choice([True, False], 150, replace=True, p=[0.75, 0.25])
 
 
 def spatial_check(metric):
@@ -137,9 +136,7 @@ def binary_check(metric):
 
 def sparse_spatial_check(metric):
     if metric in spdist.sparse_named_distances:
-        dist_matrix = pairwise_distances(
-            sparse_spatial_data.todense(), metric=metric
-        )
+        dist_matrix = pairwise_distances(sparse_spatial_data.todense(), metric=metric)
     if metric in ("braycurtis", "dice", "sokalsneath", "yule"):
         dist_matrix[np.where(~np.isfinite(dist_matrix))] = 0.0
     if metric in ("cosine", "correlation", "kulsinski", "russellrao"):
@@ -190,9 +187,7 @@ def sparse_spatial_check(metric):
 
 def sparse_binary_check(metric):
     if metric in spdist.sparse_named_distances:
-        dist_matrix = pairwise_distances(
-            sparse_binary_data.todense(), metric=metric
-        )
+        dist_matrix = pairwise_distances(sparse_binary_data.todense(), metric=metric)
     if metric in ("jaccard", "dice", "sokalsneath", "yule"):
         dist_matrix[np.where(~np.isfinite(dist_matrix))] = 0.0
     if metric in ("kulsinski", "russellrao"):
@@ -294,8 +289,7 @@ def test_sparse_nn_descent_neighbor_accuracy():
     )
 
     tree = KDTree(sparse_nn_data.todense())
-    true_indices = tree.query(sparse_nn_data.todense(),
-                              10, return_distance=False)
+    true_indices = tree.query(sparse_nn_data.todense(), 10, return_distance=False)
 
     num_correct = 0.0
     for i in range(nn_data.shape[0]):
@@ -420,8 +414,7 @@ def test_nn_search():
         False,
     )
 
-    search_graph = sparse.lil_matrix(
-        (train.shape[0], train.shape[0]), dtype=np.int8)
+    search_graph = sparse.lil_matrix((train.shape[0], train.shape[0]), dtype=np.int8)
     search_graph.rows = knn_indices
     search_graph.data = (knn_dists != 0).astype(np.int8)
     search_graph = search_graph.maximum(search_graph.transpose()).tocsr()
@@ -433,8 +426,7 @@ def test_nn_search():
     init = initialise_search(
         rp_forest, train, test, int(10 * 3), random_init, tree_init, rng_state
     )
-    result = search(train, search_graph.indptr,
-                    search_graph.indices, init, test)
+    result = search(train, search_graph.indptr, search_graph.indices, init, test)
 
     indices, dists = deheap_sort(result)
     indices = indices[:, :10]
@@ -596,8 +588,7 @@ def test_seuclidean():
     test_matrix = np.array(
         [
             [
-                dist.standardised_euclidean(
-                    spatial_data[i], spatial_data[j], v)
+                dist.standardised_euclidean(spatial_data[i], spatial_data[j], v)
                 for j in range(spatial_data.shape[0])
             ]
             for i in range(spatial_data.shape[0])
@@ -612,13 +603,11 @@ def test_seuclidean():
 
 def test_weighted_minkowski():
     v = np.abs(np.random.randn(spatial_data.shape[1]))
-    dist_matrix = pairwise_distances(
-        spatial_data, metric="wminkowski", w=v, p=3)
+    dist_matrix = pairwise_distances(spatial_data, metric="wminkowski", w=v, p=3)
     test_matrix = np.array(
         [
             [
-                dist.weighted_minkowski(
-                    spatial_data[i], spatial_data[j], v, p=3)
+                dist.weighted_minkowski(spatial_data[i], spatial_data[j], v, p=3)
                 for j in range(spatial_data.shape[0])
             ]
             for i in range(spatial_data.shape[0])
@@ -683,14 +672,12 @@ def test_umap_sparse_trustworthiness():
 
 def test_umap_trustworthiness_on_iris():
     data = iris.data
-    embedding = UMAP(n_neighbors=10, min_dist=0.01,
-                     random_state=42).fit_transform(data)
+    embedding = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit_transform(data)
     trust = trustworthiness(iris.data, embedding, 10)
     assert_greater_equal(
         trust,
         0.97,
-        "Insufficiently trustworthy embedding for" "iris dataset: {}".format(
-            trust),
+        "Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust),
     )
 
 
@@ -703,8 +690,7 @@ def test_umap_trustworthiness_on_iris_random_init():
     assert_greater_equal(
         trust,
         0.95,
-        "Insufficiently trustworthy embedding for" "iris dataset: {}".format(
-            trust),
+        "Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust),
     )
 
 
@@ -717,8 +703,7 @@ def test_supervised_umap_trustworthiness_on_iris():
     assert_greater_equal(
         trust,
         0.97,
-        "Insufficiently trustworthy embedding for" "iris dataset: {}".format(
-            trust),
+        "Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust),
     )
 
 
@@ -733,8 +718,7 @@ def test_semisupervised_umap_trustworthiness_on_iris():
     assert_greater_equal(
         trust,
         0.97,
-        "Insufficiently trustworthy embedding for" "iris dataset: {}".format(
-            trust),
+        "Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust),
     )
 
 
@@ -747,8 +731,7 @@ def test_initialized_umap_trustworthiness_on_iris():
     assert_greater_equal(
         trust,
         0.97,
-        "Insufficiently trustworthy embedding for" "iris dataset: {}".format(
-            trust),
+        "Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust),
     )
 
 
@@ -763,8 +746,7 @@ def test_umap_transform_on_iris():
     assert_greater_equal(
         trust,
         0.89,
-        "Insufficiently trustworthy transform for" "iris dataset: {}".format(
-            trust),
+        "Insufficiently trustworthy transform for" "iris dataset: {}".format(trust),
     )
 
 
@@ -783,26 +765,23 @@ def test_umap_transform_on_iris():
 def test_blobs_cluster():
     data, labels = datasets.make_blobs(n_samples=500, n_features=10, centers=5)
     embedding = UMAP().fit_transform(data)
-    assert_equal(adjusted_rand_score(
-        labels, KMeans(5).fit_predict(embedding)), 1.0)
+    assert_equal(adjusted_rand_score(labels, KMeans(5).fit_predict(embedding)), 1.0)
 
 
 def test_multi_component_layout():
     data, labels = datasets.make_blobs(
         100, 2, centers=5, cluster_std=0.5, center_box=[-20, 20], random_state=42
     )
 
-    true_centroids = np.empty(
-        (labels.max() + 1, data.shape[1]), dtype=np.float64)
+    true_centroids = np.empty((labels.max() + 1, data.shape[1]), dtype=np.float64)
 
     for label in range(labels.max() + 1):
         true_centroids[label] = data[labels == label].mean(axis=0)
 
     true_centroids = normalize(true_centroids, norm="l2")
 
     embedding = UMAP(n_neighbors=4).fit_transform(data)
-    embed_centroids = np.empty(
-        (labels.max() + 1, data.shape[1]), dtype=np.float64)
+    embed_centroids = np.empty((labels.max() + 1, data.shape[1]), dtype=np.float64)
     embed_labels = KMeans(n_clusters=5).fit_predict(embedding)
 
     for label in range(embed_labels.max() + 1):
@@ -830,7 +809,7 @@ def test_bad_too_large_min_dist():
     # a RuntimeWarning about division by zero in a,b curve fitting is expected
     # caught and ignored for this test
     with warnings.catch_warnings():
-        warnings.filterwarnings('ignore', category=RuntimeWarning)
+        warnings.filterwarnings("ignore", category=RuntimeWarning)
         assert_raises(ValueError, u.fit, nn_data)
 
 
@@ -905,14 +884,7 @@ def test_negative_target_nneighbors():
 
 
 def test_umap_bad_nn():
-    assert_raises(ValueError,
-                  nearest_neighbors,
-                  nn_data,
-                  10,
-                  42,
-                  {},
-                  False,
-                  np.random)
+    assert_raises(ValueError, nearest_neighbors, nn_data, 10, 42, {}, False, np.random)
 
 
 def test_umap_bad_nn_sparse():
@@ -930,7 +902,7 @@ def test_umap_bad_nn_sparse():
 
 def test_too_many_neighbors_warns():
     u = UMAP(a=1.2, b=1.75, n_neighbors=2000, n_epochs=11, init="random")
-    u.fit(nn_data[:100, ])
+    u.fit(nn_data[:100,])
     assert_equal(u._a, 1.2)
     assert_equal(u._b, 1.75)