diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2a877cb1..b54deba4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -35,3 +35,18 @@ are fixing a new issue feel free to file an issue and then reference it in the P You can [browse open issues](https://github.com/lmcinnes/umap/issues), or consult the [project roadmap](https://github.com/lmcinnes/umap/issues/15), for potential code contributions. Fixes for issues tagged with 'help wanted' are especially appreciated. + +### Code formatting + +If possible, install the [black code formatter](https://github.com/python/black) (e.g. +`pip install black`) and run it before submitting a pull request. This helps maintain consistency +across the code, but also there is a check in the Travis-CI continuous integration system which +will show up as a failure in the pull request if `black` detects that it hasn't been run. + +Formatting is as simple as running: + +```bash +black . +``` + +in the root of the project. diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh index c91365c3..cba68853 100644 --- a/ci_scripts/install.sh +++ b/ci_scripts/install.sh @@ -31,6 +31,7 @@ if [[ "$DISTRIB" == "conda" ]]; then source activate testenv + pip install black if [[ "$COVERAGE" == "true" ]]; then pip install coverage coveralls diff --git a/ci_scripts/test.sh b/ci_scripts/test.sh index e0548499..e131d451 100644 --- a/ci_scripts/test.sh +++ b/ci_scripts/test.sh @@ -1,5 +1,9 @@ set -e +if [[ "$COVERAGE" == "true" ]]; then + black --check $MODULE +fi + # Get into a temp directory to run test from the installed scikit learn and # check if we do not leave artifacts mkdir -p $TEST_DIR diff --git a/umap/__init__.py b/umap/__init__.py index 8c32fe10..c5ac3065 100644 --- a/umap/__init__.py +++ b/umap/__init__.py @@ -2,7 +2,8 @@ # Workaround: https://github.com/numba/numba/issues/3341 import numba -numba.config.THREADING_LAYER = 'workqueue' + +numba.config.THREADING_LAYER = "workqueue" import pkg_resources diff --git a/umap/rp_tree.py b/umap/rp_tree.py index 6f14e8df..daaa7d7f 100644 --- a/umap/rp_tree.py +++ b/umap/rp_tree.py @@ -79,7 +79,7 @@ def angular_random_projection_split(data, indices, rng_state): for d in range(dim): hyperplane_vector[d] = (data[left, d] / left_norm) - ( - data[right, d] / right_norm + data[right, d] / right_norm ) hyperplane_norm = norm(hyperplane_vector) @@ -175,7 +175,7 @@ def euclidean_random_projection_split(data, indices, rng_state): for d in range(dim): hyperplane_vector[d] = data[left, d] - data[right, d] hyperplane_offset -= ( - hyperplane_vector[d] * (data[left, d] + data[right, d]) / 2.0 + hyperplane_vector[d] * (data[left, d] + data[right, d]) / 2.0 ) # For each point compute the margin (project into normal vector, add offset) @@ -606,7 +606,7 @@ def max_sparse_hyperplane_size(tree): def recursive_flatten( - tree, hyperplanes, offsets, children, indices, node_num, leaf_num + tree, hyperplanes, offsets, children, indices, node_num, leaf_num ): if tree.is_leaf: children[node_num, 0] = -leaf_num @@ -630,7 +630,7 @@ def recursive_flatten( indices, node_num + 1, leaf_num, - ) + ) children[old_node_num, 1] = node_num + 1 node_num, leaf_num = recursive_flatten( tree.right_child, @@ -640,7 +640,7 @@ def recursive_flatten( indices, node_num + 1, leaf_num, - ) + ) return node_num, leaf_num diff --git a/umap/spectral.py b/umap/spectral.py index c82d68de..153c34d6 100644 --- a/umap/spectral.py +++ b/umap/spectral.py @@ -266,10 +266,7 @@ def spectral_layout(data, graph, dim, random_state, metric="euclidean", metric_k ) else: eigenvalues, eigenvectors = scipy.sparse.linalg.lobpcg( - L, - random_state.normal(size=(L.shape[0], k)), - largest=False, - tol=1e-8 + L, random_state.normal(size=(L.shape[0], k)), largest=False, tol=1e-8 ) order = np.argsort(eigenvalues)[1:k] return eigenvectors[:, order] diff --git a/umap/tests/test_umap.py b/umap/tests/test_umap.py index 296ed765..3e45d960 100644 --- a/umap/tests/test_umap.py +++ b/umap/tests/test_umap.py @@ -46,11 +46,13 @@ import os.path from nose.tools import assert_greater_equal from nose.tools import assert_less + """ Tests for UMAP to ensure things are working as expected. """ import warnings -warnings.filterwarnings('ignore', category=UserWarning) + +warnings.filterwarnings("ignore", category=UserWarning) np.random.seed(42) @@ -58,10 +60,9 @@ spatial_data = np.vstack( [spatial_data, np.zeros((2, 20))] ) # Add some all zero data for corner case test -binary_data = np.random.choice( - a=[False, True], size=(10, 20), p=[0.66, 1 - 0.66]) +binary_data = np.random.choice(a=[False, True], size=(10, 20), p=[0.66, 1 - 0.66]) binary_data = np.vstack( - [binary_data, np.zeros((2, 20), dtype='bool')] + [binary_data, np.zeros((2, 20), dtype="bool")] ) # Add some all zero data for corner case test sparse_spatial_data = sparse.csr_matrix(spatial_data * binary_data) sparse_binary_data = sparse.csr_matrix(binary_data) @@ -70,16 +71,14 @@ nn_data = np.vstack( [nn_data, np.zeros((2, 5))] ) # Add some all zero data for corner case test -binary_nn_data = np.random.choice( - a=[False, True], size=(1000, 5), p=[0.66, 1 - 0.66]) +binary_nn_data = np.random.choice(a=[False, True], size=(1000, 5), p=[0.66, 1 - 0.66]) binary_nn_data = np.vstack( - [binary_nn_data, np.zeros((2, 5), dtype='bool')] + [binary_nn_data, np.zeros((2, 5), dtype="bool")] ) # Add some all zero data for corner case test sparse_nn_data = sparse.csr_matrix(nn_data * binary_nn_data) iris = datasets.load_iris() -iris_selection = np.random.choice( - [True, False], 150, replace=True, p=[0.75, 0.25]) +iris_selection = np.random.choice([True, False], 150, replace=True, p=[0.75, 0.25]) def spatial_check(metric): @@ -137,9 +136,7 @@ def binary_check(metric): def sparse_spatial_check(metric): if metric in spdist.sparse_named_distances: - dist_matrix = pairwise_distances( - sparse_spatial_data.todense(), metric=metric - ) + dist_matrix = pairwise_distances(sparse_spatial_data.todense(), metric=metric) if metric in ("braycurtis", "dice", "sokalsneath", "yule"): dist_matrix[np.where(~np.isfinite(dist_matrix))] = 0.0 if metric in ("cosine", "correlation", "kulsinski", "russellrao"): @@ -190,9 +187,7 @@ def sparse_spatial_check(metric): def sparse_binary_check(metric): if metric in spdist.sparse_named_distances: - dist_matrix = pairwise_distances( - sparse_binary_data.todense(), metric=metric - ) + dist_matrix = pairwise_distances(sparse_binary_data.todense(), metric=metric) if metric in ("jaccard", "dice", "sokalsneath", "yule"): dist_matrix[np.where(~np.isfinite(dist_matrix))] = 0.0 if metric in ("kulsinski", "russellrao"): @@ -294,8 +289,7 @@ def test_sparse_nn_descent_neighbor_accuracy(): ) tree = KDTree(sparse_nn_data.todense()) - true_indices = tree.query(sparse_nn_data.todense(), - 10, return_distance=False) + true_indices = tree.query(sparse_nn_data.todense(), 10, return_distance=False) num_correct = 0.0 for i in range(nn_data.shape[0]): @@ -420,8 +414,7 @@ def test_nn_search(): False, ) - search_graph = sparse.lil_matrix( - (train.shape[0], train.shape[0]), dtype=np.int8) + search_graph = sparse.lil_matrix((train.shape[0], train.shape[0]), dtype=np.int8) search_graph.rows = knn_indices search_graph.data = (knn_dists != 0).astype(np.int8) search_graph = search_graph.maximum(search_graph.transpose()).tocsr() @@ -433,8 +426,7 @@ def test_nn_search(): init = initialise_search( rp_forest, train, test, int(10 * 3), random_init, tree_init, rng_state ) - result = search(train, search_graph.indptr, - search_graph.indices, init, test) + result = search(train, search_graph.indptr, search_graph.indices, init, test) indices, dists = deheap_sort(result) indices = indices[:, :10] @@ -596,8 +588,7 @@ def test_seuclidean(): test_matrix = np.array( [ [ - dist.standardised_euclidean( - spatial_data[i], spatial_data[j], v) + dist.standardised_euclidean(spatial_data[i], spatial_data[j], v) for j in range(spatial_data.shape[0]) ] for i in range(spatial_data.shape[0]) @@ -612,13 +603,11 @@ def test_seuclidean(): def test_weighted_minkowski(): v = np.abs(np.random.randn(spatial_data.shape[1])) - dist_matrix = pairwise_distances( - spatial_data, metric="wminkowski", w=v, p=3) + dist_matrix = pairwise_distances(spatial_data, metric="wminkowski", w=v, p=3) test_matrix = np.array( [ [ - dist.weighted_minkowski( - spatial_data[i], spatial_data[j], v, p=3) + dist.weighted_minkowski(spatial_data[i], spatial_data[j], v, p=3) for j in range(spatial_data.shape[0]) ] for i in range(spatial_data.shape[0]) @@ -683,14 +672,12 @@ def test_umap_sparse_trustworthiness(): def test_umap_trustworthiness_on_iris(): data = iris.data - embedding = UMAP(n_neighbors=10, min_dist=0.01, - random_state=42).fit_transform(data) + embedding = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit_transform(data) trust = trustworthiness(iris.data, embedding, 10) assert_greater_equal( trust, 0.97, - "Insufficiently trustworthy embedding for" "iris dataset: {}".format( - trust), + "Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust), ) @@ -703,8 +690,7 @@ def test_umap_trustworthiness_on_iris_random_init(): assert_greater_equal( trust, 0.95, - "Insufficiently trustworthy embedding for" "iris dataset: {}".format( - trust), + "Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust), ) @@ -717,8 +703,7 @@ def test_supervised_umap_trustworthiness_on_iris(): assert_greater_equal( trust, 0.97, - "Insufficiently trustworthy embedding for" "iris dataset: {}".format( - trust), + "Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust), ) @@ -733,8 +718,7 @@ def test_semisupervised_umap_trustworthiness_on_iris(): assert_greater_equal( trust, 0.97, - "Insufficiently trustworthy embedding for" "iris dataset: {}".format( - trust), + "Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust), ) @@ -747,8 +731,7 @@ def test_initialized_umap_trustworthiness_on_iris(): assert_greater_equal( trust, 0.97, - "Insufficiently trustworthy embedding for" "iris dataset: {}".format( - trust), + "Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust), ) @@ -763,8 +746,7 @@ def test_umap_transform_on_iris(): assert_greater_equal( trust, 0.89, - "Insufficiently trustworthy transform for" "iris dataset: {}".format( - trust), + "Insufficiently trustworthy transform for" "iris dataset: {}".format(trust), ) @@ -783,8 +765,7 @@ def test_umap_transform_on_iris(): def test_blobs_cluster(): data, labels = datasets.make_blobs(n_samples=500, n_features=10, centers=5) embedding = UMAP().fit_transform(data) - assert_equal(adjusted_rand_score( - labels, KMeans(5).fit_predict(embedding)), 1.0) + assert_equal(adjusted_rand_score(labels, KMeans(5).fit_predict(embedding)), 1.0) def test_multi_component_layout(): @@ -792,8 +773,7 @@ def test_multi_component_layout(): 100, 2, centers=5, cluster_std=0.5, center_box=[-20, 20], random_state=42 ) - true_centroids = np.empty( - (labels.max() + 1, data.shape[1]), dtype=np.float64) + true_centroids = np.empty((labels.max() + 1, data.shape[1]), dtype=np.float64) for label in range(labels.max() + 1): true_centroids[label] = data[labels == label].mean(axis=0) @@ -801,8 +781,7 @@ def test_multi_component_layout(): true_centroids = normalize(true_centroids, norm="l2") embedding = UMAP(n_neighbors=4).fit_transform(data) - embed_centroids = np.empty( - (labels.max() + 1, data.shape[1]), dtype=np.float64) + embed_centroids = np.empty((labels.max() + 1, data.shape[1]), dtype=np.float64) embed_labels = KMeans(n_clusters=5).fit_predict(embedding) for label in range(embed_labels.max() + 1): @@ -830,7 +809,7 @@ def test_bad_too_large_min_dist(): # a RuntimeWarning about division by zero in a,b curve fitting is expected # caught and ignored for this test with warnings.catch_warnings(): - warnings.filterwarnings('ignore', category=RuntimeWarning) + warnings.filterwarnings("ignore", category=RuntimeWarning) assert_raises(ValueError, u.fit, nn_data) @@ -905,14 +884,7 @@ def test_negative_target_nneighbors(): def test_umap_bad_nn(): - assert_raises(ValueError, - nearest_neighbors, - nn_data, - 10, - 42, - {}, - False, - np.random) + assert_raises(ValueError, nearest_neighbors, nn_data, 10, 42, {}, False, np.random) def test_umap_bad_nn_sparse(): @@ -930,7 +902,7 @@ def test_umap_bad_nn_sparse(): def test_too_many_neighbors_warns(): u = UMAP(a=1.2, b=1.75, n_neighbors=2000, n_epochs=11, init="random") - u.fit(nn_data[:100, ]) + u.fit(nn_data[:100,]) assert_equal(u._a, 1.2) assert_equal(u._b, 1.75) diff --git a/umap/umap_.py b/umap/umap_.py index 91c068ea..601f7f34 100644 --- a/umap/umap_.py +++ b/umap/umap_.py @@ -49,7 +49,9 @@ NPY_INFINITY = np.inf -@numba.njit(fastmath=True) # benchmarking `parallel=True` shows it to *decrease* performance +@numba.njit( + fastmath=True +) # benchmarking `parallel=True` shows it to *decrease* performance def smooth_knn_dist(distances, k, n_iter=64, local_connectivity=1.0, bandwidth=1.0): """Compute a continuous version of the distance to the kth nearest neighbor. That is, this is similar to knn-distance but allows continuous @@ -109,7 +111,9 @@ def smooth_knn_dist(distances, k, n_iter=64, local_connectivity=1.0, bandwidth=1 if index > 0: rho[i] = non_zero_dists[index - 1] if interpolation > SMOOTH_K_TOLERANCE: - rho[i] += interpolation * (non_zero_dists[index] - non_zero_dists[index - 1]) + rho[i] += interpolation * ( + non_zero_dists[index] - non_zero_dists[index - 1] + ) else: rho[i] = interpolation * non_zero_dists[0] elif non_zero_dists.shape[0] > 0: @@ -125,7 +129,6 @@ def smooth_knn_dist(distances, k, n_iter=64, local_connectivity=1.0, bandwidth=1 else: psum += 1.0 - if np.fabs(psum - target) < SMOOTH_K_TOLERANCE: break @@ -233,7 +236,7 @@ def nearest_neighbors( n_trees = 5 + int(round((X.shape[0]) ** 0.5 / 20.0)) n_iters = max(5, int(round(np.log2(X.shape[0])))) if verbose: - print(ts(), "Building RP forest with", str(n_trees), "trees") + print(ts(), "Building RP forest with", str(n_trees), "trees") rp_forest = make_forest(X, n_neighbors, n_trees, rng_state, angular) leaf_array = rptree_leaf_array(rp_forest) @@ -1284,7 +1287,6 @@ def __init__( self.a = a self.b = b - def _validate_parameters(self): if self.set_op_mix_ratio < 0.0 or self.set_op_mix_ratio > 1.0: raise ValueError("set_op_mix_ratio must be between 0.0 and 1.0") @@ -1320,8 +1322,7 @@ def _validate_parameters(self): if self.n_epochs is not None and ( self.n_epochs <= 10 or not isinstance(self.n_epochs, int) ): - raise ValueError("n_epochs must be a positive integer " - "larger than 10") + raise ValueError("n_epochs must be a positive integer " "larger than 10") def fit(self, X, y=None): """Fit X into an embedded space. @@ -1378,7 +1379,9 @@ def fit(self, X, y=None): # Error check n_neighbors based on data size if X.shape[0] <= self.n_neighbors: if X.shape[0] == 1: - self.embedding_ = np.zeros((1, self.n_components)) # needed to sklearn comparability + self.embedding_ = np.zeros( + (1, self.n_components) + ) # needed to sklearn comparability return self warn( @@ -1458,14 +1461,16 @@ def fit(self, X, y=None): self._distance_func = self.metric elif self.metric in dist.named_distances: self._distance_func = dist.named_distances[self.metric] - elif self.metric == 'precomputed': - warn('Using precomputed metric; transform will be unavailable for new data') + elif self.metric == "precomputed": + warn( + "Using precomputed metric; transform will be unavailable for new data" + ) else: raise ValueError( "Metric is neither callable, " + "nor a recognised string" ) - if self.metric != 'precomputed': + if self.metric != "precomputed": self._dist_args = tuple(self._metric_kwds.values()) self._random_init, self._tree_init = make_initialisations( @@ -1499,9 +1504,11 @@ def fit(self, X, y=None): # Handle the small case as precomputed as before if y.shape[0] < 4096: - ydmat = pairwise_distances(y_[np.newaxis, :].T, - metric=self.target_metric, - **self._target_metric_kwds) + ydmat = pairwise_distances( + y_[np.newaxis, :].T, + metric=self.target_metric, + **self._target_metric_kwds + ) target_graph = fuzzy_simplicial_set( ydmat, target_n_neighbors, @@ -1513,7 +1520,7 @@ def fit(self, X, y=None): False, 1.0, 1.0, - False + False, ) else: # Standard case @@ -1612,8 +1619,10 @@ def transform(self, X): """ # If we fit just a single instance then error if self.embedding_.shape[0] == 1: - raise ValueError('Transform unavailable when model was fit with' - 'only a single data sample.') + raise ValueError( + "Transform unavailable when model was fit with" + "only a single data sample." + ) # If we just have the original input then short circuit things X = check_array(X, dtype=np.float32, accept_sparse="csr") x_hash = joblib.hash(X) @@ -1622,9 +1631,10 @@ def transform(self, X): if self._sparse_data: raise ValueError("Transform not available for sparse input.") - elif self.metric == 'precomputed': - raise ValueError("Transform of new data not available for " - "precomputed metric.") + elif self.metric == "precomputed": + raise ValueError( + "Transform of new data not available for " "precomputed metric." + ) X = check_array(X, dtype=np.float32, order="C") random_state = check_random_state(self.transform_seed) @@ -1634,13 +1644,11 @@ def transform(self, X): dmat = pairwise_distances( X, self._raw_data, metric=self.metric, **self._metric_kwds ) - indices = np.argpartition(dmat, - self._n_neighbors)[:, :self._n_neighbors] + indices = np.argpartition(dmat, self._n_neighbors)[:, : self._n_neighbors] dmat_shortened = submatrix(dmat, indices, self._n_neighbors) indices_sorted = np.argsort(dmat_shortened) indices = submatrix(indices, indices_sorted, self._n_neighbors) - dists = submatrix(dmat_shortened, indices_sorted, - self._n_neighbors) + dists = submatrix(dmat_shortened, indices_sorted, self._n_neighbors) else: init = initialise_search( self._rp_forest, diff --git a/umap/utils.py b/umap/utils.py index 4e8bc4bd..48c7d91e 100644 --- a/umap/utils.py +++ b/umap/utils.py @@ -21,14 +21,14 @@ def tau_rand_int(state): ------- A (pseudo)-random int32 value """ - state[0] = (((state[0] & 4294967294) << 12) & 0xffffffff) ^ ( - (((state[0] << 13) & 0xffffffff) ^ state[0]) >> 19 + state[0] = (((state[0] & 4294967294) << 12) & 0xFFFFFFFF) ^ ( + (((state[0] << 13) & 0xFFFFFFFF) ^ state[0]) >> 19 ) - state[1] = (((state[1] & 4294967288) << 4) & 0xffffffff) ^ ( - (((state[1] << 2) & 0xffffffff) ^ state[1]) >> 25 + state[1] = (((state[1] & 4294967288) << 4) & 0xFFFFFFFF) ^ ( + (((state[1] << 2) & 0xFFFFFFFF) ^ state[1]) >> 25 ) - state[2] = (((state[2] & 4294967280) << 17) & 0xffffffff) ^ ( - (((state[2] << 3) & 0xffffffff) ^ state[2]) >> 11 + state[2] = (((state[2] & 4294967280) << 17) & 0xFFFFFFFF) ^ ( + (((state[2] << 3) & 0xFFFFFFFF) ^ state[2]) >> 11 ) return state[0] ^ state[1] ^ state[2] @@ -48,7 +48,7 @@ def tau_rand(state): A (pseudo)-random float32 in the interval [0, 1] """ integer = tau_rand_int(state) - return abs(float(integer) / 0x7fffffff) + return abs(float(integer) / 0x7FFFFFFF) @numba.njit()