Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make Travis check formatting with Black #215

Merged
merged 2 commits into from
Jun 23, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,18 @@ are fixing a new issue feel free to file an issue and then reference it in the P
You can [browse open issues](https://github.com/lmcinnes/umap/issues),
or consult the [project roadmap](https://github.com/lmcinnes/umap/issues/15), for potential code
contributions. Fixes for issues tagged with 'help wanted' are especially appreciated.

### Code formatting

If possible, install the [black code formatter](https://github.com/python/black) (e.g.
`pip install black`) and run it before submitting a pull request. This helps maintain consistency
across the code, but also there is a check in the Travis-CI continuous integration system which
will show up as a failure in the pull request if `black` detects that it hasn't been run.

Formatting is as simple as running:

```bash
black .
```

in the root of the project.
1 change: 1 addition & 0 deletions ci_scripts/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ if [[ "$DISTRIB" == "conda" ]]; then

source activate testenv

pip install black

if [[ "$COVERAGE" == "true" ]]; then
pip install coverage coveralls
Expand Down
4 changes: 4 additions & 0 deletions ci_scripts/test.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
set -e

if [[ "$COVERAGE" == "true" ]]; then
black --check $MODULE
fi

# Get into a temp directory to run test from the installed scikit learn and
# check if we do not leave artifacts
mkdir -p $TEST_DIR
Expand Down
3 changes: 2 additions & 1 deletion umap/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

# Workaround: https://github.com/numba/numba/issues/3341
import numba
numba.config.THREADING_LAYER = 'workqueue'

numba.config.THREADING_LAYER = "workqueue"

import pkg_resources

Expand Down
10 changes: 5 additions & 5 deletions umap/rp_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def angular_random_projection_split(data, indices, rng_state):

for d in range(dim):
hyperplane_vector[d] = (data[left, d] / left_norm) - (
data[right, d] / right_norm
data[right, d] / right_norm
)

hyperplane_norm = norm(hyperplane_vector)
Expand Down Expand Up @@ -175,7 +175,7 @@ def euclidean_random_projection_split(data, indices, rng_state):
for d in range(dim):
hyperplane_vector[d] = data[left, d] - data[right, d]
hyperplane_offset -= (
hyperplane_vector[d] * (data[left, d] + data[right, d]) / 2.0
hyperplane_vector[d] * (data[left, d] + data[right, d]) / 2.0
)

# For each point compute the margin (project into normal vector, add offset)
Expand Down Expand Up @@ -606,7 +606,7 @@ def max_sparse_hyperplane_size(tree):


def recursive_flatten(
tree, hyperplanes, offsets, children, indices, node_num, leaf_num
tree, hyperplanes, offsets, children, indices, node_num, leaf_num
):
if tree.is_leaf:
children[node_num, 0] = -leaf_num
Expand All @@ -630,7 +630,7 @@ def recursive_flatten(
indices,
node_num + 1,
leaf_num,
)
)
children[old_node_num, 1] = node_num + 1
node_num, leaf_num = recursive_flatten(
tree.right_child,
Expand All @@ -640,7 +640,7 @@ def recursive_flatten(
indices,
node_num + 1,
leaf_num,
)
)
return node_num, leaf_num


Expand Down
5 changes: 1 addition & 4 deletions umap/spectral.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,10 +266,7 @@ def spectral_layout(data, graph, dim, random_state, metric="euclidean", metric_k
)
else:
eigenvalues, eigenvectors = scipy.sparse.linalg.lobpcg(
L,
random_state.normal(size=(L.shape[0], k)),
largest=False,
tol=1e-8
L, random_state.normal(size=(L.shape[0], k)), largest=False, tol=1e-8
)
order = np.argsort(eigenvalues)[1:k]
return eigenvectors[:, order]
Expand Down
86 changes: 29 additions & 57 deletions umap/tests/test_umap.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,22 +46,23 @@
import os.path
from nose.tools import assert_greater_equal
from nose.tools import assert_less

"""
Tests for UMAP to ensure things are working as expected.
"""
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

warnings.filterwarnings("ignore", category=UserWarning)


np.random.seed(42)
spatial_data = np.random.randn(10, 20)
spatial_data = np.vstack(
[spatial_data, np.zeros((2, 20))]
) # Add some all zero data for corner case test
binary_data = np.random.choice(
a=[False, True], size=(10, 20), p=[0.66, 1 - 0.66])
binary_data = np.random.choice(a=[False, True], size=(10, 20), p=[0.66, 1 - 0.66])
binary_data = np.vstack(
[binary_data, np.zeros((2, 20), dtype='bool')]
[binary_data, np.zeros((2, 20), dtype="bool")]
) # Add some all zero data for corner case test
sparse_spatial_data = sparse.csr_matrix(spatial_data * binary_data)
sparse_binary_data = sparse.csr_matrix(binary_data)
Expand All @@ -70,16 +71,14 @@
nn_data = np.vstack(
[nn_data, np.zeros((2, 5))]
) # Add some all zero data for corner case test
binary_nn_data = np.random.choice(
a=[False, True], size=(1000, 5), p=[0.66, 1 - 0.66])
binary_nn_data = np.random.choice(a=[False, True], size=(1000, 5), p=[0.66, 1 - 0.66])
binary_nn_data = np.vstack(
[binary_nn_data, np.zeros((2, 5), dtype='bool')]
[binary_nn_data, np.zeros((2, 5), dtype="bool")]
) # Add some all zero data for corner case test
sparse_nn_data = sparse.csr_matrix(nn_data * binary_nn_data)

iris = datasets.load_iris()
iris_selection = np.random.choice(
[True, False], 150, replace=True, p=[0.75, 0.25])
iris_selection = np.random.choice([True, False], 150, replace=True, p=[0.75, 0.25])


def spatial_check(metric):
Expand Down Expand Up @@ -137,9 +136,7 @@ def binary_check(metric):

def sparse_spatial_check(metric):
if metric in spdist.sparse_named_distances:
dist_matrix = pairwise_distances(
sparse_spatial_data.todense(), metric=metric
)
dist_matrix = pairwise_distances(sparse_spatial_data.todense(), metric=metric)
if metric in ("braycurtis", "dice", "sokalsneath", "yule"):
dist_matrix[np.where(~np.isfinite(dist_matrix))] = 0.0
if metric in ("cosine", "correlation", "kulsinski", "russellrao"):
Expand Down Expand Up @@ -190,9 +187,7 @@ def sparse_spatial_check(metric):

def sparse_binary_check(metric):
if metric in spdist.sparse_named_distances:
dist_matrix = pairwise_distances(
sparse_binary_data.todense(), metric=metric
)
dist_matrix = pairwise_distances(sparse_binary_data.todense(), metric=metric)
if metric in ("jaccard", "dice", "sokalsneath", "yule"):
dist_matrix[np.where(~np.isfinite(dist_matrix))] = 0.0
if metric in ("kulsinski", "russellrao"):
Expand Down Expand Up @@ -294,8 +289,7 @@ def test_sparse_nn_descent_neighbor_accuracy():
)

tree = KDTree(sparse_nn_data.todense())
true_indices = tree.query(sparse_nn_data.todense(),
10, return_distance=False)
true_indices = tree.query(sparse_nn_data.todense(), 10, return_distance=False)

num_correct = 0.0
for i in range(nn_data.shape[0]):
Expand Down Expand Up @@ -420,8 +414,7 @@ def test_nn_search():
False,
)

search_graph = sparse.lil_matrix(
(train.shape[0], train.shape[0]), dtype=np.int8)
search_graph = sparse.lil_matrix((train.shape[0], train.shape[0]), dtype=np.int8)
search_graph.rows = knn_indices
search_graph.data = (knn_dists != 0).astype(np.int8)
search_graph = search_graph.maximum(search_graph.transpose()).tocsr()
Expand All @@ -433,8 +426,7 @@ def test_nn_search():
init = initialise_search(
rp_forest, train, test, int(10 * 3), random_init, tree_init, rng_state
)
result = search(train, search_graph.indptr,
search_graph.indices, init, test)
result = search(train, search_graph.indptr, search_graph.indices, init, test)

indices, dists = deheap_sort(result)
indices = indices[:, :10]
Expand Down Expand Up @@ -596,8 +588,7 @@ def test_seuclidean():
test_matrix = np.array(
[
[
dist.standardised_euclidean(
spatial_data[i], spatial_data[j], v)
dist.standardised_euclidean(spatial_data[i], spatial_data[j], v)
for j in range(spatial_data.shape[0])
]
for i in range(spatial_data.shape[0])
Expand All @@ -612,13 +603,11 @@ def test_seuclidean():

def test_weighted_minkowski():
v = np.abs(np.random.randn(spatial_data.shape[1]))
dist_matrix = pairwise_distances(
spatial_data, metric="wminkowski", w=v, p=3)
dist_matrix = pairwise_distances(spatial_data, metric="wminkowski", w=v, p=3)
test_matrix = np.array(
[
[
dist.weighted_minkowski(
spatial_data[i], spatial_data[j], v, p=3)
dist.weighted_minkowski(spatial_data[i], spatial_data[j], v, p=3)
for j in range(spatial_data.shape[0])
]
for i in range(spatial_data.shape[0])
Expand Down Expand Up @@ -683,14 +672,12 @@ def test_umap_sparse_trustworthiness():

def test_umap_trustworthiness_on_iris():
data = iris.data
embedding = UMAP(n_neighbors=10, min_dist=0.01,
random_state=42).fit_transform(data)
embedding = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit_transform(data)
trust = trustworthiness(iris.data, embedding, 10)
assert_greater_equal(
trust,
0.97,
"Insufficiently trustworthy embedding for" "iris dataset: {}".format(
trust),
"Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust),
)


Expand All @@ -703,8 +690,7 @@ def test_umap_trustworthiness_on_iris_random_init():
assert_greater_equal(
trust,
0.95,
"Insufficiently trustworthy embedding for" "iris dataset: {}".format(
trust),
"Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust),
)


Expand All @@ -717,8 +703,7 @@ def test_supervised_umap_trustworthiness_on_iris():
assert_greater_equal(
trust,
0.97,
"Insufficiently trustworthy embedding for" "iris dataset: {}".format(
trust),
"Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust),
)


Expand All @@ -733,8 +718,7 @@ def test_semisupervised_umap_trustworthiness_on_iris():
assert_greater_equal(
trust,
0.97,
"Insufficiently trustworthy embedding for" "iris dataset: {}".format(
trust),
"Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust),
)


Expand All @@ -747,8 +731,7 @@ def test_initialized_umap_trustworthiness_on_iris():
assert_greater_equal(
trust,
0.97,
"Insufficiently trustworthy embedding for" "iris dataset: {}".format(
trust),
"Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust),
)


Expand All @@ -763,8 +746,7 @@ def test_umap_transform_on_iris():
assert_greater_equal(
trust,
0.89,
"Insufficiently trustworthy transform for" "iris dataset: {}".format(
trust),
"Insufficiently trustworthy transform for" "iris dataset: {}".format(trust),
)


Expand All @@ -783,26 +765,23 @@ def test_umap_transform_on_iris():
def test_blobs_cluster():
data, labels = datasets.make_blobs(n_samples=500, n_features=10, centers=5)
embedding = UMAP().fit_transform(data)
assert_equal(adjusted_rand_score(
labels, KMeans(5).fit_predict(embedding)), 1.0)
assert_equal(adjusted_rand_score(labels, KMeans(5).fit_predict(embedding)), 1.0)


def test_multi_component_layout():
data, labels = datasets.make_blobs(
100, 2, centers=5, cluster_std=0.5, center_box=[-20, 20], random_state=42
)

true_centroids = np.empty(
(labels.max() + 1, data.shape[1]), dtype=np.float64)
true_centroids = np.empty((labels.max() + 1, data.shape[1]), dtype=np.float64)

for label in range(labels.max() + 1):
true_centroids[label] = data[labels == label].mean(axis=0)

true_centroids = normalize(true_centroids, norm="l2")

embedding = UMAP(n_neighbors=4).fit_transform(data)
embed_centroids = np.empty(
(labels.max() + 1, data.shape[1]), dtype=np.float64)
embed_centroids = np.empty((labels.max() + 1, data.shape[1]), dtype=np.float64)
embed_labels = KMeans(n_clusters=5).fit_predict(embedding)

for label in range(embed_labels.max() + 1):
Expand Down Expand Up @@ -830,7 +809,7 @@ def test_bad_too_large_min_dist():
# a RuntimeWarning about division by zero in a,b curve fitting is expected
# caught and ignored for this test
with warnings.catch_warnings():
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
assert_raises(ValueError, u.fit, nn_data)


Expand Down Expand Up @@ -905,14 +884,7 @@ def test_negative_target_nneighbors():


def test_umap_bad_nn():
assert_raises(ValueError,
nearest_neighbors,
nn_data,
10,
42,
{},
False,
np.random)
assert_raises(ValueError, nearest_neighbors, nn_data, 10, 42, {}, False, np.random)


def test_umap_bad_nn_sparse():
Expand All @@ -930,7 +902,7 @@ def test_umap_bad_nn_sparse():

def test_too_many_neighbors_warns():
u = UMAP(a=1.2, b=1.75, n_neighbors=2000, n_epochs=11, init="random")
u.fit(nn_data[:100, ])
u.fit(nn_data[:100,])
assert_equal(u._a, 1.2)
assert_equal(u._b, 1.75)

Expand Down
Loading