Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Util unit tests #261

Merged
merged 8 commits into from
Mar 16, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 106 additions & 0 deletions tools/cell_census_builder/tests/test_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import numpy as np
import pytest
from scipy.sparse import coo_matrix, csr_matrix, triu

from tools.cell_census_builder.util import array_chunker, is_positive_integral, uricat


def test_is_positive_integral() -> None:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be pedantic, this method should be called is_nonnegative_integral since 0 is an accepted value. If that looks better I can make the change.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

feel free to improve it!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you do this, please also add a test of empty array, e.g.,

assert not is_nonnegative_integral(np.array([], dtype=np.float32))

and equivalent for sparse.

X = np.array([1, 2, 3, 4])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these result in an array of dtype np.int64. The function only accepts floats (both float32 and float64), although it does not raise an error if the input violates the type signature.

These tests should explicitly create arrays of dtype float32 or float64. Prefer float32 for most tests, as that is what we typically receive.

assert is_positive_integral(X)

X = np.array([-1, 2, 3, 4])
assert not is_positive_integral(X)

X = np.array([1.2, 0, 3, 4])
assert not is_positive_integral(X)

X = np.zeros((3, 4))
assert is_positive_integral(X)

X = csr_matrix([[1, 2, 3], [4, 5, 6]])
assert is_positive_integral(X)

X = csr_matrix([[-1, 2, 3], [4, 5, 6]])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these are also int64 - same issue as with ndarray, they need a float dtype to be specified.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apologies, for some reason I thought scipy.sparse created float matrices by default. Fixing it.

assert not is_positive_integral(X)

X = csr_matrix([[1.2, 0, 3], [4, 5, 6]])
assert not is_positive_integral(X)

X = csr_matrix([0, 0, 0])
assert is_positive_integral(X)


def test_array_chunker() -> None:
# Case 1: dense matrix (np.ndarray)
X = np.ones(1200).reshape(30, 40)
# If nnz_chunk_size is less than the number of cols, the number of cols is used (40 in this example)
chunked = list(array_chunker(X, nnz_chunk_size=10))
assert len(chunked) == 30
for i, s in enumerate(chunked):
assert isinstance(s, coo_matrix)
assert s.nnz == 40
assert s.shape == (30, 40)
# The i-th row of the matrix should have 40 nonzeros (which implies the rest are zeros)
csr = s.tocsr()
assert csr.getrow(i).nnz == 40

# If nnz_chunk_size is less than
chunked = list(array_chunker(X, nnz_chunk_size=600))
assert len(chunked) == 2
for s in chunked:
assert isinstance(s, coo_matrix)
assert s.nnz == 600
assert s.shape == (30, 40)

chunked = list(array_chunker(X, nnz_chunk_size=2400))
assert len(chunked) == 1

# Case 2: compressed row sparse matrix (csr_matrix)
# we'll use an upper triangular matrix with all ones (avg 5 nnz per row)
# [
# [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
# [0., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
# [0., 0., 1., 1., 1., 1., 1., 1., 1., 1.],
# [0., 0., 0., 1., 1., 1., 1., 1., 1., 1.],
# [0., 0., 0., 0., 1., 1., 1., 1., 1., 1.],
# [0., 0., 0., 0., 0., 1., 1., 1., 1., 1.],
# [0., 0., 0., 0., 0., 0., 1., 1., 1., 1.],
# [0., 0., 0., 0., 0., 0., 0., 1., 1., 1.],
# [0., 0., 0., 0., 0., 0., 0., 0., 1., 1.],
# [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
# ]

X = triu(np.ones(100).reshape(10, 10)).tocsr()
# In this case, chunks will be 2 rows x 10 column (since on average each row contains 5 nonzeros)
chunked = list(array_chunker(X, nnz_chunk_size=10))
assert len(chunked) == 5
assert chunked[0].nnz == 19
assert chunked[1].nnz == 15
assert chunked[2].nnz == 11
assert chunked[3].nnz == 7
assert chunked[4].nnz == 3

# Case 3: compressed column sparse matrix (csc_matrix)
# We'll use the same example as for csr, but note that chunking is done by column and not by row.

X = triu(np.ones(100).reshape(10, 10)).tocsc()
# In this case, chunks will be 2 rows x 10 column (since on average each row contains 5 nonzeros)
chunked = list(array_chunker(X, nnz_chunk_size=10))
assert len(chunked) == 5
assert chunked[0].nnz == 3
assert chunked[1].nnz == 7
assert chunked[2].nnz == 11
assert chunked[3].nnz == 15
assert chunked[4].nnz == 19

# Other formats are rejected by the method
X = triu(np.ones(100).reshape(10, 10)).tocoo()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

recommend testing this by passing something other than COO (eg., LIL or BSR), as it is fairly likely we will add COO support the the array_chunker at some point. Just change the final call from tocoo() to tolil() (or anything other than csr/csc/coo)

with pytest.raises(NotImplementedError):
list(array_chunker(X))


def test_uricat() -> None:
bkmartinjr marked this conversation as resolved.
Show resolved Hide resolved
assert uricat("path", "to", "somewhere") == "path/to/somewhere"
assert uricat("path/", "to/", "somewhere") == "path/to/somewhere"
assert uricat("path/", "to/", "somewhere/") == "path/to/somewhere/"
10 changes: 6 additions & 4 deletions tools/cell_census_builder/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import time
import urllib.parse
from typing import Any, Union
from typing import Any, Iterator, Optional, Union

import git
import numpy as np
Expand All @@ -10,11 +10,13 @@
from scipy import sparse


def array_chunker(arr: Union[npt.NDArray[Any], sparse.spmatrix]) -> sparse.coo_matrix:
def array_chunker(
arr: Union[npt.NDArray[Any], sparse.spmatrix],
nnz_chunk_size: Optional[int] = 256 * 1024**2, # goal (~2.4GiB for a 32-bit COO)
) -> Iterator[sparse.coo_matrix]:
"""
Return the array as multiple chunks, each a coo_matrix.
bkmartinjr marked this conversation as resolved.
Show resolved Hide resolved
"""
nnz_chunk_size = 256 * 1024**2 # goal (~2.4GiB for a 32-bit COO)

if isinstance(arr, sparse.csr_matrix) or isinstance(arr, sparse.csr_array):
avg_nnz_per_row = arr.nnz // arr.shape[0]
Expand All @@ -37,7 +39,7 @@ def array_chunker(arr: Union[npt.NDArray[Any], sparse.spmatrix]) -> sparse.coo_m
return

if isinstance(arr, np.ndarray):
row_chunk_size = max(1, nnz_chunk_size // arr.shape[1])
row_chunk_size = max(1, nnz_chunk_size // arr.shape[1]) # type: ignore
for row_idx in range(0, arr.shape[0], row_chunk_size):
slc = sparse.coo_matrix(arr[row_idx : row_idx + row_chunk_size, :])
slc.resize(arr.shape)
Expand Down