chanzuckerberg · ebezzi · Mar 16, 2023 · Mar 15, 2023 · Mar 15, 2023 · Mar 15, 2023
diff --git a/tools/cell_census_builder/tests/test_util.py b/tools/cell_census_builder/tests/test_util.py
@@ -0,0 +1,106 @@
+import numpy as np
+import pytest
+from scipy.sparse import coo_matrix, csr_matrix, triu
+
+from tools.cell_census_builder.util import array_chunker, is_positive_integral, uricat
+
+
+def test_is_positive_integral() -> None:
+    X = np.array([1, 2, 3, 4])
+    assert is_positive_integral(X)
+
+    X = np.array([-1, 2, 3, 4])
+    assert not is_positive_integral(X)
+
+    X = np.array([1.2, 0, 3, 4])
+    assert not is_positive_integral(X)
+
+    X = np.zeros((3, 4))
+    assert is_positive_integral(X)
+
+    X = csr_matrix([[1, 2, 3], [4, 5, 6]])
+    assert is_positive_integral(X)
+
+    X = csr_matrix([[-1, 2, 3], [4, 5, 6]])
+    assert not is_positive_integral(X)
+
+    X = csr_matrix([[1.2, 0, 3], [4, 5, 6]])
+    assert not is_positive_integral(X)
+
+    X = csr_matrix([0, 0, 0])
+    assert is_positive_integral(X)
+
+
+def test_array_chunker() -> None:
+    # Case 1: dense matrix (np.ndarray)
+    X = np.ones(1200).reshape(30, 40)
+    # If nnz_chunk_size is less than the number of cols, the number of cols is used (40 in this example)
+    chunked = list(array_chunker(X, nnz_chunk_size=10))
+    assert len(chunked) == 30
+    for i, s in enumerate(chunked):
+        assert isinstance(s, coo_matrix)
+        assert s.nnz == 40
+        assert s.shape == (30, 40)
+        # The i-th row of the matrix should have 40 nonzeros (which implies the rest are zeros)
+        csr = s.tocsr()
+        assert csr.getrow(i).nnz == 40
+
+    # If nnz_chunk_size is less than
+    chunked = list(array_chunker(X, nnz_chunk_size=600))
+    assert len(chunked) == 2
+    for s in chunked:
+        assert isinstance(s, coo_matrix)
+        assert s.nnz == 600
+        assert s.shape == (30, 40)
+
+    chunked = list(array_chunker(X, nnz_chunk_size=2400))
+    assert len(chunked) == 1
+
+    # Case 2: compressed row sparse matrix (csr_matrix)
+    # we'll use an upper triangular matrix with all ones (avg 5 nnz per row)
+    # [
+    #     [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
+    #     [0., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
+    #     [0., 0., 1., 1., 1., 1., 1., 1., 1., 1.],
+    #     [0., 0., 0., 1., 1., 1., 1., 1., 1., 1.],
+    #     [0., 0., 0., 0., 1., 1., 1., 1., 1., 1.],
+    #     [0., 0., 0., 0., 0., 1., 1., 1., 1., 1.],
+    #     [0., 0., 0., 0., 0., 0., 1., 1., 1., 1.],
+    #     [0., 0., 0., 0., 0., 0., 0., 1., 1., 1.],
+    #     [0., 0., 0., 0., 0., 0., 0., 0., 1., 1.],
+    #     [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
+    # ]
+
+    X = triu(np.ones(100).reshape(10, 10)).tocsr()
+    # In this case, chunks will be 2 rows x 10 column (since on average each row contains 5 nonzeros)
+    chunked = list(array_chunker(X, nnz_chunk_size=10))
+    assert len(chunked) == 5
+    assert chunked[0].nnz == 19
+    assert chunked[1].nnz == 15
+    assert chunked[2].nnz == 11
+    assert chunked[3].nnz == 7
+    assert chunked[4].nnz == 3
+
+    # Case 3: compressed column sparse matrix (csc_matrix)
+    # We'll use the same example as for csr, but note that chunking is done by column and not by row.
+
+    X = triu(np.ones(100).reshape(10, 10)).tocsc()
+    # In this case, chunks will be 2 rows x 10 column (since on average each row contains 5 nonzeros)
+    chunked = list(array_chunker(X, nnz_chunk_size=10))
+    assert len(chunked) == 5
+    assert chunked[0].nnz == 3
+    assert chunked[1].nnz == 7
+    assert chunked[2].nnz == 11
+    assert chunked[3].nnz == 15
+    assert chunked[4].nnz == 19
+
+    # Other formats are rejected by the method
+    X = triu(np.ones(100).reshape(10, 10)).tocoo()
+    with pytest.raises(NotImplementedError):
+        list(array_chunker(X))
+
+
+def test_uricat() -> None:
+    assert uricat("path", "to", "somewhere") == "path/to/somewhere"
+    assert uricat("path/", "to/", "somewhere") == "path/to/somewhere"
+    assert uricat("path/", "to/", "somewhere/") == "path/to/somewhere/"
diff --git a/tools/cell_census_builder/util.py b/tools/cell_census_builder/util.py
@@ -1,6 +1,6 @@
 import time
 import urllib.parse
-from typing import Any, Union
+from typing import Any, Iterator, Optional, Union
 
 import git
 import numpy as np
@@ -10,11 +10,13 @@
 from scipy import sparse
 
 
-def array_chunker(arr: Union[npt.NDArray[Any], sparse.spmatrix]) -> sparse.coo_matrix:
+def array_chunker(
+    arr: Union[npt.NDArray[Any], sparse.spmatrix],
+    nnz_chunk_size: Optional[int] = 256 * 1024**2,  # goal (~2.4GiB for a 32-bit COO)
+) -> Iterator[sparse.coo_matrix]:
     """
     Return the array as multiple chunks, each a coo_matrix.
     """
-    nnz_chunk_size = 256 * 1024**2  # goal (~2.4GiB for a 32-bit COO)
 
     if isinstance(arr, sparse.csr_matrix) or isinstance(arr, sparse.csr_array):
         avg_nnz_per_row = arr.nnz // arr.shape[0]
@@ -37,7 +39,7 @@ def array_chunker(arr: Union[npt.NDArray[Any], sparse.spmatrix]) -> sparse.coo_m
         return
 
     if isinstance(arr, np.ndarray):
-        row_chunk_size = max(1, nnz_chunk_size // arr.shape[1])
+        row_chunk_size = max(1, nnz_chunk_size // arr.shape[1])  # type: ignore
         for row_idx in range(0, arr.shape[0], row_chunk_size):
             slc = sparse.coo_matrix(arr[row_idx : row_idx + row_chunk_size, :])
             slc.resize(arr.shape)