chanzuckerberg · bkmartinjr · Mar 28, 2023 · Mar 27, 2023 · Mar 27, 2023 · Mar 27, 2023
diff --git a/.github/workflows/py-unittests.yml b/.github/workflows/py-unittests.yml
@@ -29,7 +29,7 @@ jobs:
           pip install -e ./api/python/cell_census/
       - name: Test with pytest (API)
         run: |
-          PYTHONPATH=. coverage run --parallel-mode -m pytest ./api/python/cell_census/tests/
+          PYTHONPATH=. coverage run --parallel-mode -m pytest --durations=20 ./api/python/cell_census/tests/
       - uses: actions/upload-artifact@v3
         with:
           name: coverage

diff --git a/api/python/cell_census/pyproject.toml b/api/python/cell_census/pyproject.toml
@@ -34,7 +34,7 @@ dependencies= [
     # NOTE: the tiledbsoma version must be >= to the version used in the Census builder, to
     # ensure that the assets are readable (tiledbsoma supports backward compatible reading).
     # Make sure this version does not fall behind the builder's tiledbsoma version.
-    "tiledbsoma==1.1.1",
+    "tiledbsoma==1.2.1",
     "typing_extensions",
     "s3fs",
     "scikit-misc",
@@ -65,7 +65,8 @@ plugins = "numpy.typing.mypy_plugin"
 
 [tool.pytest.ini_options]
 markers = [
-    "live_corpus: runs on the live Cell Census data corpus",
+    "live_corpus: runs on the live Cell Census data corpus and small enough to run in CI",
+    "expensive: too expensive to run regularly or in CI",
 ]
 
 [tool.ruff]

diff --git a/api/python/cell_census/tests/README.md b/api/python/cell_census/tests/README.md
@@ -0,0 +1,48 @@
+# Test README
+
+This directory contains tests of the cell-census package API, _and_ the use of the API on the
+live "corpus", i.e., data in the public cell census S3 bucket. The tests use Pytest, and have
+Pytest marks to control which tests are run.
+
+Tests can be run in the usual manner. First, ensure you have cell-census installed, e.g., from the top-level repo directory:
+
+> pip install -e ./api/python/cell_census/
+
+Then run the tests:
+
+> pytest ./api/python/cell_census/
+
+## Pytest Marks
+
+There are two Pytest marks you can use from the command line:
+
+- live_corpus: tests that directly access the `latest` version of the Cell Census. Enabled by default.
+- expensive: tests that are expensive (ie., cpu, memory, time). Disabled by default - enable with `--expensive`. Some of these tests are _very_ expensive, ie., require a very large memory host to succeed.
+
+By default, only relatively cheap & fast tests are run. To enable `expensive` tests:
+
+> pytest --expensive ...
+
+To disable `live_corpus` tests:
+
+> pytest -m 'not live_corpus'
+
+You can also combine them, e.g.,
+
+> pytest -m 'not live_corpus' --expensive
+
+# Acceptance (expensive) tests
+
+These tests are periodically run, and are not part of CI due to their overhead.
+
+When run, please record the results below and commit to git:
+
+- date
+- host / instance type
+- Python & package versions and OS (tip: use tiledbsoma.show_package_versions())
+- the Cell Census version used for the test (i.e., the version aliased as `latest`)
+- full output of: `pytest --durations=0 --expensive ./api/python/cell_census/tests/`
+
+## YYYY-MM-DD
+
+TBD
diff --git a/api/python/cell_census/tests/conftest.py b/api/python/cell_census/tests/conftest.py
@@ -0,0 +1,12 @@
+import pytest
+
+
+def pytest_addoption(parser: pytest.Parser) -> None:
+    parser.addoption(
+        "--expensive", action="store_true", dest="expensive", default=False, help="enable 'expensive' decorated tests"
+    )
+
+
+def pytest_configure(config: pytest.Config) -> None:
+    if not config.option.expensive:
+        config.option.markexpr = "not expensive"
diff --git a/api/python/cell_census/tests/test_acceptance.py b/api/python/cell_census/tests/test_acceptance.py
@@ -0,0 +1,115 @@
+"""
+Acceptance tests for the Census.
+
+NOTE: those marked `expensive` are not run in the CI as they are, well, expensive...
+
+Several of them will not run to completion except on VERY large hosts.
+
+Intended use:  periodically do a manual run, including the expensive tests, on an
+appropriately large host.
+
+See README.md for historical data.
+"""
+from typing import Iterator, Optional
+
+import pyarrow as pa
+import pytest
+import tiledb
+import tiledbsoma as soma
+
+import cell_census
+
+
+@pytest.mark.live_corpus
+@pytest.mark.parametrize("organism", ["homo_sapiens", "mus_musculus"])
+def test_load_axes(organism: str) -> None:
+    """Verify axes can be loaded into a Pandas DataFrame"""
+    census = cell_census.open_soma(census_version="latest")
+
+    # use subset of columns for speed
+    obs_df = (
+        census["census_data"][organism]
+        .obs.read(column_names=["soma_joinid", "cell_type", "tissue"])
+        .concat()
+        .to_pandas()
+    )
+    assert len(obs_df)
+    del obs_df
+
+    var_df = census["census_data"][organism].ms["RNA"].var.read().concat().to_pandas()
+    assert len(var_df)
+    del var_df
+
+
+def table_iter_is_ok(tbl_iter: Iterator[pa.Table], stop_after: Optional[int] = 2) -> bool:
+    """
+    Utility that verifies that the value is an iterator of pa.Table.
+
+    Will only call __next__ as many times as the `stop_after` param specifies,
+    or will read until end of iteration of it is None.
+    """
+    assert isinstance(tbl_iter, Iterator)
+    for n, tbl in enumerate(tbl_iter):
+        # keep things speedy by quitting early if stop_after specified
+        if stop_after is not None and n > stop_after:
+            break
+        assert isinstance(tbl, pa.Table)
+        assert len(tbl)
+
+    return True
+
+
+@pytest.mark.live_corpus
+@pytest.mark.parametrize("organism", ["homo_sapiens", "mus_musculus"])
+def test_incremental_read(organism: str) -> None:
+    """Verify that obs, var and X[raw] can be read incrementally, i.e., in chunks"""
+
+    # open census with a small (default) TileDB buffer size, which reduces
+    # memory use, and makes it feasible to run in a GHA.
+    version = cell_census.get_census_version_description("latest")
+    s3_region = version["soma"].get("s3_region")
+    context = soma.options.SOMATileDBContext(tiledb_ctx=tiledb.Ctx({"vfs.s3.region": s3_region}))
+
+    with cell_census.open_soma(census_version="latest", context=context) as census:
+        assert table_iter_is_ok(census["census_data"][organism].obs.read(column_names=["soma_joinid", "tissue"]))
+        assert table_iter_is_ok(
+            census["census_data"][organism].ms["RNA"].var.read(column_names=["soma_joinid", "feature_id"])
+        )
+        assert table_iter_is_ok(census["census_data"][organism].ms["RNA"].X["raw"].read().tables())
+
+
+@pytest.mark.live_corpus
+@pytest.mark.parametrize("organism", ["homo_sapiens", "mus_musculus"])
+@pytest.mark.parametrize(
+    "obs_value_filter", ["tissue=='aorta'", pytest.param("tissue=='brain'", marks=pytest.mark.expensive)]
+)
+@pytest.mark.parametrize("stop_after", [2, pytest.param(None, marks=pytest.mark.expensive)])
+def test_incremental_query(organism: str, obs_value_filter: str, stop_after: Optional[int]) -> None:
+    """Verify incremental read of query result."""
+    # use default TileDB configuration
+    with cell_census.open_soma(census_version="latest") as census:
+        with census["census_data"][organism].axis_query(
+            measurement_name="RNA", obs_query=soma.AxisQuery(value_filter=obs_value_filter)
+        ) as query:
+            assert table_iter_is_ok(query.obs(), stop_after=stop_after)
+            assert table_iter_is_ok(query.var(), stop_after=stop_after)
+            assert table_iter_is_ok(query.X("raw").tables(), stop_after=stop_after)
+
+
+@pytest.mark.live_corpus
+@pytest.mark.expensive
+@pytest.mark.parametrize("organism", ["homo_sapiens", "mus_musculus"])
+@pytest.mark.parametrize(
+    "obs_value_filter",
+    [
+        "tissue == 'aorta'",
+        pytest.param("cell_type == 'neuron'", marks=pytest.mark.expensive),  # very common cell type
+        pytest.param("tissue == 'brain'", marks=pytest.mark.expensive),  # very common tissue
+        pytest.param(None, marks=pytest.mark.expensive),  # whole enchilada
+    ],
+)
+def test_get_anndata(organism: str, obs_value_filter: str) -> None:
+    """Verify query and read into AnnData"""
+    with cell_census.open_soma(census_version="latest") as census:
+        ad = cell_census.get_anndata(census, organism, obs_value_filter=obs_value_filter)
+        assert ad is not None
diff --git a/api/python/cell_census/tests/test_open.py b/api/python/cell_census/tests/test_open.py
@@ -63,7 +63,7 @@ def test_get_source_h5ad_uri() -> None:
         census_datasets = census["census_info"]["datasets"].read().concat().to_pandas()
 
     rng = np.random.default_rng()
-    for idx in rng.choice(np.arange(len(census_datasets)), size=10, replace=False):
+    for idx in rng.choice(np.arange(len(census_datasets)), size=3, replace=False):
         a_dataset = census_datasets.iloc[idx]
         locator = cell_census.get_source_h5ad_uri(a_dataset.dataset_id)
         assert isinstance(locator, dict)