chanzuckerberg · ebezzi · Mar 15, 2023 · Mar 14, 2023 · Mar 14, 2023 · Mar 14, 2023
diff --git a/tools/cell_census_builder/tests/conftest.py b/tools/cell_census_builder/tests/conftest.py
@@ -1,3 +1,4 @@
+import io
 import os
 import pathlib
 from typing import List
@@ -122,6 +123,41 @@ def datasets(assets_path: str) -> List[Dataset]:
     return datasets
 
 
+@pytest.fixture
+def manifest_csv(tmp_path: pathlib.Path) -> io.TextIOWrapper:
+    manifest_content = f"""
+    dataset_id_1, {tmp_path}/data/h5ads/dataset_id_1.h5ad
+    dataset_id_2, {tmp_path}/data/h5ads/dataset_id_2.h5ad
+    """
+    path = f"{tmp_path}/manifest.csv"
+    h5ad_path = f"{tmp_path}/data/h5ads/"
+    pathlib.Path(h5ad_path).mkdir(parents=True, exist_ok=True)
+    pathlib.Path(tmp_path / "data/h5ads/dataset_id_1.h5ad").touch()
+    pathlib.Path(tmp_path / "data/h5ads/dataset_id_2.h5ad").touch()
+    with open(path, "w+") as f:
+        f.writelines(manifest_content.strip())
+
+    return open(path)
+
+
+@pytest.fixture
+def manifest_csv_with_duplicates(tmp_path: pathlib.Path) -> io.TextIOWrapper:
+    manifest_content = f"""
+    dataset_id_1, {tmp_path}/data/h5ads/dataset_id_1.h5ad
+    dataset_id_2, {tmp_path}/data/h5ads/dataset_id_2.h5ad
+    dataset_id_2, {tmp_path}/data/h5ads/dataset_id_2.h5ad
+    """
+    path = f"{tmp_path}/manifest.csv"
+    h5ad_path = f"{tmp_path}/data/h5ads/"
+    pathlib.Path(h5ad_path).mkdir(parents=True, exist_ok=True)
+    pathlib.Path(tmp_path / "data/h5ads/dataset_id_1.h5ad").touch()
+    pathlib.Path(tmp_path / "data/h5ads/dataset_id_2.h5ad").touch()
+    with open(path, "w+") as f:
+        f.writelines(manifest_content.strip())
+
+    return open(path)
+
+
 @pytest.fixture()
 def setup(monkeypatch: MonkeyPatch) -> None:
     process_initializer()

diff --git a/tools/cell_census_builder/tests/test_builder.py b/tools/cell_census_builder/tests/test_builder.py
@@ -1,5 +1,7 @@
+import io
 import os
 import pathlib
+from types import SimpleNamespace
 from typing import List
 from unittest.mock import patch
 
@@ -9,7 +11,7 @@
 import tiledb
 import tiledbsoma as soma
 
-from tools.cell_census_builder.__main__ import build, make_experiment_specs
+from tools.cell_census_builder.__main__ import build, build_step1_get_source_datasets, make_experiment_specs
 from tools.cell_census_builder.datasets import Dataset
 from tools.cell_census_builder.experiment_builder import ExperimentBuilder
 from tools.cell_census_builder.globals import (
@@ -126,3 +128,20 @@ def test_unicode_support(tmp_path: pathlib.Path) -> None:
 
     with soma.DataFrame.open(uri=os.path.join(tmp_path, "unicode_support")) as pd_df_in:
         assert pd_df_in.read().concat().to_pandas()["value"].to_list() == ["Ünicode", "S̈upport"]
+
+
+def test_build_step1_get_source_datasets(tmp_path: pathlib.Path, manifest_csv: io.TextIOWrapper) -> None:
+    import pathlib
+
+    pathlib.Path(tmp_path / "dest").mkdir()
+    args = SimpleNamespace(manifest=manifest_csv, test_first_n=None, verbose=True, max_workers=1)
+
+    # Call the function
+    datasets = build_step1_get_source_datasets(args, f"{tmp_path}/dest")  # type: ignore
+
+    # Verify that 2 datasets are returned
+    assert len(datasets) == 2
+
+    # Verify that the datasets have been staged
+    assert pathlib.Path(tmp_path / "dest" / "dataset_id_1.h5ad").exists()
+    assert pathlib.Path(tmp_path / "dest" / "dataset_id_2.h5ad").exists()
diff --git a/tools/cell_census_builder/tests/test_manifest.py b/tools/cell_census_builder/tests/test_manifest.py
@@ -0,0 +1,121 @@
+import io
+import pathlib
+import re
+from unittest.mock import patch
+
+from tools.cell_census_builder.manifest import CXG_BASE_URI, load_manifest
+
+
+def test_load_manifest_from_file(tmp_path: pathlib.Path, manifest_csv: io.TextIOWrapper) -> None:
+    """
+    If specified a parameter, `load_manifest` should load the dataset manifest from such file.
+    """
+    manifest = load_manifest(manifest_csv)
+    assert len(manifest) == 2
+    assert manifest[0].dataset_id == "dataset_id_1"
+    assert manifest[1].dataset_id == "dataset_id_2"
+    assert manifest[0].corpora_asset_h5ad_uri == f"{tmp_path}/data/h5ads/dataset_id_1.h5ad"
+    assert manifest[1].corpora_asset_h5ad_uri == f"{tmp_path}/data/h5ads/dataset_id_2.h5ad"
+
+
+def test_load_manifest_does_dedup(manifest_csv_with_duplicates: io.TextIOWrapper) -> None:
+    """
+    `load_manifest` should not include duplicate datasets from the manifest
+    """
+    manifest = load_manifest(manifest_csv_with_duplicates)
+    assert len(manifest) == 2
+
+
+def test_load_manifest_from_cxg() -> None:
+    """
+    If no parameters are specified, `load_manifest` should load the dataset list from Discover API.
+    """
+    with patch("tools.cell_census_builder.manifest.fetch_json") as m:
+
+        def mock_call_fn(uri):  # type: ignore
+            if uri == f"{CXG_BASE_URI}curation/v1/collections":
+                return [
+                    {
+                        "id": "collection_1",
+                        "doi": None,
+                        "name": "1",
+                        "datasets": [{"id": "dataset_id_1"}, {"id": "dataset_id_2"}],
+                    }
+                ]
+            elif m := re.match(rf"{CXG_BASE_URI}curation/v1/collections/(\w+)/datasets/(\w+)$", uri):
+                return {"id": m[2], "schema_version": "3.0.0", "title": f"dataset #{m[2]}"}
+            elif m := re.match(rf"{CXG_BASE_URI}curation/v1/collections/(\w+)/datasets/(\w+)/assets$", uri):
+                return [{"filetype": "H5AD", "filesize": 1024, "presigned_url": f"https://fake.url/{m[2]}.h5ad"}]
+
+        m.side_effect = mock_call_fn
+
+        manifest = load_manifest(None)
+        assert len(manifest) == 2
+        assert manifest[0].dataset_id == "dataset_id_1"
+        assert manifest[1].dataset_id == "dataset_id_2"
+        assert manifest[0].corpora_asset_h5ad_uri == "https://fake.url/dataset_id_1.h5ad"
+        assert manifest[1].corpora_asset_h5ad_uri == "https://fake.url/dataset_id_2.h5ad"
+
+
+def test_load_manifest_from_cxg_excludes_datasets_with_old_schema() -> None:
+    """
+    `load_manifest` should exclude datasets that do not have a current schema version.
+    """
+    with patch("tools.cell_census_builder.manifest.fetch_json") as m:
+
+        def mock_call_fn(uri):  # type: ignore
+            if uri == f"{CXG_BASE_URI}curation/v1/collections":
+                return [
+                    {
+                        "id": "collection_1",
+                        "doi": None,
+                        "name": "1",
+                        "datasets": [{"id": "dataset_id_1"}, {"id": "dataset_id_2"}],
+                    }
+                ]
+            elif m := re.match(rf"{CXG_BASE_URI}curation/v1/collections/(\w+)/datasets/(\w+)$", uri):
+                return {
+                    "id": m[2],
+                    "schema_version": "3.0.0" if m[2] == "dataset_id_1" else "2.0.0",
+                    "title": f"dataset #{m[2]}",
+                }
+            elif m := re.match(rf"{CXG_BASE_URI}curation/v1/collections/(\w+)/datasets/(\w+)/assets$", uri):
+                return [{"filetype": "H5AD", "filesize": 1024, "presigned_url": f"https://fake.url/{m[2]}.h5ad"}]
+
+        m.side_effect = mock_call_fn
+
+        manifest = load_manifest(None)
+        assert len(manifest) == 1
+        assert manifest[0].dataset_id == "dataset_id_1"
+        assert manifest[0].corpora_asset_h5ad_uri == "https://fake.url/dataset_id_1.h5ad"
+
+
+def test_load_manifest_from_cxg_excludes_datasets_with_no_assets() -> None:
+    """
+    `load_manifest` should exclude datasets that do not have assets
+    """
+    with patch("tools.cell_census_builder.manifest.fetch_json") as m:
+
+        def mock_call_fn(uri):  # type: ignore
+            if uri == f"{CXG_BASE_URI}curation/v1/collections":
+                return [
+                    {
+                        "id": "collection_1",
+                        "doi": None,
+                        "name": "1",
+                        "datasets": [{"id": "dataset_id_1"}, {"id": "dataset_id_2"}],
+                    }
+                ]
+            elif m := re.match(rf"{CXG_BASE_URI}curation/v1/collections/(\w+)/datasets/(\w+)$", uri):
+                return {"id": m[2], "schema_version": "3.0.0", "title": f"dataset #{m[2]}"}
+            elif m := re.match(rf"{CXG_BASE_URI}curation/v1/collections/(\w+)/datasets/dataset_id_1/assets$", uri):
+                return [{"filetype": "H5AD", "filesize": 1024, "presigned_url": "https://fake.url/dataset_id_1.h5ad"}]
+            elif m := re.match(rf"{CXG_BASE_URI}curation/v1/collections/(\w+)/datasets/dataset_id_2/assets$", uri):
+                return []
+
+        m.side_effect = mock_call_fn
+
+        manifest = load_manifest(None)
+        assert len(manifest) == 1
+        assert manifest[0].dataset_id == "dataset_id_1"
+        assert manifest[0].corpora_asset_h5ad_uri == "https://fake.url/dataset_id_1.h5ad"
diff --git a/tools/cell_census_builder/tests/test_source_assets.py b/tools/cell_census_builder/tests/test_source_assets.py
@@ -0,0 +1,25 @@
+import pathlib
+from types import SimpleNamespace
+
+from tools.cell_census_builder.datasets import Dataset
+from tools.cell_census_builder.source_assets import stage_source_assets
+
+
+def test_source_assets(tmp_path: pathlib.Path) -> None:
+    """
+    `source_assets` should copy the datasets from their `corpora_asset_h5ad_uri` to the specified `assets_dir`
+    """
+    datasets = []
+    pathlib.Path(tmp_path / "source").mkdir()
+    pathlib.Path(tmp_path / "dest").mkdir()
+    for i in range(10):
+        dataset = Dataset(f"dataset_{i}", corpora_asset_h5ad_uri=f"file://{tmp_path}/source/dataset_{i}.h5ad")
+        pathlib.Path(tmp_path / "source" / f"dataset_{i}.h5ad").touch()
+        datasets.append(dataset)
+
+    # Call the function
+    stage_source_assets(datasets, SimpleNamespace(verbose=True, max_workers=1), tmp_path / "dest")  # type: ignore
+
+    # Verify that the files exist
+    for i in range(10):
+        assert pathlib.Path(tmp_path / "dest" / f"dataset_{i}.h5ad").exists()