From 10b30cd1a1b03f5eb7f8f96f3c8b7923a8ebcb38 Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Fri, 17 Mar 2023 20:56:48 +0000
Subject: [PATCH 01/34] reorganize census builder

---
 tools/cell_census_builder/README.md           | 39 +++++++---
 tools/cell_census_builder/pyproject.toml      | 74 +++++++++++++++++++
 .../src/cell_census_builder/__init__.py       | 12 +++
 .../build_soma}/__init__.py                   |  0
 .../build_soma}/__main__.py                   |  0
 .../build_soma}/anndata.py                    |  0
 .../build_soma}/census_summary.py             |  0
 .../build_soma}/consolidate.py                |  0
 .../build_soma}/datasets.py                   |  0
 .../build_soma}/experiment_builder.py         |  0
 .../build_soma}/globals.py                    |  0
 .../build_soma}/manifest.py                   |  0
 .../cell_census_builder/build_soma}/mp.py     | 10 +--
 .../build_soma}/source_assets.py              |  0
 .../build_soma}/summary_cell_counts.py        |  0
 .../build_soma}/tissue_mapper.py              |  0
 .../cell_census_builder/build_soma}/util.py   |  0
 .../build_soma}/validate.py                   |  0
 .../cell_census_builder}/census_summary.py    |  2 +-
 .../cell_census_builder/host_validation.py    | 67 +++++++++++++++++
 .../src/cell_census_builder/logging.py        | 26 +++++++
 .../tests/anndata/conftest.py                 |  4 +-
 .../tests/anndata/test_anndata.py             | 16 ++--
 tools/cell_census_builder/tests/conftest.py   | 17 ++---
 .../cell_census_builder/tests/test_builder.py | 25 +++----
 tools/cell_census_builder/tests/test_main.py  |  2 +-
 .../tests/test_manifest.py                    |  8 +-
 .../tests/test_source_assets.py               |  6 +-
 tools/cell_census_builder/tests/test_util.py  |  3 +-
 .../scripts/aws/mount_instance_storage.sh     |  0
 .../scripts/aws/swapon_instance_storage.sh    |  0
 tools/scripts/requirements.txt                | 17 -----
 32 files changed, 247 insertions(+), 81 deletions(-)
 create mode 100644 tools/cell_census_builder/pyproject.toml
 create mode 100644 tools/cell_census_builder/src/cell_census_builder/__init__.py
 rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/__init__.py (100%)
 rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/__main__.py (100%)
 rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/anndata.py (100%)
 rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/census_summary.py (100%)
 rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/consolidate.py (100%)
 rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/datasets.py (100%)
 rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/experiment_builder.py (100%)
 rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/globals.py (100%)
 rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/manifest.py (100%)
 rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/mp.py (86%)
 rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/source_assets.py (100%)
 rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/summary_cell_counts.py (100%)
 rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/tissue_mapper.py (100%)
 rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/util.py (100%)
 rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/validate.py (100%)
 rename tools/cell_census_builder/{scripts/release => src/cell_census_builder}/census_summary.py (98%)
 create mode 100644 tools/cell_census_builder/src/cell_census_builder/host_validation.py
 create mode 100644 tools/cell_census_builder/src/cell_census_builder/logging.py
 rename tools/{cell_census_builder => }/scripts/aws/mount_instance_storage.sh (100%)
 rename tools/{cell_census_builder => }/scripts/aws/swapon_instance_storage.sh (100%)
 delete mode 100644 tools/scripts/requirements.txt

diff --git a/tools/cell_census_builder/README.md b/tools/cell_census_builder/README.md
index 7afb948b4..4dbda235d 100644
--- a/tools/cell_census_builder/README.md
+++ b/tools/cell_census_builder/README.md
@@ -1,16 +1,31 @@
 # README
 
-This is a tool to build the SOMA instantiation of the Cell Census schema, as specified in this doc:
+This package contains code to build and release the Cell Census in the SOMA format, as specified in the
+[data schema](https://github.com/chanzuckerberg/cell-census/blob/main/docs/cell_census_schema.md).
 
-https://docs.google.com/document/d/1GKndzCk9q_1SdYOq3BeCxWgp-o2NSQkEmSBaBPKnNI8/
+This tool is not intended for end-users - it is used by CZI to periodically create and release all
+CELLxGENE data in the above format. The remainder of this document is intended for users of the
+build package.
 
-CAVEATS (READ THIS):
+Please see the top-level [README](../../README.md) for more information on the Cell Census.
 
-1. The code is written to the still-rapidly-evolving and **pre-release** Python SOMA API, _and will be subject to change_ as the SOMA API and `tiledbsoma` evolve and stabilize.
-2. The schema implemented by this code is still evolving and subject to change.
-3. The `cell_census_builder` package requires Python 3.9 or later.
+# Overview
 
-## Usage
+This package contains sub-modules, each of which automate elements of the Cell Census build and release process.
+The ultimate intention is to integrate these into an automated multi-step workflow. Until that occurs, individual steps
+are provided as modules with their own `__main__`, to be manually invoked.
+
+## `host_validation` module
+
+Module which provides a set of checks that the current host machine has the requisite capabilities
+to build the census (e.g., free disk space). Raises exception (non-zero process exit) if host is
+unable to meet base requirements.
+
+Stand-alone usage: `python -m cell_census_builder.host_validation`
+
+## `build_soma` module
+
+Stand-alone use: `python -m cell_census_builder.build_soma ...`
 
 TL;DR:
 
@@ -25,7 +40,7 @@ The build process:
 - Step 3: Write the axis dataframes for each experiment, filtering the datasets and cells to include (serialized iteration of dataset H5ADs).
 - Step 4: Write the X layers for each experiment (parallelized iteration of filtered dataset H5ADs).
 - Step 5: Write datasets manifest and summary info.
-- (Optional) Consolidate TileDB data 
+- (Optional) Consolidate TileDB data
 - (Optional) Validate the entire Cell Census, re-reading from storage.
 
 Modes of operation:
@@ -37,10 +52,10 @@ b) creating a smaller "cell census" from a user-provided list of files (a "manif
 - On a large-memory machine with _ample_ free (local) disk (eg, 3/4 TB or more) and swap (1 TB or more)
 - To create a cell census at `<census_path>`, execute:
   > $ python -m cell_census_builder -mp --max-workers 12 <census_path> build
-- Tips: 
-    - `-v` to view info-level logging during run, or `-v -v` for debug-level logging
-    - `--test-first-n <#>` to test build on a subset of datasets
-    - `--build-tag $(date +'%Y%m%d_%H%M%S')` to produce non-conflicting census build directories during testing
+- Tips:
+  - `-v` to view info-level logging during run, or `-v -v` for debug-level logging
+  - `--test-first-n <#>` to test build on a subset of datasets
+  - `--build-tag $(date +'%Y%m%d_%H%M%S')` to produce non-conflicting census build directories during testing
 
 If you run out of memory, reduce `--max-workers`. You can also try a higher number if you have lots of CPU & memory.
 
diff --git a/tools/cell_census_builder/pyproject.toml b/tools/cell_census_builder/pyproject.toml
new file mode 100644
index 000000000..a5bf541e0
--- /dev/null
+++ b/tools/cell_census_builder/pyproject.toml
@@ -0,0 +1,74 @@
+[build-system]
+requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "cell_census_builder"
+dynamic = ["version"]
+description = "Build Cell Census"
+authors = [
+    { name = "Chan Zuckerberg Initiative", email = "cellxgene@chanzuckerberg.com" }
+]
+license = { text = "MIT" }
+readme = "README.md"
+requires-python = ">= 3.9, < 3.11"  # Python 3.11 is pending numba support
+classifiers = [
+    "Development Status :: 2 - Pre-Alpha",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Information Technology",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python",
+    "Topic :: Scientific/Engineering :: Bio-Informatics",
+    "Operating System :: POSIX :: Linux",
+    "Operating System :: MacOS :: MacOS X",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+]
+dependencies= [
+    "pyarrow",
+    "pandas",
+    "anndata>=0.8",
+    "numpy",
+    # NOTE: The builder's version of tiledbsoma MUST be <= the API's tiledbsoma version, to ensure reader compatibility
+    # with TileDB on-disk storage format
+    "tiledbsoma==1.0.0",
+    "scipy",
+    "fsspec",
+    "s3fs",
+    "requests",
+    "aiohttp",
+    "Cython", # required by owlready2
+    "wheel",  # required by owlready2
+    "owlready2",
+    "gitpython",
+    "attrs>=22.2.0",
+    "psutil",
+    "cell_census==0.10.0",
+    "typing_extensions",
+]
+
+# [tool.setuptools.packages.find]
+# where = ["src"]
+# include = ["cell_census_builder*"]  # package names should match these glob patterns (["*"] by default)
+# exclude = ["tests*", "scripts*"]  # exclude packages matching these glob patterns (empty by default)
+
+[tool.setuptools_scm]
+root = "../.."
+
+[tool.black]
+line-length = 120
+target_version = ['py39']
+
+[tool.mypy]
+show_error_codes = true
+ignore_missing_imports = true
+warn_unreachable = true
+strict = true
+plugins = "numpy.typing.mypy_plugin"
+
+[tool.ruff]
+select = ["E", "F", "B", "I"]
+ignore = ["E501", "E402", "C408", ]
+line-length = 120
+target-version = "py39"
diff --git a/tools/cell_census_builder/src/cell_census_builder/__init__.py b/tools/cell_census_builder/src/cell_census_builder/__init__.py
new file mode 100644
index 000000000..16e5282c0
--- /dev/null
+++ b/tools/cell_census_builder/src/cell_census_builder/__init__.py
@@ -0,0 +1,12 @@
+try:
+    from importlib import metadata
+except ImportError:
+    # for python <=3.7
+    import importlib_metadata as metadata  # type: ignore[no-redef]
+
+
+try:
+    __version__ = metadata.version("cell_census")
+except metadata.PackageNotFoundError:
+    # package is not installed
+    __version__ = "0.0.0-unknown"
diff --git a/tools/cell_census_builder/__init__.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/__init__.py
similarity index 100%
rename from tools/cell_census_builder/__init__.py
rename to tools/cell_census_builder/src/cell_census_builder/build_soma/__init__.py
diff --git a/tools/cell_census_builder/__main__.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py
similarity index 100%
rename from tools/cell_census_builder/__main__.py
rename to tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py
diff --git a/tools/cell_census_builder/anndata.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/anndata.py
similarity index 100%
rename from tools/cell_census_builder/anndata.py
rename to tools/cell_census_builder/src/cell_census_builder/build_soma/anndata.py
diff --git a/tools/cell_census_builder/census_summary.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/census_summary.py
similarity index 100%
rename from tools/cell_census_builder/census_summary.py
rename to tools/cell_census_builder/src/cell_census_builder/build_soma/census_summary.py
diff --git a/tools/cell_census_builder/consolidate.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/consolidate.py
similarity index 100%
rename from tools/cell_census_builder/consolidate.py
rename to tools/cell_census_builder/src/cell_census_builder/build_soma/consolidate.py
diff --git a/tools/cell_census_builder/datasets.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/datasets.py
similarity index 100%
rename from tools/cell_census_builder/datasets.py
rename to tools/cell_census_builder/src/cell_census_builder/build_soma/datasets.py
diff --git a/tools/cell_census_builder/experiment_builder.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_builder.py
similarity index 100%
rename from tools/cell_census_builder/experiment_builder.py
rename to tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_builder.py
diff --git a/tools/cell_census_builder/globals.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/globals.py
similarity index 100%
rename from tools/cell_census_builder/globals.py
rename to tools/cell_census_builder/src/cell_census_builder/build_soma/globals.py
diff --git a/tools/cell_census_builder/manifest.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/manifest.py
similarity index 100%
rename from tools/cell_census_builder/manifest.py
rename to tools/cell_census_builder/src/cell_census_builder/build_soma/manifest.py
diff --git a/tools/cell_census_builder/mp.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/mp.py
similarity index 86%
rename from tools/cell_census_builder/mp.py
rename to tools/cell_census_builder/src/cell_census_builder/build_soma/mp.py
index b78e57da0..bd5d9b580 100644
--- a/tools/cell_census_builder/mp.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/mp.py
@@ -5,6 +5,8 @@
 import os
 from typing import Optional, cast
 
+from ..logging import setup_logging
+
 
 def cpu_count() -> int:
     """Sign, os.cpu_count() returns None if "undetermined" number of CPUs"""
@@ -15,13 +17,7 @@ def cpu_count() -> int:
 
 
 def process_initializer(verbose: int = 0) -> None:
-    level = logging.DEBUG if verbose > 1 else logging.INFO if verbose == 1 else logging.WARNING
-    logging.basicConfig(
-        format="%(asctime)s %(process)-7s %(levelname)-8s %(message)s",
-        level=level,
-        datefmt="%Y-%m-%d %H:%M:%S",
-    )
-    logging.captureWarnings(True)
+    setup_logging(verbose)
 
 
 def create_process_pool_executor(
diff --git a/tools/cell_census_builder/source_assets.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py
similarity index 100%
rename from tools/cell_census_builder/source_assets.py
rename to tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py
diff --git a/tools/cell_census_builder/summary_cell_counts.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/summary_cell_counts.py
similarity index 100%
rename from tools/cell_census_builder/summary_cell_counts.py
rename to tools/cell_census_builder/src/cell_census_builder/build_soma/summary_cell_counts.py
diff --git a/tools/cell_census_builder/tissue_mapper.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/tissue_mapper.py
similarity index 100%
rename from tools/cell_census_builder/tissue_mapper.py
rename to tools/cell_census_builder/src/cell_census_builder/build_soma/tissue_mapper.py
diff --git a/tools/cell_census_builder/util.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/util.py
similarity index 100%
rename from tools/cell_census_builder/util.py
rename to tools/cell_census_builder/src/cell_census_builder/build_soma/util.py
diff --git a/tools/cell_census_builder/validate.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/validate.py
similarity index 100%
rename from tools/cell_census_builder/validate.py
rename to tools/cell_census_builder/src/cell_census_builder/build_soma/validate.py
diff --git a/tools/cell_census_builder/scripts/release/census_summary.py b/tools/cell_census_builder/src/cell_census_builder/census_summary.py
similarity index 98%
rename from tools/cell_census_builder/scripts/release/census_summary.py
rename to tools/cell_census_builder/src/cell_census_builder/census_summary.py
index 08cdbe31f..86fe202af 100644
--- a/tools/cell_census_builder/scripts/release/census_summary.py
+++ b/tools/cell_census_builder/src/cell_census_builder/census_summary.py
@@ -4,7 +4,7 @@
 import cell_census
 import pandas as pd
 
-from tools.cell_census_builder.globals import CENSUS_DATA_NAME, CENSUS_INFO_NAME
+from .build.globals import CENSUS_DATA_NAME, CENSUS_INFO_NAME
 
 # Print all of the Pandas DataFrames, except the dimensions
 pd.options.display.max_columns = None  # type: ignore[assignment] # None is legal per Pandas documentation.
diff --git a/tools/cell_census_builder/src/cell_census_builder/host_validation.py b/tools/cell_census_builder/src/cell_census_builder/host_validation.py
new file mode 100644
index 000000000..e0c407710
--- /dev/null
+++ b/tools/cell_census_builder/src/cell_census_builder/host_validation.py
@@ -0,0 +1,67 @@
+import logging
+import os
+import sys
+from typing import Optional
+
+import psutil
+
+from cell_census_builder.logging import setup_logging, hr_multibyte_unit
+
+"""Minimum physical RAM"""
+MIN_RAM = 512 * 1024**3  # 512GiB
+
+"""Minimum virtual memory/swap"""
+MIN_SWAP = 2 * 1024**4  # 2TiB
+
+"""Minimum free disk space"""
+MIN_FREE_DISK_SPACE = 1 * 1024**4  # 1 TiB
+
+
+def check_os() -> None:
+    """
+    Check that we run on Posix (Linux, MacOS), as we rely on
+    Posix semantics for a few things.
+    """
+    assert psutil.POSIX
+
+
+def check_memory() -> None:
+    """
+    Check for sufficient physical and virtual memory.
+    """
+    svmem = psutil.virtual_memory()
+    logging.debug(f"Host: {hr_multibyte_unit(svmem.total)} memory found")
+    assert svmem.total >= MIN_RAM, f"Insufficient memory (found {svmem.total}, require {MIN_RAM})"
+
+    svswap = psutil.swap_memory()
+    logging.debug(f"Host: {hr_multibyte_unit(svswap.total)} swap found")
+    assert svswap.total >= MIN_SWAP, f"Insufficient swap space (found {svswap.total}, require {MIN_SWAP})"
+
+
+def check_free_disk(working_dir: Optional[str] = ".") -> None:
+    """
+    Check for sufficient free disk space.
+    """
+    skdiskusage = psutil.disk_usage(working_dir)
+    logging.debug(f"Host: {hr_multibyte_unit(skdiskusage.free)} free disk space found")
+    assert (
+        skdiskusage.free >= MIN_FREE_DISK_SPACE
+    ), f"Insufficient free disk space (found {skdiskusage.free}, require {MIN_FREE_DISK_SPACE})"
+
+
+def run_all_checks() -> int:
+    """
+    Run all host validation checks.  Returns zero or raises an exception.
+    """
+    check_os()
+    check_memory()
+    check_free_disk(os.getcwd())  # assumed working directory is CWD
+    logging.info("Host validation success")
+    return 0
+
+
+# Process MUST return zero on success (all good) or non-zero on a
+# host which does not validate.
+if __name__ == "__main__":
+    setup_logging(verbose=1)
+    sys.exit(run_all_checks())
diff --git a/tools/cell_census_builder/src/cell_census_builder/logging.py b/tools/cell_census_builder/src/cell_census_builder/logging.py
new file mode 100644
index 000000000..987046b6d
--- /dev/null
+++ b/tools/cell_census_builder/src/cell_census_builder/logging.py
@@ -0,0 +1,26 @@
+import logging
+import math
+
+
+def setup_logging(verbose: int = 0) -> None:
+    """
+    Configure the logger
+    """
+    level = logging.DEBUG if verbose > 1 else logging.INFO if verbose == 1 else logging.WARNING
+    logging.basicConfig(
+        format="%(asctime)s %(process)-7s %(levelname)-8s %(message)s",
+        level=level,
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    logging.captureWarnings(True)
+
+
+def hr_multibyte_unit(n_bytes: int) -> str:
+    """Convert number of bytes into a human-readable binary (power of 1024) multi-byte unit string."""
+    if n_bytes == 0:
+        return "0B"
+
+    unit_size_name = ("B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB")
+    unit = int(math.floor(math.log(n_bytes, 1024)))
+    n_units = round(n_bytes / math.pow(1024, unit))
+    return f"{n_units}{unit_size_name[unit]}"
diff --git a/tools/cell_census_builder/tests/anndata/conftest.py b/tools/cell_census_builder/tests/anndata/conftest.py
index 354538825..a2d0a78bf 100644
--- a/tools/cell_census_builder/tests/anndata/conftest.py
+++ b/tools/cell_census_builder/tests/anndata/conftest.py
@@ -2,9 +2,9 @@
 
 import anndata as ad
 import pytest
+from cell_census_builder.build_soma.datasets import Dataset
 
-from tools.cell_census_builder.datasets import Dataset
-from tools.cell_census_builder.tests.conftest import ORGANISMS, get_h5ad
+from ..conftest import ORGANISMS, get_h5ad
 
 
 @pytest.fixture
diff --git a/tools/cell_census_builder/tests/anndata/test_anndata.py b/tools/cell_census_builder/tests/anndata/test_anndata.py
index 8b2a37f03..adbc95b57 100644
--- a/tools/cell_census_builder/tests/anndata/test_anndata.py
+++ b/tools/cell_census_builder/tests/anndata/test_anndata.py
@@ -2,10 +2,10 @@
 
 import anndata as ad
 import numpy as np
+from cell_census_builder.build_soma.anndata import get_cellxgene_schema_version, make_anndata_cell_filter, open_anndata
+from cell_census_builder.build_soma.datasets import Dataset
 
-from tools.cell_census_builder.anndata import get_cellxgene_schema_version, make_anndata_cell_filter, open_anndata
-from tools.cell_census_builder.datasets import Dataset
-from tools.cell_census_builder.tests.conftest import ORGANISMS
+from ..conftest import ORGANISMS
 
 
 def test_open_anndata(datasets: List[Dataset]) -> None:
@@ -76,7 +76,7 @@ def test_open_anndata_equalizes_raw_and_normalized(datasets_with_larger_raw_laye
 
 
 def test_make_anndata_cell_filter(h5ad_simple: ad.AnnData) -> None:
-    func = make_anndata_cell_filter({})  # type: ignore
+    func = make_anndata_cell_filter({})
     filtered_h5ad = func(h5ad_simple)
     assert h5ad_simple.var.equals(filtered_h5ad.var)
     assert h5ad_simple.obs.equals(filtered_h5ad.obs)
@@ -86,28 +86,28 @@ def test_make_anndata_cell_filter(h5ad_simple: ad.AnnData) -> None:
 def test_make_anndata_cell_filter_filters_out_organoids_cell_culture(
     h5ad_with_organoids_and_cell_culture: ad.AnnData,
 ) -> None:
-    func = make_anndata_cell_filter({})  # type: ignore
+    func = make_anndata_cell_filter({})
     filtered_h5ad = func(h5ad_with_organoids_and_cell_culture)
     assert h5ad_with_organoids_and_cell_culture.var.equals(filtered_h5ad.var)
     assert filtered_h5ad.obs.shape[0] == 2
 
 
 def test_make_anndata_cell_filter_organism(h5ad_with_organism: ad.AnnData) -> None:
-    func = make_anndata_cell_filter({"organism_ontology_term_id": ORGANISMS[0].organism_ontology_term_id})  # type: ignore
+    func = make_anndata_cell_filter({"organism_ontology_term_id": ORGANISMS[0].organism_ontology_term_id})
     filtered_h5ad = func(h5ad_with_organism)
     assert h5ad_with_organism.var.equals(filtered_h5ad.var)
     assert filtered_h5ad.obs.shape[0] == 3
 
 
 def test_make_anndata_cell_filter_feature_biotype_gene(h5ad_with_feature_biotype: ad.AnnData) -> None:
-    func = make_anndata_cell_filter({})  # type: ignore
+    func = make_anndata_cell_filter({})
     filtered_h5ad = func(h5ad_with_feature_biotype)
     assert h5ad_with_feature_biotype.obs.equals(filtered_h5ad.obs)
     assert filtered_h5ad.var.shape[0] == 3
 
 
 def test_make_anndata_cell_filter_assay(h5ad_with_assays: ad.AnnData) -> None:
-    func = make_anndata_cell_filter({"assay_ontology_term_ids": ["EFO:1234", "EFO:1235"]})  # type: ignore
+    func = make_anndata_cell_filter({"assay_ontology_term_ids": ["EFO:1234", "EFO:1235"]})
     filtered_h5ad = func(h5ad_with_assays)
     assert filtered_h5ad.obs.shape[0] == 2
     assert list(filtered_h5ad.obs.index) == ["1", "3"]
diff --git a/tools/cell_census_builder/tests/conftest.py b/tools/cell_census_builder/tests/conftest.py
index 6b3a83e18..3b8363289 100644
--- a/tools/cell_census_builder/tests/conftest.py
+++ b/tools/cell_census_builder/tests/conftest.py
@@ -9,13 +9,12 @@
 import pandas as pd
 import pytest
 from _pytest.monkeypatch import MonkeyPatch
-from scipy import sparse
-
-from tools.cell_census_builder.datasets import Dataset
-from tools.cell_census_builder.globals import (
+from cell_census_builder.build_soma.datasets import Dataset
+from cell_census_builder.build_soma.globals import (
     CENSUS_X_LAYERS_PLATFORM_CONFIG,
 )
-from tools.cell_census_builder.mp import process_initializer
+from cell_census_builder.build_soma.mp import process_initializer
+from scipy import sparse
 
 
 @attrs.define(frozen=True)
@@ -168,9 +167,5 @@ def manifest_csv_with_duplicates(tmp_path: pathlib.Path) -> io.TextIOWrapper:
 @pytest.fixture()
 def setup(monkeypatch: MonkeyPatch) -> None:
     process_initializer()
-    monkeypatch.setitem(
-        CENSUS_X_LAYERS_PLATFORM_CONFIG["raw"]["tiledb"]["create"]["dims"]["soma_dim_0"], "tile", 2  # type: ignore
-    )
-    monkeypatch.setitem(
-        CENSUS_X_LAYERS_PLATFORM_CONFIG["raw"]["tiledb"]["create"]["dims"]["soma_dim_1"], "tile", 2  # type: ignore
-    )
+    monkeypatch.setitem(CENSUS_X_LAYERS_PLATFORM_CONFIG["raw"]["tiledb"]["create"]["dims"]["soma_dim_0"], "tile", 2)
+    monkeypatch.setitem(CENSUS_X_LAYERS_PLATFORM_CONFIG["raw"]["tiledb"]["create"]["dims"]["soma_dim_1"], "tile", 2)
diff --git a/tools/cell_census_builder/tests/test_builder.py b/tools/cell_census_builder/tests/test_builder.py
index fe802f713..10b752346 100644
--- a/tools/cell_census_builder/tests/test_builder.py
+++ b/tools/cell_census_builder/tests/test_builder.py
@@ -10,17 +10,16 @@
 import pyarrow as pa
 import tiledb
 import tiledbsoma as soma
-
-from tools.cell_census_builder.__main__ import build, build_step1_get_source_datasets, make_experiment_specs
-from tools.cell_census_builder.datasets import Dataset
-from tools.cell_census_builder.experiment_builder import ExperimentBuilder
-from tools.cell_census_builder.globals import (
+from cell_census_builder.build_soma.__main__ import build, build_step1_get_source_datasets, make_experiment_specs
+from cell_census_builder.build_soma.datasets import Dataset
+from cell_census_builder.build_soma.experiment_builder import ExperimentBuilder
+from cell_census_builder.build_soma.globals import (
     CENSUS_DATA_NAME,
     CENSUS_INFO_NAME,
     FEATURE_DATASET_PRESENCE_MATRIX_NAME,
     MEASUREMENT_RNA_NAME,
 )
-from tools.cell_census_builder.validate import validate
+from cell_census_builder.build_soma.validate import validate
 
 
 def test_base_builder_creation(
@@ -29,10 +28,10 @@ def test_base_builder_creation(
     """
     Runs the builder, queries the census and performs a set of base assertions.
     """
-    with patch("tools.cell_census_builder.__main__.prepare_file_system"), patch(
-        "tools.cell_census_builder.__main__.build_step1_get_source_datasets", return_value=datasets
-    ), patch("tools.cell_census_builder.consolidate._run"), patch(
-        "tools.cell_census_builder.validate.validate_consolidation", return_value=True
+    with patch("cell_census_builder.build_soma.__main__.prepare_file_system"), patch(
+        "cell_census_builder.build_soma.__main__.build_step1_get_source_datasets", return_value=datasets
+    ), patch("cell_census_builder.build_soma.consolidate._run"), patch(
+        "cell_census_builder.build_soma.validate.validate_consolidation", return_value=True
     ):
         # Patching consolidate_tiledb_object, becuase is uses to much memory to run in github actions.
         experiment_specifications = make_experiment_specs()
@@ -41,13 +40,13 @@ def test_base_builder_creation(
         from types import SimpleNamespace
 
         args = SimpleNamespace(multi_process=False, consolidate=True, build_tag="test_tag", verbose=True)
-        return_value = build(args, soma_path, assets_path, experiment_builders)  # type: ignore[arg-type]
+        return_value = build(args, soma_path, assets_path, experiment_builders)
 
         # return_value = 0 means that the build succeeded
         assert return_value == 0
 
         # validate the cell_census
-        return_value = validate(args, soma_path, assets_path, experiment_specifications)  # type: ignore[arg-type]
+        return_value = validate(args, soma_path, assets_path, experiment_specifications)
         assert return_value is True
 
         # Query the census and do assertions
@@ -137,7 +136,7 @@ def test_build_step1_get_source_datasets(tmp_path: pathlib.Path, manifest_csv: i
     args = SimpleNamespace(manifest=manifest_csv, test_first_n=None, verbose=2, multi_process=True)
 
     # Call the function
-    datasets = build_step1_get_source_datasets(args, f"{tmp_path}/dest")  # type: ignore
+    datasets = build_step1_get_source_datasets(args, f"{tmp_path}/dest")
 
     # Verify that 2 datasets are returned
     assert len(datasets) == 2
diff --git a/tools/cell_census_builder/tests/test_main.py b/tools/cell_census_builder/tests/test_main.py
index 83985fd06..ff0cbf0a9 100644
--- a/tools/cell_census_builder/tests/test_main.py
+++ b/tools/cell_census_builder/tests/test_main.py
@@ -1,4 +1,4 @@
-from tools.cell_census_builder.__main__ import create_args_parser
+from cell_census_builder.build_soma.__main__ import create_args_parser
 
 
 def test_create_args_parser_default_build() -> None:
diff --git a/tools/cell_census_builder/tests/test_manifest.py b/tools/cell_census_builder/tests/test_manifest.py
index 348acdb2a..89f6077dc 100644
--- a/tools/cell_census_builder/tests/test_manifest.py
+++ b/tools/cell_census_builder/tests/test_manifest.py
@@ -3,7 +3,7 @@
 import re
 from unittest.mock import patch
 
-from tools.cell_census_builder.manifest import CXG_BASE_URI, load_manifest
+from cell_census_builder.build_soma.manifest import CXG_BASE_URI, load_manifest
 
 
 def test_load_manifest_from_file(tmp_path: pathlib.Path, manifest_csv: io.TextIOWrapper) -> None:
@@ -30,7 +30,7 @@ def test_load_manifest_from_cxg() -> None:
     """
     If no parameters are specified, `load_manifest` should load the dataset list from Discover API.
     """
-    with patch("tools.cell_census_builder.manifest.fetch_json") as m:
+    with patch("cell_census_builder.build_soma.manifest.fetch_json") as m:
 
         def mock_call_fn(uri):  # type: ignore
             if uri == f"{CXG_BASE_URI}curation/v1/collections":
@@ -61,7 +61,7 @@ def test_load_manifest_from_cxg_excludes_datasets_with_old_schema() -> None:
     """
     `load_manifest` should exclude datasets that do not have a current schema version.
     """
-    with patch("tools.cell_census_builder.manifest.fetch_json") as m:
+    with patch("cell_census_builder.build_soma.manifest.fetch_json") as m:
 
         def mock_call_fn(uri):  # type: ignore
             if uri == f"{CXG_BASE_URI}curation/v1/collections":
@@ -94,7 +94,7 @@ def test_load_manifest_from_cxg_excludes_datasets_with_no_assets() -> None:
     """
     `load_manifest` should exclude datasets that do not have assets
     """
-    with patch("tools.cell_census_builder.manifest.fetch_json") as m:
+    with patch("cell_census_builder.build_soma.manifest.fetch_json") as m:
 
         def mock_call_fn(uri):  # type: ignore
             if uri == f"{CXG_BASE_URI}curation/v1/collections":
diff --git a/tools/cell_census_builder/tests/test_source_assets.py b/tools/cell_census_builder/tests/test_source_assets.py
index c6f0d4ab9..0b5f5707e 100644
--- a/tools/cell_census_builder/tests/test_source_assets.py
+++ b/tools/cell_census_builder/tests/test_source_assets.py
@@ -1,8 +1,8 @@
 import pathlib
 from types import ModuleType, SimpleNamespace
 
-from tools.cell_census_builder.datasets import Dataset
-from tools.cell_census_builder.source_assets import stage_source_assets
+from cell_census_builder.build_soma.datasets import Dataset
+from cell_census_builder.build_soma.source_assets import stage_source_assets
 
 
 def test_source_assets(tmp_path: pathlib.Path) -> None:
@@ -18,7 +18,7 @@ def test_source_assets(tmp_path: pathlib.Path) -> None:
         datasets.append(dataset)
 
     # Call the function
-    stage_source_assets(datasets, SimpleNamespace(verbose=True), tmp_path / "dest")  # type: ignore
+    stage_source_assets(datasets, SimpleNamespace(verbose=True), tmp_path / "dest")
 
     # Verify that the files exist
     for i in range(10):
diff --git a/tools/cell_census_builder/tests/test_util.py b/tools/cell_census_builder/tests/test_util.py
index c1ed55d13..77aaf8dbf 100644
--- a/tools/cell_census_builder/tests/test_util.py
+++ b/tools/cell_census_builder/tests/test_util.py
@@ -1,9 +1,8 @@
 import numpy as np
 import pytest
+from cell_census_builder.build_soma.util import array_chunker, is_nonnegative_integral, uricat
 from scipy.sparse import coo_matrix, csr_matrix, triu
 
-from tools.cell_census_builder.util import array_chunker, is_nonnegative_integral, uricat
-
 
 def test_is_nonnegative_integral() -> None:
     X = np.array([1, 2, 3], dtype=np.float32)
diff --git a/tools/cell_census_builder/scripts/aws/mount_instance_storage.sh b/tools/scripts/aws/mount_instance_storage.sh
similarity index 100%
rename from tools/cell_census_builder/scripts/aws/mount_instance_storage.sh
rename to tools/scripts/aws/mount_instance_storage.sh
diff --git a/tools/cell_census_builder/scripts/aws/swapon_instance_storage.sh b/tools/scripts/aws/swapon_instance_storage.sh
similarity index 100%
rename from tools/cell_census_builder/scripts/aws/swapon_instance_storage.sh
rename to tools/scripts/aws/swapon_instance_storage.sh
diff --git a/tools/scripts/requirements.txt b/tools/scripts/requirements.txt
deleted file mode 100644
index 605ba7ab4..000000000
--- a/tools/scripts/requirements.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-pyarrow
-pandas
-anndata
-numpy
-# NOTE: The builder's version of tiledbsoma MUST be <= the API's tiledbsoma version, to ensure reader compatibility
-# with TileDB on-disk storage format
-tiledbsoma==1.0.0
-scipy
-fsspec
-s3fs
-requests
-aiohttp
-Cython  # required by owlready2
-wheel  # required by owlready2
-owlready2
-gitpython
-attrs>=22.2.0

From 2399c97530a2565d380166be69c858947220ee06 Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Fri, 17 Mar 2023 21:42:39 +0000
Subject: [PATCH 02/34] refactor files in build_soma

---
 .../build_soma/__main__.py                    | 291 +-----------------
 .../cell_census_builder/build_soma/build.py   | 256 +++++++++++++++
 .../build_soma/experiment_specs.py            |  34 ++
 .../cell_census_builder/host_validation.py    |   2 +-
 .../cell_census_builder/tests/test_builder.py |   7 +-
 5 files changed, 300 insertions(+), 290 deletions(-)
 create mode 100644 tools/cell_census_builder/src/cell_census_builder/build_soma/build.py
 create mode 100644 tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_specs.py

diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py
index 5b15d2829..f2f5606e6 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py
@@ -1,70 +1,16 @@
 import argparse
-import gc
-import logging
 import multiprocessing
-import os.path
 import sys
-from datetime import datetime, timezone
-from typing import List
+from datetime import datetime
 
-import tiledbsoma as soma
-
-from .anndata import open_anndata
-from .census_summary import create_census_summary
-from .consolidate import consolidate
-from .datasets import Dataset, assign_dataset_soma_joinids, create_dataset_manifest
-from .experiment_builder import (
-    ExperimentBuilder,
-    ExperimentSpecification,
-    populate_X_layers,
-    reopen_experiment_builders,
-)
-from .globals import (
-    CENSUS_DATA_NAME,
-    CENSUS_INFO_NAME,
-    CENSUS_SCHEMA_VERSION,
-    CXG_SCHEMA_VERSION,
-    RNA_SEQ,
-    SOMA_TileDB_Context,
-)
-from .manifest import load_manifest
+from .build import build
+from .experiment_builder import ExperimentBuilder
+from .experiment_specs import make_experiment_specs
 from .mp import process_initializer
-from .source_assets import stage_source_assets
-from .summary_cell_counts import create_census_summary_cell_counts
-from .util import get_git_commit_sha, is_git_repo_dirty, uricat
+from .util import uricat
 from .validate import validate
 
 
-def make_experiment_specs() -> List[ExperimentSpecification]:
-    """
-    Define all soma.Experiments to build in the census.
-
-    Functionally, this defines per-experiment name, anndata filter, etc.
-    It also loads any required per-Experiment assets.
-    """
-    GENE_LENGTH_BASE_URI = (
-        "https://raw.githubusercontent.com/chanzuckerberg/single-cell-curation/"
-        "100f935eac932e1f5f5dadac0627204da3790f6f/cellxgene_schema_cli/cellxgene_schema/ontology_files/"
-    )
-    GENE_LENGTH_URIS = [
-        GENE_LENGTH_BASE_URI + "genes_homo_sapiens.csv.gz",
-        GENE_LENGTH_BASE_URI + "genes_mus_musculus.csv.gz",
-        GENE_LENGTH_BASE_URI + "genes_sars_cov_2.csv.gz",
-    ]
-    return [  # The soma.Experiments we want to build
-        ExperimentSpecification.create(
-            name="homo_sapiens",
-            anndata_cell_filter_spec=dict(organism_ontology_term_id="NCBITaxon:9606", assay_ontology_term_ids=RNA_SEQ),
-            gene_feature_length_uris=GENE_LENGTH_URIS,
-        ),
-        ExperimentSpecification.create(
-            name="mus_musculus",
-            anndata_cell_filter_spec=dict(organism_ontology_term_id="NCBITaxon:10090", assay_ontology_term_ids=RNA_SEQ),
-            gene_feature_length_uris=GENE_LENGTH_URIS,
-        ),
-    ]
-
-
 def main() -> int:
     parser = create_args_parser()
     args = parser.parse_args()
@@ -90,233 +36,6 @@ def main() -> int:
     return cc
 
 
-def prepare_file_system(soma_path: str, assets_path: str, args: argparse.Namespace) -> None:
-    """
-    Prepares the file system for the builder run
-    """
-    # Don't clobber an existing census build
-    if os.path.exists(soma_path) or os.path.exists(assets_path):
-        raise Exception("Census build path already exists - aborting build")
-
-    # Ensure that the git tree is clean
-    if not args.test_disable_dirty_git_check and is_git_repo_dirty():
-        raise Exception("The git repo has uncommitted changes - aborting build")
-
-    # Create top-level build directories
-    os.makedirs(soma_path, exist_ok=False)
-    os.makedirs(assets_path, exist_ok=False)
-
-
-def build(
-    args: argparse.Namespace, soma_path: str, assets_path: str, experiment_builders: List[ExperimentBuilder]
-) -> int:
-    """
-    Approximately, build steps are:
-    1. Download manifest and copy/stage all source assets
-    2. Read all H5AD and create axis dataframe (serial)
-        * write obs/var dataframes
-        * accumulate overall shape of X
-    3. Read all H5AD assets again, write X layer (parallel)
-    4. Optional: validate
-
-    Returns
-    -------
-    int
-        Process completion code, 0 on success, non-zero indicating error,
-        suitable for providing to sys.exit()
-    """
-
-    try:
-        prepare_file_system(soma_path, assets_path, args)
-    except Exception as e:
-        logging.error(e)
-        return 1
-
-    # Step 1 - get all source datasets
-    datasets = build_step1_get_source_datasets(args, assets_path)
-
-    # Step 2 - create root collection, and all child objects, but do not populate any dataframes or matrices
-    root_collection = build_step2_create_root_collection(soma_path, experiment_builders)
-    gc.collect()
-
-    # Step 3 - populate axes
-    filtered_datasets = build_step3_populate_obs_and_var_axes(assets_path, datasets, experiment_builders)
-
-    # Step 4 - populate X layers
-    build_step4_populate_X_layers(assets_path, filtered_datasets, experiment_builders, args)
-    gc.collect()
-
-    # Step 5- write out dataset manifest and summary information
-    build_step5_populate_summary_info(root_collection, experiment_builders, filtered_datasets, args.build_tag)
-
-    # consolidate TileDB data
-    if args.consolidate:
-        consolidate(args, root_collection.uri)
-
-    return 0
-
-
-def populate_root_collection(root_collection: soma.Collection) -> soma.Collection:
-    """
-    Create the root SOMA collection for the Census.
-
-    Returns the root collection.
-    """
-
-    # Set root metadata for the experiment
-    root_collection.metadata["created_on"] = datetime.now(tz=timezone.utc).isoformat(timespec="seconds")
-    root_collection.metadata["cxg_schema_version"] = CXG_SCHEMA_VERSION
-    root_collection.metadata["census_schema_version"] = CENSUS_SCHEMA_VERSION
-
-    sha = get_git_commit_sha()
-    root_collection.metadata["git_commit_sha"] = sha
-
-    # Create sub-collections for experiments, etc.
-    for n in [CENSUS_INFO_NAME, CENSUS_DATA_NAME]:
-        root_collection.add_new_collection(n)
-
-    return root_collection
-
-
-def build_step1_get_source_datasets(args: argparse.Namespace, assets_path: str) -> List[Dataset]:
-    logging.info("Build step 1 - get source assets - started")
-
-    # Load manifest defining the datasets
-    datasets = load_manifest(args.manifest)
-    if len(datasets) == 0:
-        logging.error("No H5AD files in the manifest (or we can't find the files)")
-        raise AssertionError("No H5AD files in the manifest (or we can't find the files)")
-
-    # Testing/debugging hook - hidden option
-    if args.test_first_n is not None and args.test_first_n > 0:
-        # Process the N smallest datasets
-        datasets = sorted(datasets, key=lambda d: d.asset_h5ad_filesize)[0 : args.test_first_n]
-
-    # Stage all files
-    stage_source_assets(datasets, args, assets_path)
-
-    logging.info("Build step 1 - get source assets - finished")
-    return datasets
-
-
-def populate_obs_axis(
-    assets_path: str, datasets: List[Dataset], experiment_builders: List[ExperimentBuilder]
-) -> List[Dataset]:
-    filtered_datasets = []
-    N = len(datasets) * len(experiment_builders)
-    n = 0
-
-    for dataset, ad in open_anndata(assets_path, datasets, backed="r"):
-        dataset_total_cell_count = 0
-
-        for eb in reopen_experiment_builders(experiment_builders):
-            n += 1
-            logging.info(f"{eb.name}: filtering dataset '{dataset.dataset_id}' ({n} of {N})")
-            ad_filtered = eb.filter_anndata_cells(ad)
-
-            if len(ad_filtered.obs) == 0:  # type:ignore
-                logging.info(f"{eb.name} - H5AD has no data after filtering, skipping {dataset.dataset_h5ad_path}")
-                continue
-
-            # append to `obs`; accumulate `var` data
-            dataset_total_cell_count += eb.accumulate_axes(dataset, ad_filtered)
-
-        # dataset passes filter if either experiment includes cells from the dataset
-        if dataset_total_cell_count > 0:
-            filtered_datasets.append(dataset)
-            dataset.dataset_total_cell_count = dataset_total_cell_count
-
-    for eb in experiment_builders:
-        logging.info(f"Experiment {eb.name} will contain {eb.n_obs} cells from {eb.n_datasets} datasets")
-
-    return filtered_datasets
-
-
-def populate_var_axis_and_presence(experiment_builders: List[ExperimentBuilder]) -> None:
-    for eb in reopen_experiment_builders(experiment_builders):
-        # populate `var`; create empty `presence` now that we have its dimensions
-        eb.populate_var_axis()
-
-
-def build_step2_create_root_collection(soma_path: str, experiment_builders: List[ExperimentBuilder]) -> soma.Collection:
-    """
-    Create all objects
-
-    Returns: the root collection.
-    """
-    logging.info("Build step 2 - Create root collection - started")
-
-    with soma.Collection.create(soma_path, context=SOMA_TileDB_Context()) as root_collection:
-        populate_root_collection(root_collection)
-
-        for e in experiment_builders:
-            e.create(census_data=root_collection[CENSUS_DATA_NAME])
-
-        logging.info("Build step 2 - Create root collection - finished")
-        return root_collection
-
-
-def build_step3_populate_obs_and_var_axes(
-    assets_path: str,
-    datasets: List[Dataset],
-    experiment_builders: List[ExperimentBuilder],
-) -> List[Dataset]:
-    """
-    Populate obs and var axes. Filter cells from datasets for each experiment, as obs is built.
-    """
-    logging.info("Build step 3 - Populate obs and var axes - started")
-
-    filtered_datasets = populate_obs_axis(assets_path, datasets, experiment_builders)
-    logging.info(f"({len(filtered_datasets)} of {len(datasets)}) datasets suitable for processing.")
-
-    populate_var_axis_and_presence(experiment_builders)
-
-    assign_dataset_soma_joinids(filtered_datasets)
-
-    logging.info("Build step 3 - Populate obs and var axes - finished")
-
-    return filtered_datasets
-
-
-def build_step4_populate_X_layers(
-    assets_path: str,
-    filtered_datasets: List[Dataset],
-    experiment_builders: List[ExperimentBuilder],
-    args: argparse.Namespace,
-) -> None:
-    """
-    Populate X layers.
-    """
-    logging.info("Build step 4 - Populate X layers - started")
-
-    # Process all X data
-    for eb in reopen_experiment_builders(experiment_builders):
-        eb.create_X_with_layers()
-
-    populate_X_layers(assets_path, filtered_datasets, experiment_builders, args)
-
-    for eb in reopen_experiment_builders(experiment_builders):
-        eb.populate_presence_matrix(filtered_datasets)
-
-    logging.info("Build step 4 - Populate X layers - finished")
-
-
-def build_step5_populate_summary_info(
-    root_collection: soma.Collection,
-    experiment_builders: List[ExperimentBuilder],
-    filtered_datasets: List[Dataset],
-    build_tag: str,
-) -> None:
-    logging.info("Build step 5 - Populate summary info - started")
-
-    with soma.Collection.open(root_collection[CENSUS_INFO_NAME].uri, "w", context=SOMA_TileDB_Context()) as census_info:
-        create_dataset_manifest(census_info, filtered_datasets)
-        create_census_summary_cell_counts(census_info, [e.census_summary_cell_counts for e in experiment_builders])
-        create_census_summary(census_info, experiment_builders, build_tag)
-
-    logging.info("Build step 5 - Populate summary info - finished")
-
-
 def create_args_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(prog="cell_census_builder")
     parser.add_argument("uri", type=str, help="Census top-level URI")
diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/build.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/build.py
new file mode 100644
index 000000000..e3c126751
--- /dev/null
+++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/build.py
@@ -0,0 +1,256 @@
+import argparse
+import gc
+import logging
+import os.path
+from datetime import datetime, timezone
+from typing import List
+
+import tiledbsoma as soma
+
+from .anndata import open_anndata
+from .census_summary import create_census_summary
+from .consolidate import consolidate
+from .datasets import Dataset, assign_dataset_soma_joinids, create_dataset_manifest
+from .experiment_builder import (
+    ExperimentBuilder,
+    populate_X_layers,
+    reopen_experiment_builders,
+)
+from .globals import (
+    CENSUS_DATA_NAME,
+    CENSUS_INFO_NAME,
+    CENSUS_SCHEMA_VERSION,
+    CXG_SCHEMA_VERSION,
+    SOMA_TileDB_Context,
+)
+from .manifest import load_manifest
+from .source_assets import stage_source_assets
+from .summary_cell_counts import create_census_summary_cell_counts
+from .util import get_git_commit_sha, is_git_repo_dirty
+
+
+def prepare_file_system(soma_path: str, assets_path: str, args: argparse.Namespace) -> None:
+    """
+    Prepares the file system for the builder run
+    """
+    # Don't clobber an existing census build
+    if os.path.exists(soma_path) or os.path.exists(assets_path):
+        raise Exception("Census build path already exists - aborting build")
+
+    # Ensure that the git tree is clean
+    if not args.test_disable_dirty_git_check and is_git_repo_dirty():
+        raise Exception("The git repo has uncommitted changes - aborting build")
+
+    # Create top-level build directories
+    os.makedirs(soma_path, exist_ok=False)
+    os.makedirs(assets_path, exist_ok=False)
+
+
+def build(
+    args: argparse.Namespace, soma_path: str, assets_path: str, experiment_builders: List[ExperimentBuilder]
+) -> int:
+    """
+    Approximately, build steps are:
+    1. Download manifest and copy/stage all source assets
+    2. Read all H5AD and create axis dataframe (serial)
+        * write obs/var dataframes
+        * accumulate overall shape of X
+    3. Read all H5AD assets again, write X layer (parallel)
+    4. Optional: validate
+
+    Returns
+    -------
+    int
+        Process completion code, 0 on success, non-zero indicating error,
+        suitable for providing to sys.exit()
+    """
+
+    try:
+        prepare_file_system(soma_path, assets_path, args)
+    except Exception as e:
+        logging.error(e)
+        return 1
+
+    # Step 1 - get all source datasets
+    datasets = build_step1_get_source_datasets(args, assets_path)
+
+    # Step 2 - create root collection, and all child objects, but do not populate any dataframes or matrices
+    root_collection = build_step2_create_root_collection(soma_path, experiment_builders)
+    gc.collect()
+
+    # Step 3 - populate axes
+    filtered_datasets = build_step3_populate_obs_and_var_axes(assets_path, datasets, experiment_builders)
+
+    # Step 4 - populate X layers
+    build_step4_populate_X_layers(assets_path, filtered_datasets, experiment_builders, args)
+    gc.collect()
+
+    # Step 5- write out dataset manifest and summary information
+    build_step5_populate_summary_info(root_collection, experiment_builders, filtered_datasets, args.build_tag)
+
+    # consolidate TileDB data
+    if args.consolidate:
+        consolidate(args, root_collection.uri)
+
+    return 0
+
+
+def populate_root_collection(root_collection: soma.Collection) -> soma.Collection:
+    """
+    Create the root SOMA collection for the Census.
+
+    Returns the root collection.
+    """
+
+    # Set root metadata for the experiment
+    root_collection.metadata["created_on"] = datetime.now(tz=timezone.utc).isoformat(timespec="seconds")
+    root_collection.metadata["cxg_schema_version"] = CXG_SCHEMA_VERSION
+    root_collection.metadata["census_schema_version"] = CENSUS_SCHEMA_VERSION
+
+    sha = get_git_commit_sha()
+    root_collection.metadata["git_commit_sha"] = sha
+
+    # Create sub-collections for experiments, etc.
+    for n in [CENSUS_INFO_NAME, CENSUS_DATA_NAME]:
+        root_collection.add_new_collection(n)
+
+    return root_collection
+
+
+def build_step1_get_source_datasets(args: argparse.Namespace, assets_path: str) -> List[Dataset]:
+    logging.info("Build step 1 - get source assets - started")
+
+    # Load manifest defining the datasets
+    datasets = load_manifest(args.manifest)
+    if len(datasets) == 0:
+        logging.error("No H5AD files in the manifest (or we can't find the files)")
+        raise AssertionError("No H5AD files in the manifest (or we can't find the files)")
+
+    # Testing/debugging hook - hidden option
+    if args.test_first_n is not None and args.test_first_n > 0:
+        # Process the N smallest datasets
+        datasets = sorted(datasets, key=lambda d: d.asset_h5ad_filesize)[0 : args.test_first_n]
+
+    # Stage all files
+    stage_source_assets(datasets, args, assets_path)
+
+    logging.info("Build step 1 - get source assets - finished")
+    return datasets
+
+
+def populate_obs_axis(
+    assets_path: str, datasets: List[Dataset], experiment_builders: List[ExperimentBuilder]
+) -> List[Dataset]:
+    filtered_datasets = []
+    N = len(datasets) * len(experiment_builders)
+    n = 0
+
+    for dataset, ad in open_anndata(assets_path, datasets, backed="r"):
+        dataset_total_cell_count = 0
+
+        for eb in reopen_experiment_builders(experiment_builders):
+            n += 1
+            logging.info(f"{eb.name}: filtering dataset '{dataset.dataset_id}' ({n} of {N})")
+            ad_filtered = eb.filter_anndata_cells(ad)
+
+            if len(ad_filtered.obs) == 0:  # type:ignore
+                logging.info(f"{eb.name} - H5AD has no data after filtering, skipping {dataset.dataset_h5ad_path}")
+                continue
+
+            # append to `obs`; accumulate `var` data
+            dataset_total_cell_count += eb.accumulate_axes(dataset, ad_filtered)
+
+        # dataset passes filter if either experiment includes cells from the dataset
+        if dataset_total_cell_count > 0:
+            filtered_datasets.append(dataset)
+            dataset.dataset_total_cell_count = dataset_total_cell_count
+
+    for eb in experiment_builders:
+        logging.info(f"Experiment {eb.name} will contain {eb.n_obs} cells from {eb.n_datasets} datasets")
+
+    return filtered_datasets
+
+
+def populate_var_axis_and_presence(experiment_builders: List[ExperimentBuilder]) -> None:
+    for eb in reopen_experiment_builders(experiment_builders):
+        # populate `var`; create empty `presence` now that we have its dimensions
+        eb.populate_var_axis()
+
+
+def build_step2_create_root_collection(soma_path: str, experiment_builders: List[ExperimentBuilder]) -> soma.Collection:
+    """
+    Create all objects
+
+    Returns: the root collection.
+    """
+    logging.info("Build step 2 - Create root collection - started")
+
+    with soma.Collection.create(soma_path, context=SOMA_TileDB_Context()) as root_collection:
+        populate_root_collection(root_collection)
+
+        for e in experiment_builders:
+            e.create(census_data=root_collection[CENSUS_DATA_NAME])
+
+        logging.info("Build step 2 - Create root collection - finished")
+        return root_collection
+
+
+def build_step3_populate_obs_and_var_axes(
+    assets_path: str,
+    datasets: List[Dataset],
+    experiment_builders: List[ExperimentBuilder],
+) -> List[Dataset]:
+    """
+    Populate obs and var axes. Filter cells from datasets for each experiment, as obs is built.
+    """
+    logging.info("Build step 3 - Populate obs and var axes - started")
+
+    filtered_datasets = populate_obs_axis(assets_path, datasets, experiment_builders)
+    logging.info(f"({len(filtered_datasets)} of {len(datasets)}) datasets suitable for processing.")
+
+    populate_var_axis_and_presence(experiment_builders)
+
+    assign_dataset_soma_joinids(filtered_datasets)
+
+    logging.info("Build step 3 - Populate obs and var axes - finished")
+
+    return filtered_datasets
+
+
+def build_step4_populate_X_layers(
+    assets_path: str,
+    filtered_datasets: List[Dataset],
+    experiment_builders: List[ExperimentBuilder],
+    args: argparse.Namespace,
+) -> None:
+    """
+    Populate X layers.
+    """
+    logging.info("Build step 4 - Populate X layers - started")
+
+    # Process all X data
+    for eb in reopen_experiment_builders(experiment_builders):
+        eb.create_X_with_layers()
+
+    populate_X_layers(assets_path, filtered_datasets, experiment_builders, args)
+
+    for eb in reopen_experiment_builders(experiment_builders):
+        eb.populate_presence_matrix(filtered_datasets)
+
+    logging.info("Build step 4 - Populate X layers - finished")
+
+
+def build_step5_populate_summary_info(
+    root_collection: soma.Collection,
+    experiment_builders: List[ExperimentBuilder],
+    filtered_datasets: List[Dataset],
+    build_tag: str,
+) -> None:
+    logging.info("Build step 5 - Populate summary info - started")
+
+    with soma.Collection.open(root_collection[CENSUS_INFO_NAME].uri, "w", context=SOMA_TileDB_Context()) as census_info:
+        create_dataset_manifest(census_info, filtered_datasets)
+        create_census_summary_cell_counts(census_info, [e.census_summary_cell_counts for e in experiment_builders])
+        create_census_summary(census_info, experiment_builders, build_tag)
+
+    logging.info("Build step 5 - Populate summary info - finished")
diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_specs.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_specs.py
new file mode 100644
index 000000000..bd2c815a3
--- /dev/null
+++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_specs.py
@@ -0,0 +1,34 @@
+from typing import List
+
+from .experiment_builder import ExperimentSpecification
+from .globals import RNA_SEQ
+
+
+def make_experiment_specs() -> List[ExperimentSpecification]:
+    """
+    Define all soma.Experiments to build in the census.
+
+    Functionally, this defines per-experiment name, anndata filter, etc.
+    It also loads any required per-Experiment assets.
+    """
+    GENE_LENGTH_BASE_URI = (
+        "https://raw.githubusercontent.com/chanzuckerberg/single-cell-curation/"
+        "100f935eac932e1f5f5dadac0627204da3790f6f/cellxgene_schema_cli/cellxgene_schema/ontology_files/"
+    )
+    GENE_LENGTH_URIS = [
+        GENE_LENGTH_BASE_URI + "genes_homo_sapiens.csv.gz",
+        GENE_LENGTH_BASE_URI + "genes_mus_musculus.csv.gz",
+        GENE_LENGTH_BASE_URI + "genes_sars_cov_2.csv.gz",
+    ]
+    return [  # The soma.Experiments we want to build
+        ExperimentSpecification.create(
+            name="homo_sapiens",
+            anndata_cell_filter_spec=dict(organism_ontology_term_id="NCBITaxon:9606", assay_ontology_term_ids=RNA_SEQ),
+            gene_feature_length_uris=GENE_LENGTH_URIS,
+        ),
+        ExperimentSpecification.create(
+            name="mus_musculus",
+            anndata_cell_filter_spec=dict(organism_ontology_term_id="NCBITaxon:10090", assay_ontology_term_ids=RNA_SEQ),
+            gene_feature_length_uris=GENE_LENGTH_URIS,
+        ),
+    ]
diff --git a/tools/cell_census_builder/src/cell_census_builder/host_validation.py b/tools/cell_census_builder/src/cell_census_builder/host_validation.py
index e0c407710..69a096056 100644
--- a/tools/cell_census_builder/src/cell_census_builder/host_validation.py
+++ b/tools/cell_census_builder/src/cell_census_builder/host_validation.py
@@ -5,7 +5,7 @@
 
 import psutil
 
-from cell_census_builder.logging import setup_logging, hr_multibyte_unit
+from cell_census_builder.logging import hr_multibyte_unit, setup_logging
 
 """Minimum physical RAM"""
 MIN_RAM = 512 * 1024**3  # 512GiB
diff --git a/tools/cell_census_builder/tests/test_builder.py b/tools/cell_census_builder/tests/test_builder.py
index 10b752346..90b233300 100644
--- a/tools/cell_census_builder/tests/test_builder.py
+++ b/tools/cell_census_builder/tests/test_builder.py
@@ -10,9 +10,10 @@
 import pyarrow as pa
 import tiledb
 import tiledbsoma as soma
-from cell_census_builder.build_soma.__main__ import build, build_step1_get_source_datasets, make_experiment_specs
+from cell_census_builder.build_soma.build import build, build_step1_get_source_datasets
 from cell_census_builder.build_soma.datasets import Dataset
 from cell_census_builder.build_soma.experiment_builder import ExperimentBuilder
+from cell_census_builder.build_soma.experiment_specs import make_experiment_specs
 from cell_census_builder.build_soma.globals import (
     CENSUS_DATA_NAME,
     CENSUS_INFO_NAME,
@@ -28,8 +29,8 @@ def test_base_builder_creation(
     """
     Runs the builder, queries the census and performs a set of base assertions.
     """
-    with patch("cell_census_builder.build_soma.__main__.prepare_file_system"), patch(
-        "cell_census_builder.build_soma.__main__.build_step1_get_source_datasets", return_value=datasets
+    with patch("cell_census_builder.build_soma.build.prepare_file_system"), patch(
+        "cell_census_builder.build_soma.build.build_step1_get_source_datasets", return_value=datasets
     ), patch("cell_census_builder.build_soma.consolidate._run"), patch(
         "cell_census_builder.build_soma.validate.validate_consolidation", return_value=True
     ):

From c30d38b5ee3ccf007d544139d6299572b0393a1a Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Fri, 17 Mar 2023 21:55:52 +0000
Subject: [PATCH 03/34] fix GHA unit test

---
 .github/workflows/py-unittests.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/py-unittests.yml b/.github/workflows/py-unittests.yml
index 74a606ce5..8d1acc618 100644
--- a/.github/workflows/py-unittests.yml
+++ b/.github/workflows/py-unittests.yml
@@ -53,8 +53,8 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install -U pip setuptools wheel
-          pip install -r ./tools/scripts/requirements.txt -r ./tools/scripts/requirements-dev.txt
-          pip install -e ./tools/
+          pip install ./tools/cell_census_builder/
+          pip install -r ./tools/scripts/requirements-dev.txt
       - name: Test with pytest (builder)
         run: |
           PYTHONPATH=. coverage run --parallel-mode -m pytest ./tools/cell_census_builder/tests/

From d3cb2725bd23231bd1b38acd854c5ef78cbbeb63 Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Fri, 17 Mar 2023 21:56:38 +0000
Subject: [PATCH 04/34] fix GHA unit test

---
 .github/workflows/py-unittests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/py-unittests.yml b/.github/workflows/py-unittests.yml
index 8d1acc618..026e04c8b 100644
--- a/.github/workflows/py-unittests.yml
+++ b/.github/workflows/py-unittests.yml
@@ -53,7 +53,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install -U pip setuptools wheel
-          pip install ./tools/cell_census_builder/
+          pip install -e ./tools/cell_census_builder/
           pip install -r ./tools/scripts/requirements-dev.txt
       - name: Test with pytest (builder)
         run: |

From 613cc46121d101bc9b822989edff7b3d7736e4a2 Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Mon, 20 Mar 2023 19:04:15 +0000
Subject: [PATCH 05/34] additional refactoring for top-level workflow

---
 .pre-commit-config.yaml                       |   1 +
 tools/cell_census_builder/pyproject.toml      |   7 +-
 .../src/cell_census_builder/__init__.py       |   2 +-
 .../src/cell_census_builder/__main__.py       | 133 ++++++++++++++
 .../build_soma/__main__.py                    |  38 ++--
 .../cell_census_builder/build_soma/anndata.py |   4 +-
 .../cell_census_builder/build_soma/build.py   |  50 +++---
 .../build_soma/consolidate.py                 |   6 +-
 .../build_soma/experiment_builder.py          |  10 +-
 .../build_soma/experiment_specs.py            |   9 +-
 .../src/cell_census_builder/build_soma/mp.py  |  16 +-
 .../build_soma/source_assets.py               |   8 +-
 .../cell_census_builder/build_soma/util.py    |  16 --
 .../build_soma/validate.py                    |  29 ++--
 .../src/cell_census_builder/build_state.py    | 162 ++++++++++++++++++
 .../cell_census_builder/host_validation.py    | 101 +++++++----
 .../src/cell_census_builder/logging.py        |  41 ++++-
 .../src/cell_census_builder/util.py           |  46 +++++
 .../tests/anndata/conftest.py                 |  16 +-
 tools/cell_census_builder/tests/conftest.py   |  31 ++--
 .../cell_census_builder/tests/test_builder.py |  46 ++---
 .../tests/test_source_assets.py               |  15 +-
 tools/cell_census_builder/tests/test_util.py  |  44 +++--
 23 files changed, 616 insertions(+), 215 deletions(-)
 create mode 100644 tools/cell_census_builder/src/cell_census_builder/__main__.py
 create mode 100644 tools/cell_census_builder/src/cell_census_builder/build_state.py
 create mode 100644 tools/cell_census_builder/src/cell_census_builder/util.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ba322a2d3..a313ea4da 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -54,3 +54,4 @@ repos:
           - numpy
           - typing_extensions
           - types-setuptools
+          - types-PyYAML
diff --git a/tools/cell_census_builder/pyproject.toml b/tools/cell_census_builder/pyproject.toml
index a5bf541e0..6d49850ec 100644
--- a/tools/cell_census_builder/pyproject.toml
+++ b/tools/cell_census_builder/pyproject.toml
@@ -26,13 +26,12 @@ classifiers = [
     "Programming Language :: Python :: 3.10",
 ]
 dependencies= [
+    "typing_extensions",
     "pyarrow",
     "pandas",
     "anndata>=0.8",
     "numpy",
-    # NOTE: The builder's version of tiledbsoma MUST be <= the API's tiledbsoma version, to ensure reader compatibility
-    # with TileDB on-disk storage format
-    "tiledbsoma==1.0.0",
+    "cell_census==0.10.0",
     "scipy",
     "fsspec",
     "s3fs",
@@ -44,8 +43,6 @@ dependencies= [
     "gitpython",
     "attrs>=22.2.0",
     "psutil",
-    "cell_census==0.10.0",
-    "typing_extensions",
 ]
 
 # [tool.setuptools.packages.find]
diff --git a/tools/cell_census_builder/src/cell_census_builder/__init__.py b/tools/cell_census_builder/src/cell_census_builder/__init__.py
index 16e5282c0..4cd7c916d 100644
--- a/tools/cell_census_builder/src/cell_census_builder/__init__.py
+++ b/tools/cell_census_builder/src/cell_census_builder/__init__.py
@@ -6,7 +6,7 @@
 
 
 try:
-    __version__ = metadata.version("cell_census")
+    __version__ = metadata.version("cell_census_builder")
 except metadata.PackageNotFoundError:
     # package is not installed
     __version__ = "0.0.0-unknown"
diff --git a/tools/cell_census_builder/src/cell_census_builder/__main__.py b/tools/cell_census_builder/src/cell_census_builder/__main__.py
new file mode 100644
index 000000000..01192ff7b
--- /dev/null
+++ b/tools/cell_census_builder/src/cell_census_builder/__main__.py
@@ -0,0 +1,133 @@
+import argparse
+import logging
+import pathlib
+import sys
+from typing import Callable, List
+
+import s3fs
+
+from . import __version__
+from .build_state import CENSUS_BUILD_CONFIG, CENSUS_BUILD_STATE, CensusBuildArgs, CensusBuildConfig
+from .host_validation import check_host
+from .util import process_init, urlcat
+
+"""
+File tree for the build.
+
+working_dir:
+    |
+    +-- config.yaml        # build config (user provided, read-only)
+    +-- state.yaml         # build runtime state (eg., census version tag, etc)
+    +-- soma
+    +-- h5ads
+    +-- logs               # log files from various stages
+    |   +-- build.log
+    |   +-- ...
+    +-- reports
+        +-- census-summary-VERSION.txt
+        +-- census-diff-VERSION.txt
+
+"""
+
+
+def main() -> int:
+    cli_parser = create_args_parser()
+    cli_args = cli_parser.parse_args()
+
+    working_dir = pathlib.PosixPath(cli_args.working_dir)
+    if not working_dir.is_dir():
+        logging.critical("Census builder: unable to find working directory - exiting.")
+        return 1
+    if not (working_dir / CENSUS_BUILD_CONFIG).is_file():
+        logging.critical("Census builder: unable to find config.yaml in working directory - exiting.")
+        return 1
+    if (working_dir / CENSUS_BUILD_STATE).exists():
+        logging.critical("Found pre-existing census build in working directory - aborting census build.")
+        return 1
+
+    build_config = CensusBuildConfig.load(working_dir / CENSUS_BUILD_CONFIG)
+    build_args = CensusBuildArgs(working_dir=working_dir, config=build_config)
+
+    # Process initialization/setup must be done early
+    process_init(build_args)
+
+    # Return process exit code (or raise, which exits with a code of `1`)
+    return do_build(build_args)
+
+
+def do_build(args: CensusBuildArgs) -> int:
+    """
+    Top-level build sequence.
+
+    Built steps will be executed in order. Build will stop if a build step returns non-zero
+    exit code or raises.
+    """
+    logging.info(f"Census build: start [version={__version__}]")
+    build_steps: List[Callable[[CensusBuildArgs], int]] = [
+        do_prebuild_set_defaults,
+        do_prebuild_checks,
+        do_build_soma,
+        do_create_reports,
+    ]
+    try:
+        for n, build_step in enumerate(build_steps, start=1):
+            logging.info(f"Build step {build_step.__name__} [{n} of {len(build_steps)}]: start")
+            cc = build_step(args)
+            args.state.commit(args.working_dir / CENSUS_BUILD_STATE)
+            if cc != 0:
+                logging.critical(f"Build step {build_step.__name__} returned error code {cc}: aborting build.")
+                return cc
+            logging.info(f"Build step {build_step.__name__} [{n} of {len(build_steps)}]: complete")
+
+    except Exception as e:
+        logging.critical(f"Caught exception, exiting: {str(e)}")
+        return 1
+
+    logging.info("Census build: completed")
+    return 0
+
+
+def do_prebuild_set_defaults(args: CensusBuildArgs) -> int:
+    """Set any default state required by build steps."""
+    args.state["do_prebuild_set_defaults"] = True
+    return 0
+
+
+def do_prebuild_checks(args: CensusBuildArgs) -> int:
+    """Pre-build checks for host, config, etc. All pre-conditions should go here."""
+
+    # check host configuration, e.g., free disk space
+    if not check_host(args):
+        return 1
+
+    # verify the build tag is not already published/in use
+    build_tag = args.config.build_tag
+    assert build_tag is not None
+    s3path = urlcat(args.config.cell_census_S3_path, build_tag)
+    if s3fs.S3FileSystem(anon=True).exists(s3path):
+        logging.error(f"Build tag {build_tag} already exists at {s3path}.")
+        return 1
+
+    args.state["do_prebuild_checks"] = True
+    return 0
+
+
+def do_build_soma(args: CensusBuildArgs) -> int:
+    # WIP
+    # args.state["do_build_soma"] = True
+    return 0
+
+
+def do_create_reports(args: CensusBuildArgs) -> int:
+    # WIP
+    # args.state["do_create_reports"] = True
+    return 0
+
+
+def create_args_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(prog="cell_census_builder")
+    parser.add_argument("working_dir", type=str, help="Working directory for the build")
+    return parser
+
+
+sys.exit(main())
diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py
index f2f5606e6..d1951b1b1 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py
@@ -1,37 +1,29 @@
 import argparse
-import multiprocessing
+import pathlib
 import sys
 from datetime import datetime
 
+from ..build_state import CensusBuildArgs, CensusBuildConfig
+from ..util import process_init
 from .build import build
-from .experiment_builder import ExperimentBuilder
-from .experiment_specs import make_experiment_specs
-from .mp import process_initializer
-from .util import uricat
 from .validate import validate
 
 
 def main() -> int:
-    parser = create_args_parser()
-    args = parser.parse_args()
-    assert args.subcommand in ["build", "validate"]
+    cli_parser = create_args_parser()
+    cli_args = cli_parser.parse_args()
+    assert cli_args.subcommand in ["build", "validate"]
 
-    process_initializer(args.verbose)
-
-    # normalize our base URI - must include trailing slash
-    soma_path = uricat(args.uri, args.build_tag, "soma")
-    assets_path = uricat(args.uri, args.build_tag, "h5ads")
-
-    # create the experiment specifications and builders
-    experiment_specifications = make_experiment_specs()
-    experiment_builders = [ExperimentBuilder(spec) for spec in experiment_specifications]
+    config = CensusBuildConfig(**cli_args.__dict__)
+    args = CensusBuildArgs(working_dir=pathlib.PosixPath(cli_args.uri), config=config)
+    process_init(args)
 
     cc = 0
-    if args.subcommand == "build":
-        cc = build(args, soma_path, assets_path, experiment_builders)
+    if cli_args.subcommand == "build":
+        cc = build(args)
 
-    if cc == 0 and (args.subcommand == "validate" or args.validate):
-        validate(args, soma_path, assets_path, experiment_specifications)
+    if cc == 0 and (cli_args.subcommand == "validate" or cli_args.validate):
+        validate(args)
 
     return cc
 
@@ -85,8 +77,4 @@ def create_args_parser() -> argparse.ArgumentParser:
 
 
 if __name__ == "__main__":
-    # this is very important to do early, before any use of `concurrent.futures`
-    if multiprocessing.get_start_method(True) != "spawn":
-        multiprocessing.set_start_method("spawn", True)
-
     sys.exit(main())
diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/anndata.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/anndata.py
index b373db20c..c9289d4e6 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_soma/anndata.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/anndata.py
@@ -5,9 +5,9 @@
 import numpy as np
 import pandas as pd
 
+from ..util import urlcat
 from .datasets import Dataset
 from .globals import CXG_SCHEMA_VERSION, CXG_SCHEMA_VERSION_IMPORT, FEATURE_REFERENCE_IGNORE
-from .util import uricat
 
 AnnDataFilterSpec = TypedDict(
     "AnnDataFilterSpec",
@@ -34,7 +34,7 @@ def open_anndata(
         datasets = [datasets]
 
     for h5ad in datasets:
-        path = uricat(base_path, h5ad.dataset_h5ad_path)
+        path = urlcat(base_path, h5ad.dataset_h5ad_path)
         logging.debug(f"open_anndata: {path}")
         ad = anndata.read_h5ad(path, *args, **kwargs)
 
diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/build.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/build.py
index e3c126751..ab3693b75 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_soma/build.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/build.py
@@ -1,12 +1,11 @@
-import argparse
 import gc
 import logging
-import os.path
 from datetime import datetime, timezone
 from typing import List
 
 import tiledbsoma as soma
 
+from ..build_state import CensusBuildArgs
 from .anndata import open_anndata
 from .census_summary import create_census_summary
 from .consolidate import consolidate
@@ -16,6 +15,7 @@
     populate_X_layers,
     reopen_experiment_builders,
 )
+from .experiment_specs import make_experiment_builders
 from .globals import (
     CENSUS_DATA_NAME,
     CENSUS_INFO_NAME,
@@ -29,26 +29,24 @@
 from .util import get_git_commit_sha, is_git_repo_dirty
 
 
-def prepare_file_system(soma_path: str, assets_path: str, args: argparse.Namespace) -> None:
+def prepare_file_system(args: CensusBuildArgs) -> None:
     """
     Prepares the file system for the builder run
     """
     # Don't clobber an existing census build
-    if os.path.exists(soma_path) or os.path.exists(assets_path):
+    if args.soma_path.exists() or args.h5ads_path.exists():
         raise Exception("Census build path already exists - aborting build")
 
     # Ensure that the git tree is clean
-    if not args.test_disable_dirty_git_check and is_git_repo_dirty():
+    if not args.config.test_disable_dirty_git_check and is_git_repo_dirty():
         raise Exception("The git repo has uncommitted changes - aborting build")
 
     # Create top-level build directories
-    os.makedirs(soma_path, exist_ok=False)
-    os.makedirs(assets_path, exist_ok=False)
+    args.soma_path.mkdir(parents=True, exist_ok=False)
+    args.h5ads_path.mkdir(parents=True, exist_ok=False)
 
 
-def build(
-    args: argparse.Namespace, soma_path: str, assets_path: str, experiment_builders: List[ExperimentBuilder]
-) -> int:
+def build(args: CensusBuildArgs) -> int:
     """
     Approximately, build steps are:
     1. Download manifest and copy/stage all source assets
@@ -65,31 +63,29 @@ def build(
         suitable for providing to sys.exit()
     """
 
-    try:
-        prepare_file_system(soma_path, assets_path, args)
-    except Exception as e:
-        logging.error(e)
-        return 1
+    experiment_builders = make_experiment_builders()
+
+    prepare_file_system(args)
 
     # Step 1 - get all source datasets
-    datasets = build_step1_get_source_datasets(args, assets_path)
+    datasets = build_step1_get_source_datasets(args)
 
     # Step 2 - create root collection, and all child objects, but do not populate any dataframes or matrices
-    root_collection = build_step2_create_root_collection(soma_path, experiment_builders)
+    root_collection = build_step2_create_root_collection(args.soma_path.as_posix(), experiment_builders)
     gc.collect()
 
     # Step 3 - populate axes
-    filtered_datasets = build_step3_populate_obs_and_var_axes(assets_path, datasets, experiment_builders)
+    filtered_datasets = build_step3_populate_obs_and_var_axes(args.h5ads_path.as_posix(), datasets, experiment_builders)
 
     # Step 4 - populate X layers
-    build_step4_populate_X_layers(assets_path, filtered_datasets, experiment_builders, args)
+    build_step4_populate_X_layers(args.h5ads_path.as_posix(), filtered_datasets, experiment_builders, args)
     gc.collect()
 
     # Step 5- write out dataset manifest and summary information
-    build_step5_populate_summary_info(root_collection, experiment_builders, filtered_datasets, args.build_tag)
+    build_step5_populate_summary_info(root_collection, experiment_builders, filtered_datasets, args.config.build_tag)
 
     # consolidate TileDB data
-    if args.consolidate:
+    if args.config.consolidate:
         consolidate(args, root_collection.uri)
 
     return 0
@@ -117,22 +113,22 @@ def populate_root_collection(root_collection: soma.Collection) -> soma.Collectio
     return root_collection
 
 
-def build_step1_get_source_datasets(args: argparse.Namespace, assets_path: str) -> List[Dataset]:
+def build_step1_get_source_datasets(args: CensusBuildArgs) -> List[Dataset]:
     logging.info("Build step 1 - get source assets - started")
 
     # Load manifest defining the datasets
-    datasets = load_manifest(args.manifest)
+    datasets = load_manifest(args.config.manifest)
     if len(datasets) == 0:
         logging.error("No H5AD files in the manifest (or we can't find the files)")
         raise AssertionError("No H5AD files in the manifest (or we can't find the files)")
 
     # Testing/debugging hook - hidden option
-    if args.test_first_n is not None and args.test_first_n > 0:
+    if args.config.test_first_n is not None and args.config.test_first_n > 0:
         # Process the N smallest datasets
-        datasets = sorted(datasets, key=lambda d: d.asset_h5ad_filesize)[0 : args.test_first_n]
+        datasets = sorted(datasets, key=lambda d: d.asset_h5ad_filesize)[0 : args.config.test_first_n]
 
     # Stage all files
-    stage_source_assets(datasets, args, assets_path)
+    stage_source_assets(datasets, args)
 
     logging.info("Build step 1 - get source assets - finished")
     return datasets
@@ -221,7 +217,7 @@ def build_step4_populate_X_layers(
     assets_path: str,
     filtered_datasets: List[Dataset],
     experiment_builders: List[ExperimentBuilder],
-    args: argparse.Namespace,
+    args: CensusBuildArgs,
 ) -> None:
     """
     Populate X layers.
diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/consolidate.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/consolidate.py
index f8048a62a..8d6d2272a 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_soma/consolidate.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/consolidate.py
@@ -1,15 +1,15 @@
-import argparse
 import concurrent.futures
 import logging
 from typing import List
 
 import tiledbsoma as soma
 
+from ..build_state import CensusBuildArgs
 from .globals import DEFAULT_TILEDB_CONFIG, SOMA_TileDB_Context
 from .mp import create_process_pool_executor, log_on_broken_process_pool
 
 
-def consolidate(args: argparse.Namespace, uri: str) -> None:
+def consolidate(args: CensusBuildArgs, uri: str) -> None:
     """
     This is a non-portable, TileDB-specific consolidation routine.
     """
@@ -30,7 +30,7 @@ def _gather(uri: str) -> List[str]:
     return uris_to_consolidate
 
 
-def _run(args: argparse.Namespace, uris_to_consolidate: List[str]) -> None:
+def _run(args: CensusBuildArgs, uris_to_consolidate: List[str]) -> None:
     # Queue consolidator for each array
     with create_process_pool_executor(args) as ppe:
         futures = [ppe.submit(consolidate_tiledb_object, uri) for uri in uris_to_consolidate]
diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_builder.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_builder.py
index 8307b89b5..34a8b017a 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_builder.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_builder.py
@@ -1,4 +1,3 @@
-import argparse
 import concurrent.futures
 import gc
 import io
@@ -18,6 +17,8 @@
 from somacore.options import OpenMode
 from typing_extensions import Self
 
+from ..build_state import CensusBuildArgs
+from ..util import urlcat
 from .anndata import AnnDataFilterSpec, make_anndata_cell_filter, open_anndata
 from .datasets import Dataset
 from .globals import (
@@ -41,7 +42,6 @@
     anndata_ordered_bool_issue_853_workaround,
     array_chunker,
     is_nonnegative_integral,
-    uricat,
 )
 
 # Contents:
@@ -149,7 +149,7 @@ def gene_feature_length(self) -> pd.DataFrame:
     def create(self, census_data: soma.Collection) -> None:
         """Create experiment within the specified Collection with a single Measurement."""
 
-        logging.info(f"{self.name}: create experiment at {uricat(census_data.uri, self.name)}")
+        logging.info(f"{self.name}: create experiment at {urlcat(census_data.uri, self.name)}")
 
         self.experiment = census_data.add_new_collection(self.name, soma.Experiment)
         self.experiment_uri = self.experiment.uri
@@ -463,14 +463,14 @@ def _accumulate_X(
 
 
 def populate_X_layers(
-    assets_path: str, datasets: List[Dataset], experiment_builders: List[ExperimentBuilder], args: argparse.Namespace
+    assets_path: str, datasets: List[Dataset], experiment_builders: List[ExperimentBuilder], args: CensusBuildArgs
 ) -> None:
     """
     Do all X layer processing for all Experiments. Also accumulate presence matrix data for later writing.
     """
     # populate X layers
     presence: List[PresenceResult] = []
-    if args.multi_process:
+    if args.config.multi_process:
         with create_process_pool_executor(args) as pe:
             futures = {
                 _accumulate_X(
diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_specs.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_specs.py
index bd2c815a3..3e2a9ec3f 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_specs.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_specs.py
@@ -1,9 +1,11 @@
+import functools
 from typing import List
 
-from .experiment_builder import ExperimentSpecification
+from .experiment_builder import ExperimentBuilder, ExperimentSpecification
 from .globals import RNA_SEQ
 
 
+@functools.cache
 def make_experiment_specs() -> List[ExperimentSpecification]:
     """
     Define all soma.Experiments to build in the census.
@@ -32,3 +34,8 @@ def make_experiment_specs() -> List[ExperimentSpecification]:
             gene_feature_length_uris=GENE_LENGTH_URIS,
         ),
     ]
+
+
+@functools.cache
+def make_experiment_builders() -> List[ExperimentBuilder]:
+    return [ExperimentBuilder(spec) for spec in make_experiment_specs()]
diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/mp.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/mp.py
index bd5d9b580..056efce44 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_soma/mp.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/mp.py
@@ -1,11 +1,11 @@
-import argparse
 import concurrent.futures
 import logging
 import multiprocessing
 import os
 from typing import Optional, cast
 
-from ..logging import setup_logging
+from ..build_state import CensusBuildArgs
+from ..util import process_init
 
 
 def cpu_count() -> int:
@@ -16,12 +16,8 @@ def cpu_count() -> int:
     return cast(int, cpu_count)
 
 
-def process_initializer(verbose: int = 0) -> None:
-    setup_logging(verbose)
-
-
 def create_process_pool_executor(
-    args: argparse.Namespace, max_workers: Optional[int] = None
+    args: CensusBuildArgs, max_workers: Optional[int] = None
 ) -> concurrent.futures.ProcessPoolExecutor:
     # We rely on the pool configuration being correct. Failure to do this will
     # lead to strange errors on some OS (eg., Linux defaults to fork). Rather
@@ -29,9 +25,9 @@ def create_process_pool_executor(
     assert multiprocessing.get_start_method(True) == "spawn"
 
     return concurrent.futures.ProcessPoolExecutor(
-        max_workers=args.max_workers if max_workers is None else max_workers,
-        initializer=process_initializer,
-        initargs=(args.verbose,),
+        max_workers=args.config.max_workers if max_workers is None else max_workers,
+        initializer=process_init,
+        initargs=(args,),
     )
 
 
diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py
index 1e996acb9..dd2f0041a 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py
@@ -1,4 +1,3 @@
-import argparse
 import logging
 import os
 import urllib.parse
@@ -7,11 +6,14 @@
 import aiohttp
 import fsspec
 
+from ..build_state import CensusBuildArgs
 from .datasets import Dataset
 from .mp import cpu_count, create_process_pool_executor
 
 
-def stage_source_assets(datasets: List[Dataset], args: argparse.Namespace, assets_dir: str) -> None:
+def stage_source_assets(datasets: List[Dataset], args: CensusBuildArgs) -> None:
+    assets_dir = args.h5ads_path.as_posix()
+
     logging.info(f"Starting asset staging to {assets_dir}")
     assert os.path.isdir(assets_dir)
 
@@ -19,7 +21,7 @@ def stage_source_assets(datasets: List[Dataset], args: argparse.Namespace, asset
     datasets = sorted(datasets, key=lambda d: d.asset_h5ad_filesize, reverse=True)
 
     N = len(datasets)
-    if getattr(args, "multi_process", False):
+    if not args.config.multi_process:
         n_workers = max(min(8, cpu_count()), 64)
         with create_process_pool_executor(args, n_workers) as pe:
             paths = list(
diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/util.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/util.py
index 3e5496210..e2adf1c01 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_soma/util.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/util.py
@@ -1,6 +1,5 @@
 import os
 import time
-import urllib.parse
 from typing import Any, Iterator, Optional, Union
 
 import numpy as np
@@ -64,21 +63,6 @@ def array_chunker(
     raise NotImplementedError("array_chunker: unsupported array type")
 
 
-def uricat(container_uri: str, *paths: str) -> str:
-    """
-    Concat one or more paths, separated with '/'
-
-    Similar to urllib.parse.urljoin except it takes an iterator, and
-    assumes the container_uri is a 'directory'/container, ie, ends in '/'.
-    """
-
-    uri = container_uri
-    for p in paths:
-        uri = uri if uri.endswith("/") else uri + "/"
-        uri = urllib.parse.urljoin(uri, p)
-    return uri
-
-
 def fetch_json(url: str, delay_secs: float = 0.0) -> object:
     response = requests.get(url)
     response.raise_for_status()
diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/validate.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/validate.py
index 0d27f35d4..a322d8993 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_soma/validate.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/validate.py
@@ -1,4 +1,3 @@
-import argparse
 import concurrent.futures
 import dataclasses
 import logging
@@ -16,10 +15,13 @@
 from scipy import sparse
 from typing_extensions import Self
 
+from ..build_state import CensusBuildArgs
+from ..util import urlcat
 from .anndata import make_anndata_cell_filter, open_anndata
 from .consolidate import list_uris_to_consolidate
 from .datasets import Dataset
 from .experiment_builder import ExperimentSpecification
+from .experiment_specs import make_experiment_specs
 from .globals import (
     CENSUS_DATA_NAME,
     CENSUS_DATASETS_COLUMNS,
@@ -39,7 +41,6 @@
     SOMA_TileDB_Context,
 )
 from .mp import create_process_pool_executor, log_on_broken_process_pool
-from .util import uricat
 
 
 @dataclass  # TODO: use attrs
@@ -63,7 +64,7 @@ def n_vars(self) -> int:
 
 def open_experiment(base_uri: str, eb: ExperimentSpecification) -> soma.Experiment:
     """Helper function that knows the Census schema path conventions."""
-    return soma.Experiment.open(uricat(base_uri, CENSUS_DATA_NAME, eb.name), mode="r")
+    return soma.Experiment.open(urlcat(base_uri, CENSUS_DATA_NAME, eb.name), mode="r")
 
 
 def validate_all_soma_objects_exist(soma_path: str, experiment_specifications: List[ExperimentSpecification]) -> bool:
@@ -179,7 +180,7 @@ def validate_axis_dataframes(
     soma_path: str,
     datasets: List[Dataset],
     experiment_specifications: List[ExperimentSpecification],
-    args: argparse.Namespace,
+    args: CensusBuildArgs,
 ) -> Dict[str, EbInfo]:
     """ "
     Validate axis dataframes: schema, shape, contents
@@ -205,7 +206,7 @@ def validate_axis_dataframes(
 
     # check shapes & perform weak test of contents
     eb_info = {eb.name: EbInfo() for eb in experiment_specifications}
-    if args.multi_process:
+    if args.config.multi_process:
         with create_process_pool_executor(args) as ppe:
             futures = [
                 ppe.submit(_validate_axis_dataframes, (assets_path, soma_path, dataset, experiment_specifications))
@@ -397,7 +398,7 @@ def validate_X_layers(
     datasets: List[Dataset],
     experiment_specifications: List[ExperimentSpecification],
     eb_info: Dict[str, EbInfo],
-    args: argparse.Namespace,
+    args: CensusBuildArgs,
 ) -> bool:
     """ "
     Validate all X layers: schema, shape, contents
@@ -429,7 +430,7 @@ def validate_X_layers(
                     assert X.schema.field("soma_data").type == CENSUS_X_LAYERS[lyr]
                     assert X.shape == (n_obs, n_vars)
 
-    if args.multi_process:
+    if args.config.multi_process:
         with create_process_pool_executor(args) as ppe:
             ROWS_PER_PROCESS = 1_000_000
             dup_coord_futures = [
@@ -479,7 +480,7 @@ def load_datasets_from_census(assets_path: str, soma_path: str) -> List[Dataset]
     # census against the snapshot assets.
     with soma.Collection.open(soma_path, context=SOMA_TileDB_Context()) as census:
         df = census[CENSUS_INFO_NAME][CENSUS_DATASETS_NAME].read().concat().to_pandas()
-        df["corpora_asset_h5ad_uri"] = df.dataset_h5ad_path.map(lambda p: uricat(assets_path, p))
+        df["corpora_asset_h5ad_uri"] = df.dataset_h5ad_path.map(lambda p: urlcat(assets_path, p))
         datasets = Dataset.from_dataframe(df)
         return datasets
 
@@ -487,7 +488,7 @@ def load_datasets_from_census(assets_path: str, soma_path: str) -> List[Dataset]
 def validate_manifest_contents(assets_path: str, datasets: List[Dataset]) -> bool:
     """Confirm contents of manifest are correct."""
     for d in datasets:
-        p = pathlib.Path(uricat(assets_path, d.dataset_h5ad_path))
+        p = pathlib.Path(urlcat(assets_path, d.dataset_h5ad_path))
         assert p.exists() and p.is_file(), f"{d.dataset_h5ad_path} is missing from the census"
         assert str(p).endswith(".h5ad"), "Expected only H5AD assets"
 
@@ -543,15 +544,19 @@ def _walk_tree(name: str, parent: Any) -> None:
     return True
 
 
-def validate(
-    args: argparse.Namespace, soma_path: str, assets_path: str, experiment_specifications: List[ExperimentSpecification]
-) -> bool:
+def validate(args: CensusBuildArgs) -> bool:
     """
     Validate that the "census" matches the datasets and experiment builder spec.
 
     Will raise if validation fails. Returns True on success.
     """
     logging.info("Validation start")
+
+    experiment_specifications = make_experiment_specs()
+
+    soma_path = args.soma_path.as_posix()
+    assets_path = args.h5ads_path.as_posix()
+
     assert validate_directory_structure(soma_path, assets_path)
 
     assert validate_all_soma_objects_exist(soma_path, experiment_specifications)
diff --git a/tools/cell_census_builder/src/cell_census_builder/build_state.py b/tools/cell_census_builder/src/cell_census_builder/build_state.py
new file mode 100644
index 000000000..dce19ddde
--- /dev/null
+++ b/tools/cell_census_builder/src/cell_census_builder/build_state.py
@@ -0,0 +1,162 @@
+"""
+build state and config
+"""
+import functools
+import io
+import os
+import pathlib
+from datetime import datetime
+from typing import Any, Iterator, Mapping, Union
+
+import attrs
+import yaml
+from typing_extensions import Self
+
+CENSUS_BUILD_CONFIG = "config.yaml"
+CENSUS_BUILD_STATE = "state.yaml"
+CONFIG_DEFAULTS = {
+    "build_tag": datetime.now().astimezone().date().isoformat(),
+    "verbose": 1,
+    "log_dir": "logs",
+    "log_file": "build.log",
+    "cell_census_S3_path": "s3://cellxgene-data-public/cell-census",
+    # XXX TODO add host requirements, etc.
+    "consolidate": True,
+    "validate": True,
+    "multi_process": False,
+    "max_workers": None,
+    "manifest": None,
+    "test_first_n": None,
+    "test_disable_dirty_get_check": False,
+}
+
+
+class Namespace(Mapping[str, Any]):
+    """Readonly namespace"""
+
+    def __init__(self, **kwargs: Any):
+        self._state = dict(kwargs)
+
+    def __eq__(self, other: object) -> bool:
+        if isinstance(other, Namespace):
+            return self._state == other._state
+        return NotImplemented
+
+    def __contains__(self, key: Any) -> bool:
+        return key in self._state
+
+    def __repr__(self) -> str:
+        items = (f"{k}={v!r}" for k, v in self.items())
+        return "{}({})".format(type(self).__name__, ", ".join(items))
+
+    def __getitem__(self, key: str) -> Any:
+        return self._state[key]
+
+    def __getattr__(self, key: str) -> Any:
+        return self._state[key]
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(self._state)
+
+    def __len__(self) -> int:
+        return len(self._state)
+
+    def __getstate__(self) -> dict[str, Any]:
+        return self.__dict__.copy()
+
+    def __setstate__(self, state: dict[str, Any]) -> None:
+        self.__dict__.update(state)
+
+
+class MutableNamespace(Namespace):
+    """Mutable namespace"""
+
+    def __setitem__(self, key: str, value: Any) -> None:
+        if not isinstance(key, str):
+            raise TypeError
+        self._state[key] = value
+
+    # Do not implement __delitem__. Log format has no deletion marker, so delete
+    # semantics can't be supported until that is implemented.
+
+
+class CensusBuildConfig(Namespace):
+    defaults = CONFIG_DEFAULTS
+
+    def __init__(self, **kwargs: Any):
+        config = self.defaults.copy()
+        config.update(kwargs)
+        super().__init__(**config)
+
+    @classmethod
+    def load(cls, file: Union[str, os.PathLike[str], io.TextIOBase]) -> Self:
+        if isinstance(file, (str, os.PathLike)):
+            with open(file) as f:
+                user_config = yaml.safe_load(f)
+        else:
+            user_config = yaml.safe_load(file)
+
+        # Empty YAML config file is legal
+        if user_config is None:
+            user_config = {}
+
+        # But we only understand a top-level dictionary (e.g., no lists, etc.)
+        if not isinstance(user_config, dict):
+            raise TypeError("YAML config file malformed - expected top-level dictionary")
+
+        return cls(**user_config)
+
+
+class CensusBuildState(MutableNamespace):
+    def __init__(self, **kwargs: Any):
+        self.__dirty_keys = set(kwargs)
+        super().__init__(**kwargs)
+
+    def __setitem__(self, key: str, value: Any) -> None:
+        if self._state.get(key) == value:
+            return
+        super().__setitem__(key, value)
+        self.__dirty_keys.add(key)
+
+    @classmethod
+    def load(cls, file: Union[str, os.PathLike[str], io.TextIOBase]) -> Self:
+        if isinstance(file, (str, os.PathLike)):
+            with open(file) as state_log:
+                documents = yaml.safe_load_all(state_log)
+        else:
+            documents = yaml.safe_load_all(file)
+
+        return cls(**functools.reduce(lambda acc, r: acc.update(r) or acc, documents, {}))
+
+    def commit(self, file: Union[str, os.PathLike[str]]) -> None:
+        # append dirty elements (atomic on Posix)
+        if self.__dirty_keys:
+            dirty = {k: self[k] for k in self.__dirty_keys}
+            self.__dirty_keys.clear()
+            with open(file, mode="a") as state_log:
+                record = f"--- # {datetime.now().isoformat()}\n" + yaml.dump(dirty)
+                state_log.write(record)
+
+
+@attrs.define(frozen=True)
+class CensusBuildArgs:
+    working_dir: pathlib.PosixPath = attrs.field(validator=attrs.validators.instance_of(pathlib.PosixPath))
+    config: CensusBuildConfig = attrs.field(validator=attrs.validators.instance_of(CensusBuildConfig))
+    state: CensusBuildState = attrs.field(
+        factory=CensusBuildState, validator=attrs.validators.instance_of(CensusBuildState)  # default: empty state
+    )
+
+    @property
+    def soma_path(self) -> pathlib.PosixPath:
+        return self.working_dir / self.build_tag / "soma"
+
+    @property
+    def h5ads_path(self) -> pathlib.PosixPath:
+        return self.working_dir / self.build_tag / "h5ads"
+
+    @property
+    def build_tag(self) -> str:
+        build_tag = self.config.build_tag
+        if not isinstance(build_tag, str):
+            raise TypeError("Configuration contains non-string build_tag.")
+        return build_tag
diff --git a/tools/cell_census_builder/src/cell_census_builder/host_validation.py b/tools/cell_census_builder/src/cell_census_builder/host_validation.py
index 69a096056..4de94e9c2 100644
--- a/tools/cell_census_builder/src/cell_census_builder/host_validation.py
+++ b/tools/cell_census_builder/src/cell_census_builder/host_validation.py
@@ -1,67 +1,96 @@
 import logging
 import os
 import sys
-from typing import Optional
+from typing import Union
 
 import psutil
 
-from cell_census_builder.logging import hr_multibyte_unit, setup_logging
+from .build_state import CensusBuildArgs
+from .logging import hr_binary_unit, hr_decimal_unit
 
-"""Minimum physical RAM"""
-MIN_RAM = 512 * 1024**3  # 512GiB
+"""Defaults"""
+MIN_PHYSICAL_MEMORY = 512 * 1024**3  # 512GiB
+MIN_SWAP_MEMORY = 2 * 1024**4  # 2TiB
+MIN_FREE_DISK_SPACE = 1 * 1024**4  # 1 TiB
 
-"""Minimum virtual memory/swap"""
-MIN_SWAP = 2 * 1024**4  # 2TiB
 
-"""Minimum free disk space"""
-MIN_FREE_DISK_SPACE = 1 * 1024**4  # 1 TiB
+def _check(condition: bool, message: str) -> bool:
+    """Like assert, but logs"""
+    if not condition:
+        logging.critical(message)
+    return condition
 
 
-def check_os() -> None:
+def check_os() -> bool:
     """
     Check that we run on Posix (Linux, MacOS), as we rely on
     Posix semantics for a few things.
     """
-    assert psutil.POSIX
+    return _check(os.name == "posix" and psutil.POSIX, "Census builder requires Posix OS")
 
 
-def check_memory() -> None:
+def check_physical_memory(min_physical_memory: int) -> bool:
     """
     Check for sufficient physical and virtual memory.
     """
     svmem = psutil.virtual_memory()
-    logging.debug(f"Host: {hr_multibyte_unit(svmem.total)} memory found")
-    assert svmem.total >= MIN_RAM, f"Insufficient memory (found {svmem.total}, require {MIN_RAM})"
+    logging.debug(f"Host: {hr_binary_unit(svmem.total)} memory found")
+    return _check(
+        svmem.total >= min_physical_memory,
+        f"Insufficient memory (found {hr_binary_unit(svmem.total)}, " f"require {hr_binary_unit(min_physical_memory)})",
+    )
 
-    svswap = psutil.swap_memory()
-    logging.debug(f"Host: {hr_multibyte_unit(svswap.total)} swap found")
-    assert svswap.total >= MIN_SWAP, f"Insufficient swap space (found {svswap.total}, require {MIN_SWAP})"
 
-
-def check_free_disk(working_dir: Optional[str] = ".") -> None:
+def check_swap_memory(min_swap_memory: int) -> bool:
     """
-    Check for sufficient free disk space.
+    Check for sufficient physical and virtual memory.
     """
-    skdiskusage = psutil.disk_usage(working_dir)
-    logging.debug(f"Host: {hr_multibyte_unit(skdiskusage.free)} free disk space found")
-    assert (
-        skdiskusage.free >= MIN_FREE_DISK_SPACE
-    ), f"Insufficient free disk space (found {skdiskusage.free}, require {MIN_FREE_DISK_SPACE})"
+    svswap = psutil.swap_memory()
+    logging.debug(f"Host: {hr_binary_unit(svswap.total)} swap found")
+    return _check(
+        svswap.total >= min_swap_memory,
+        f"Insufficient swap space (found {hr_binary_unit(svswap.total)}, "
+        f"require {hr_binary_unit(min_swap_memory)})",
+    )
 
 
-def run_all_checks() -> int:
+def check_free_disk(working_dir: Union[str, os.PathLike[str]], min_free_disk_space: int) -> bool:
     """
-    Run all host validation checks.  Returns zero or raises an exception.
+    Check for sufficient free disk space.
     """
-    check_os()
-    check_memory()
-    check_free_disk(os.getcwd())  # assumed working directory is CWD
-    logging.info("Host validation success")
-    return 0
-
-
-# Process MUST return zero on success (all good) or non-zero on a
+    working_dir_fspath = working_dir.__fspath__() if isinstance(working_dir, os.PathLike) else working_dir
+    skdiskusage = psutil.disk_usage(working_dir_fspath)
+    logging.debug(f"Host: {hr_decimal_unit(skdiskusage.free)} free disk space found")
+    return _check(
+        skdiskusage.free >= min_free_disk_space,
+        f"Insufficient free disk space (found {hr_decimal_unit(skdiskusage.free)}, "
+        f"require {hr_decimal_unit(min_free_disk_space)})",
+    )
+
+
+def check_host(args: CensusBuildArgs) -> bool:
+    """Verify all host requirments. Return True if OK, False if conditions not met"""
+    return (
+        check_os()
+        and check_physical_memory(args.config.get("min_physical_memory", MIN_PHYSICAL_MEMORY))
+        and check_swap_memory(args.config.get("min_swap_memory", MIN_SWAP_MEMORY))
+        and check_free_disk(args.working_dir, args.config.get("min_free_disk_space", MIN_FREE_DISK_SPACE))
+    )
+
+
+# Return zero on success (all good) or non-zero on a
 # host which does not validate.
 if __name__ == "__main__":
-    setup_logging(verbose=1)
-    sys.exit(run_all_checks())
+    """For CLI testing"""
+
+    def main() -> int:
+        assert (
+            check_os()
+            and check_physical_memory(MIN_PHYSICAL_MEMORY)
+            and check_swap_memory(MIN_SWAP_MEMORY)
+            and check_free_disk(os.getcwd(), MIN_FREE_DISK_SPACE)
+        )  # assumed working directory is CWD
+        print("Host validation success")
+        return 0
+
+    sys.exit(main())
diff --git a/tools/cell_census_builder/src/cell_census_builder/logging.py b/tools/cell_census_builder/src/cell_census_builder/logging.py
index 987046b6d..7d587cdc9 100644
--- a/tools/cell_census_builder/src/cell_census_builder/logging.py
+++ b/tools/cell_census_builder/src/cell_census_builder/logging.py
@@ -1,26 +1,49 @@
 import logging
 import math
+import pathlib
+import sys
+from typing import List, Tuple
 
+from .build_state import CensusBuildArgs
 
-def setup_logging(verbose: int = 0) -> None:
+
+def logging_init(args: CensusBuildArgs) -> None:
     """
-    Configure the logger
+    Configure the logger.
     """
-    level = logging.DEBUG if verbose > 1 else logging.INFO if verbose == 1 else logging.WARNING
+    level = logging.DEBUG if args.config.verbose > 1 else logging.INFO if args.config.verbose == 1 else logging.WARNING
+    handlers: List[logging.Handler] = [logging.StreamHandler(sys.stderr)]
+
+    # Create logging directory if configured appropriately
+    if args.config.log_dir and args.config.log_file:
+        logs_dir = pathlib.PosixPath(args.working_dir) / pathlib.PosixPath(args.config.log_dir)
+        logs_dir.mkdir(parents=True, exist_ok=True)
+        logs_file = logs_dir / args.config.log_file
+        handlers.insert(0, logging.FileHandler(logs_file))
+
     logging.basicConfig(
         format="%(asctime)s %(process)-7s %(levelname)-8s %(message)s",
         level=level,
         datefmt="%Y-%m-%d %H:%M:%S",
+        handlers=handlers,
     )
     logging.captureWarnings(True)
 
 
-def hr_multibyte_unit(n_bytes: int) -> str:
-    """Convert number of bytes into a human-readable binary (power of 1024) multi-byte unit string."""
+def _hr_multibyte_unit(n_bytes: int, unit_base: int, unit_size_names: Tuple[str, ...]) -> str:
+    """Private. Convert number of bytes into a human-readable multi-byte unit string."""
     if n_bytes == 0:
         return "0B"
 
-    unit_size_name = ("B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB")
-    unit = int(math.floor(math.log(n_bytes, 1024)))
-    n_units = round(n_bytes / math.pow(1024, unit))
-    return f"{n_units}{unit_size_name[unit]}"
+    unit = int(math.floor(math.log(n_bytes, unit_base)))
+    n_units = round(n_bytes / math.pow(unit_base, unit))
+    return f"{n_units}{unit_size_names[unit]}"
+
+
+def hr_binary_unit(n_bytes: int) -> str:
+    return _hr_multibyte_unit(n_bytes, 1024, ("B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"))
+
+
+def hr_decimal_unit(n_bytes: int) -> str:
+    """Convert number of bytes into a human-readable decimal (power of 1000) multi-byte unit string."""
+    return _hr_multibyte_unit(n_bytes, 1000, ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"))
diff --git a/tools/cell_census_builder/src/cell_census_builder/util.py b/tools/cell_census_builder/src/cell_census_builder/util.py
new file mode 100644
index 000000000..683662deb
--- /dev/null
+++ b/tools/cell_census_builder/src/cell_census_builder/util.py
@@ -0,0 +1,46 @@
+import multiprocessing
+import urllib.parse
+
+from .build_state import CensusBuildArgs
+from .logging import logging_init
+
+
+def urljoin(base: str, url: str) -> str:
+    """
+    like urllib.parse.urljoin, but doesn't get confused by S3://
+    """
+    p_url = urllib.parse.urlparse(url)
+    if p_url.netloc:
+        return url
+
+    p_base = urllib.parse.urlparse(base)
+    path = urllib.parse.urljoin(p_base.path, p_url.path)
+    parts = [p_base.scheme, p_base.netloc, path, p_url.params, p_url.query, p_url.fragment]
+    return urllib.parse.urlunparse(parts)
+
+
+def urlcat(base: str, *paths: str) -> str:
+    """
+    Concat one or more paths, separated with '/'. Similar to urllib.parse.urljoin,
+    but doesn't get confused by S3:// and other "non-standard" protocols (treats
+    them as if they are same as http: or file:)
+
+    Similar to urllib.parse.urljoin except it takes an iterator, and
+    assumes the container_uri is a 'directory'/container, ie, ends in '/'.
+    """
+
+    url = base
+    for p in paths:
+        url = url if url.endswith("/") else url + "/"
+        url = urljoin(url, p)
+    return url
+
+
+def process_init(args: CensusBuildArgs) -> None:
+    """
+    Called on every process start to configure global package/module behavior.
+    """
+    if multiprocessing.get_start_method(True) != "spawn":
+        multiprocessing.set_start_method("spawn", True)
+
+    logging_init(args)
diff --git a/tools/cell_census_builder/tests/anndata/conftest.py b/tools/cell_census_builder/tests/anndata/conftest.py
index a2d0a78bf..52064fe89 100644
--- a/tools/cell_census_builder/tests/anndata/conftest.py
+++ b/tools/cell_census_builder/tests/anndata/conftest.py
@@ -3,12 +3,16 @@
 import anndata as ad
 import pytest
 from cell_census_builder.build_soma.datasets import Dataset
+from cell_census_builder.build_state import CensusBuildArgs
 
 from ..conftest import ORGANISMS, get_h5ad
 
 
 @pytest.fixture
-def datasets_with_mixed_feature_reference(assets_path: str) -> List[Dataset]:
+def datasets_with_mixed_feature_reference(census_build_args: CensusBuildArgs) -> List[Dataset]:
+    census_build_args.h5ads_path.mkdir(parents=True, exist_ok=True)
+    assets_path = census_build_args.h5ads_path.as_posix()
+
     organism = ORGANISMS[0]
     dataset_id = "an_id"
     datasets = []
@@ -31,7 +35,10 @@ def datasets_with_mixed_feature_reference(assets_path: str) -> List[Dataset]:
 
 
 @pytest.fixture
-def datasets_with_larger_raw_layer(assets_path: str) -> List[Dataset]:
+def datasets_with_larger_raw_layer(census_build_args: CensusBuildArgs) -> List[Dataset]:
+    census_build_args.h5ads_path.mkdir(parents=True, exist_ok=True)
+    assets_path = census_build_args.h5ads_path.as_posix()
+
     organism = ORGANISMS[0]
     dataset_id = "an_id"
     datasets = []
@@ -56,7 +63,10 @@ def datasets_with_larger_raw_layer(assets_path: str) -> List[Dataset]:
 
 
 @pytest.fixture
-def datasets_with_incorrect_schema_version(assets_path: str) -> List[Dataset]:
+def datasets_with_incorrect_schema_version(census_build_args: CensusBuildArgs) -> List[Dataset]:
+    census_build_args.h5ads_path.mkdir(parents=True, exist_ok=True)
+    assets_path = census_build_args.h5ads_path.as_posix()
+
     organism = ORGANISMS[0]
     dataset_id = "an_id"
     datasets = []
diff --git a/tools/cell_census_builder/tests/conftest.py b/tools/cell_census_builder/tests/conftest.py
index 3b8363289..663949807 100644
--- a/tools/cell_census_builder/tests/conftest.py
+++ b/tools/cell_census_builder/tests/conftest.py
@@ -1,5 +1,4 @@
 import io
-import os
 import pathlib
 from typing import List, Optional
 
@@ -13,7 +12,8 @@
 from cell_census_builder.build_soma.globals import (
     CENSUS_X_LAYERS_PLATFORM_CONFIG,
 )
-from cell_census_builder.build_soma.mp import process_initializer
+from cell_census_builder.build_state import CensusBuildArgs, CensusBuildConfig
+from cell_census_builder.util import process_init
 from scipy import sparse
 
 
@@ -94,21 +94,22 @@ def get_h5ad(organism: Organism, gene_ids: Optional[List[str]] = None) -> anndat
 
 
 @pytest.fixture
-def assets_path(tmp_path: pathlib.Path) -> str:
-    assets_path = f"{tmp_path}/h5ads"
-    os.mkdir(assets_path)
-    return assets_path
+def census_build_args(request: pytest.FixtureRequest, tmp_path: pathlib.Path) -> CensusBuildArgs:
+    # parameterization is optional
+    try:
+        config = request.param
+    except AttributeError:
+        config = {}
 
-
-@pytest.fixture
-def soma_path(tmp_path: pathlib.Path) -> str:
-    soma_path = f"{tmp_path}/soma"
-    os.mkdir(soma_path)
-    return soma_path
+    if config.get("manifest") is True:  # if bool True, replace with an IOstream
+        config["manifest"] = request.getfixturevalue("manifest_csv")
+    return CensusBuildArgs(working_dir=tmp_path, config=CensusBuildConfig(**config))
 
 
 @pytest.fixture
-def datasets(assets_path: str) -> List[Dataset]:
+def datasets(census_build_args: CensusBuildArgs) -> List[Dataset]:
+    census_build_args.h5ads_path.mkdir(parents=True, exist_ok=True)
+    assets_path = census_build_args.h5ads_path.as_posix()
     datasets = []
     for organism in ORGANISMS:
         for i in range(NUM_DATASET):
@@ -165,7 +166,7 @@ def manifest_csv_with_duplicates(tmp_path: pathlib.Path) -> io.TextIOWrapper:
 
 
 @pytest.fixture()
-def setup(monkeypatch: MonkeyPatch) -> None:
-    process_initializer()
+def setup(monkeypatch: MonkeyPatch, census_build_args: CensusBuildArgs) -> None:
+    process_init(census_build_args)
     monkeypatch.setitem(CENSUS_X_LAYERS_PLATFORM_CONFIG["raw"]["tiledb"]["create"]["dims"]["soma_dim_0"], "tile", 2)
     monkeypatch.setitem(CENSUS_X_LAYERS_PLATFORM_CONFIG["raw"]["tiledb"]["create"]["dims"]["soma_dim_1"], "tile", 2)
diff --git a/tools/cell_census_builder/tests/test_builder.py b/tools/cell_census_builder/tests/test_builder.py
index 90b233300..3a626b3da 100644
--- a/tools/cell_census_builder/tests/test_builder.py
+++ b/tools/cell_census_builder/tests/test_builder.py
@@ -1,19 +1,17 @@
-import io
 import os
 import pathlib
-from types import ModuleType, SimpleNamespace
+from types import ModuleType
 from typing import List
 from unittest.mock import patch
 
 import numpy as np
 import pandas as pd
 import pyarrow as pa
+import pytest
 import tiledb
 import tiledbsoma as soma
 from cell_census_builder.build_soma.build import build, build_step1_get_source_datasets
 from cell_census_builder.build_soma.datasets import Dataset
-from cell_census_builder.build_soma.experiment_builder import ExperimentBuilder
-from cell_census_builder.build_soma.experiment_specs import make_experiment_specs
 from cell_census_builder.build_soma.globals import (
     CENSUS_DATA_NAME,
     CENSUS_INFO_NAME,
@@ -21,10 +19,16 @@
     MEASUREMENT_RNA_NAME,
 )
 from cell_census_builder.build_soma.validate import validate
+from cell_census_builder.build_state import CensusBuildArgs
 
 
+@pytest.mark.parametrize(
+    "census_build_args", [dict(multi_process=False, consolidate=True, build_tag="test_tag", verbose=1)], indirect=True
+)
 def test_base_builder_creation(
-    datasets: List[Dataset], assets_path: str, soma_path: str, tmp_path: pathlib.Path, setup: None
+    datasets: List[Dataset],
+    census_build_args: CensusBuildArgs,
+    setup: None,
 ) -> None:
     """
     Runs the builder, queries the census and performs a set of base assertions.
@@ -34,25 +38,18 @@ def test_base_builder_creation(
     ), patch("cell_census_builder.build_soma.consolidate._run"), patch(
         "cell_census_builder.build_soma.validate.validate_consolidation", return_value=True
     ):
-        # Patching consolidate_tiledb_object, becuase is uses to much memory to run in github actions.
-        experiment_specifications = make_experiment_specs()
-        experiment_builders = [ExperimentBuilder(spec) for spec in experiment_specifications]
-
-        from types import SimpleNamespace
-
-        args = SimpleNamespace(multi_process=False, consolidate=True, build_tag="test_tag", verbose=True)
-        return_value = build(args, soma_path, assets_path, experiment_builders)
+        return_value = build(census_build_args)
 
         # return_value = 0 means that the build succeeded
         assert return_value == 0
 
         # validate the cell_census
-        return_value = validate(args, soma_path, assets_path, experiment_specifications)
+        return_value = validate(census_build_args)
         assert return_value is True
 
         # Query the census and do assertions
         with soma.Collection.open(
-            uri=soma_path,
+            uri=census_build_args.soma_path.as_posix(),
             context=soma.options.SOMATileDBContext(tiledb_ctx=tiledb.Ctx({"vfs.s3.region": "us-west-2"})),
         ) as census:
             # There are 8 cells in total (4 from the first and 4 from the second datasets). They all belong to homo_sapiens
@@ -130,21 +127,24 @@ def test_unicode_support(tmp_path: pathlib.Path) -> None:
         assert pd_df_in.read().concat().to_pandas()["value"].to_list() == ["Ünicode", "S̈upport"]
 
 
-def test_build_step1_get_source_datasets(tmp_path: pathlib.Path, manifest_csv: io.TextIOWrapper) -> None:
-    import pathlib
-
-    pathlib.Path(tmp_path / "dest").mkdir()
-    args = SimpleNamespace(manifest=manifest_csv, test_first_n=None, verbose=2, multi_process=True)
+@pytest.mark.parametrize(
+    "census_build_args",
+    [dict(manifest=True, test_first_n=None, verbose=2, build_tag="build_tag", multi_process=True)],
+    indirect=True,
+)
+def test_build_step1_get_source_datasets(tmp_path: pathlib.Path, census_build_args: CensusBuildArgs) -> None:
+    # prereq for build step 1
+    census_build_args.h5ads_path.mkdir(parents=True, exist_ok=True)
 
     # Call the function
-    datasets = build_step1_get_source_datasets(args, f"{tmp_path}/dest")
+    datasets = build_step1_get_source_datasets(census_build_args)
 
     # Verify that 2 datasets are returned
     assert len(datasets) == 2
 
     # Verify that the datasets have been staged
-    assert pathlib.Path(tmp_path / "dest" / "dataset_id_1.h5ad").exists()
-    assert pathlib.Path(tmp_path / "dest" / "dataset_id_2.h5ad").exists()
+    assert pathlib.Path(tmp_path / "build_tag" / "h5ads" / "dataset_id_1.h5ad").exists()
+    assert pathlib.Path(tmp_path / "build_tag" / "h5ads" / "dataset_id_2.h5ad").exists()
 
 
 def setup_module(module: ModuleType) -> None:
diff --git a/tools/cell_census_builder/tests/test_source_assets.py b/tools/cell_census_builder/tests/test_source_assets.py
index 0b5f5707e..6ffa9179f 100644
--- a/tools/cell_census_builder/tests/test_source_assets.py
+++ b/tools/cell_census_builder/tests/test_source_assets.py
@@ -1,28 +1,29 @@
 import pathlib
-from types import ModuleType, SimpleNamespace
+from types import ModuleType
 
 from cell_census_builder.build_soma.datasets import Dataset
 from cell_census_builder.build_soma.source_assets import stage_source_assets
+from cell_census_builder.build_state import CensusBuildArgs
 
 
-def test_source_assets(tmp_path: pathlib.Path) -> None:
+def test_source_assets(tmp_path: pathlib.Path, census_build_args: CensusBuildArgs) -> None:
     """
     `source_assets` should copy the datasets from their `corpora_asset_h5ad_uri` to the specified `assets_dir`
     """
     datasets = []
-    pathlib.Path(tmp_path / "source").mkdir()
-    pathlib.Path(tmp_path / "dest").mkdir()
+    (tmp_path / "source").mkdir()
+    census_build_args.h5ads_path.mkdir(parents=True, exist_ok=True)
     for i in range(10):
         dataset = Dataset(f"dataset_{i}", corpora_asset_h5ad_uri=f"file://{tmp_path}/source/dataset_{i}.h5ad")
-        pathlib.Path(tmp_path / "source" / f"dataset_{i}.h5ad").touch()
+        (tmp_path / "source" / f"dataset_{i}.h5ad").touch()
         datasets.append(dataset)
 
     # Call the function
-    stage_source_assets(datasets, SimpleNamespace(verbose=True), tmp_path / "dest")
+    stage_source_assets(datasets, census_build_args)
 
     # Verify that the files exist
     for i in range(10):
-        assert pathlib.Path(tmp_path / "dest" / f"dataset_{i}.h5ad").exists()
+        assert (census_build_args.h5ads_path / f"dataset_{i}.h5ad").exists()
 
 
 def setup_module(module: ModuleType) -> None:
diff --git a/tools/cell_census_builder/tests/test_util.py b/tools/cell_census_builder/tests/test_util.py
index 77aaf8dbf..7e14f2706 100644
--- a/tools/cell_census_builder/tests/test_util.py
+++ b/tools/cell_census_builder/tests/test_util.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pytest
-from cell_census_builder.build_soma.util import array_chunker, is_nonnegative_integral, uricat
+from cell_census_builder.build_soma.util import array_chunker, is_nonnegative_integral
+from cell_census_builder.util import urlcat, urljoin
 from scipy.sparse import coo_matrix, csr_matrix, triu
 
 
@@ -119,14 +120,33 @@ def test_array_chunker() -> None:
         list(array_chunker(X))
 
 
-def test_uricat() -> None:
-    assert uricat("path", "to", "somewhere") == "path/to/somewhere"
-    assert uricat("path/", "to/", "somewhere") == "path/to/somewhere"
-    assert uricat("path/", "to/", "somewhere/") == "path/to/somewhere/"
-    assert uricat("file:///path/to", "somewhere") == "file:///path/to/somewhere"
-    assert uricat("file:///path/to/", "somewhere") == "file:///path/to/somewhere"
-    assert uricat("file:///path/to", "somewhere") == "file:///path/to/somewhere"
-    assert uricat("file:///path/to/", "/absolute") == "file:///absolute"
-    assert uricat("file://path/to", "file://somewhere") == "file://somewhere"
-    assert uricat("file:///path/to", "file://somewhere") == "file://somewhere"
-    assert uricat("file:///path/to", "file:///somewhere") == "file:///somewhere"
+def test_urljoin() -> None:
+    assert urljoin("path", "to") == "to"
+    assert urljoin("path/", "to") == "path/to"
+    assert urljoin("path/", "to/") == "path/to/"
+    assert urljoin("file:///path/to", "somewhere") == "file:///path/somewhere"
+    assert urljoin("file:///path/to/", "somewhere") == "file:///path/to/somewhere"
+    assert urljoin("file:///path/to", "somewhere") == "file:///path/somewhere"
+    assert urljoin("file:///path/to/", "/absolute") == "file:///absolute"
+    assert urljoin("file://path/to", "file://somewhere") == "file://somewhere"
+    assert urljoin("file:///path/to", "file://somewhere") == "file://somewhere"
+    assert urljoin("file:///path/to", "file:///somewhere") == "file:///somewhere"
+    assert urljoin("s3://foo", "bar") == "s3://foo/bar"
+    assert urljoin("s3://foo/", "bar") == "s3://foo/bar"
+    assert urljoin("s3://foo", "bar/") == "s3://foo/bar/"
+
+
+def test_urlcat() -> None:
+    assert urlcat("path", "to", "somewhere") == "path/to/somewhere"
+    assert urlcat("path/", "to/", "somewhere") == "path/to/somewhere"
+    assert urlcat("path/", "to/", "somewhere/") == "path/to/somewhere/"
+    assert urlcat("file:///path/to", "somewhere") == "file:///path/to/somewhere"
+    assert urlcat("file:///path/to/", "somewhere") == "file:///path/to/somewhere"
+    assert urlcat("file:///path/to", "somewhere") == "file:///path/to/somewhere"
+    assert urlcat("file:///path/to/", "/absolute") == "file:///absolute"
+    assert urlcat("file://path/to", "file://somewhere") == "file://somewhere"
+    assert urlcat("file:///path/to", "file://somewhere") == "file://somewhere"
+    assert urlcat("file:///path/to", "file:///somewhere") == "file:///somewhere"
+    assert urlcat("s3://foo", "bar", "baz") == "s3://foo/bar/baz"
+    assert urlcat("s3://foo", "bar/", "baz") == "s3://foo/bar/baz"
+    assert urlcat("s3://foo", "bar/", "baz/") == "s3://foo/bar/baz/"

From 4ae07c1576d13044d1d2b9cf6357feb77dc4a82c Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Mon, 20 Mar 2023 19:08:45 +0000
Subject: [PATCH 06/34] add missing package to dependency list

---
 tools/cell_census_builder/pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/cell_census_builder/pyproject.toml b/tools/cell_census_builder/pyproject.toml
index 6d49850ec..3d10e4b6d 100644
--- a/tools/cell_census_builder/pyproject.toml
+++ b/tools/cell_census_builder/pyproject.toml
@@ -43,6 +43,7 @@ dependencies= [
     "gitpython",
     "attrs>=22.2.0",
     "psutil",
+    "pyyaml",
 ]
 
 # [tool.setuptools.packages.find]

From a8963eef5fc47c9fc9df6e5464ad46381f2949c5 Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Mon, 20 Mar 2023 20:43:17 +0000
Subject: [PATCH 07/34] cleanup host validation config

---
 .../src/cell_census_builder/build_state.py    | 16 +++++++++---
 .../cell_census_builder/host_validation.py    | 25 +++++++++++--------
 2 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/tools/cell_census_builder/src/cell_census_builder/build_state.py b/tools/cell_census_builder/src/cell_census_builder/build_state.py
index dce19ddde..871dfcc1b 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_state.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_state.py
@@ -1,5 +1,6 @@
 """
-build state and config
+Manage the configuration and dynamic build state for the Census build.
+
 """
 import functools
 import io
@@ -12,15 +13,18 @@
 import yaml
 from typing_extensions import Self
 
+"""
+Defaults for Census configuration.
+"""
+
 CENSUS_BUILD_CONFIG = "config.yaml"
 CENSUS_BUILD_STATE = "state.yaml"
-CONFIG_DEFAULTS = {
+CENSUS_CONFIG_DEFAULTS = {
     "build_tag": datetime.now().astimezone().date().isoformat(),
     "verbose": 1,
     "log_dir": "logs",
     "log_file": "build.log",
     "cell_census_S3_path": "s3://cellxgene-data-public/cell-census",
-    # XXX TODO add host requirements, etc.
     "consolidate": True,
     "validate": True,
     "multi_process": False,
@@ -28,6 +32,10 @@
     "manifest": None,
     "test_first_n": None,
     "test_disable_dirty_get_check": False,
+    "host_validation_disable": False,  # if True, host validation checks will be skipped
+    "host_validation_min_physical_memory": 512 * 1024**3,  # 512GiB
+    "host_validation_min_swap_space": 2 * 1024**4,  # 2TiB
+    "host_validation_min_free_disk_space": 1 * 1024**4,  # 1 TiB
 }
 
 
@@ -81,7 +89,7 @@ def __setitem__(self, key: str, value: Any) -> None:
 
 
 class CensusBuildConfig(Namespace):
-    defaults = CONFIG_DEFAULTS
+    defaults = CENSUS_CONFIG_DEFAULTS
 
     def __init__(self, **kwargs: Any):
         config = self.defaults.copy()
diff --git a/tools/cell_census_builder/src/cell_census_builder/host_validation.py b/tools/cell_census_builder/src/cell_census_builder/host_validation.py
index 4de94e9c2..bca5b30eb 100644
--- a/tools/cell_census_builder/src/cell_census_builder/host_validation.py
+++ b/tools/cell_census_builder/src/cell_census_builder/host_validation.py
@@ -8,11 +8,6 @@
 from .build_state import CensusBuildArgs
 from .logging import hr_binary_unit, hr_decimal_unit
 
-"""Defaults"""
-MIN_PHYSICAL_MEMORY = 512 * 1024**3  # 512GiB
-MIN_SWAP_MEMORY = 2 * 1024**4  # 2TiB
-MIN_FREE_DISK_SPACE = 1 * 1024**4  # 1 TiB
-
 
 def _check(condition: bool, message: str) -> bool:
     """Like assert, but logs"""
@@ -70,11 +65,18 @@ def check_free_disk(working_dir: Union[str, os.PathLike[str]], min_free_disk_spa
 
 def check_host(args: CensusBuildArgs) -> bool:
     """Verify all host requirments. Return True if OK, False if conditions not met"""
+    if args.config.host_validation_disable:
+        return True
+
     return (
         check_os()
-        and check_physical_memory(args.config.get("min_physical_memory", MIN_PHYSICAL_MEMORY))
-        and check_swap_memory(args.config.get("min_swap_memory", MIN_SWAP_MEMORY))
-        and check_free_disk(args.working_dir, args.config.get("min_free_disk_space", MIN_FREE_DISK_SPACE))
+        and check_physical_memory(
+            args.config.get("min_physical_memory", args.config.host_validation_min_physical_memory)
+        )
+        and check_swap_memory(args.config.get("min_swap_memory", args.config.host_validation_min_swap_memory))
+        and check_free_disk(
+            args.working_dir, args.config.get("min_free_disk_space", args.config.host_validation_min_free_disk_space)
+        )
     )
 
 
@@ -82,13 +84,14 @@ def check_host(args: CensusBuildArgs) -> bool:
 # host which does not validate.
 if __name__ == "__main__":
     """For CLI testing"""
+    from .build_state import CENSUS_CONFIG_DEFAULTS
 
     def main() -> int:
         assert (
             check_os()
-            and check_physical_memory(MIN_PHYSICAL_MEMORY)
-            and check_swap_memory(MIN_SWAP_MEMORY)
-            and check_free_disk(os.getcwd(), MIN_FREE_DISK_SPACE)
+            and check_physical_memory(CENSUS_CONFIG_DEFAULTS["host_validation_min_physical_memory"])  # type: ignore[arg-type]
+            and check_swap_memory(CENSUS_CONFIG_DEFAULTS["host_validation_min_swap_memory"])  # type: ignore[arg-type]
+            and check_free_disk(os.getcwd(), CENSUS_CONFIG_DEFAULTS["host_validation_min_free_disk_space"])  # type: ignore[arg-type]
         )  # assumed working directory is CWD
         print("Host validation success")
         return 0

From 76c6bbd5489aeca4e74d9d0786412af5f70e37ab Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Mon, 20 Mar 2023 20:45:10 +0000
Subject: [PATCH 08/34] update test CLI for host validation

---
 .../src/cell_census_builder/host_validation.py             | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/cell_census_builder/src/cell_census_builder/host_validation.py b/tools/cell_census_builder/src/cell_census_builder/host_validation.py
index bca5b30eb..fd60db8d4 100644
--- a/tools/cell_census_builder/src/cell_census_builder/host_validation.py
+++ b/tools/cell_census_builder/src/cell_census_builder/host_validation.py
@@ -87,12 +87,15 @@ def check_host(args: CensusBuildArgs) -> bool:
     from .build_state import CENSUS_CONFIG_DEFAULTS
 
     def main() -> int:
-        assert (
+        if not (
             check_os()
             and check_physical_memory(CENSUS_CONFIG_DEFAULTS["host_validation_min_physical_memory"])  # type: ignore[arg-type]
             and check_swap_memory(CENSUS_CONFIG_DEFAULTS["host_validation_min_swap_memory"])  # type: ignore[arg-type]
             and check_free_disk(os.getcwd(), CENSUS_CONFIG_DEFAULTS["host_validation_min_free_disk_space"])  # type: ignore[arg-type]
-        )  # assumed working directory is CWD
+        ):  # assumed working directory is CWD
+            print("Host validation FAILURE")
+            return 1
+
         print("Host validation success")
         return 0
 

From bccd4f1bffe63a0f224869025b815b2b885a1cf5 Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Tue, 21 Mar 2023 00:35:41 +0000
Subject: [PATCH 09/34] more namespace refactoring

---
 .../src/cell_census_builder/__main__.py       | 19 +++++++++++++++----
 .../build_soma/__init__.py                    |  7 +++++++
 .../build_soma/__main__.py                    |  4 ++--
 .../build_soma/{build.py => build_soma.py}    |  0
 .../{validate.py => validate_soma.py}         |  0
 .../src/cell_census_builder/build_state.py    |  2 +-
 .../cell_census_builder/tests/test_builder.py | 10 +++++-----
 7 files changed, 30 insertions(+), 12 deletions(-)
 rename tools/cell_census_builder/src/cell_census_builder/build_soma/{build.py => build_soma.py} (100%)
 rename tools/cell_census_builder/src/cell_census_builder/build_soma/{validate.py => validate_soma.py} (100%)

diff --git a/tools/cell_census_builder/src/cell_census_builder/__main__.py b/tools/cell_census_builder/src/cell_census_builder/__main__.py
index 01192ff7b..36636efd2 100644
--- a/tools/cell_census_builder/src/cell_census_builder/__main__.py
+++ b/tools/cell_census_builder/src/cell_census_builder/__main__.py
@@ -7,6 +7,8 @@
 import s3fs
 
 from . import __version__
+from .build_soma import build as build_a_soma
+from .build_soma import validate as validate_a_soma
 from .build_state import CENSUS_BUILD_CONFIG, CENSUS_BUILD_STATE, CensusBuildArgs, CensusBuildConfig
 from .host_validation import check_host
 from .util import process_init, urlcat
@@ -67,6 +69,7 @@ def do_build(args: CensusBuildArgs) -> int:
         do_prebuild_set_defaults,
         do_prebuild_checks,
         do_build_soma,
+        do_validate_soma,
         do_create_reports,
     ]
     try:
@@ -79,8 +82,8 @@ def do_build(args: CensusBuildArgs) -> int:
                 return cc
             logging.info(f"Build step {build_step.__name__} [{n} of {len(build_steps)}]: complete")
 
-    except Exception as e:
-        logging.critical(f"Caught exception, exiting: {str(e)}")
+    except Exception:
+        logging.critical("Caught exception, exiting", exc_info=True)
         return 1
 
     logging.info("Census build: completed")
@@ -113,8 +116,16 @@ def do_prebuild_checks(args: CensusBuildArgs) -> int:
 
 
 def do_build_soma(args: CensusBuildArgs) -> int:
-    # WIP
-    # args.state["do_build_soma"] = True
+    if not build_a_soma(args):
+        return 1
+    args.state["do_build_soma"] = True
+    return 0
+
+
+def do_validate_soma(args: CensusBuildArgs) -> int:
+    if not validate_a_soma(args):
+        return 1
+    args.state["do_validate_soma"] = True
     return 0
 
 
diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/__init__.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/__init__.py
index e69de29bb..1d58b873a 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_soma/__init__.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/__init__.py
@@ -0,0 +1,7 @@
+from .build_soma import build
+from .validate_soma import validate
+
+__all__ = [
+    "build",
+    "validate",
+]
diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py
index d1951b1b1..f88cfbe36 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py
@@ -5,8 +5,8 @@
 
 from ..build_state import CensusBuildArgs, CensusBuildConfig
 from ..util import process_init
-from .build import build
-from .validate import validate
+from .build_soma import build
+from .validate_soma import validate
 
 
 def main() -> int:
diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/build.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/build_soma.py
similarity index 100%
rename from tools/cell_census_builder/src/cell_census_builder/build_soma/build.py
rename to tools/cell_census_builder/src/cell_census_builder/build_soma/build_soma.py
diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/validate.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/validate_soma.py
similarity index 100%
rename from tools/cell_census_builder/src/cell_census_builder/build_soma/validate.py
rename to tools/cell_census_builder/src/cell_census_builder/build_soma/validate_soma.py
diff --git a/tools/cell_census_builder/src/cell_census_builder/build_state.py b/tools/cell_census_builder/src/cell_census_builder/build_state.py
index 871dfcc1b..66d7e39b2 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_state.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_state.py
@@ -31,7 +31,7 @@
     "max_workers": None,
     "manifest": None,
     "test_first_n": None,
-    "test_disable_dirty_get_check": False,
+    "test_disable_dirty_git_check": False,
     "host_validation_disable": False,  # if True, host validation checks will be skipped
     "host_validation_min_physical_memory": 512 * 1024**3,  # 512GiB
     "host_validation_min_swap_space": 2 * 1024**4,  # 2TiB
diff --git a/tools/cell_census_builder/tests/test_builder.py b/tools/cell_census_builder/tests/test_builder.py
index 3a626b3da..61e54ae1b 100644
--- a/tools/cell_census_builder/tests/test_builder.py
+++ b/tools/cell_census_builder/tests/test_builder.py
@@ -10,7 +10,8 @@
 import pytest
 import tiledb
 import tiledbsoma as soma
-from cell_census_builder.build_soma.build import build, build_step1_get_source_datasets
+from cell_census_builder.build_soma import build, validate
+from cell_census_builder.build_soma.build_soma import build_step1_get_source_datasets
 from cell_census_builder.build_soma.datasets import Dataset
 from cell_census_builder.build_soma.globals import (
     CENSUS_DATA_NAME,
@@ -18,7 +19,6 @@
     FEATURE_DATASET_PRESENCE_MATRIX_NAME,
     MEASUREMENT_RNA_NAME,
 )
-from cell_census_builder.build_soma.validate import validate
 from cell_census_builder.build_state import CensusBuildArgs
 
 
@@ -33,10 +33,10 @@ def test_base_builder_creation(
     """
     Runs the builder, queries the census and performs a set of base assertions.
     """
-    with patch("cell_census_builder.build_soma.build.prepare_file_system"), patch(
-        "cell_census_builder.build_soma.build.build_step1_get_source_datasets", return_value=datasets
+    with patch("cell_census_builder.build_soma.build_soma.prepare_file_system"), patch(
+        "cell_census_builder.build_soma.build_soma.build_step1_get_source_datasets", return_value=datasets
     ), patch("cell_census_builder.build_soma.consolidate._run"), patch(
-        "cell_census_builder.build_soma.validate.validate_consolidation", return_value=True
+        "cell_census_builder.build_soma.validate_soma.validate_consolidation", return_value=True
     ):
         return_value = build(census_build_args)
 

From 6dcb088c41cd4ecc10e1238cd9b96eb482b2a645 Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Tue, 21 Mar 2023 02:10:57 +0000
Subject: [PATCH 10/34] add reports to workflow

---
 tools/cell_census_builder/README.md           | 38 +++++++-
 .../src/cell_census_builder/__main__.py       | 49 +++++------
 .../src/cell_census_builder/build_state.py    |  1 -
 .../src/cell_census_builder/census_summary.py | 88 +++++++++++--------
 4 files changed, 111 insertions(+), 65 deletions(-)

diff --git a/tools/cell_census_builder/README.md b/tools/cell_census_builder/README.md
index 4dbda235d..99c89c337 100644
--- a/tools/cell_census_builder/README.md
+++ b/tools/cell_census_builder/README.md
@@ -7,13 +7,45 @@ This tool is not intended for end-users - it is used by CZI to periodically crea
 CELLxGENE data in the above format. The remainder of this document is intended for users of the
 build package.
 
-Please see the top-level [README](../../README.md) for more information on the Cell Census.
+Please see the top-level [README](../../README.md) for more information on the Cell Census and
+using the Cell Census data.
 
 # Overview
 
 This package contains sub-modules, each of which automate elements of the Cell Census build and release process.
-The ultimate intention is to integrate these into an automated multi-step workflow. Until that occurs, individual steps
-are provided as modules with their own `__main__`, to be manually invoked.
+They are wrapped at the package top-leveby by a `__main__` which implements the Cell Census build process,
+with standard defaults.
+
+The top-level build can be invoked as follows:
+
+- Create a working directory, e.g., `census-build` or equivalent.
+- If any configuration defaults need to be overridden, create a `config.yaml` in the working directory containing the default overrides.
+- Run the build as `python -m cell_census_builder your-working_dir`
+
+This will perform four steps (more will be added the future):
+
+- host validation
+- build soma
+- validate soma
+- build reports (eg., summary)
+
+This will result in the following file tree:
+
+```
+working_dir:
+    |
+    +-- config.yaml        # build config (user provided, read-only)
+    +-- state.yaml         # build runtime state (eg., census version tag, etc)
+    +-- build-version      # defaults to current date, e.g., 2023-01-20
+    |   +-- soma
+    |   +-- h5ads
+    +-- logs               # log files from various stages
+    |   +-- build.log
+    |   +-- ...
+    +-- reports
+        +-- census-summary-VERSION.txt
+        +-- census-diff-VERSION.txt
+```
 
 ## `host_validation` module
 
diff --git a/tools/cell_census_builder/src/cell_census_builder/__main__.py b/tools/cell_census_builder/src/cell_census_builder/__main__.py
index 36636efd2..dde5a9925 100644
--- a/tools/cell_census_builder/src/cell_census_builder/__main__.py
+++ b/tools/cell_census_builder/src/cell_census_builder/__main__.py
@@ -7,30 +7,9 @@
 import s3fs
 
 from . import __version__
-from .build_soma import build as build_a_soma
-from .build_soma import validate as validate_a_soma
 from .build_state import CENSUS_BUILD_CONFIG, CENSUS_BUILD_STATE, CensusBuildArgs, CensusBuildConfig
-from .host_validation import check_host
 from .util import process_init, urlcat
 
-"""
-File tree for the build.
-
-working_dir:
-    |
-    +-- config.yaml        # build config (user provided, read-only)
-    +-- state.yaml         # build runtime state (eg., census version tag, etc)
-    +-- soma
-    +-- h5ads
-    +-- logs               # log files from various stages
-    |   +-- build.log
-    |   +-- ...
-    +-- reports
-        +-- census-summary-VERSION.txt
-        +-- census-diff-VERSION.txt
-
-"""
-
 
 def main() -> int:
     cli_parser = create_args_parser()
@@ -98,6 +77,7 @@ def do_prebuild_set_defaults(args: CensusBuildArgs) -> int:
 
 def do_prebuild_checks(args: CensusBuildArgs) -> int:
     """Pre-build checks for host, config, etc. All pre-conditions should go here."""
+    from .host_validation import check_host
 
     # check host configuration, e.g., free disk space
     if not check_host(args):
@@ -116,22 +96,41 @@ def do_prebuild_checks(args: CensusBuildArgs) -> int:
 
 
 def do_build_soma(args: CensusBuildArgs) -> int:
-    if not build_a_soma(args):
-        return 1
+    from .build_soma import build as build_a_soma
+
+    if (cc := build_a_soma(args)) != 0:
+        return cc
+
     args.state["do_build_soma"] = True
     return 0
 
 
 def do_validate_soma(args: CensusBuildArgs) -> int:
+    from .build_soma import validate as validate_a_soma
+
     if not validate_a_soma(args):
+        logging.critical("Validation of the census build has failed.")
         return 1
+
     args.state["do_validate_soma"] = True
     return 0
 
 
 def do_create_reports(args: CensusBuildArgs) -> int:
-    # WIP
-    # args.state["do_create_reports"] = True
+    from .census_summary import display_summary, display_diff
+
+    reports_dir = args.working_dir / "reports"
+    reports_dir.mkdir(parents=True, exist_ok=True)
+
+    logging.info("Creating summary report")
+    with open(reports_dir / f"census-summary-{args.build_tag}.txt", mode="w") as f:
+        display_summary(uri=args.soma_path.as_posix(), file=f)
+
+    logging.info("Creating diff report (new build vs 'latest')")
+    with open(reports_dir / f"census-diff-{args.build_tag}.txt", mode="w") as f:
+        display_diff(uri=args.soma_path.as_posix(), previous_census_version="latest", file=f)
+
+    args.state["do_create_reports"] = True
     return 0
 
 
diff --git a/tools/cell_census_builder/src/cell_census_builder/build_state.py b/tools/cell_census_builder/src/cell_census_builder/build_state.py
index 66d7e39b2..21036b38b 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_state.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_state.py
@@ -26,7 +26,6 @@
     "log_file": "build.log",
     "cell_census_S3_path": "s3://cellxgene-data-public/cell-census",
     "consolidate": True,
-    "validate": True,
     "multi_process": False,
     "max_workers": None,
     "manifest": None,
diff --git a/tools/cell_census_builder/src/cell_census_builder/census_summary.py b/tools/cell_census_builder/src/cell_census_builder/census_summary.py
index 86fe202af..0c016550b 100644
--- a/tools/cell_census_builder/src/cell_census_builder/census_summary.py
+++ b/tools/cell_census_builder/src/cell_census_builder/census_summary.py
@@ -1,10 +1,11 @@
 import argparse
 import sys
+from typing import TextIO, Optional
 
 import cell_census
 import pandas as pd
 
-from .build.globals import CENSUS_DATA_NAME, CENSUS_INFO_NAME
+from .build_soma.globals import CENSUS_DATA_NAME, CENSUS_INFO_NAME
 
 # Print all of the Pandas DataFrames, except the dimensions
 pd.options.display.max_columns = None  # type: ignore[assignment] # None is legal per Pandas documentation.
@@ -13,8 +14,13 @@
 pd.options.display.show_dimensions = False  # type: ignore[assignment] # boolean is legal per Pandas documentation.
 
 
-def display_summary(census_version: str) -> int:
-    census = cell_census.open_soma(census_version=census_version)
+def display_summary(
+    *,
+    census_version: Optional[str] = "latest",
+    uri: Optional[str] = None,
+    file: Optional[TextIO] = None,
+) -> int:
+    census = cell_census.open_soma(census_version=census_version, uri=uri)
 
     COLS_TO_QUERY = [
         ("soma_joinid", "cells"),
@@ -31,18 +37,27 @@ def display_summary(census_version: str) -> int:
 
     # Use Pandas to summarize and display
     stats = [(organism, col[1], df[col[0]].nunique()) for organism, df in obs_df.items() for col in COLS_TO_QUERY]
-    print(census["census_info"]["summary"].read().concat().to_pandas()[["label", "value"]].to_string(index=False))
+    print(
+        census["census_info"]["summary"].read().concat().to_pandas()[["label", "value"]].to_string(index=False),
+        file=file,
+    )
     stats_df = pd.DataFrame(stats, columns=["organism", "attribute", "unique count"])
     display_stats_df = pd.pivot(stats_df, index=["organism"], columns=["attribute"], values=["unique count"])
-    print(display_stats_df)
-    print()
+    print(display_stats_df, file=file)
+    print(file=file)
 
     return 0
 
 
-def display_diff(census_version: str, previous_census_version: str) -> int:
-    census = cell_census.open_soma(census_version=census_version)
-    previous_census = cell_census.open_soma(census_version=previous_census_version)
+def display_diff(
+    census_version: Optional[str] = "latest",
+    uri: Optional[str] = None,
+    previous_census_version: Optional[str] = None,
+    previous_uri: Optional[str] = None,
+    file: Optional[TextIO] = None,
+) -> int:
+    census = cell_census.open_soma(census_version=census_version, uri=uri)
+    previous_census = cell_census.open_soma(census_version=previous_census_version, uri=previous_uri)
 
     # Total cell count deltas by experiment (mouse, human)
 
@@ -50,9 +65,10 @@ def display_diff(census_version: str, previous_census_version: str) -> int:
         curr_count = census[CENSUS_DATA_NAME][organism].obs.count
         prev_count = previous_census[CENSUS_DATA_NAME][organism].obs.count
         print(
-            f"Previous {organism} cell count: {prev_count}, current {organism} cell count: {curr_count}, delta {curr_count - prev_count}"
+            f"Previous {organism} cell count: {prev_count}, current {organism} cell count: {curr_count}, delta {curr_count - prev_count}",
+            file=file,
         )
-        print()
+        print(file=file)
 
     prev_datasets = previous_census[CENSUS_INFO_NAME]["datasets"].read().concat().to_pandas()
     curr_datasets = census[CENSUS_INFO_NAME]["datasets"].read().concat().to_pandas()
@@ -64,20 +80,20 @@ def display_diff(census_version: str, previous_census_version: str) -> int:
     added_datasets = curr_datasets_ids - prev_dataset_ids
     removed_datasets = prev_dataset_ids - curr_datasets_ids
     if added_datasets:
-        print(f"Datasets that were added ({len(added_datasets)})")
+        print(f"Datasets that were added ({len(added_datasets)})", file=file)
         added_datasets_df = curr_datasets[curr_datasets["dataset_id"].isin(added_datasets)]
-        print(added_datasets_df[["dataset_id", "dataset_title", "collection_name"]])
+        print(added_datasets_df[["dataset_id", "dataset_title", "collection_name"]], file=file)
     else:
-        print("No datasets were added")
-    print()
+        print("No datasets were added", file=file)
+    print(file=file)
 
     if removed_datasets:
-        print(f"Datasets that were removed ({len(removed_datasets)}")
+        print(f"Datasets that were removed ({len(removed_datasets)})", file=file)
         removed_datasets_df = prev_datasets[prev_datasets["dataset_id"].isin(removed_datasets)]
-        print(removed_datasets_df[["dataset_id", "dataset_title", "collection_name"]])
+        print(removed_datasets_df[["dataset_id", "dataset_title", "collection_name"]], file=file)
     else:
-        print("No datasets were removed")
-    print()
+        print("No datasets were removed", file=file)
+    print(file=file)
 
     # Datasets in both versions but that have differing cell counts
     joined = prev_datasets.join(
@@ -88,9 +104,9 @@ def display_diff(census_version: str, previous_census_version: str) -> int:
     ][["dataset_id", "dataset_total_cell_count_prev", "dataset_total_cell_count_curr"]]
 
     if not datasets_with_different_cell_counts.empty:
-        print("Datasets that have a different cell count")
-        print(datasets_with_different_cell_counts)
-        print()
+        print("Datasets that have a different cell count", file=file)
+        print(datasets_with_different_cell_counts, file=file)
+        print(file=file)
 
     # Deltas between summary_cell_counts dataframes
     y = census["census_info"]["summary_cell_counts"].read().concat().to_pandas()
@@ -104,17 +120,17 @@ def display_diff(census_version: str, previous_census_version: str) -> int:
         ["total_cell_count_prev", "total_cell_count_curr"]
     ].reset_index()
     if not delta.empty:
-        print("Summary delta - total cell counts")
-        print(delta)
-        print()
+        print("Summary delta - total cell counts", file=file)
+        print(delta, file=file)
+        print(file=file)
 
     delta = w.loc[w["unique_cell_count_prev"] != w["unique_cell_count_curr"]][
         ["unique_cell_count_prev", "unique_cell_count_curr"]
     ].reset_index()
     if not delta.empty:
-        print("Summary delta - unique cell counts")
-        print(delta)
-        print()
+        print("Summary delta - unique cell counts", file=file)
+        print(delta, file=file)
+        print(file=file)
 
     # Genes removed, added
     for organism in census[CENSUS_DATA_NAME]:
@@ -123,19 +139,19 @@ def display_diff(census_version: str, previous_census_version: str) -> int:
 
         new_genes = set(curr_genes["feature_id"]) - set(prev_genes["feature_id"])
         if new_genes:
-            print("Genes added")
-            print(new_genes)
+            print("Genes added", file=file)
+            print(new_genes, file=file)
         else:
             "No genes were added."
-            print()
+            print(file=file)
 
         removed_genes = set(prev_genes["feature_id"]) - set(curr_genes["feature_id"])
         if removed_genes:
-            print("Genes removed")
-            print(removed_genes)
+            print("Genes removed", file=file)
+            print(removed_genes, file=file)
         else:
             "No genes were removed."
-            print()
+            print(file=file)
 
     return 0
 
@@ -161,9 +177,9 @@ def main() -> int:
     assert args.subcommand in ["summarize", "diff"]
 
     if args.subcommand == "summarize":
-        return display_summary(args.census_version)
+        return display_summary(census_version=args.census_version)
     elif args.subcommand == "diff":
-        return display_diff(args.census_version, args.previous_version)
+        return display_diff(census_version=args.census_version, previous_census_version=args.previous_version)
 
     return 0
 

From e8829e0aa0cc4f1c6f09d577e55b1ab425eb0474 Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Tue, 21 Mar 2023 02:11:24 +0000
Subject: [PATCH 11/34] lint

---
 tools/cell_census_builder/src/cell_census_builder/__main__.py   | 2 +-
 .../src/cell_census_builder/census_summary.py                   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/cell_census_builder/src/cell_census_builder/__main__.py b/tools/cell_census_builder/src/cell_census_builder/__main__.py
index dde5a9925..e613967d6 100644
--- a/tools/cell_census_builder/src/cell_census_builder/__main__.py
+++ b/tools/cell_census_builder/src/cell_census_builder/__main__.py
@@ -117,7 +117,7 @@ def do_validate_soma(args: CensusBuildArgs) -> int:
 
 
 def do_create_reports(args: CensusBuildArgs) -> int:
-    from .census_summary import display_summary, display_diff
+    from .census_summary import display_diff, display_summary
 
     reports_dir = args.working_dir / "reports"
     reports_dir.mkdir(parents=True, exist_ok=True)
diff --git a/tools/cell_census_builder/src/cell_census_builder/census_summary.py b/tools/cell_census_builder/src/cell_census_builder/census_summary.py
index 0c016550b..e8b33f791 100644
--- a/tools/cell_census_builder/src/cell_census_builder/census_summary.py
+++ b/tools/cell_census_builder/src/cell_census_builder/census_summary.py
@@ -1,6 +1,6 @@
 import argparse
 import sys
-from typing import TextIO, Optional
+from typing import Optional, TextIO
 
 import cell_census
 import pandas as pd

From 2c2dba95597ef09723593bcf42c93ebe326096bc Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Tue, 21 Mar 2023 02:25:52 +0000
Subject: [PATCH 12/34] handle default config correctly

---
 .../src/cell_census_builder/__main__.py                  | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tools/cell_census_builder/src/cell_census_builder/__main__.py b/tools/cell_census_builder/src/cell_census_builder/__main__.py
index e613967d6..c1948757b 100644
--- a/tools/cell_census_builder/src/cell_census_builder/__main__.py
+++ b/tools/cell_census_builder/src/cell_census_builder/__main__.py
@@ -19,14 +19,15 @@ def main() -> int:
     if not working_dir.is_dir():
         logging.critical("Census builder: unable to find working directory - exiting.")
         return 1
-    if not (working_dir / CENSUS_BUILD_CONFIG).is_file():
-        logging.critical("Census builder: unable to find config.yaml in working directory - exiting.")
-        return 1
     if (working_dir / CENSUS_BUILD_STATE).exists():
         logging.critical("Found pre-existing census build in working directory - aborting census build.")
         return 1
 
-    build_config = CensusBuildConfig.load(working_dir / CENSUS_BUILD_CONFIG)
+    if (working_dir / CENSUS_BUILD_CONFIG).is_file():
+        build_config = CensusBuildConfig.load(working_dir / CENSUS_BUILD_CONFIG)
+    else:
+        build_config = CensusBuildConfig()
+
     build_args = CensusBuildArgs(working_dir=working_dir, config=build_config)
 
     # Process initialization/setup must be done early

From 4680c87c07ff85923f3d599cc92b2c5b4541422f Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Tue, 21 Mar 2023 02:26:20 +0000
Subject: [PATCH 13/34] fix typo in defaults

---
 .../cell_census_builder/src/cell_census_builder/build_state.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cell_census_builder/src/cell_census_builder/build_state.py b/tools/cell_census_builder/src/cell_census_builder/build_state.py
index 21036b38b..efafdfea7 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_state.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_state.py
@@ -33,7 +33,7 @@
     "test_disable_dirty_git_check": False,
     "host_validation_disable": False,  # if True, host validation checks will be skipped
     "host_validation_min_physical_memory": 512 * 1024**3,  # 512GiB
-    "host_validation_min_swap_space": 2 * 1024**4,  # 2TiB
+    "host_validation_min_swap_memory": 2 * 1024**4,  # 2TiB
     "host_validation_min_free_disk_space": 1 * 1024**4,  # 1 TiB
 }
 

From cb0d3f7525f0772d5f65b265192002a3ca0b6321 Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Tue, 21 Mar 2023 23:21:07 +0000
Subject: [PATCH 14/34] fix report typo

---
 .../src/cell_census_builder/census_summary.py                 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/cell_census_builder/src/cell_census_builder/census_summary.py b/tools/cell_census_builder/src/cell_census_builder/census_summary.py
index e8b33f791..dcf472e74 100644
--- a/tools/cell_census_builder/src/cell_census_builder/census_summary.py
+++ b/tools/cell_census_builder/src/cell_census_builder/census_summary.py
@@ -142,7 +142,7 @@ def display_diff(
             print("Genes added", file=file)
             print(new_genes, file=file)
         else:
-            "No genes were added."
+            print("No genes were added.", file=file)
             print(file=file)
 
         removed_genes = set(prev_genes["feature_id"]) - set(curr_genes["feature_id"])
@@ -150,7 +150,7 @@ def display_diff(
             print("Genes removed", file=file)
             print(removed_genes, file=file)
         else:
-            "No genes were removed."
+            print("No genes were removed.", file=file)
             print(file=file)
 
     return 0

From d3076160cf4d121d81dd436ae9385eb2f60f84b7 Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Tue, 21 Mar 2023 23:21:55 +0000
Subject: [PATCH 15/34] fix state load issue; enable multi-process by default

---
 .../src/cell_census_builder/build_state.py    | 35 +++++++++++++------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/tools/cell_census_builder/src/cell_census_builder/build_state.py b/tools/cell_census_builder/src/cell_census_builder/build_state.py
index efafdfea7..91a4a8069 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_state.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_state.py
@@ -10,6 +10,7 @@
 from typing import Any, Iterator, Mapping, Union
 
 import attrs
+import psutil
 import yaml
 from typing_extensions import Self
 
@@ -20,21 +21,33 @@
 CENSUS_BUILD_CONFIG = "config.yaml"
 CENSUS_BUILD_STATE = "state.yaml"
 CENSUS_CONFIG_DEFAULTS = {
-    "build_tag": datetime.now().astimezone().date().isoformat(),
+    # General config
     "verbose": 1,
     "log_dir": "logs",
     "log_file": "build.log",
-    "cell_census_S3_path": "s3://cellxgene-data-public/cell-census",
     "consolidate": True,
-    "multi_process": False,
-    "max_workers": None,
-    "manifest": None,
-    "test_first_n": None,
-    "test_disable_dirty_git_check": False,
+    #
+    # Paths and census version name determined by spec.
+    "cell_census_S3_path": "s3://cellxgene-data-public/cell-census",
+    "build_tag": datetime.now().astimezone().date().isoformat(),
+    #
+    # Default multi-process. Memory scaling based on empirical tests.
+    "multi_process": True,
+    "max_workers": 2 + int(psutil.virtual_memory().total / (96 * 1024**3)),
+    #
+    # XXX TODO: this exposes a bug in the validation pass
+    # "multi_process": False,
+    # "max_workers": None,
+    #
+    # Host minimum resource validation
     "host_validation_disable": False,  # if True, host validation checks will be skipped
     "host_validation_min_physical_memory": 512 * 1024**3,  # 512GiB
     "host_validation_min_swap_memory": 2 * 1024**4,  # 2TiB
     "host_validation_min_free_disk_space": 1 * 1024**4,  # 1 TiB
+    # For testing convenience only
+    "manifest": None,
+    "test_first_n": None,
+    "test_disable_dirty_git_check": False,
 }
 
 
@@ -129,11 +142,13 @@ def __setitem__(self, key: str, value: Any) -> None:
     def load(cls, file: Union[str, os.PathLike[str], io.TextIOBase]) -> Self:
         if isinstance(file, (str, os.PathLike)):
             with open(file) as state_log:
-                documents = yaml.safe_load_all(state_log)
+                documents = list(yaml.safe_load_all(state_log))
         else:
-            documents = yaml.safe_load_all(file)
+            documents = list(yaml.safe_load_all(file))
 
-        return cls(**functools.reduce(lambda acc, r: acc.update(r) or acc, documents, {}))
+        state = cls(**functools.reduce(lambda acc, r: acc.update(r) or acc, documents, {}))
+        state.__dirty_keys.clear()
+        return state
 
     def commit(self, file: Union[str, os.PathLike[str]]) -> None:
         # append dirty elements (atomic on Posix)

From 1f70ab91f6d7b261a68fec253095fa31b18877ea Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Tue, 21 Mar 2023 23:22:30 +0000
Subject: [PATCH 16/34] fix typo in program name

---
 .../src/cell_census_builder/build_soma/__main__.py              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py
index f88cfbe36..128f423d3 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py
@@ -29,7 +29,7 @@ def main() -> int:
 
 
 def create_args_parser() -> argparse.ArgumentParser:
-    parser = argparse.ArgumentParser(prog="cell_census_builder")
+    parser = argparse.ArgumentParser(prog="cell_census_builder.build_soma")
     parser.add_argument("uri", type=str, help="Census top-level URI")
     parser.add_argument("-v", "--verbose", action="count", default=0, help="Increase logging verbosity")
     parser.add_argument(

From e40653a9846fc43122554b006b3a8a9432db2770 Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Tue, 21 Mar 2023 23:22:50 +0000
Subject: [PATCH 17/34] add build resumption

---
 .../src/cell_census_builder/__main__.py       | 83 +++++++++++--------
 1 file changed, 48 insertions(+), 35 deletions(-)

diff --git a/tools/cell_census_builder/src/cell_census_builder/__main__.py b/tools/cell_census_builder/src/cell_census_builder/__main__.py
index c1948757b..ea68e31c3 100644
--- a/tools/cell_census_builder/src/cell_census_builder/__main__.py
+++ b/tools/cell_census_builder/src/cell_census_builder/__main__.py
@@ -7,7 +7,7 @@
 import s3fs
 
 from . import __version__
-from .build_state import CENSUS_BUILD_CONFIG, CENSUS_BUILD_STATE, CensusBuildArgs, CensusBuildConfig
+from .build_state import CENSUS_BUILD_CONFIG, CENSUS_BUILD_STATE, CensusBuildArgs, CensusBuildConfig, CensusBuildState
 from .util import process_init, urlcat
 
 
@@ -19,25 +19,30 @@ def main() -> int:
     if not working_dir.is_dir():
         logging.critical("Census builder: unable to find working directory - exiting.")
         return 1
-    if (working_dir / CENSUS_BUILD_STATE).exists():
-        logging.critical("Found pre-existing census build in working directory - aborting census build.")
-        return 1
 
     if (working_dir / CENSUS_BUILD_CONFIG).is_file():
         build_config = CensusBuildConfig.load(working_dir / CENSUS_BUILD_CONFIG)
     else:
         build_config = CensusBuildConfig()
 
-    build_args = CensusBuildArgs(working_dir=working_dir, config=build_config)
+    if not cli_args.test_resume:
+        if (working_dir / CENSUS_BUILD_STATE).exists():
+            logging.critical("Found pre-existing census build in working directory - aborting census build.")
+            return 1
+        build_state = CensusBuildState()
+    else:
+        build_state = CensusBuildState.load(working_dir / CENSUS_BUILD_STATE)
+
+    build_args = CensusBuildArgs(working_dir=working_dir, config=build_config, state=build_state)
 
     # Process initialization/setup must be done early
     process_init(build_args)
 
     # Return process exit code (or raise, which exits with a code of `1`)
-    return do_build(build_args)
+    return do_build(build_args, skip_completed_steps=cli_args.test_resume)
 
 
-def do_build(args: CensusBuildArgs) -> int:
+def do_build(args: CensusBuildArgs, skip_completed_steps: bool = False) -> int:
     """
     Top-level build sequence.
 
@@ -45,7 +50,7 @@ def do_build(args: CensusBuildArgs) -> int:
     exit code or raises.
     """
     logging.info(f"Census build: start [version={__version__}]")
-    build_steps: List[Callable[[CensusBuildArgs], int]] = [
+    build_steps: List[Callable[[CensusBuildArgs], bool]] = [
         do_prebuild_set_defaults,
         do_prebuild_checks,
         do_build_soma,
@@ -54,13 +59,19 @@ def do_build(args: CensusBuildArgs) -> int:
     ]
     try:
         for n, build_step in enumerate(build_steps, start=1):
-            logging.info(f"Build step {build_step.__name__} [{n} of {len(build_steps)}]: start")
-            cc = build_step(args)
+            step_n_of = f"Build step {build_step.__name__} [{n} of {len(build_steps)}]"
+            if skip_completed_steps and args.state.get(build_step.__name__):
+                logging.info(f"{step_n_of}: already complete, skipping.")
+                continue
+
+            logging.info(f"{step_n_of}: start")
+            if not build_step(args):
+                logging.critical(f"{step_n_of}: failed, aborting build.")
+                return 1
+
+            args.state[build_step.__name__] = True
             args.state.commit(args.working_dir / CENSUS_BUILD_STATE)
-            if cc != 0:
-                logging.critical(f"Build step {build_step.__name__} returned error code {cc}: aborting build.")
-                return cc
-            logging.info(f"Build step {build_step.__name__} [{n} of {len(build_steps)}]: complete")
+            logging.info(f"{step_n_of}: complete")
 
     except Exception:
         logging.critical("Caught exception, exiting", exc_info=True)
@@ -70,19 +81,18 @@ def do_build(args: CensusBuildArgs) -> int:
     return 0
 
 
-def do_prebuild_set_defaults(args: CensusBuildArgs) -> int:
-    """Set any default state required by build steps."""
-    args.state["do_prebuild_set_defaults"] = True
-    return 0
+def do_prebuild_set_defaults(args: CensusBuildArgs) -> bool:
+    """Set any defaults required by build steps."""
+    return True
 
 
-def do_prebuild_checks(args: CensusBuildArgs) -> int:
+def do_prebuild_checks(args: CensusBuildArgs) -> bool:
     """Pre-build checks for host, config, etc. All pre-conditions should go here."""
     from .host_validation import check_host
 
     # check host configuration, e.g., free disk space
     if not check_host(args):
-        return 1
+        return False
 
     # verify the build tag is not already published/in use
     build_tag = args.config.build_tag
@@ -90,34 +100,32 @@ def do_prebuild_checks(args: CensusBuildArgs) -> int:
     s3path = urlcat(args.config.cell_census_S3_path, build_tag)
     if s3fs.S3FileSystem(anon=True).exists(s3path):
         logging.error(f"Build tag {build_tag} already exists at {s3path}.")
-        return 1
+        return False
 
-    args.state["do_prebuild_checks"] = True
-    return 0
+    return True
 
 
-def do_build_soma(args: CensusBuildArgs) -> int:
+def do_build_soma(args: CensusBuildArgs) -> bool:
     from .build_soma import build as build_a_soma
 
     if (cc := build_a_soma(args)) != 0:
-        return cc
+        logging.critical(f"Build of census failed with code {cc}.")
+        return False
 
-    args.state["do_build_soma"] = True
-    return 0
+    return True
 
 
-def do_validate_soma(args: CensusBuildArgs) -> int:
+def do_validate_soma(args: CensusBuildArgs) -> bool:
     from .build_soma import validate as validate_a_soma
 
     if not validate_a_soma(args):
         logging.critical("Validation of the census build has failed.")
-        return 1
+        return False
 
-    args.state["do_validate_soma"] = True
-    return 0
+    return True
 
 
-def do_create_reports(args: CensusBuildArgs) -> int:
+def do_create_reports(args: CensusBuildArgs) -> bool:
     from .census_summary import display_diff, display_summary
 
     reports_dir = args.working_dir / "reports"
@@ -131,13 +139,18 @@ def do_create_reports(args: CensusBuildArgs) -> int:
     with open(reports_dir / f"census-diff-{args.build_tag}.txt", mode="w") as f:
         display_diff(uri=args.soma_path.as_posix(), previous_census_version="latest", file=f)
 
-    args.state["do_create_reports"] = True
-    return 0
+    return True
 
 
 def create_args_parser() -> argparse.ArgumentParser:
-    parser = argparse.ArgumentParser(prog="cell_census_builder")
+    parser = argparse.ArgumentParser(prog="cell_census_builder", description="Build the official cell census.")
     parser.add_argument("working_dir", type=str, help="Working directory for the build")
+    parser.add_argument(
+        "--test-resume",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Attempt to resume the build by skipping completed workflow steps. CAUTION: TEST OPTION ONLY.",
+    )
     return parser
 
 

From 18e1b671ea7f854d04535cdedca78da0095aae81 Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Wed, 22 Mar 2023 14:23:03 +0000
Subject: [PATCH 18/34] dockerfile update

---
 tools/cell_census_builder/Dockerfile | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 tools/cell_census_builder/Dockerfile

diff --git a/tools/cell_census_builder/Dockerfile b/tools/cell_census_builder/Dockerfile
new file mode 100644
index 000000000..67a7a3d52
--- /dev/null
+++ b/tools/cell_census_builder/Dockerfile
@@ -0,0 +1,17 @@
+FROM ubuntu:22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+ARG COMMIT_SHA
+ENV COMMIT_SHA=${COMMIT_SHA}
+
+RUN apt update && apt -y full-upgrade && apt -y install python3.10-venv python3-pip awscli
+
+ADD dist/ /tools/cell_census_builder
+
+RUN python3 -m pip install -U pip
+RUN python3 -m pip -v install /tools/cell_census_builder/*.whl
+
+WORKDIR /census-build
+
+ENTRYPOINT ["python3", "-m", "cell_census_builder", "."]

From c35554aaa69e56d042be6ef71ab9fd5e0f54c325 Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Wed, 22 Mar 2023 16:19:26 +0000
Subject: [PATCH 19/34] docker build refinement

---
 tools/cell_census_builder/Makefile            | 10 ++++++
 tools/cell_census_builder/README.md           | 35 ++++++++++++-------
 .../src/cell_census_builder/__main__.py       |  4 ++-
 .../build_soma/build_soma.py                  |  2 +-
 .../build_soma/manifest.py                    | 10 ++++--
 .../build_soma/source_assets.py               |  2 +-
 .../cell_census_builder/build_soma/util.py    |  1 +
 .../src/cell_census_builder/build_state.py    |  3 +-
 tools/cell_census_builder/tests/conftest.py   |  9 +++--
 .../cell_census_builder/tests/test_builder.py |  2 +-
 .../tests/test_manifest.py                    | 17 +++++++--
 11 files changed, 66 insertions(+), 29 deletions(-)
 create mode 100644 tools/cell_census_builder/Makefile

diff --git a/tools/cell_census_builder/Makefile b/tools/cell_census_builder/Makefile
new file mode 100644
index 000000000..c82691a75
--- /dev/null
+++ b/tools/cell_census_builder/Makefile
@@ -0,0 +1,10 @@
+# Build docker container
+
+.PHONY: container
+container:
+	python3 -m build .
+	docker build --build-arg=COMMIT_SHA=$(git rev-parse --short HEAD) -t cell-census-builder .
+
+.PHONY: clean
+clean:
+	rm -rf build dist
diff --git a/tools/cell_census_builder/README.md b/tools/cell_census_builder/README.md
index 99c89c337..40b2ce6f1 100644
--- a/tools/cell_census_builder/README.md
+++ b/tools/cell_census_builder/README.md
@@ -13,7 +13,7 @@ using the Cell Census data.
 # Overview
 
 This package contains sub-modules, each of which automate elements of the Cell Census build and release process.
-They are wrapped at the package top-leveby by a `__main__` which implements the Cell Census build process,
+They are wrapped at the package top-level by by a `__main__` which implements the Cell Census build process,
 with standard defaults.
 
 The top-level build can be invoked as follows:
@@ -47,6 +47,27 @@ working_dir:
         +-- census-diff-VERSION.txt
 ```
 
+# Building and using the Docker container
+
+The standard Census build is expected to be done via a Docker container.
+
+To build the container, do a `git pull` to the version you want to use, and do the following to create a container called `cell-census-builder`:
+
+```
+$ cd tools/cell_census_builder
+$ make container
+```
+
+To use the container to build the _full_ census, with default options, pick a working directory (e.g., /tmp/census-build), and:
+
+```
+$ mkdir /tmp/census-build
+$ chmod ug+s /tmp/census-build   # optional, but makes permissions handling simpler
+$ docker run --mount type=bind,source="`pwd`/tmp/census-build",target='/census-build' cell-census-builder
+```
+
+# Module-specific notes
+
 ## `host_validation` module
 
 Module which provides a set of checks that the current host machine has the requisite capabilities
@@ -102,15 +123,3 @@ If you run out of memory, reduce `--max-workers`. You can also try a higher numb
   You can specify a file system path or a URI in the second field
 - To create a cell census at `<census_path>`, execute:
   > $ python -m cell_census_builder <census_path> build --manifest <the_manifest_file.csv>
-
-### Other info
-
-There are more options discoverable via the `--help` command line option.
-
-Note on required host resources:
-
-- all H5AD files not on the local disk will be downloaded/cached locally. There must be
-  sufficient local file system space. Location of cache can be controlled with the
-  environment variable `FSSPEC_CACHE_DIR`
-- each H5AD will be read into memory, in its entirety. Sufficient RAM must be present to
-  allow for this (and to do so for multiple H5ADs concurrently if you use the `--multi-process` option)
diff --git a/tools/cell_census_builder/src/cell_census_builder/__main__.py b/tools/cell_census_builder/src/cell_census_builder/__main__.py
index ea68e31c3..618234ff2 100644
--- a/tools/cell_census_builder/src/cell_census_builder/__main__.py
+++ b/tools/cell_census_builder/src/cell_census_builder/__main__.py
@@ -35,7 +35,7 @@ def main() -> int:
 
     build_args = CensusBuildArgs(working_dir=working_dir, config=build_config, state=build_state)
 
-    # Process initialization/setup must be done early
+    # Process initialization/setup must be done early. NOTE: do NOT log before this line!
     process_init(build_args)
 
     # Return process exit code (or raise, which exits with a code of `1`)
@@ -50,6 +50,8 @@ def do_build(args: CensusBuildArgs, skip_completed_steps: bool = False) -> int:
     exit code or raises.
     """
     logging.info(f"Census build: start [version={__version__}]")
+    logging.info(args)
+
     build_steps: List[Callable[[CensusBuildArgs], bool]] = [
         do_prebuild_set_defaults,
         do_prebuild_checks,
diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/build_soma.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/build_soma.py
index ab3693b75..612a0fc38 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_soma/build_soma.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/build_soma.py
@@ -38,7 +38,7 @@ def prepare_file_system(args: CensusBuildArgs) -> None:
         raise Exception("Census build path already exists - aborting build")
 
     # Ensure that the git tree is clean
-    if not args.config.test_disable_dirty_git_check and is_git_repo_dirty():
+    if not args.config.disable_dirty_git_check and is_git_repo_dirty():
         raise Exception("The git repo has uncommitted changes - aborting build")
 
     # Create top-level build directories
diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/manifest.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/manifest.py
index 53a6de916..1c7c4d266 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_soma/manifest.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/manifest.py
@@ -68,7 +68,7 @@ def load_manifest_from_CxG() -> List[Dataset]:
     logging.info(f"Found {len(datasets)} datasets, in {len(collections)} collections")
 
     # load per-dataset schema version
-    with concurrent.futures.ThreadPoolExecutor(max_workers=16) as tp:
+    with concurrent.futures.ThreadPoolExecutor(max_workers=32) as tp:
         dataset_metadata = tp.map(
             lambda d: fetch_json(
                 f"{CXG_BASE_URI}curation/v1/collections/{d['collection_id']}/datasets/{d['dataset_id']}"
@@ -130,13 +130,17 @@ def load_manifest_from_CxG() -> List[Dataset]:
     return [Dataset(**d) for d in datasets.values()]
 
 
-def load_manifest(manifest_fp: Optional[io.TextIOBase] = None) -> List[Dataset]:
+def load_manifest(manifest_fp: Optional[Union[str, io.TextIOBase]] = None) -> List[Dataset]:
     """
     Load dataset manifest from the file pointer if provided, else bootstrap
     the load rom the CELLxGENE REST API.
     """
     if manifest_fp is not None:
-        datasets = load_manifest_from_fp(manifest_fp)
+        if isinstance(manifest_fp, str):
+            with open(manifest_fp) as f:
+                datasets = load_manifest_from_fp(f)
+        else:
+            datasets = load_manifest_from_fp(manifest_fp)
     else:
         datasets = load_manifest_from_CxG()
 
diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py
index dd2f0041a..15810c2c5 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py
@@ -21,7 +21,7 @@ def stage_source_assets(datasets: List[Dataset], args: CensusBuildArgs) -> None:
     datasets = sorted(datasets, key=lambda d: d.asset_h5ad_filesize, reverse=True)
 
     N = len(datasets)
-    if not args.config.multi_process:
+    if args.config.multi_process:
         n_workers = max(min(8, cpu_count()), 64)
         with create_process_pool_executor(args, n_workers) as pe:
             paths = list(
diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/util.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/util.py
index e2adf1c01..f59d9e740 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_soma/util.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/util.py
@@ -122,6 +122,7 @@ def get_git_commit_sha() -> str:
     commit_sha_var = os.getenv("COMMIT_SHA")
     if commit_sha_var is not None:
         return commit_sha_var
+
     import git  # Scoped import - this requires the git executable to exist on the machine
 
     repo = git.Repo(search_parent_directories=True)
diff --git a/tools/cell_census_builder/src/cell_census_builder/build_state.py b/tools/cell_census_builder/src/cell_census_builder/build_state.py
index 91a4a8069..54e892191 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_state.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_state.py
@@ -26,6 +26,7 @@
     "log_dir": "logs",
     "log_file": "build.log",
     "consolidate": True,
+    "disable_dirty_git_check": True,
     #
     # Paths and census version name determined by spec.
     "cell_census_S3_path": "s3://cellxgene-data-public/cell-census",
@@ -44,10 +45,10 @@
     "host_validation_min_physical_memory": 512 * 1024**3,  # 512GiB
     "host_validation_min_swap_memory": 2 * 1024**4,  # 2TiB
     "host_validation_min_free_disk_space": 1 * 1024**4,  # 1 TiB
+    #
     # For testing convenience only
     "manifest": None,
     "test_first_n": None,
-    "test_disable_dirty_git_check": False,
 }
 
 
diff --git a/tools/cell_census_builder/tests/conftest.py b/tools/cell_census_builder/tests/conftest.py
index 663949807..2f1c525e3 100644
--- a/tools/cell_census_builder/tests/conftest.py
+++ b/tools/cell_census_builder/tests/conftest.py
@@ -1,4 +1,3 @@
-import io
 import pathlib
 from typing import List, Optional
 
@@ -131,7 +130,7 @@ def datasets(census_build_args: CensusBuildArgs) -> List[Dataset]:
 
 
 @pytest.fixture
-def manifest_csv(tmp_path: pathlib.Path) -> io.TextIOWrapper:
+def manifest_csv(tmp_path: pathlib.Path) -> str:
     manifest_content = f"""
     dataset_id_1, {tmp_path}/data/h5ads/dataset_id_1.h5ad
     dataset_id_2, {tmp_path}/data/h5ads/dataset_id_2.h5ad
@@ -144,11 +143,11 @@ def manifest_csv(tmp_path: pathlib.Path) -> io.TextIOWrapper:
     with open(path, "w+") as f:
         f.writelines(manifest_content.strip())
 
-    return open(path)
+    return path
 
 
 @pytest.fixture
-def manifest_csv_with_duplicates(tmp_path: pathlib.Path) -> io.TextIOWrapper:
+def manifest_csv_with_duplicates(tmp_path: pathlib.Path) -> str:
     manifest_content = f"""
     dataset_id_1, {tmp_path}/data/h5ads/dataset_id_1.h5ad
     dataset_id_2, {tmp_path}/data/h5ads/dataset_id_2.h5ad
@@ -162,7 +161,7 @@ def manifest_csv_with_duplicates(tmp_path: pathlib.Path) -> io.TextIOWrapper:
     with open(path, "w+") as f:
         f.writelines(manifest_content.strip())
 
-    return open(path)
+    return path
 
 
 @pytest.fixture()
diff --git a/tools/cell_census_builder/tests/test_builder.py b/tools/cell_census_builder/tests/test_builder.py
index 61e54ae1b..968f80032 100644
--- a/tools/cell_census_builder/tests/test_builder.py
+++ b/tools/cell_census_builder/tests/test_builder.py
@@ -129,7 +129,7 @@ def test_unicode_support(tmp_path: pathlib.Path) -> None:
 
 @pytest.mark.parametrize(
     "census_build_args",
-    [dict(manifest=True, test_first_n=None, verbose=2, build_tag="build_tag", multi_process=True)],
+    [dict(manifest=True, test_first_n=None, verbose=2, build_tag="build_tag", multi_process=True, max_workers=2)],
     indirect=True,
 )
 def test_build_step1_get_source_datasets(tmp_path: pathlib.Path, census_build_args: CensusBuildArgs) -> None:
diff --git a/tools/cell_census_builder/tests/test_manifest.py b/tools/cell_census_builder/tests/test_manifest.py
index 89f6077dc..cbf8773c5 100644
--- a/tools/cell_census_builder/tests/test_manifest.py
+++ b/tools/cell_census_builder/tests/test_manifest.py
@@ -1,4 +1,3 @@
-import io
 import pathlib
 import re
 from unittest.mock import patch
@@ -6,7 +5,7 @@
 from cell_census_builder.build_soma.manifest import CXG_BASE_URI, load_manifest
 
 
-def test_load_manifest_from_file(tmp_path: pathlib.Path, manifest_csv: io.TextIOWrapper) -> None:
+def test_load_manifest_from_file(tmp_path: pathlib.Path, manifest_csv: str) -> None:
     """
     If specified a parameter, `load_manifest` should load the dataset manifest from such file.
     """
@@ -17,14 +16,26 @@ def test_load_manifest_from_file(tmp_path: pathlib.Path, manifest_csv: io.TextIO
     assert manifest[0].corpora_asset_h5ad_uri == f"{tmp_path}/data/h5ads/dataset_id_1.h5ad"
     assert manifest[1].corpora_asset_h5ad_uri == f"{tmp_path}/data/h5ads/dataset_id_2.h5ad"
 
+    with open(manifest_csv) as fp:
+        manifest = load_manifest(fp)
+        assert len(manifest) == 2
+        assert manifest[0].dataset_id == "dataset_id_1"
+        assert manifest[1].dataset_id == "dataset_id_2"
+        assert manifest[0].corpora_asset_h5ad_uri == f"{tmp_path}/data/h5ads/dataset_id_1.h5ad"
+        assert manifest[1].corpora_asset_h5ad_uri == f"{tmp_path}/data/h5ads/dataset_id_2.h5ad"
 
-def test_load_manifest_does_dedup(manifest_csv_with_duplicates: io.TextIOWrapper) -> None:
+
+def test_load_manifest_does_dedup(manifest_csv_with_duplicates: str) -> None:
     """
     `load_manifest` should not include duplicate datasets from the manifest
     """
     manifest = load_manifest(manifest_csv_with_duplicates)
     assert len(manifest) == 2
 
+    with open(manifest_csv_with_duplicates) as fp:
+        manifest = load_manifest(fp)
+        assert len(manifest) == 2
+
 
 def test_load_manifest_from_cxg() -> None:
     """

From fe93cb3fd3b29057c7070981046e9792db79931d Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Wed, 22 Mar 2023 17:55:07 +0000
Subject: [PATCH 20/34] refine builder build process

---
 tools/cell_census_builder/Dockerfile     |  3 ++-
 tools/cell_census_builder/Makefile       |  6 +++---
 tools/cell_census_builder/README.md      | 24 +++++++++++++++++-------
 tools/cell_census_builder/entrypoint.sh  |  4 ++++
 tools/cell_census_builder/pyproject.toml |  8 ++++----
 5 files changed, 30 insertions(+), 15 deletions(-)
 create mode 100644 tools/cell_census_builder/entrypoint.sh

diff --git a/tools/cell_census_builder/Dockerfile b/tools/cell_census_builder/Dockerfile
index 67a7a3d52..0b09650f5 100644
--- a/tools/cell_census_builder/Dockerfile
+++ b/tools/cell_census_builder/Dockerfile
@@ -7,6 +7,7 @@ ENV COMMIT_SHA=${COMMIT_SHA}
 
 RUN apt update && apt -y full-upgrade && apt -y install python3.10-venv python3-pip awscli
 
+ADD entrypoint.sh /
 ADD dist/ /tools/cell_census_builder
 
 RUN python3 -m pip install -U pip
@@ -14,4 +15,4 @@ RUN python3 -m pip -v install /tools/cell_census_builder/*.whl
 
 WORKDIR /census-build
 
-ENTRYPOINT ["python3", "-m", "cell_census_builder", "."]
+ENTRYPOINT ["/bin/bash", "/entrypoint.sh"]
diff --git a/tools/cell_census_builder/Makefile b/tools/cell_census_builder/Makefile
index c82691a75..6a90caedc 100644
--- a/tools/cell_census_builder/Makefile
+++ b/tools/cell_census_builder/Makefile
@@ -1,7 +1,7 @@
-# Build docker container
+# Build docker image. This Makefile is for developer convenience.
 
-.PHONY: container
-container:
+.PHONY: image
+image:
 	python3 -m build .
 	docker build --build-arg=COMMIT_SHA=$(git rev-parse --short HEAD) -t cell-census-builder .
 
diff --git a/tools/cell_census_builder/README.md b/tools/cell_census_builder/README.md
index 40b2ce6f1..d38016a22 100644
--- a/tools/cell_census_builder/README.md
+++ b/tools/cell_census_builder/README.md
@@ -10,7 +10,7 @@ build package.
 Please see the top-level [README](../../README.md) for more information on the Cell Census and
 using the Cell Census data.
 
-# Overview
+## Overview
 
 This package contains sub-modules, each of which automate elements of the Cell Census build and release process.
 They are wrapped at the package top-level by by a `__main__` which implements the Cell Census build process,
@@ -47,7 +47,7 @@ working_dir:
         +-- census-diff-VERSION.txt
 ```
 
-# Building and using the Docker container
+## Building and using the Docker container
 
 The standard Census build is expected to be done via a Docker container.
 
@@ -66,9 +66,19 @@ $ chmod ug+s /tmp/census-build   # optional, but makes permissions handling simp
 $ docker run --mount type=bind,source="`pwd`/tmp/census-build",target='/census-build' cell-census-builder
 ```
 
-# Module-specific notes
+### Commands to cleanup local Docker state on your ec2 instance (while building an image)
 
-## `host_validation` module
+Docker keeps around intermediate layers/images and if your machine doesn't have enough memory, you might run into issues. You can blow away these cached layers/images by running the following commands.
+
+```
+docker system prune
+docker rm -f $(docker ps -aq)
+docker rmi -f $(docker images -q)
+```
+
+## Module-specific notes
+
+### `host_validation` module
 
 Module which provides a set of checks that the current host machine has the requisite capabilities
 to build the census (e.g., free disk space). Raises exception (non-zero process exit) if host is
@@ -76,7 +86,7 @@ unable to meet base requirements.
 
 Stand-alone usage: `python -m cell_census_builder.host_validation`
 
-## `build_soma` module
+### `build_soma` module
 
 Stand-alone use: `python -m cell_census_builder.build_soma ...`
 
@@ -100,7 +110,7 @@ Modes of operation:
 a) (default) creating the entire "cell census" using all files currently in the CELLxGENE repository.
 b) creating a smaller "cell census" from a user-provided list of files (a "manifest")
 
-### Mode (a) - creating the full cell census from the entire CELLxGENE (public) corpus:
+#### Mode (a) - creating the full cell census from the entire CELLxGENE (public) corpus:
 
 - On a large-memory machine with _ample_ free (local) disk (eg, 3/4 TB or more) and swap (1 TB or more)
 - To create a cell census at `<census_path>`, execute:
@@ -112,7 +122,7 @@ b) creating a smaller "cell census" from a user-provided list of files (a "manif
 
 If you run out of memory, reduce `--max-workers`. You can also try a higher number if you have lots of CPU & memory.
 
-### Mode (b) - creating a cell census from a user-provided list of H5AD files:
+#### Mode (b) - creating a cell census from a user-provided list of H5AD files:
 
 - Create a manifest file, in CSV format, containing two columns: dataset_id, h5ad_uri. Example:
   ```csv
diff --git a/tools/cell_census_builder/entrypoint.sh b/tools/cell_census_builder/entrypoint.sh
new file mode 100644
index 000000000..cf3f21f10
--- /dev/null
+++ b/tools/cell_census_builder/entrypoint.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+python3 -m cell_census_builder . --help
+
diff --git a/tools/cell_census_builder/pyproject.toml b/tools/cell_census_builder/pyproject.toml
index 3d10e4b6d..d5a1f4faa 100644
--- a/tools/cell_census_builder/pyproject.toml
+++ b/tools/cell_census_builder/pyproject.toml
@@ -46,10 +46,10 @@ dependencies= [
     "pyyaml",
 ]
 
-# [tool.setuptools.packages.find]
-# where = ["src"]
-# include = ["cell_census_builder*"]  # package names should match these glob patterns (["*"] by default)
-# exclude = ["tests*", "scripts*"]  # exclude packages matching these glob patterns (empty by default)
+[tool.setuptools.packages.find]
+where = ["src"]
+include = ["cell_census_builder*"]  # package names should match these glob patterns (["*"] by default)
+exclude = ["tests*"]  # exclude packages matching these glob patterns (empty by default)
 
 [tool.setuptools_scm]
 root = "../.."

From 5da72eca913167d9c1e066394b60451cbc1c42b8 Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Wed, 22 Mar 2023 17:55:34 +0000
Subject: [PATCH 21/34] add GHA for docker image build

---
 .github/workflows/py-build.yml | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/py-build.yml b/.github/workflows/py-build.yml
index 6c1790c03..e1268660f 100644
--- a/.github/workflows/py-build.yml
+++ b/.github/workflows/py-build.yml
@@ -2,8 +2,9 @@ name: Python cell_census build
 
 on:
   pull_request:
-    paths-ignore:
-      - "api/r/**"
+    paths:
+      - "api/python/**"
+      - "tools/cell_census_builder/**"
   push:
     branches: [main]
   workflow_dispatch:
@@ -34,3 +35,28 @@ jobs:
         uses: actions/upload-artifact@v3
         with:
           path: api/python/cell_census/dist/*
+
+  build_docker_container:
+    name: Build Docker image for Census Builder
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+
+      - name: Install deps
+        run: |
+          python -m pip install -U pip setuptools build
+
+      - name: Build package
+        run: python -m build
+        working-directory: tools/cell_census_builder/
+
+      - name: Build image
+        run: docker build --build-arg=COMMIT_SHA=$(git rev-parse --short HEAD) -t cell-census-builder .
+        working-directory: tools/cell_census_builder/

From accb53b51708f5c24d346eb6d732f9f451a92e0d Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Wed, 22 Mar 2023 18:14:36 +0000
Subject: [PATCH 22/34] update readme

---
 tools/cell_census_builder/README.md           | 10 ++++-
 tools/census-builder-workflow/Dockerfile      | 17 -------
 tools/census-builder-workflow/README.md       | 45 -------------------
 .../census-builder-workflow/build-census.yaml | 11 -----
 tools/census-builder-workflow/entrypoint.py   |  3 --
 5 files changed, 9 insertions(+), 77 deletions(-)
 delete mode 100644 tools/census-builder-workflow/Dockerfile
 delete mode 100644 tools/census-builder-workflow/README.md
 delete mode 100644 tools/census-builder-workflow/build-census.yaml
 delete mode 100755 tools/census-builder-workflow/entrypoint.py

diff --git a/tools/cell_census_builder/README.md b/tools/cell_census_builder/README.md
index d38016a22..4a73fef1e 100644
--- a/tools/cell_census_builder/README.md
+++ b/tools/cell_census_builder/README.md
@@ -3,7 +3,7 @@
 This package contains code to build and release the Cell Census in the SOMA format, as specified in the
 [data schema](https://github.com/chanzuckerberg/cell-census/blob/main/docs/cell_census_schema.md).
 
-This tool is not intended for end-users - it is used by CZI to periodically create and release all
+This tool is not intended for end-users - it is used by the CELLxGENE team to periodically create and release all
 CELLxGENE data in the above format. The remainder of this document is intended for users of the
 build package.
 
@@ -66,6 +66,14 @@ $ chmod ug+s /tmp/census-build   # optional, but makes permissions handling simp
 $ docker run --mount type=bind,source="`pwd`/tmp/census-build",target='/census-build' cell-census-builder
 ```
 
+### Build configuration options
+
+To be documented. They are all present in the `build_state.py` file.
+
+### Building the docker image
+
+The image is built by a GHA workflow. For developer builds, there is a target present in `tools/cell_census_builder/Makefile`.
+
 ### Commands to cleanup local Docker state on your ec2 instance (while building an image)
 
 Docker keeps around intermediate layers/images and if your machine doesn't have enough memory, you might run into issues. You can blow away these cached layers/images by running the following commands.
diff --git a/tools/census-builder-workflow/Dockerfile b/tools/census-builder-workflow/Dockerfile
deleted file mode 100644
index 899445ff3..000000000
--- a/tools/census-builder-workflow/Dockerfile
+++ /dev/null
@@ -1,17 +0,0 @@
-FROM ubuntu:22.04
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-ARG COMMIT_SHA
-ENV COMMIT_SHA=${COMMIT_SHA}
-
-RUN apt update && apt -y install python3.10-venv python3-pip awscli
-
-ADD cell_census_builder/ /tools/cell_census_builder
-ADD scripts/requirements.txt .
-ADD entrypoint.py .
-ADD build-census.yaml .
-
-RUN python3 -m pip install -r requirements.txt
-
-ENTRYPOINT ["./entrypoint.py"]
diff --git a/tools/census-builder-workflow/README.md b/tools/census-builder-workflow/README.md
deleted file mode 100644
index 37512c232..000000000
--- a/tools/census-builder-workflow/README.md
+++ /dev/null
@@ -1,45 +0,0 @@
-# Cell Census Builder Workflow
-
-This subproject can be used to run a cell-census build using a Docker container and a custom workflow file.
-
-## Instructions
-
-### Build
-
-To build the docker container, `cd` into the parent folder (`tools/`) and run:
-
-```docker build --build-arg=COMMIT_SHA=$(git rev-parse --short HEAD) . -t census-builder```
-
-This will build a Docker container named `census-builder`.
-
-### Prepare
-
-Before running the workflow, make sure that a `data` directory exists on the machine. This can contain any inputs for the builder (e.g. a manifest file and local `h5ad`s), and will also be used to output the built cell census. This folder will also need to contain a `build-census.yaml` file as defined in the next step.
-
-
-### Create workflow file
-
-In the `data` folder, create a `build-census.yaml` file that contain a workflow that will be executed by the builder. This should also contain all the parameters for the workflow.
-
-Here is an example workflow that runs the builder using a manifest file:
-
-```
-census-builder:
-  uri:
-    /data/cell-census-small/
-  verbose:
-    true
-  commands:
-    build:
-      manifest:
-        /data/manifest-small.csv
-      test-disable-dirty-git-check:
-        true
-```
-
-
-### Run
-
-Run the builder workflow with:
-
-```docker run --mount type=bind,source="path/to/data",target=/data census-builder```
\ No newline at end of file
diff --git a/tools/census-builder-workflow/build-census.yaml b/tools/census-builder-workflow/build-census.yaml
deleted file mode 100644
index 17678f15e..000000000
--- a/tools/census-builder-workflow/build-census.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-census-builder:
-  uri:
-    /data/cell-census-small/
-  verbose:
-    true
-  commands:
-    build:
-      manifest:
-        /data/manifest-small.csv
-      test-disable-dirty-git-check:
-        true
diff --git a/tools/census-builder-workflow/entrypoint.py b/tools/census-builder-workflow/entrypoint.py
deleted file mode 100755
index 4e5e2cc8a..000000000
--- a/tools/census-builder-workflow/entrypoint.py
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/python3
-
-print("Calling the builder...")

From 40b1c9052ee25da79fbf173d1527fa57e8ef69f6 Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Wed, 22 Mar 2023 18:20:35 +0000
Subject: [PATCH 23/34] fix entry point

---
 tools/cell_census_builder/entrypoint.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/cell_census_builder/entrypoint.sh b/tools/cell_census_builder/entrypoint.sh
index cf3f21f10..31e2437bc 100644
--- a/tools/cell_census_builder/entrypoint.sh
+++ b/tools/cell_census_builder/entrypoint.sh
@@ -1,4 +1,3 @@
 #!/bin/bash
 
-python3 -m cell_census_builder . --help
-
+python3 -m cell_census_builder .

From 86993eed2a1b8531f9b0f897f2dfb4c06fb75d7a Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Wed, 22 Mar 2023 18:38:05 +0000
Subject: [PATCH 24/34] more readme edits

---
 tools/cell_census_builder/README.md | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/tools/cell_census_builder/README.md b/tools/cell_census_builder/README.md
index 4a73fef1e..98771b371 100644
--- a/tools/cell_census_builder/README.md
+++ b/tools/cell_census_builder/README.md
@@ -49,13 +49,11 @@ working_dir:
 
 ## Building and using the Docker container
 
-The standard Census build is expected to be done via a Docker container.
-
-To build the container, do a `git pull` to the version you want to use, and do the following to create a container called `cell-census-builder`:
+The standard Census build is expected to be done via a Docker container. To build the required image, do a `git pull` to the version you want to use, and do the following to create a docker image called `cell-census-builder`:
 
 ```
 $ cd tools/cell_census_builder
-$ make container
+$ make image
 ```
 
 To use the container to build the _full_ census, with default options, pick a working directory (e.g., /tmp/census-build), and:
@@ -68,11 +66,7 @@ $ docker run --mount type=bind,source="`pwd`/tmp/census-build",target='/census-b
 
 ### Build configuration options
 
-To be documented. They are all present in the `build_state.py` file.
-
-### Building the docker image
-
-The image is built by a GHA workflow. For developer builds, there is a target present in `tools/cell_census_builder/Makefile`.
+To be documented. Defaults are defined in the `build_state.py` file, and can be passed to the build process by creating a `config.yaml` in the build working directory.
 
 ### Commands to cleanup local Docker state on your ec2 instance (while building an image)
 

From 2c8e2b083dc2c50b6717afce35f0437902d3dd68 Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Wed, 22 Mar 2023 19:24:03 +0000
Subject: [PATCH 25/34] fix owlready2 installation in docker image

---
 tools/cell_census_builder/Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/cell_census_builder/Dockerfile b/tools/cell_census_builder/Dockerfile
index 0b09650f5..5073ba55a 100644
--- a/tools/cell_census_builder/Dockerfile
+++ b/tools/cell_census_builder/Dockerfile
@@ -10,8 +10,8 @@ RUN apt update && apt -y full-upgrade && apt -y install python3.10-venv python3-
 ADD entrypoint.sh /
 ADD dist/ /tools/cell_census_builder
 
-RUN python3 -m pip install -U pip
-RUN python3 -m pip -v install /tools/cell_census_builder/*.whl
+RUN python3 -m pip install -U pip Cython wheel build
+RUN python3 -m pip install /tools/cell_census_builder/*.whl
 
 WORKDIR /census-build
 

From f103b5cd10155b380ae8485699d522de627fb539 Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Wed, 22 Mar 2023 20:45:19 +0000
Subject: [PATCH 26/34] PR feedback

---
 tools/cell_census_builder/README.md | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/tools/cell_census_builder/README.md b/tools/cell_census_builder/README.md
index 98771b371..236060fa7 100644
--- a/tools/cell_census_builder/README.md
+++ b/tools/cell_census_builder/README.md
@@ -19,7 +19,7 @@ with standard defaults.
 The top-level build can be invoked as follows:
 
 - Create a working directory, e.g., `census-build` or equivalent.
-- If any configuration defaults need to be overridden, create a `config.yaml` in the working directory containing the default overrides.
+- If any configuration defaults need to be overridden, create a `config.yaml` in the working directory containing the default overrides. _NOTE:_ by default you do not need to create a `config.yaml` file -- the defaults are appropriate to build the full Census.
 - Run the build as `python -m cell_census_builder your-working_dir`
 
 This will perform four steps (more will be added the future):
@@ -66,7 +66,14 @@ $ docker run --mount type=bind,source="`pwd`/tmp/census-build",target='/census-b
 
 ### Build configuration options
 
-To be documented. Defaults are defined in the `build_state.py` file, and can be passed to the build process by creating a `config.yaml` in the build working directory.
+This is primarily for the use of package developers. The defaults are suitable for the standad Census build, and are defined in the `build_state.py` file.
+
+If you need to override a default, create `config.yaml` in the build working directory and specify the overrides. An example `config.yaml` might look like:
+
+```
+verbose: 2  # debug level logging
+consolidate: false  # disable TileDB consolidation
+```
 
 ### Commands to cleanup local Docker state on your ec2 instance (while building an image)
 

From 756d6f7b72321ffae747c2948e98687c8292924c Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Wed, 22 Mar 2023 20:52:38 +0000
Subject: [PATCH 27/34] PR feedback

---
 .../src/cell_census_builder/build_soma/manifest.py              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/manifest.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/manifest.py
index 1c7c4d266..d3768d675 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_soma/manifest.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/manifest.py
@@ -68,7 +68,7 @@ def load_manifest_from_CxG() -> List[Dataset]:
     logging.info(f"Found {len(datasets)} datasets, in {len(collections)} collections")
 
     # load per-dataset schema version
-    with concurrent.futures.ThreadPoolExecutor(max_workers=32) as tp:
+    with concurrent.futures.ThreadPoolExecutor() as tp:
         dataset_metadata = tp.map(
             lambda d: fetch_json(
                 f"{CXG_BASE_URI}curation/v1/collections/{d['collection_id']}/datasets/{d['dataset_id']}"

From 151036992792db2e640a7b0aa2b227e6acae54dc Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Wed, 22 Mar 2023 21:14:59 +0000
Subject: [PATCH 28/34] fix email address in metadata

---
 tools/cell_census_builder/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cell_census_builder/pyproject.toml b/tools/cell_census_builder/pyproject.toml
index d5a1f4faa..869e89371 100644
--- a/tools/cell_census_builder/pyproject.toml
+++ b/tools/cell_census_builder/pyproject.toml
@@ -7,7 +7,7 @@ name = "cell_census_builder"
 dynamic = ["version"]
 description = "Build Cell Census"
 authors = [
-    { name = "Chan Zuckerberg Initiative", email = "cellxgene@chanzuckerberg.com" }
+    { name = "Chan Zuckerberg Initiative", email = "soma@chanzuckerberg.com" }
 ]
 license = { text = "MIT" }
 readme = "README.md"

From 0566b17386418f4c9cde6dfc64fbc64141183f72 Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Thu, 23 Mar 2023 01:14:58 +0000
Subject: [PATCH 29/34] add file size integrity check on downloads

---
 .../src/cell_census_builder/build_soma/source_assets.py      | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py
index 15810c2c5..da893d9d4 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py
@@ -46,6 +46,11 @@ def _copy_file(n: int, dataset: Dataset, asset_dir: str, N: int) -> str:
 
     logging.info(f"Staging {dataset.dataset_id} ({n} of {N}) to {dataset_path}")
     fs.get_file(dataset.corpora_asset_h5ad_uri, dataset_path)
+
+    # verify file size is as expected, if we know the size a priori
+    assert (dataset.asset_h5ad_filesize == -1) or (dataset.asset_h5ad_filesize == os.path.getsize(dataset_path))
+    # TODO: add integrity checksum as well. Waiting on feature request chanzuckerberg/single-cell-data-portal#4392
+
     logging.info(f"Staging {dataset.dataset_id} ({n} of {N}) complete")
     return dataset_file_name
 

From 84acafba5c5fcf0c01f3a7f7bf570b3f40475d97 Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Thu, 23 Mar 2023 02:33:05 +0000
Subject: [PATCH 30/34] add missing broken process pool logger

---
 .../src/cell_census_builder/build_soma/validate_soma.py          | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/validate_soma.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/validate_soma.py
index a322d8993..0c315673a 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_soma/validate_soma.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/validate_soma.py
@@ -213,6 +213,7 @@ def validate_axis_dataframes(
                 for dataset in datasets
             ]
             for n, future in enumerate(concurrent.futures.as_completed(futures), start=1):
+                log_on_broken_process_pool(ppe)
                 res = future.result()
                 for eb_name, ebi in res.items():
                     eb_info[eb_name].update(ebi)

From 4f50e11b61d3c9671dc75f6e33a27677d524658f Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Thu, 23 Mar 2023 03:05:40 +0000
Subject: [PATCH 31/34] tweak developer Makefile for builder

---
 tools/cell_census_builder/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cell_census_builder/Makefile b/tools/cell_census_builder/Makefile
index 6a90caedc..c5f2dd697 100644
--- a/tools/cell_census_builder/Makefile
+++ b/tools/cell_census_builder/Makefile
@@ -1,7 +1,7 @@
 # Build docker image. This Makefile is for developer convenience.
 
 .PHONY: image
-image:
+image: clean
 	python3 -m build .
 	docker build --build-arg=COMMIT_SHA=$(git rev-parse --short HEAD) -t cell-census-builder .
 

From e4f4c5704a813274afc65b6ec1a492e7d6f72a8a Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Thu, 23 Mar 2023 03:31:32 +0000
Subject: [PATCH 32/34] clean up comments

---
 .../src/cell_census_builder/build_state.py                    | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tools/cell_census_builder/src/cell_census_builder/build_state.py b/tools/cell_census_builder/src/cell_census_builder/build_state.py
index 54e892191..6a05c16b8 100644
--- a/tools/cell_census_builder/src/cell_census_builder/build_state.py
+++ b/tools/cell_census_builder/src/cell_census_builder/build_state.py
@@ -36,10 +36,6 @@
     "multi_process": True,
     "max_workers": 2 + int(psutil.virtual_memory().total / (96 * 1024**3)),
     #
-    # XXX TODO: this exposes a bug in the validation pass
-    # "multi_process": False,
-    # "max_workers": None,
-    #
     # Host minimum resource validation
     "host_validation_disable": False,  # if True, host validation checks will be skipped
     "host_validation_min_physical_memory": 512 * 1024**3,  # 512GiB

From 2eb588ecd7537e3c235c4a3981801f6097f29692 Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Thu, 23 Mar 2023 16:12:52 +0000
Subject: [PATCH 33/34] PR feedback

---
 tools/cell_census_builder/Makefile            | 16 +++++++++++-
 tools/cell_census_builder/README.md           | 26 +++++++++++++------
 .../src/cell_census_builder/__init__.py       |  7 +----
 tools/scripts/requirements-dev.txt            |  2 ++
 4 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/tools/cell_census_builder/Makefile b/tools/cell_census_builder/Makefile
index c5f2dd697..abf3f1b06 100644
--- a/tools/cell_census_builder/Makefile
+++ b/tools/cell_census_builder/Makefile
@@ -1,10 +1,24 @@
-# Build docker image. This Makefile is for developer convenience.
+# Build docker image. This Makefile is for convenience in development,
+# and as a means to manually build in advance of pushing the image to
+# a registry.
+#
+# COMING SOON: Docker builds for routine use are created by a GHA, and
+# will be available in a Docker repository.
 
+# Create the image
 .PHONY: image
 image: clean
 	python3 -m build .
 	docker build --build-arg=COMMIT_SHA=$(git rev-parse --short HEAD) -t cell-census-builder .
 
+# Clean Python build
 .PHONY: clean
 clean:
 	rm -rf build dist
+
+# Prune docker cache
+.PHONY: prune
+prune:
+	docker system prune -f
+	if [ "$(docker ps -aq)" ]; then docker rm -f $(docker ps -aq) ; fi
+	if [ "$(docker images -q)" ]; then docker rmi -f $(docker images -q) ; fi
diff --git a/tools/cell_census_builder/README.md b/tools/cell_census_builder/README.md
index 236060fa7..fb3c1f8b6 100644
--- a/tools/cell_census_builder/README.md
+++ b/tools/cell_census_builder/README.md
@@ -49,19 +49,29 @@ working_dir:
 
 ## Building and using the Docker container
 
+### Prerequisites
+
+You will need:
+
+- Linux - known to work on Ubuntu 20 and 22, and should work fine on most other (modern) Linux distros
+- Docker - [primary installation instructions](https://docs.docker.com/engine/install/ubuntu/#installation-methods) and [important post-install configuration](https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user)
+- Python 3.9+
+
+### Build & run
+
 The standard Census build is expected to be done via a Docker container. To build the required image, do a `git pull` to the version you want to use, and do the following to create a docker image called `cell-census-builder`:
 
-```
-$ cd tools/cell_census_builder
-$ make image
+```shell
+cd tools/cell_census_builder
+make image
 ```
 
 To use the container to build the _full_ census, with default options, pick a working directory (e.g., /tmp/census-build), and:
 
-```
-$ mkdir /tmp/census-build
-$ chmod ug+s /tmp/census-build   # optional, but makes permissions handling simpler
-$ docker run --mount type=bind,source="`pwd`/tmp/census-build",target='/census-build' cell-census-builder
+```shell
+mkdir /tmp/census-build
+chmod ug+s /tmp/census-build   # optional, but makes permissions handling simpler
+docker run --mount type=bind,source="`pwd`/tmp/census-build",target='/census-build' cell-census-builder
 ```
 
 ### Build configuration options
@@ -79,7 +89,7 @@ consolidate: false  # disable TileDB consolidation
 
 Docker keeps around intermediate layers/images and if your machine doesn't have enough memory, you might run into issues. You can blow away these cached layers/images by running the following commands.
 
-```
+```shell
 docker system prune
 docker rm -f $(docker ps -aq)
 docker rmi -f $(docker images -q)
diff --git a/tools/cell_census_builder/src/cell_census_builder/__init__.py b/tools/cell_census_builder/src/cell_census_builder/__init__.py
index 4cd7c916d..584b56c05 100644
--- a/tools/cell_census_builder/src/cell_census_builder/__init__.py
+++ b/tools/cell_census_builder/src/cell_census_builder/__init__.py
@@ -1,9 +1,4 @@
-try:
-    from importlib import metadata
-except ImportError:
-    # for python <=3.7
-    import importlib_metadata as metadata  # type: ignore[no-redef]
-
+from importlib import metadata
 
 try:
     __version__ = metadata.version("cell_census_builder")
diff --git a/tools/scripts/requirements-dev.txt b/tools/scripts/requirements-dev.txt
index ddb07b83d..ec6b31fa1 100644
--- a/tools/scripts/requirements-dev.txt
+++ b/tools/scripts/requirements-dev.txt
@@ -1,3 +1,5 @@
 pytest
 coverage
 requests-mock
+setuptools
+build

From 509ffd20b1a5c6243269dee8656770c361b4d54f Mon Sep 17 00:00:00 2001
From: bkmartinjr <bruce@chanzuckerberg.com>
Date: Thu, 23 Mar 2023 16:20:00 +0000
Subject: [PATCH 34/34] fix typo

---
 tools/cell_census_builder/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cell_census_builder/README.md b/tools/cell_census_builder/README.md
index fb3c1f8b6..bdf8819b3 100644
--- a/tools/cell_census_builder/README.md
+++ b/tools/cell_census_builder/README.md
@@ -71,7 +71,7 @@ To use the container to build the _full_ census, with default options, pick a wo
 ```shell
 mkdir /tmp/census-build
 chmod ug+s /tmp/census-build   # optional, but makes permissions handling simpler
-docker run --mount type=bind,source="`pwd`/tmp/census-build",target='/census-build' cell-census-builder
+docker run --mount type=bind,source="/tmp/census-build",target='/census-build' cell-census-builder
 ```
 
 ### Build configuration options