From 10b30cd1a1b03f5eb7f8f96f3c8b7923a8ebcb38 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Fri, 17 Mar 2023 20:56:48 +0000 Subject: [PATCH 01/34] reorganize census builder --- tools/cell_census_builder/README.md | 39 +++++++--- tools/cell_census_builder/pyproject.toml | 74 +++++++++++++++++++ .../src/cell_census_builder/__init__.py | 12 +++ .../build_soma}/__init__.py | 0 .../build_soma}/__main__.py | 0 .../build_soma}/anndata.py | 0 .../build_soma}/census_summary.py | 0 .../build_soma}/consolidate.py | 0 .../build_soma}/datasets.py | 0 .../build_soma}/experiment_builder.py | 0 .../build_soma}/globals.py | 0 .../build_soma}/manifest.py | 0 .../cell_census_builder/build_soma}/mp.py | 10 +-- .../build_soma}/source_assets.py | 0 .../build_soma}/summary_cell_counts.py | 0 .../build_soma}/tissue_mapper.py | 0 .../cell_census_builder/build_soma}/util.py | 0 .../build_soma}/validate.py | 0 .../cell_census_builder}/census_summary.py | 2 +- .../cell_census_builder/host_validation.py | 67 +++++++++++++++++ .../src/cell_census_builder/logging.py | 26 +++++++ .../tests/anndata/conftest.py | 4 +- .../tests/anndata/test_anndata.py | 16 ++-- tools/cell_census_builder/tests/conftest.py | 17 ++--- .../cell_census_builder/tests/test_builder.py | 25 +++---- tools/cell_census_builder/tests/test_main.py | 2 +- .../tests/test_manifest.py | 8 +- .../tests/test_source_assets.py | 6 +- tools/cell_census_builder/tests/test_util.py | 3 +- .../scripts/aws/mount_instance_storage.sh | 0 .../scripts/aws/swapon_instance_storage.sh | 0 tools/scripts/requirements.txt | 17 ----- 32 files changed, 247 insertions(+), 81 deletions(-) create mode 100644 tools/cell_census_builder/pyproject.toml create mode 100644 tools/cell_census_builder/src/cell_census_builder/__init__.py rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/__init__.py (100%) rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/__main__.py (100%) rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/anndata.py (100%) rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/census_summary.py (100%) rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/consolidate.py (100%) rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/datasets.py (100%) rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/experiment_builder.py (100%) rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/globals.py (100%) rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/manifest.py (100%) rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/mp.py (86%) rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/source_assets.py (100%) rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/summary_cell_counts.py (100%) rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/tissue_mapper.py (100%) rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/util.py (100%) rename tools/cell_census_builder/{ => src/cell_census_builder/build_soma}/validate.py (100%) rename tools/cell_census_builder/{scripts/release => src/cell_census_builder}/census_summary.py (98%) create mode 100644 tools/cell_census_builder/src/cell_census_builder/host_validation.py create mode 100644 tools/cell_census_builder/src/cell_census_builder/logging.py rename tools/{cell_census_builder => }/scripts/aws/mount_instance_storage.sh (100%) rename tools/{cell_census_builder => }/scripts/aws/swapon_instance_storage.sh (100%) delete mode 100644 tools/scripts/requirements.txt diff --git a/tools/cell_census_builder/README.md b/tools/cell_census_builder/README.md index 7afb948b4..4dbda235d 100644 --- a/tools/cell_census_builder/README.md +++ b/tools/cell_census_builder/README.md @@ -1,16 +1,31 @@ # README -This is a tool to build the SOMA instantiation of the Cell Census schema, as specified in this doc: +This package contains code to build and release the Cell Census in the SOMA format, as specified in the +[data schema](https://github.com/chanzuckerberg/cell-census/blob/main/docs/cell_census_schema.md). -https://docs.google.com/document/d/1GKndzCk9q_1SdYOq3BeCxWgp-o2NSQkEmSBaBPKnNI8/ +This tool is not intended for end-users - it is used by CZI to periodically create and release all +CELLxGENE data in the above format. The remainder of this document is intended for users of the +build package. -CAVEATS (READ THIS): +Please see the top-level [README](../../README.md) for more information on the Cell Census. -1. The code is written to the still-rapidly-evolving and **pre-release** Python SOMA API, _and will be subject to change_ as the SOMA API and `tiledbsoma` evolve and stabilize. -2. The schema implemented by this code is still evolving and subject to change. -3. The `cell_census_builder` package requires Python 3.9 or later. +# Overview -## Usage +This package contains sub-modules, each of which automate elements of the Cell Census build and release process. +The ultimate intention is to integrate these into an automated multi-step workflow. Until that occurs, individual steps +are provided as modules with their own `__main__`, to be manually invoked. + +## `host_validation` module + +Module which provides a set of checks that the current host machine has the requisite capabilities +to build the census (e.g., free disk space). Raises exception (non-zero process exit) if host is +unable to meet base requirements. + +Stand-alone usage: `python -m cell_census_builder.host_validation` + +## `build_soma` module + +Stand-alone use: `python -m cell_census_builder.build_soma ...` TL;DR: @@ -25,7 +40,7 @@ The build process: - Step 3: Write the axis dataframes for each experiment, filtering the datasets and cells to include (serialized iteration of dataset H5ADs). - Step 4: Write the X layers for each experiment (parallelized iteration of filtered dataset H5ADs). - Step 5: Write datasets manifest and summary info. -- (Optional) Consolidate TileDB data +- (Optional) Consolidate TileDB data - (Optional) Validate the entire Cell Census, re-reading from storage. Modes of operation: @@ -37,10 +52,10 @@ b) creating a smaller "cell census" from a user-provided list of files (a "manif - On a large-memory machine with _ample_ free (local) disk (eg, 3/4 TB or more) and swap (1 TB or more) - To create a cell census at ``, execute: > $ python -m cell_census_builder -mp --max-workers 12 build -- Tips: - - `-v` to view info-level logging during run, or `-v -v` for debug-level logging - - `--test-first-n <#>` to test build on a subset of datasets - - `--build-tag $(date +'%Y%m%d_%H%M%S')` to produce non-conflicting census build directories during testing +- Tips: + - `-v` to view info-level logging during run, or `-v -v` for debug-level logging + - `--test-first-n <#>` to test build on a subset of datasets + - `--build-tag $(date +'%Y%m%d_%H%M%S')` to produce non-conflicting census build directories during testing If you run out of memory, reduce `--max-workers`. You can also try a higher number if you have lots of CPU & memory. diff --git a/tools/cell_census_builder/pyproject.toml b/tools/cell_census_builder/pyproject.toml new file mode 100644 index 000000000..a5bf541e0 --- /dev/null +++ b/tools/cell_census_builder/pyproject.toml @@ -0,0 +1,74 @@ +[build-system] +requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"] +build-backend = "setuptools.build_meta" + +[project] +name = "cell_census_builder" +dynamic = ["version"] +description = "Build Cell Census" +authors = [ + { name = "Chan Zuckerberg Initiative", email = "cellxgene@chanzuckerberg.com" } +] +license = { text = "MIT" } +readme = "README.md" +requires-python = ">= 3.9, < 3.11" # Python 3.11 is pending numba support +classifiers = [ + "Development Status :: 2 - Pre-Alpha", + "Intended Audience :: Developers", + "Intended Audience :: Information Technology", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "Operating System :: POSIX :: Linux", + "Operating System :: MacOS :: MacOS X", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", +] +dependencies= [ + "pyarrow", + "pandas", + "anndata>=0.8", + "numpy", + # NOTE: The builder's version of tiledbsoma MUST be <= the API's tiledbsoma version, to ensure reader compatibility + # with TileDB on-disk storage format + "tiledbsoma==1.0.0", + "scipy", + "fsspec", + "s3fs", + "requests", + "aiohttp", + "Cython", # required by owlready2 + "wheel", # required by owlready2 + "owlready2", + "gitpython", + "attrs>=22.2.0", + "psutil", + "cell_census==0.10.0", + "typing_extensions", +] + +# [tool.setuptools.packages.find] +# where = ["src"] +# include = ["cell_census_builder*"] # package names should match these glob patterns (["*"] by default) +# exclude = ["tests*", "scripts*"] # exclude packages matching these glob patterns (empty by default) + +[tool.setuptools_scm] +root = "../.." + +[tool.black] +line-length = 120 +target_version = ['py39'] + +[tool.mypy] +show_error_codes = true +ignore_missing_imports = true +warn_unreachable = true +strict = true +plugins = "numpy.typing.mypy_plugin" + +[tool.ruff] +select = ["E", "F", "B", "I"] +ignore = ["E501", "E402", "C408", ] +line-length = 120 +target-version = "py39" diff --git a/tools/cell_census_builder/src/cell_census_builder/__init__.py b/tools/cell_census_builder/src/cell_census_builder/__init__.py new file mode 100644 index 000000000..16e5282c0 --- /dev/null +++ b/tools/cell_census_builder/src/cell_census_builder/__init__.py @@ -0,0 +1,12 @@ +try: + from importlib import metadata +except ImportError: + # for python <=3.7 + import importlib_metadata as metadata # type: ignore[no-redef] + + +try: + __version__ = metadata.version("cell_census") +except metadata.PackageNotFoundError: + # package is not installed + __version__ = "0.0.0-unknown" diff --git a/tools/cell_census_builder/__init__.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/__init__.py similarity index 100% rename from tools/cell_census_builder/__init__.py rename to tools/cell_census_builder/src/cell_census_builder/build_soma/__init__.py diff --git a/tools/cell_census_builder/__main__.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py similarity index 100% rename from tools/cell_census_builder/__main__.py rename to tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py diff --git a/tools/cell_census_builder/anndata.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/anndata.py similarity index 100% rename from tools/cell_census_builder/anndata.py rename to tools/cell_census_builder/src/cell_census_builder/build_soma/anndata.py diff --git a/tools/cell_census_builder/census_summary.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/census_summary.py similarity index 100% rename from tools/cell_census_builder/census_summary.py rename to tools/cell_census_builder/src/cell_census_builder/build_soma/census_summary.py diff --git a/tools/cell_census_builder/consolidate.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/consolidate.py similarity index 100% rename from tools/cell_census_builder/consolidate.py rename to tools/cell_census_builder/src/cell_census_builder/build_soma/consolidate.py diff --git a/tools/cell_census_builder/datasets.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/datasets.py similarity index 100% rename from tools/cell_census_builder/datasets.py rename to tools/cell_census_builder/src/cell_census_builder/build_soma/datasets.py diff --git a/tools/cell_census_builder/experiment_builder.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_builder.py similarity index 100% rename from tools/cell_census_builder/experiment_builder.py rename to tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_builder.py diff --git a/tools/cell_census_builder/globals.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/globals.py similarity index 100% rename from tools/cell_census_builder/globals.py rename to tools/cell_census_builder/src/cell_census_builder/build_soma/globals.py diff --git a/tools/cell_census_builder/manifest.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/manifest.py similarity index 100% rename from tools/cell_census_builder/manifest.py rename to tools/cell_census_builder/src/cell_census_builder/build_soma/manifest.py diff --git a/tools/cell_census_builder/mp.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/mp.py similarity index 86% rename from tools/cell_census_builder/mp.py rename to tools/cell_census_builder/src/cell_census_builder/build_soma/mp.py index b78e57da0..bd5d9b580 100644 --- a/tools/cell_census_builder/mp.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/mp.py @@ -5,6 +5,8 @@ import os from typing import Optional, cast +from ..logging import setup_logging + def cpu_count() -> int: """Sign, os.cpu_count() returns None if "undetermined" number of CPUs""" @@ -15,13 +17,7 @@ def cpu_count() -> int: def process_initializer(verbose: int = 0) -> None: - level = logging.DEBUG if verbose > 1 else logging.INFO if verbose == 1 else logging.WARNING - logging.basicConfig( - format="%(asctime)s %(process)-7s %(levelname)-8s %(message)s", - level=level, - datefmt="%Y-%m-%d %H:%M:%S", - ) - logging.captureWarnings(True) + setup_logging(verbose) def create_process_pool_executor( diff --git a/tools/cell_census_builder/source_assets.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py similarity index 100% rename from tools/cell_census_builder/source_assets.py rename to tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py diff --git a/tools/cell_census_builder/summary_cell_counts.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/summary_cell_counts.py similarity index 100% rename from tools/cell_census_builder/summary_cell_counts.py rename to tools/cell_census_builder/src/cell_census_builder/build_soma/summary_cell_counts.py diff --git a/tools/cell_census_builder/tissue_mapper.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/tissue_mapper.py similarity index 100% rename from tools/cell_census_builder/tissue_mapper.py rename to tools/cell_census_builder/src/cell_census_builder/build_soma/tissue_mapper.py diff --git a/tools/cell_census_builder/util.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/util.py similarity index 100% rename from tools/cell_census_builder/util.py rename to tools/cell_census_builder/src/cell_census_builder/build_soma/util.py diff --git a/tools/cell_census_builder/validate.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/validate.py similarity index 100% rename from tools/cell_census_builder/validate.py rename to tools/cell_census_builder/src/cell_census_builder/build_soma/validate.py diff --git a/tools/cell_census_builder/scripts/release/census_summary.py b/tools/cell_census_builder/src/cell_census_builder/census_summary.py similarity index 98% rename from tools/cell_census_builder/scripts/release/census_summary.py rename to tools/cell_census_builder/src/cell_census_builder/census_summary.py index 08cdbe31f..86fe202af 100644 --- a/tools/cell_census_builder/scripts/release/census_summary.py +++ b/tools/cell_census_builder/src/cell_census_builder/census_summary.py @@ -4,7 +4,7 @@ import cell_census import pandas as pd -from tools.cell_census_builder.globals import CENSUS_DATA_NAME, CENSUS_INFO_NAME +from .build.globals import CENSUS_DATA_NAME, CENSUS_INFO_NAME # Print all of the Pandas DataFrames, except the dimensions pd.options.display.max_columns = None # type: ignore[assignment] # None is legal per Pandas documentation. diff --git a/tools/cell_census_builder/src/cell_census_builder/host_validation.py b/tools/cell_census_builder/src/cell_census_builder/host_validation.py new file mode 100644 index 000000000..e0c407710 --- /dev/null +++ b/tools/cell_census_builder/src/cell_census_builder/host_validation.py @@ -0,0 +1,67 @@ +import logging +import os +import sys +from typing import Optional + +import psutil + +from cell_census_builder.logging import setup_logging, hr_multibyte_unit + +"""Minimum physical RAM""" +MIN_RAM = 512 * 1024**3 # 512GiB + +"""Minimum virtual memory/swap""" +MIN_SWAP = 2 * 1024**4 # 2TiB + +"""Minimum free disk space""" +MIN_FREE_DISK_SPACE = 1 * 1024**4 # 1 TiB + + +def check_os() -> None: + """ + Check that we run on Posix (Linux, MacOS), as we rely on + Posix semantics for a few things. + """ + assert psutil.POSIX + + +def check_memory() -> None: + """ + Check for sufficient physical and virtual memory. + """ + svmem = psutil.virtual_memory() + logging.debug(f"Host: {hr_multibyte_unit(svmem.total)} memory found") + assert svmem.total >= MIN_RAM, f"Insufficient memory (found {svmem.total}, require {MIN_RAM})" + + svswap = psutil.swap_memory() + logging.debug(f"Host: {hr_multibyte_unit(svswap.total)} swap found") + assert svswap.total >= MIN_SWAP, f"Insufficient swap space (found {svswap.total}, require {MIN_SWAP})" + + +def check_free_disk(working_dir: Optional[str] = ".") -> None: + """ + Check for sufficient free disk space. + """ + skdiskusage = psutil.disk_usage(working_dir) + logging.debug(f"Host: {hr_multibyte_unit(skdiskusage.free)} free disk space found") + assert ( + skdiskusage.free >= MIN_FREE_DISK_SPACE + ), f"Insufficient free disk space (found {skdiskusage.free}, require {MIN_FREE_DISK_SPACE})" + + +def run_all_checks() -> int: + """ + Run all host validation checks. Returns zero or raises an exception. + """ + check_os() + check_memory() + check_free_disk(os.getcwd()) # assumed working directory is CWD + logging.info("Host validation success") + return 0 + + +# Process MUST return zero on success (all good) or non-zero on a +# host which does not validate. +if __name__ == "__main__": + setup_logging(verbose=1) + sys.exit(run_all_checks()) diff --git a/tools/cell_census_builder/src/cell_census_builder/logging.py b/tools/cell_census_builder/src/cell_census_builder/logging.py new file mode 100644 index 000000000..987046b6d --- /dev/null +++ b/tools/cell_census_builder/src/cell_census_builder/logging.py @@ -0,0 +1,26 @@ +import logging +import math + + +def setup_logging(verbose: int = 0) -> None: + """ + Configure the logger + """ + level = logging.DEBUG if verbose > 1 else logging.INFO if verbose == 1 else logging.WARNING + logging.basicConfig( + format="%(asctime)s %(process)-7s %(levelname)-8s %(message)s", + level=level, + datefmt="%Y-%m-%d %H:%M:%S", + ) + logging.captureWarnings(True) + + +def hr_multibyte_unit(n_bytes: int) -> str: + """Convert number of bytes into a human-readable binary (power of 1024) multi-byte unit string.""" + if n_bytes == 0: + return "0B" + + unit_size_name = ("B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB") + unit = int(math.floor(math.log(n_bytes, 1024))) + n_units = round(n_bytes / math.pow(1024, unit)) + return f"{n_units}{unit_size_name[unit]}" diff --git a/tools/cell_census_builder/tests/anndata/conftest.py b/tools/cell_census_builder/tests/anndata/conftest.py index 354538825..a2d0a78bf 100644 --- a/tools/cell_census_builder/tests/anndata/conftest.py +++ b/tools/cell_census_builder/tests/anndata/conftest.py @@ -2,9 +2,9 @@ import anndata as ad import pytest +from cell_census_builder.build_soma.datasets import Dataset -from tools.cell_census_builder.datasets import Dataset -from tools.cell_census_builder.tests.conftest import ORGANISMS, get_h5ad +from ..conftest import ORGANISMS, get_h5ad @pytest.fixture diff --git a/tools/cell_census_builder/tests/anndata/test_anndata.py b/tools/cell_census_builder/tests/anndata/test_anndata.py index 8b2a37f03..adbc95b57 100644 --- a/tools/cell_census_builder/tests/anndata/test_anndata.py +++ b/tools/cell_census_builder/tests/anndata/test_anndata.py @@ -2,10 +2,10 @@ import anndata as ad import numpy as np +from cell_census_builder.build_soma.anndata import get_cellxgene_schema_version, make_anndata_cell_filter, open_anndata +from cell_census_builder.build_soma.datasets import Dataset -from tools.cell_census_builder.anndata import get_cellxgene_schema_version, make_anndata_cell_filter, open_anndata -from tools.cell_census_builder.datasets import Dataset -from tools.cell_census_builder.tests.conftest import ORGANISMS +from ..conftest import ORGANISMS def test_open_anndata(datasets: List[Dataset]) -> None: @@ -76,7 +76,7 @@ def test_open_anndata_equalizes_raw_and_normalized(datasets_with_larger_raw_laye def test_make_anndata_cell_filter(h5ad_simple: ad.AnnData) -> None: - func = make_anndata_cell_filter({}) # type: ignore + func = make_anndata_cell_filter({}) filtered_h5ad = func(h5ad_simple) assert h5ad_simple.var.equals(filtered_h5ad.var) assert h5ad_simple.obs.equals(filtered_h5ad.obs) @@ -86,28 +86,28 @@ def test_make_anndata_cell_filter(h5ad_simple: ad.AnnData) -> None: def test_make_anndata_cell_filter_filters_out_organoids_cell_culture( h5ad_with_organoids_and_cell_culture: ad.AnnData, ) -> None: - func = make_anndata_cell_filter({}) # type: ignore + func = make_anndata_cell_filter({}) filtered_h5ad = func(h5ad_with_organoids_and_cell_culture) assert h5ad_with_organoids_and_cell_culture.var.equals(filtered_h5ad.var) assert filtered_h5ad.obs.shape[0] == 2 def test_make_anndata_cell_filter_organism(h5ad_with_organism: ad.AnnData) -> None: - func = make_anndata_cell_filter({"organism_ontology_term_id": ORGANISMS[0].organism_ontology_term_id}) # type: ignore + func = make_anndata_cell_filter({"organism_ontology_term_id": ORGANISMS[0].organism_ontology_term_id}) filtered_h5ad = func(h5ad_with_organism) assert h5ad_with_organism.var.equals(filtered_h5ad.var) assert filtered_h5ad.obs.shape[0] == 3 def test_make_anndata_cell_filter_feature_biotype_gene(h5ad_with_feature_biotype: ad.AnnData) -> None: - func = make_anndata_cell_filter({}) # type: ignore + func = make_anndata_cell_filter({}) filtered_h5ad = func(h5ad_with_feature_biotype) assert h5ad_with_feature_biotype.obs.equals(filtered_h5ad.obs) assert filtered_h5ad.var.shape[0] == 3 def test_make_anndata_cell_filter_assay(h5ad_with_assays: ad.AnnData) -> None: - func = make_anndata_cell_filter({"assay_ontology_term_ids": ["EFO:1234", "EFO:1235"]}) # type: ignore + func = make_anndata_cell_filter({"assay_ontology_term_ids": ["EFO:1234", "EFO:1235"]}) filtered_h5ad = func(h5ad_with_assays) assert filtered_h5ad.obs.shape[0] == 2 assert list(filtered_h5ad.obs.index) == ["1", "3"] diff --git a/tools/cell_census_builder/tests/conftest.py b/tools/cell_census_builder/tests/conftest.py index 6b3a83e18..3b8363289 100644 --- a/tools/cell_census_builder/tests/conftest.py +++ b/tools/cell_census_builder/tests/conftest.py @@ -9,13 +9,12 @@ import pandas as pd import pytest from _pytest.monkeypatch import MonkeyPatch -from scipy import sparse - -from tools.cell_census_builder.datasets import Dataset -from tools.cell_census_builder.globals import ( +from cell_census_builder.build_soma.datasets import Dataset +from cell_census_builder.build_soma.globals import ( CENSUS_X_LAYERS_PLATFORM_CONFIG, ) -from tools.cell_census_builder.mp import process_initializer +from cell_census_builder.build_soma.mp import process_initializer +from scipy import sparse @attrs.define(frozen=True) @@ -168,9 +167,5 @@ def manifest_csv_with_duplicates(tmp_path: pathlib.Path) -> io.TextIOWrapper: @pytest.fixture() def setup(monkeypatch: MonkeyPatch) -> None: process_initializer() - monkeypatch.setitem( - CENSUS_X_LAYERS_PLATFORM_CONFIG["raw"]["tiledb"]["create"]["dims"]["soma_dim_0"], "tile", 2 # type: ignore - ) - monkeypatch.setitem( - CENSUS_X_LAYERS_PLATFORM_CONFIG["raw"]["tiledb"]["create"]["dims"]["soma_dim_1"], "tile", 2 # type: ignore - ) + monkeypatch.setitem(CENSUS_X_LAYERS_PLATFORM_CONFIG["raw"]["tiledb"]["create"]["dims"]["soma_dim_0"], "tile", 2) + monkeypatch.setitem(CENSUS_X_LAYERS_PLATFORM_CONFIG["raw"]["tiledb"]["create"]["dims"]["soma_dim_1"], "tile", 2) diff --git a/tools/cell_census_builder/tests/test_builder.py b/tools/cell_census_builder/tests/test_builder.py index fe802f713..10b752346 100644 --- a/tools/cell_census_builder/tests/test_builder.py +++ b/tools/cell_census_builder/tests/test_builder.py @@ -10,17 +10,16 @@ import pyarrow as pa import tiledb import tiledbsoma as soma - -from tools.cell_census_builder.__main__ import build, build_step1_get_source_datasets, make_experiment_specs -from tools.cell_census_builder.datasets import Dataset -from tools.cell_census_builder.experiment_builder import ExperimentBuilder -from tools.cell_census_builder.globals import ( +from cell_census_builder.build_soma.__main__ import build, build_step1_get_source_datasets, make_experiment_specs +from cell_census_builder.build_soma.datasets import Dataset +from cell_census_builder.build_soma.experiment_builder import ExperimentBuilder +from cell_census_builder.build_soma.globals import ( CENSUS_DATA_NAME, CENSUS_INFO_NAME, FEATURE_DATASET_PRESENCE_MATRIX_NAME, MEASUREMENT_RNA_NAME, ) -from tools.cell_census_builder.validate import validate +from cell_census_builder.build_soma.validate import validate def test_base_builder_creation( @@ -29,10 +28,10 @@ def test_base_builder_creation( """ Runs the builder, queries the census and performs a set of base assertions. """ - with patch("tools.cell_census_builder.__main__.prepare_file_system"), patch( - "tools.cell_census_builder.__main__.build_step1_get_source_datasets", return_value=datasets - ), patch("tools.cell_census_builder.consolidate._run"), patch( - "tools.cell_census_builder.validate.validate_consolidation", return_value=True + with patch("cell_census_builder.build_soma.__main__.prepare_file_system"), patch( + "cell_census_builder.build_soma.__main__.build_step1_get_source_datasets", return_value=datasets + ), patch("cell_census_builder.build_soma.consolidate._run"), patch( + "cell_census_builder.build_soma.validate.validate_consolidation", return_value=True ): # Patching consolidate_tiledb_object, becuase is uses to much memory to run in github actions. experiment_specifications = make_experiment_specs() @@ -41,13 +40,13 @@ def test_base_builder_creation( from types import SimpleNamespace args = SimpleNamespace(multi_process=False, consolidate=True, build_tag="test_tag", verbose=True) - return_value = build(args, soma_path, assets_path, experiment_builders) # type: ignore[arg-type] + return_value = build(args, soma_path, assets_path, experiment_builders) # return_value = 0 means that the build succeeded assert return_value == 0 # validate the cell_census - return_value = validate(args, soma_path, assets_path, experiment_specifications) # type: ignore[arg-type] + return_value = validate(args, soma_path, assets_path, experiment_specifications) assert return_value is True # Query the census and do assertions @@ -137,7 +136,7 @@ def test_build_step1_get_source_datasets(tmp_path: pathlib.Path, manifest_csv: i args = SimpleNamespace(manifest=manifest_csv, test_first_n=None, verbose=2, multi_process=True) # Call the function - datasets = build_step1_get_source_datasets(args, f"{tmp_path}/dest") # type: ignore + datasets = build_step1_get_source_datasets(args, f"{tmp_path}/dest") # Verify that 2 datasets are returned assert len(datasets) == 2 diff --git a/tools/cell_census_builder/tests/test_main.py b/tools/cell_census_builder/tests/test_main.py index 83985fd06..ff0cbf0a9 100644 --- a/tools/cell_census_builder/tests/test_main.py +++ b/tools/cell_census_builder/tests/test_main.py @@ -1,4 +1,4 @@ -from tools.cell_census_builder.__main__ import create_args_parser +from cell_census_builder.build_soma.__main__ import create_args_parser def test_create_args_parser_default_build() -> None: diff --git a/tools/cell_census_builder/tests/test_manifest.py b/tools/cell_census_builder/tests/test_manifest.py index 348acdb2a..89f6077dc 100644 --- a/tools/cell_census_builder/tests/test_manifest.py +++ b/tools/cell_census_builder/tests/test_manifest.py @@ -3,7 +3,7 @@ import re from unittest.mock import patch -from tools.cell_census_builder.manifest import CXG_BASE_URI, load_manifest +from cell_census_builder.build_soma.manifest import CXG_BASE_URI, load_manifest def test_load_manifest_from_file(tmp_path: pathlib.Path, manifest_csv: io.TextIOWrapper) -> None: @@ -30,7 +30,7 @@ def test_load_manifest_from_cxg() -> None: """ If no parameters are specified, `load_manifest` should load the dataset list from Discover API. """ - with patch("tools.cell_census_builder.manifest.fetch_json") as m: + with patch("cell_census_builder.build_soma.manifest.fetch_json") as m: def mock_call_fn(uri): # type: ignore if uri == f"{CXG_BASE_URI}curation/v1/collections": @@ -61,7 +61,7 @@ def test_load_manifest_from_cxg_excludes_datasets_with_old_schema() -> None: """ `load_manifest` should exclude datasets that do not have a current schema version. """ - with patch("tools.cell_census_builder.manifest.fetch_json") as m: + with patch("cell_census_builder.build_soma.manifest.fetch_json") as m: def mock_call_fn(uri): # type: ignore if uri == f"{CXG_BASE_URI}curation/v1/collections": @@ -94,7 +94,7 @@ def test_load_manifest_from_cxg_excludes_datasets_with_no_assets() -> None: """ `load_manifest` should exclude datasets that do not have assets """ - with patch("tools.cell_census_builder.manifest.fetch_json") as m: + with patch("cell_census_builder.build_soma.manifest.fetch_json") as m: def mock_call_fn(uri): # type: ignore if uri == f"{CXG_BASE_URI}curation/v1/collections": diff --git a/tools/cell_census_builder/tests/test_source_assets.py b/tools/cell_census_builder/tests/test_source_assets.py index c6f0d4ab9..0b5f5707e 100644 --- a/tools/cell_census_builder/tests/test_source_assets.py +++ b/tools/cell_census_builder/tests/test_source_assets.py @@ -1,8 +1,8 @@ import pathlib from types import ModuleType, SimpleNamespace -from tools.cell_census_builder.datasets import Dataset -from tools.cell_census_builder.source_assets import stage_source_assets +from cell_census_builder.build_soma.datasets import Dataset +from cell_census_builder.build_soma.source_assets import stage_source_assets def test_source_assets(tmp_path: pathlib.Path) -> None: @@ -18,7 +18,7 @@ def test_source_assets(tmp_path: pathlib.Path) -> None: datasets.append(dataset) # Call the function - stage_source_assets(datasets, SimpleNamespace(verbose=True), tmp_path / "dest") # type: ignore + stage_source_assets(datasets, SimpleNamespace(verbose=True), tmp_path / "dest") # Verify that the files exist for i in range(10): diff --git a/tools/cell_census_builder/tests/test_util.py b/tools/cell_census_builder/tests/test_util.py index c1ed55d13..77aaf8dbf 100644 --- a/tools/cell_census_builder/tests/test_util.py +++ b/tools/cell_census_builder/tests/test_util.py @@ -1,9 +1,8 @@ import numpy as np import pytest +from cell_census_builder.build_soma.util import array_chunker, is_nonnegative_integral, uricat from scipy.sparse import coo_matrix, csr_matrix, triu -from tools.cell_census_builder.util import array_chunker, is_nonnegative_integral, uricat - def test_is_nonnegative_integral() -> None: X = np.array([1, 2, 3], dtype=np.float32) diff --git a/tools/cell_census_builder/scripts/aws/mount_instance_storage.sh b/tools/scripts/aws/mount_instance_storage.sh similarity index 100% rename from tools/cell_census_builder/scripts/aws/mount_instance_storage.sh rename to tools/scripts/aws/mount_instance_storage.sh diff --git a/tools/cell_census_builder/scripts/aws/swapon_instance_storage.sh b/tools/scripts/aws/swapon_instance_storage.sh similarity index 100% rename from tools/cell_census_builder/scripts/aws/swapon_instance_storage.sh rename to tools/scripts/aws/swapon_instance_storage.sh diff --git a/tools/scripts/requirements.txt b/tools/scripts/requirements.txt deleted file mode 100644 index 605ba7ab4..000000000 --- a/tools/scripts/requirements.txt +++ /dev/null @@ -1,17 +0,0 @@ -pyarrow -pandas -anndata -numpy -# NOTE: The builder's version of tiledbsoma MUST be <= the API's tiledbsoma version, to ensure reader compatibility -# with TileDB on-disk storage format -tiledbsoma==1.0.0 -scipy -fsspec -s3fs -requests -aiohttp -Cython # required by owlready2 -wheel # required by owlready2 -owlready2 -gitpython -attrs>=22.2.0 From 2399c97530a2565d380166be69c858947220ee06 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Fri, 17 Mar 2023 21:42:39 +0000 Subject: [PATCH 02/34] refactor files in build_soma --- .../build_soma/__main__.py | 291 +----------------- .../cell_census_builder/build_soma/build.py | 256 +++++++++++++++ .../build_soma/experiment_specs.py | 34 ++ .../cell_census_builder/host_validation.py | 2 +- .../cell_census_builder/tests/test_builder.py | 7 +- 5 files changed, 300 insertions(+), 290 deletions(-) create mode 100644 tools/cell_census_builder/src/cell_census_builder/build_soma/build.py create mode 100644 tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_specs.py diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py index 5b15d2829..f2f5606e6 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py @@ -1,70 +1,16 @@ import argparse -import gc -import logging import multiprocessing -import os.path import sys -from datetime import datetime, timezone -from typing import List +from datetime import datetime -import tiledbsoma as soma - -from .anndata import open_anndata -from .census_summary import create_census_summary -from .consolidate import consolidate -from .datasets import Dataset, assign_dataset_soma_joinids, create_dataset_manifest -from .experiment_builder import ( - ExperimentBuilder, - ExperimentSpecification, - populate_X_layers, - reopen_experiment_builders, -) -from .globals import ( - CENSUS_DATA_NAME, - CENSUS_INFO_NAME, - CENSUS_SCHEMA_VERSION, - CXG_SCHEMA_VERSION, - RNA_SEQ, - SOMA_TileDB_Context, -) -from .manifest import load_manifest +from .build import build +from .experiment_builder import ExperimentBuilder +from .experiment_specs import make_experiment_specs from .mp import process_initializer -from .source_assets import stage_source_assets -from .summary_cell_counts import create_census_summary_cell_counts -from .util import get_git_commit_sha, is_git_repo_dirty, uricat +from .util import uricat from .validate import validate -def make_experiment_specs() -> List[ExperimentSpecification]: - """ - Define all soma.Experiments to build in the census. - - Functionally, this defines per-experiment name, anndata filter, etc. - It also loads any required per-Experiment assets. - """ - GENE_LENGTH_BASE_URI = ( - "https://raw.githubusercontent.com/chanzuckerberg/single-cell-curation/" - "100f935eac932e1f5f5dadac0627204da3790f6f/cellxgene_schema_cli/cellxgene_schema/ontology_files/" - ) - GENE_LENGTH_URIS = [ - GENE_LENGTH_BASE_URI + "genes_homo_sapiens.csv.gz", - GENE_LENGTH_BASE_URI + "genes_mus_musculus.csv.gz", - GENE_LENGTH_BASE_URI + "genes_sars_cov_2.csv.gz", - ] - return [ # The soma.Experiments we want to build - ExperimentSpecification.create( - name="homo_sapiens", - anndata_cell_filter_spec=dict(organism_ontology_term_id="NCBITaxon:9606", assay_ontology_term_ids=RNA_SEQ), - gene_feature_length_uris=GENE_LENGTH_URIS, - ), - ExperimentSpecification.create( - name="mus_musculus", - anndata_cell_filter_spec=dict(organism_ontology_term_id="NCBITaxon:10090", assay_ontology_term_ids=RNA_SEQ), - gene_feature_length_uris=GENE_LENGTH_URIS, - ), - ] - - def main() -> int: parser = create_args_parser() args = parser.parse_args() @@ -90,233 +36,6 @@ def main() -> int: return cc -def prepare_file_system(soma_path: str, assets_path: str, args: argparse.Namespace) -> None: - """ - Prepares the file system for the builder run - """ - # Don't clobber an existing census build - if os.path.exists(soma_path) or os.path.exists(assets_path): - raise Exception("Census build path already exists - aborting build") - - # Ensure that the git tree is clean - if not args.test_disable_dirty_git_check and is_git_repo_dirty(): - raise Exception("The git repo has uncommitted changes - aborting build") - - # Create top-level build directories - os.makedirs(soma_path, exist_ok=False) - os.makedirs(assets_path, exist_ok=False) - - -def build( - args: argparse.Namespace, soma_path: str, assets_path: str, experiment_builders: List[ExperimentBuilder] -) -> int: - """ - Approximately, build steps are: - 1. Download manifest and copy/stage all source assets - 2. Read all H5AD and create axis dataframe (serial) - * write obs/var dataframes - * accumulate overall shape of X - 3. Read all H5AD assets again, write X layer (parallel) - 4. Optional: validate - - Returns - ------- - int - Process completion code, 0 on success, non-zero indicating error, - suitable for providing to sys.exit() - """ - - try: - prepare_file_system(soma_path, assets_path, args) - except Exception as e: - logging.error(e) - return 1 - - # Step 1 - get all source datasets - datasets = build_step1_get_source_datasets(args, assets_path) - - # Step 2 - create root collection, and all child objects, but do not populate any dataframes or matrices - root_collection = build_step2_create_root_collection(soma_path, experiment_builders) - gc.collect() - - # Step 3 - populate axes - filtered_datasets = build_step3_populate_obs_and_var_axes(assets_path, datasets, experiment_builders) - - # Step 4 - populate X layers - build_step4_populate_X_layers(assets_path, filtered_datasets, experiment_builders, args) - gc.collect() - - # Step 5- write out dataset manifest and summary information - build_step5_populate_summary_info(root_collection, experiment_builders, filtered_datasets, args.build_tag) - - # consolidate TileDB data - if args.consolidate: - consolidate(args, root_collection.uri) - - return 0 - - -def populate_root_collection(root_collection: soma.Collection) -> soma.Collection: - """ - Create the root SOMA collection for the Census. - - Returns the root collection. - """ - - # Set root metadata for the experiment - root_collection.metadata["created_on"] = datetime.now(tz=timezone.utc).isoformat(timespec="seconds") - root_collection.metadata["cxg_schema_version"] = CXG_SCHEMA_VERSION - root_collection.metadata["census_schema_version"] = CENSUS_SCHEMA_VERSION - - sha = get_git_commit_sha() - root_collection.metadata["git_commit_sha"] = sha - - # Create sub-collections for experiments, etc. - for n in [CENSUS_INFO_NAME, CENSUS_DATA_NAME]: - root_collection.add_new_collection(n) - - return root_collection - - -def build_step1_get_source_datasets(args: argparse.Namespace, assets_path: str) -> List[Dataset]: - logging.info("Build step 1 - get source assets - started") - - # Load manifest defining the datasets - datasets = load_manifest(args.manifest) - if len(datasets) == 0: - logging.error("No H5AD files in the manifest (or we can't find the files)") - raise AssertionError("No H5AD files in the manifest (or we can't find the files)") - - # Testing/debugging hook - hidden option - if args.test_first_n is not None and args.test_first_n > 0: - # Process the N smallest datasets - datasets = sorted(datasets, key=lambda d: d.asset_h5ad_filesize)[0 : args.test_first_n] - - # Stage all files - stage_source_assets(datasets, args, assets_path) - - logging.info("Build step 1 - get source assets - finished") - return datasets - - -def populate_obs_axis( - assets_path: str, datasets: List[Dataset], experiment_builders: List[ExperimentBuilder] -) -> List[Dataset]: - filtered_datasets = [] - N = len(datasets) * len(experiment_builders) - n = 0 - - for dataset, ad in open_anndata(assets_path, datasets, backed="r"): - dataset_total_cell_count = 0 - - for eb in reopen_experiment_builders(experiment_builders): - n += 1 - logging.info(f"{eb.name}: filtering dataset '{dataset.dataset_id}' ({n} of {N})") - ad_filtered = eb.filter_anndata_cells(ad) - - if len(ad_filtered.obs) == 0: # type:ignore - logging.info(f"{eb.name} - H5AD has no data after filtering, skipping {dataset.dataset_h5ad_path}") - continue - - # append to `obs`; accumulate `var` data - dataset_total_cell_count += eb.accumulate_axes(dataset, ad_filtered) - - # dataset passes filter if either experiment includes cells from the dataset - if dataset_total_cell_count > 0: - filtered_datasets.append(dataset) - dataset.dataset_total_cell_count = dataset_total_cell_count - - for eb in experiment_builders: - logging.info(f"Experiment {eb.name} will contain {eb.n_obs} cells from {eb.n_datasets} datasets") - - return filtered_datasets - - -def populate_var_axis_and_presence(experiment_builders: List[ExperimentBuilder]) -> None: - for eb in reopen_experiment_builders(experiment_builders): - # populate `var`; create empty `presence` now that we have its dimensions - eb.populate_var_axis() - - -def build_step2_create_root_collection(soma_path: str, experiment_builders: List[ExperimentBuilder]) -> soma.Collection: - """ - Create all objects - - Returns: the root collection. - """ - logging.info("Build step 2 - Create root collection - started") - - with soma.Collection.create(soma_path, context=SOMA_TileDB_Context()) as root_collection: - populate_root_collection(root_collection) - - for e in experiment_builders: - e.create(census_data=root_collection[CENSUS_DATA_NAME]) - - logging.info("Build step 2 - Create root collection - finished") - return root_collection - - -def build_step3_populate_obs_and_var_axes( - assets_path: str, - datasets: List[Dataset], - experiment_builders: List[ExperimentBuilder], -) -> List[Dataset]: - """ - Populate obs and var axes. Filter cells from datasets for each experiment, as obs is built. - """ - logging.info("Build step 3 - Populate obs and var axes - started") - - filtered_datasets = populate_obs_axis(assets_path, datasets, experiment_builders) - logging.info(f"({len(filtered_datasets)} of {len(datasets)}) datasets suitable for processing.") - - populate_var_axis_and_presence(experiment_builders) - - assign_dataset_soma_joinids(filtered_datasets) - - logging.info("Build step 3 - Populate obs and var axes - finished") - - return filtered_datasets - - -def build_step4_populate_X_layers( - assets_path: str, - filtered_datasets: List[Dataset], - experiment_builders: List[ExperimentBuilder], - args: argparse.Namespace, -) -> None: - """ - Populate X layers. - """ - logging.info("Build step 4 - Populate X layers - started") - - # Process all X data - for eb in reopen_experiment_builders(experiment_builders): - eb.create_X_with_layers() - - populate_X_layers(assets_path, filtered_datasets, experiment_builders, args) - - for eb in reopen_experiment_builders(experiment_builders): - eb.populate_presence_matrix(filtered_datasets) - - logging.info("Build step 4 - Populate X layers - finished") - - -def build_step5_populate_summary_info( - root_collection: soma.Collection, - experiment_builders: List[ExperimentBuilder], - filtered_datasets: List[Dataset], - build_tag: str, -) -> None: - logging.info("Build step 5 - Populate summary info - started") - - with soma.Collection.open(root_collection[CENSUS_INFO_NAME].uri, "w", context=SOMA_TileDB_Context()) as census_info: - create_dataset_manifest(census_info, filtered_datasets) - create_census_summary_cell_counts(census_info, [e.census_summary_cell_counts for e in experiment_builders]) - create_census_summary(census_info, experiment_builders, build_tag) - - logging.info("Build step 5 - Populate summary info - finished") - - def create_args_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(prog="cell_census_builder") parser.add_argument("uri", type=str, help="Census top-level URI") diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/build.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/build.py new file mode 100644 index 000000000..e3c126751 --- /dev/null +++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/build.py @@ -0,0 +1,256 @@ +import argparse +import gc +import logging +import os.path +from datetime import datetime, timezone +from typing import List + +import tiledbsoma as soma + +from .anndata import open_anndata +from .census_summary import create_census_summary +from .consolidate import consolidate +from .datasets import Dataset, assign_dataset_soma_joinids, create_dataset_manifest +from .experiment_builder import ( + ExperimentBuilder, + populate_X_layers, + reopen_experiment_builders, +) +from .globals import ( + CENSUS_DATA_NAME, + CENSUS_INFO_NAME, + CENSUS_SCHEMA_VERSION, + CXG_SCHEMA_VERSION, + SOMA_TileDB_Context, +) +from .manifest import load_manifest +from .source_assets import stage_source_assets +from .summary_cell_counts import create_census_summary_cell_counts +from .util import get_git_commit_sha, is_git_repo_dirty + + +def prepare_file_system(soma_path: str, assets_path: str, args: argparse.Namespace) -> None: + """ + Prepares the file system for the builder run + """ + # Don't clobber an existing census build + if os.path.exists(soma_path) or os.path.exists(assets_path): + raise Exception("Census build path already exists - aborting build") + + # Ensure that the git tree is clean + if not args.test_disable_dirty_git_check and is_git_repo_dirty(): + raise Exception("The git repo has uncommitted changes - aborting build") + + # Create top-level build directories + os.makedirs(soma_path, exist_ok=False) + os.makedirs(assets_path, exist_ok=False) + + +def build( + args: argparse.Namespace, soma_path: str, assets_path: str, experiment_builders: List[ExperimentBuilder] +) -> int: + """ + Approximately, build steps are: + 1. Download manifest and copy/stage all source assets + 2. Read all H5AD and create axis dataframe (serial) + * write obs/var dataframes + * accumulate overall shape of X + 3. Read all H5AD assets again, write X layer (parallel) + 4. Optional: validate + + Returns + ------- + int + Process completion code, 0 on success, non-zero indicating error, + suitable for providing to sys.exit() + """ + + try: + prepare_file_system(soma_path, assets_path, args) + except Exception as e: + logging.error(e) + return 1 + + # Step 1 - get all source datasets + datasets = build_step1_get_source_datasets(args, assets_path) + + # Step 2 - create root collection, and all child objects, but do not populate any dataframes or matrices + root_collection = build_step2_create_root_collection(soma_path, experiment_builders) + gc.collect() + + # Step 3 - populate axes + filtered_datasets = build_step3_populate_obs_and_var_axes(assets_path, datasets, experiment_builders) + + # Step 4 - populate X layers + build_step4_populate_X_layers(assets_path, filtered_datasets, experiment_builders, args) + gc.collect() + + # Step 5- write out dataset manifest and summary information + build_step5_populate_summary_info(root_collection, experiment_builders, filtered_datasets, args.build_tag) + + # consolidate TileDB data + if args.consolidate: + consolidate(args, root_collection.uri) + + return 0 + + +def populate_root_collection(root_collection: soma.Collection) -> soma.Collection: + """ + Create the root SOMA collection for the Census. + + Returns the root collection. + """ + + # Set root metadata for the experiment + root_collection.metadata["created_on"] = datetime.now(tz=timezone.utc).isoformat(timespec="seconds") + root_collection.metadata["cxg_schema_version"] = CXG_SCHEMA_VERSION + root_collection.metadata["census_schema_version"] = CENSUS_SCHEMA_VERSION + + sha = get_git_commit_sha() + root_collection.metadata["git_commit_sha"] = sha + + # Create sub-collections for experiments, etc. + for n in [CENSUS_INFO_NAME, CENSUS_DATA_NAME]: + root_collection.add_new_collection(n) + + return root_collection + + +def build_step1_get_source_datasets(args: argparse.Namespace, assets_path: str) -> List[Dataset]: + logging.info("Build step 1 - get source assets - started") + + # Load manifest defining the datasets + datasets = load_manifest(args.manifest) + if len(datasets) == 0: + logging.error("No H5AD files in the manifest (or we can't find the files)") + raise AssertionError("No H5AD files in the manifest (or we can't find the files)") + + # Testing/debugging hook - hidden option + if args.test_first_n is not None and args.test_first_n > 0: + # Process the N smallest datasets + datasets = sorted(datasets, key=lambda d: d.asset_h5ad_filesize)[0 : args.test_first_n] + + # Stage all files + stage_source_assets(datasets, args, assets_path) + + logging.info("Build step 1 - get source assets - finished") + return datasets + + +def populate_obs_axis( + assets_path: str, datasets: List[Dataset], experiment_builders: List[ExperimentBuilder] +) -> List[Dataset]: + filtered_datasets = [] + N = len(datasets) * len(experiment_builders) + n = 0 + + for dataset, ad in open_anndata(assets_path, datasets, backed="r"): + dataset_total_cell_count = 0 + + for eb in reopen_experiment_builders(experiment_builders): + n += 1 + logging.info(f"{eb.name}: filtering dataset '{dataset.dataset_id}' ({n} of {N})") + ad_filtered = eb.filter_anndata_cells(ad) + + if len(ad_filtered.obs) == 0: # type:ignore + logging.info(f"{eb.name} - H5AD has no data after filtering, skipping {dataset.dataset_h5ad_path}") + continue + + # append to `obs`; accumulate `var` data + dataset_total_cell_count += eb.accumulate_axes(dataset, ad_filtered) + + # dataset passes filter if either experiment includes cells from the dataset + if dataset_total_cell_count > 0: + filtered_datasets.append(dataset) + dataset.dataset_total_cell_count = dataset_total_cell_count + + for eb in experiment_builders: + logging.info(f"Experiment {eb.name} will contain {eb.n_obs} cells from {eb.n_datasets} datasets") + + return filtered_datasets + + +def populate_var_axis_and_presence(experiment_builders: List[ExperimentBuilder]) -> None: + for eb in reopen_experiment_builders(experiment_builders): + # populate `var`; create empty `presence` now that we have its dimensions + eb.populate_var_axis() + + +def build_step2_create_root_collection(soma_path: str, experiment_builders: List[ExperimentBuilder]) -> soma.Collection: + """ + Create all objects + + Returns: the root collection. + """ + logging.info("Build step 2 - Create root collection - started") + + with soma.Collection.create(soma_path, context=SOMA_TileDB_Context()) as root_collection: + populate_root_collection(root_collection) + + for e in experiment_builders: + e.create(census_data=root_collection[CENSUS_DATA_NAME]) + + logging.info("Build step 2 - Create root collection - finished") + return root_collection + + +def build_step3_populate_obs_and_var_axes( + assets_path: str, + datasets: List[Dataset], + experiment_builders: List[ExperimentBuilder], +) -> List[Dataset]: + """ + Populate obs and var axes. Filter cells from datasets for each experiment, as obs is built. + """ + logging.info("Build step 3 - Populate obs and var axes - started") + + filtered_datasets = populate_obs_axis(assets_path, datasets, experiment_builders) + logging.info(f"({len(filtered_datasets)} of {len(datasets)}) datasets suitable for processing.") + + populate_var_axis_and_presence(experiment_builders) + + assign_dataset_soma_joinids(filtered_datasets) + + logging.info("Build step 3 - Populate obs and var axes - finished") + + return filtered_datasets + + +def build_step4_populate_X_layers( + assets_path: str, + filtered_datasets: List[Dataset], + experiment_builders: List[ExperimentBuilder], + args: argparse.Namespace, +) -> None: + """ + Populate X layers. + """ + logging.info("Build step 4 - Populate X layers - started") + + # Process all X data + for eb in reopen_experiment_builders(experiment_builders): + eb.create_X_with_layers() + + populate_X_layers(assets_path, filtered_datasets, experiment_builders, args) + + for eb in reopen_experiment_builders(experiment_builders): + eb.populate_presence_matrix(filtered_datasets) + + logging.info("Build step 4 - Populate X layers - finished") + + +def build_step5_populate_summary_info( + root_collection: soma.Collection, + experiment_builders: List[ExperimentBuilder], + filtered_datasets: List[Dataset], + build_tag: str, +) -> None: + logging.info("Build step 5 - Populate summary info - started") + + with soma.Collection.open(root_collection[CENSUS_INFO_NAME].uri, "w", context=SOMA_TileDB_Context()) as census_info: + create_dataset_manifest(census_info, filtered_datasets) + create_census_summary_cell_counts(census_info, [e.census_summary_cell_counts for e in experiment_builders]) + create_census_summary(census_info, experiment_builders, build_tag) + + logging.info("Build step 5 - Populate summary info - finished") diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_specs.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_specs.py new file mode 100644 index 000000000..bd2c815a3 --- /dev/null +++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_specs.py @@ -0,0 +1,34 @@ +from typing import List + +from .experiment_builder import ExperimentSpecification +from .globals import RNA_SEQ + + +def make_experiment_specs() -> List[ExperimentSpecification]: + """ + Define all soma.Experiments to build in the census. + + Functionally, this defines per-experiment name, anndata filter, etc. + It also loads any required per-Experiment assets. + """ + GENE_LENGTH_BASE_URI = ( + "https://raw.githubusercontent.com/chanzuckerberg/single-cell-curation/" + "100f935eac932e1f5f5dadac0627204da3790f6f/cellxgene_schema_cli/cellxgene_schema/ontology_files/" + ) + GENE_LENGTH_URIS = [ + GENE_LENGTH_BASE_URI + "genes_homo_sapiens.csv.gz", + GENE_LENGTH_BASE_URI + "genes_mus_musculus.csv.gz", + GENE_LENGTH_BASE_URI + "genes_sars_cov_2.csv.gz", + ] + return [ # The soma.Experiments we want to build + ExperimentSpecification.create( + name="homo_sapiens", + anndata_cell_filter_spec=dict(organism_ontology_term_id="NCBITaxon:9606", assay_ontology_term_ids=RNA_SEQ), + gene_feature_length_uris=GENE_LENGTH_URIS, + ), + ExperimentSpecification.create( + name="mus_musculus", + anndata_cell_filter_spec=dict(organism_ontology_term_id="NCBITaxon:10090", assay_ontology_term_ids=RNA_SEQ), + gene_feature_length_uris=GENE_LENGTH_URIS, + ), + ] diff --git a/tools/cell_census_builder/src/cell_census_builder/host_validation.py b/tools/cell_census_builder/src/cell_census_builder/host_validation.py index e0c407710..69a096056 100644 --- a/tools/cell_census_builder/src/cell_census_builder/host_validation.py +++ b/tools/cell_census_builder/src/cell_census_builder/host_validation.py @@ -5,7 +5,7 @@ import psutil -from cell_census_builder.logging import setup_logging, hr_multibyte_unit +from cell_census_builder.logging import hr_multibyte_unit, setup_logging """Minimum physical RAM""" MIN_RAM = 512 * 1024**3 # 512GiB diff --git a/tools/cell_census_builder/tests/test_builder.py b/tools/cell_census_builder/tests/test_builder.py index 10b752346..90b233300 100644 --- a/tools/cell_census_builder/tests/test_builder.py +++ b/tools/cell_census_builder/tests/test_builder.py @@ -10,9 +10,10 @@ import pyarrow as pa import tiledb import tiledbsoma as soma -from cell_census_builder.build_soma.__main__ import build, build_step1_get_source_datasets, make_experiment_specs +from cell_census_builder.build_soma.build import build, build_step1_get_source_datasets from cell_census_builder.build_soma.datasets import Dataset from cell_census_builder.build_soma.experiment_builder import ExperimentBuilder +from cell_census_builder.build_soma.experiment_specs import make_experiment_specs from cell_census_builder.build_soma.globals import ( CENSUS_DATA_NAME, CENSUS_INFO_NAME, @@ -28,8 +29,8 @@ def test_base_builder_creation( """ Runs the builder, queries the census and performs a set of base assertions. """ - with patch("cell_census_builder.build_soma.__main__.prepare_file_system"), patch( - "cell_census_builder.build_soma.__main__.build_step1_get_source_datasets", return_value=datasets + with patch("cell_census_builder.build_soma.build.prepare_file_system"), patch( + "cell_census_builder.build_soma.build.build_step1_get_source_datasets", return_value=datasets ), patch("cell_census_builder.build_soma.consolidate._run"), patch( "cell_census_builder.build_soma.validate.validate_consolidation", return_value=True ): From c30d38b5ee3ccf007d544139d6299572b0393a1a Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Fri, 17 Mar 2023 21:55:52 +0000 Subject: [PATCH 03/34] fix GHA unit test --- .github/workflows/py-unittests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/py-unittests.yml b/.github/workflows/py-unittests.yml index 74a606ce5..8d1acc618 100644 --- a/.github/workflows/py-unittests.yml +++ b/.github/workflows/py-unittests.yml @@ -53,8 +53,8 @@ jobs: - name: Install dependencies run: | python -m pip install -U pip setuptools wheel - pip install -r ./tools/scripts/requirements.txt -r ./tools/scripts/requirements-dev.txt - pip install -e ./tools/ + pip install ./tools/cell_census_builder/ + pip install -r ./tools/scripts/requirements-dev.txt - name: Test with pytest (builder) run: | PYTHONPATH=. coverage run --parallel-mode -m pytest ./tools/cell_census_builder/tests/ From d3cb2725bd23231bd1b38acd854c5ef78cbbeb63 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Fri, 17 Mar 2023 21:56:38 +0000 Subject: [PATCH 04/34] fix GHA unit test --- .github/workflows/py-unittests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/py-unittests.yml b/.github/workflows/py-unittests.yml index 8d1acc618..026e04c8b 100644 --- a/.github/workflows/py-unittests.yml +++ b/.github/workflows/py-unittests.yml @@ -53,7 +53,7 @@ jobs: - name: Install dependencies run: | python -m pip install -U pip setuptools wheel - pip install ./tools/cell_census_builder/ + pip install -e ./tools/cell_census_builder/ pip install -r ./tools/scripts/requirements-dev.txt - name: Test with pytest (builder) run: | From 613cc46121d101bc9b822989edff7b3d7736e4a2 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Mon, 20 Mar 2023 19:04:15 +0000 Subject: [PATCH 05/34] additional refactoring for top-level workflow --- .pre-commit-config.yaml | 1 + tools/cell_census_builder/pyproject.toml | 7 +- .../src/cell_census_builder/__init__.py | 2 +- .../src/cell_census_builder/__main__.py | 133 ++++++++++++++ .../build_soma/__main__.py | 38 ++-- .../cell_census_builder/build_soma/anndata.py | 4 +- .../cell_census_builder/build_soma/build.py | 50 +++--- .../build_soma/consolidate.py | 6 +- .../build_soma/experiment_builder.py | 10 +- .../build_soma/experiment_specs.py | 9 +- .../src/cell_census_builder/build_soma/mp.py | 16 +- .../build_soma/source_assets.py | 8 +- .../cell_census_builder/build_soma/util.py | 16 -- .../build_soma/validate.py | 29 ++-- .../src/cell_census_builder/build_state.py | 162 ++++++++++++++++++ .../cell_census_builder/host_validation.py | 101 +++++++---- .../src/cell_census_builder/logging.py | 41 ++++- .../src/cell_census_builder/util.py | 46 +++++ .../tests/anndata/conftest.py | 16 +- tools/cell_census_builder/tests/conftest.py | 31 ++-- .../cell_census_builder/tests/test_builder.py | 46 ++--- .../tests/test_source_assets.py | 15 +- tools/cell_census_builder/tests/test_util.py | 44 +++-- 23 files changed, 616 insertions(+), 215 deletions(-) create mode 100644 tools/cell_census_builder/src/cell_census_builder/__main__.py create mode 100644 tools/cell_census_builder/src/cell_census_builder/build_state.py create mode 100644 tools/cell_census_builder/src/cell_census_builder/util.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ba322a2d3..a313ea4da 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -54,3 +54,4 @@ repos: - numpy - typing_extensions - types-setuptools + - types-PyYAML diff --git a/tools/cell_census_builder/pyproject.toml b/tools/cell_census_builder/pyproject.toml index a5bf541e0..6d49850ec 100644 --- a/tools/cell_census_builder/pyproject.toml +++ b/tools/cell_census_builder/pyproject.toml @@ -26,13 +26,12 @@ classifiers = [ "Programming Language :: Python :: 3.10", ] dependencies= [ + "typing_extensions", "pyarrow", "pandas", "anndata>=0.8", "numpy", - # NOTE: The builder's version of tiledbsoma MUST be <= the API's tiledbsoma version, to ensure reader compatibility - # with TileDB on-disk storage format - "tiledbsoma==1.0.0", + "cell_census==0.10.0", "scipy", "fsspec", "s3fs", @@ -44,8 +43,6 @@ dependencies= [ "gitpython", "attrs>=22.2.0", "psutil", - "cell_census==0.10.0", - "typing_extensions", ] # [tool.setuptools.packages.find] diff --git a/tools/cell_census_builder/src/cell_census_builder/__init__.py b/tools/cell_census_builder/src/cell_census_builder/__init__.py index 16e5282c0..4cd7c916d 100644 --- a/tools/cell_census_builder/src/cell_census_builder/__init__.py +++ b/tools/cell_census_builder/src/cell_census_builder/__init__.py @@ -6,7 +6,7 @@ try: - __version__ = metadata.version("cell_census") + __version__ = metadata.version("cell_census_builder") except metadata.PackageNotFoundError: # package is not installed __version__ = "0.0.0-unknown" diff --git a/tools/cell_census_builder/src/cell_census_builder/__main__.py b/tools/cell_census_builder/src/cell_census_builder/__main__.py new file mode 100644 index 000000000..01192ff7b --- /dev/null +++ b/tools/cell_census_builder/src/cell_census_builder/__main__.py @@ -0,0 +1,133 @@ +import argparse +import logging +import pathlib +import sys +from typing import Callable, List + +import s3fs + +from . import __version__ +from .build_state import CENSUS_BUILD_CONFIG, CENSUS_BUILD_STATE, CensusBuildArgs, CensusBuildConfig +from .host_validation import check_host +from .util import process_init, urlcat + +""" +File tree for the build. + +working_dir: + | + +-- config.yaml # build config (user provided, read-only) + +-- state.yaml # build runtime state (eg., census version tag, etc) + +-- soma + +-- h5ads + +-- logs # log files from various stages + | +-- build.log + | +-- ... + +-- reports + +-- census-summary-VERSION.txt + +-- census-diff-VERSION.txt + +""" + + +def main() -> int: + cli_parser = create_args_parser() + cli_args = cli_parser.parse_args() + + working_dir = pathlib.PosixPath(cli_args.working_dir) + if not working_dir.is_dir(): + logging.critical("Census builder: unable to find working directory - exiting.") + return 1 + if not (working_dir / CENSUS_BUILD_CONFIG).is_file(): + logging.critical("Census builder: unable to find config.yaml in working directory - exiting.") + return 1 + if (working_dir / CENSUS_BUILD_STATE).exists(): + logging.critical("Found pre-existing census build in working directory - aborting census build.") + return 1 + + build_config = CensusBuildConfig.load(working_dir / CENSUS_BUILD_CONFIG) + build_args = CensusBuildArgs(working_dir=working_dir, config=build_config) + + # Process initialization/setup must be done early + process_init(build_args) + + # Return process exit code (or raise, which exits with a code of `1`) + return do_build(build_args) + + +def do_build(args: CensusBuildArgs) -> int: + """ + Top-level build sequence. + + Built steps will be executed in order. Build will stop if a build step returns non-zero + exit code or raises. + """ + logging.info(f"Census build: start [version={__version__}]") + build_steps: List[Callable[[CensusBuildArgs], int]] = [ + do_prebuild_set_defaults, + do_prebuild_checks, + do_build_soma, + do_create_reports, + ] + try: + for n, build_step in enumerate(build_steps, start=1): + logging.info(f"Build step {build_step.__name__} [{n} of {len(build_steps)}]: start") + cc = build_step(args) + args.state.commit(args.working_dir / CENSUS_BUILD_STATE) + if cc != 0: + logging.critical(f"Build step {build_step.__name__} returned error code {cc}: aborting build.") + return cc + logging.info(f"Build step {build_step.__name__} [{n} of {len(build_steps)}]: complete") + + except Exception as e: + logging.critical(f"Caught exception, exiting: {str(e)}") + return 1 + + logging.info("Census build: completed") + return 0 + + +def do_prebuild_set_defaults(args: CensusBuildArgs) -> int: + """Set any default state required by build steps.""" + args.state["do_prebuild_set_defaults"] = True + return 0 + + +def do_prebuild_checks(args: CensusBuildArgs) -> int: + """Pre-build checks for host, config, etc. All pre-conditions should go here.""" + + # check host configuration, e.g., free disk space + if not check_host(args): + return 1 + + # verify the build tag is not already published/in use + build_tag = args.config.build_tag + assert build_tag is not None + s3path = urlcat(args.config.cell_census_S3_path, build_tag) + if s3fs.S3FileSystem(anon=True).exists(s3path): + logging.error(f"Build tag {build_tag} already exists at {s3path}.") + return 1 + + args.state["do_prebuild_checks"] = True + return 0 + + +def do_build_soma(args: CensusBuildArgs) -> int: + # WIP + # args.state["do_build_soma"] = True + return 0 + + +def do_create_reports(args: CensusBuildArgs) -> int: + # WIP + # args.state["do_create_reports"] = True + return 0 + + +def create_args_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(prog="cell_census_builder") + parser.add_argument("working_dir", type=str, help="Working directory for the build") + return parser + + +sys.exit(main()) diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py index f2f5606e6..d1951b1b1 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py @@ -1,37 +1,29 @@ import argparse -import multiprocessing +import pathlib import sys from datetime import datetime +from ..build_state import CensusBuildArgs, CensusBuildConfig +from ..util import process_init from .build import build -from .experiment_builder import ExperimentBuilder -from .experiment_specs import make_experiment_specs -from .mp import process_initializer -from .util import uricat from .validate import validate def main() -> int: - parser = create_args_parser() - args = parser.parse_args() - assert args.subcommand in ["build", "validate"] + cli_parser = create_args_parser() + cli_args = cli_parser.parse_args() + assert cli_args.subcommand in ["build", "validate"] - process_initializer(args.verbose) - - # normalize our base URI - must include trailing slash - soma_path = uricat(args.uri, args.build_tag, "soma") - assets_path = uricat(args.uri, args.build_tag, "h5ads") - - # create the experiment specifications and builders - experiment_specifications = make_experiment_specs() - experiment_builders = [ExperimentBuilder(spec) for spec in experiment_specifications] + config = CensusBuildConfig(**cli_args.__dict__) + args = CensusBuildArgs(working_dir=pathlib.PosixPath(cli_args.uri), config=config) + process_init(args) cc = 0 - if args.subcommand == "build": - cc = build(args, soma_path, assets_path, experiment_builders) + if cli_args.subcommand == "build": + cc = build(args) - if cc == 0 and (args.subcommand == "validate" or args.validate): - validate(args, soma_path, assets_path, experiment_specifications) + if cc == 0 and (cli_args.subcommand == "validate" or cli_args.validate): + validate(args) return cc @@ -85,8 +77,4 @@ def create_args_parser() -> argparse.ArgumentParser: if __name__ == "__main__": - # this is very important to do early, before any use of `concurrent.futures` - if multiprocessing.get_start_method(True) != "spawn": - multiprocessing.set_start_method("spawn", True) - sys.exit(main()) diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/anndata.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/anndata.py index b373db20c..c9289d4e6 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_soma/anndata.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/anndata.py @@ -5,9 +5,9 @@ import numpy as np import pandas as pd +from ..util import urlcat from .datasets import Dataset from .globals import CXG_SCHEMA_VERSION, CXG_SCHEMA_VERSION_IMPORT, FEATURE_REFERENCE_IGNORE -from .util import uricat AnnDataFilterSpec = TypedDict( "AnnDataFilterSpec", @@ -34,7 +34,7 @@ def open_anndata( datasets = [datasets] for h5ad in datasets: - path = uricat(base_path, h5ad.dataset_h5ad_path) + path = urlcat(base_path, h5ad.dataset_h5ad_path) logging.debug(f"open_anndata: {path}") ad = anndata.read_h5ad(path, *args, **kwargs) diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/build.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/build.py index e3c126751..ab3693b75 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_soma/build.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/build.py @@ -1,12 +1,11 @@ -import argparse import gc import logging -import os.path from datetime import datetime, timezone from typing import List import tiledbsoma as soma +from ..build_state import CensusBuildArgs from .anndata import open_anndata from .census_summary import create_census_summary from .consolidate import consolidate @@ -16,6 +15,7 @@ populate_X_layers, reopen_experiment_builders, ) +from .experiment_specs import make_experiment_builders from .globals import ( CENSUS_DATA_NAME, CENSUS_INFO_NAME, @@ -29,26 +29,24 @@ from .util import get_git_commit_sha, is_git_repo_dirty -def prepare_file_system(soma_path: str, assets_path: str, args: argparse.Namespace) -> None: +def prepare_file_system(args: CensusBuildArgs) -> None: """ Prepares the file system for the builder run """ # Don't clobber an existing census build - if os.path.exists(soma_path) or os.path.exists(assets_path): + if args.soma_path.exists() or args.h5ads_path.exists(): raise Exception("Census build path already exists - aborting build") # Ensure that the git tree is clean - if not args.test_disable_dirty_git_check and is_git_repo_dirty(): + if not args.config.test_disable_dirty_git_check and is_git_repo_dirty(): raise Exception("The git repo has uncommitted changes - aborting build") # Create top-level build directories - os.makedirs(soma_path, exist_ok=False) - os.makedirs(assets_path, exist_ok=False) + args.soma_path.mkdir(parents=True, exist_ok=False) + args.h5ads_path.mkdir(parents=True, exist_ok=False) -def build( - args: argparse.Namespace, soma_path: str, assets_path: str, experiment_builders: List[ExperimentBuilder] -) -> int: +def build(args: CensusBuildArgs) -> int: """ Approximately, build steps are: 1. Download manifest and copy/stage all source assets @@ -65,31 +63,29 @@ def build( suitable for providing to sys.exit() """ - try: - prepare_file_system(soma_path, assets_path, args) - except Exception as e: - logging.error(e) - return 1 + experiment_builders = make_experiment_builders() + + prepare_file_system(args) # Step 1 - get all source datasets - datasets = build_step1_get_source_datasets(args, assets_path) + datasets = build_step1_get_source_datasets(args) # Step 2 - create root collection, and all child objects, but do not populate any dataframes or matrices - root_collection = build_step2_create_root_collection(soma_path, experiment_builders) + root_collection = build_step2_create_root_collection(args.soma_path.as_posix(), experiment_builders) gc.collect() # Step 3 - populate axes - filtered_datasets = build_step3_populate_obs_and_var_axes(assets_path, datasets, experiment_builders) + filtered_datasets = build_step3_populate_obs_and_var_axes(args.h5ads_path.as_posix(), datasets, experiment_builders) # Step 4 - populate X layers - build_step4_populate_X_layers(assets_path, filtered_datasets, experiment_builders, args) + build_step4_populate_X_layers(args.h5ads_path.as_posix(), filtered_datasets, experiment_builders, args) gc.collect() # Step 5- write out dataset manifest and summary information - build_step5_populate_summary_info(root_collection, experiment_builders, filtered_datasets, args.build_tag) + build_step5_populate_summary_info(root_collection, experiment_builders, filtered_datasets, args.config.build_tag) # consolidate TileDB data - if args.consolidate: + if args.config.consolidate: consolidate(args, root_collection.uri) return 0 @@ -117,22 +113,22 @@ def populate_root_collection(root_collection: soma.Collection) -> soma.Collectio return root_collection -def build_step1_get_source_datasets(args: argparse.Namespace, assets_path: str) -> List[Dataset]: +def build_step1_get_source_datasets(args: CensusBuildArgs) -> List[Dataset]: logging.info("Build step 1 - get source assets - started") # Load manifest defining the datasets - datasets = load_manifest(args.manifest) + datasets = load_manifest(args.config.manifest) if len(datasets) == 0: logging.error("No H5AD files in the manifest (or we can't find the files)") raise AssertionError("No H5AD files in the manifest (or we can't find the files)") # Testing/debugging hook - hidden option - if args.test_first_n is not None and args.test_first_n > 0: + if args.config.test_first_n is not None and args.config.test_first_n > 0: # Process the N smallest datasets - datasets = sorted(datasets, key=lambda d: d.asset_h5ad_filesize)[0 : args.test_first_n] + datasets = sorted(datasets, key=lambda d: d.asset_h5ad_filesize)[0 : args.config.test_first_n] # Stage all files - stage_source_assets(datasets, args, assets_path) + stage_source_assets(datasets, args) logging.info("Build step 1 - get source assets - finished") return datasets @@ -221,7 +217,7 @@ def build_step4_populate_X_layers( assets_path: str, filtered_datasets: List[Dataset], experiment_builders: List[ExperimentBuilder], - args: argparse.Namespace, + args: CensusBuildArgs, ) -> None: """ Populate X layers. diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/consolidate.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/consolidate.py index f8048a62a..8d6d2272a 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_soma/consolidate.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/consolidate.py @@ -1,15 +1,15 @@ -import argparse import concurrent.futures import logging from typing import List import tiledbsoma as soma +from ..build_state import CensusBuildArgs from .globals import DEFAULT_TILEDB_CONFIG, SOMA_TileDB_Context from .mp import create_process_pool_executor, log_on_broken_process_pool -def consolidate(args: argparse.Namespace, uri: str) -> None: +def consolidate(args: CensusBuildArgs, uri: str) -> None: """ This is a non-portable, TileDB-specific consolidation routine. """ @@ -30,7 +30,7 @@ def _gather(uri: str) -> List[str]: return uris_to_consolidate -def _run(args: argparse.Namespace, uris_to_consolidate: List[str]) -> None: +def _run(args: CensusBuildArgs, uris_to_consolidate: List[str]) -> None: # Queue consolidator for each array with create_process_pool_executor(args) as ppe: futures = [ppe.submit(consolidate_tiledb_object, uri) for uri in uris_to_consolidate] diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_builder.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_builder.py index 8307b89b5..34a8b017a 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_builder.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_builder.py @@ -1,4 +1,3 @@ -import argparse import concurrent.futures import gc import io @@ -18,6 +17,8 @@ from somacore.options import OpenMode from typing_extensions import Self +from ..build_state import CensusBuildArgs +from ..util import urlcat from .anndata import AnnDataFilterSpec, make_anndata_cell_filter, open_anndata from .datasets import Dataset from .globals import ( @@ -41,7 +42,6 @@ anndata_ordered_bool_issue_853_workaround, array_chunker, is_nonnegative_integral, - uricat, ) # Contents: @@ -149,7 +149,7 @@ def gene_feature_length(self) -> pd.DataFrame: def create(self, census_data: soma.Collection) -> None: """Create experiment within the specified Collection with a single Measurement.""" - logging.info(f"{self.name}: create experiment at {uricat(census_data.uri, self.name)}") + logging.info(f"{self.name}: create experiment at {urlcat(census_data.uri, self.name)}") self.experiment = census_data.add_new_collection(self.name, soma.Experiment) self.experiment_uri = self.experiment.uri @@ -463,14 +463,14 @@ def _accumulate_X( def populate_X_layers( - assets_path: str, datasets: List[Dataset], experiment_builders: List[ExperimentBuilder], args: argparse.Namespace + assets_path: str, datasets: List[Dataset], experiment_builders: List[ExperimentBuilder], args: CensusBuildArgs ) -> None: """ Do all X layer processing for all Experiments. Also accumulate presence matrix data for later writing. """ # populate X layers presence: List[PresenceResult] = [] - if args.multi_process: + if args.config.multi_process: with create_process_pool_executor(args) as pe: futures = { _accumulate_X( diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_specs.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_specs.py index bd2c815a3..3e2a9ec3f 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_specs.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/experiment_specs.py @@ -1,9 +1,11 @@ +import functools from typing import List -from .experiment_builder import ExperimentSpecification +from .experiment_builder import ExperimentBuilder, ExperimentSpecification from .globals import RNA_SEQ +@functools.cache def make_experiment_specs() -> List[ExperimentSpecification]: """ Define all soma.Experiments to build in the census. @@ -32,3 +34,8 @@ def make_experiment_specs() -> List[ExperimentSpecification]: gene_feature_length_uris=GENE_LENGTH_URIS, ), ] + + +@functools.cache +def make_experiment_builders() -> List[ExperimentBuilder]: + return [ExperimentBuilder(spec) for spec in make_experiment_specs()] diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/mp.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/mp.py index bd5d9b580..056efce44 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_soma/mp.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/mp.py @@ -1,11 +1,11 @@ -import argparse import concurrent.futures import logging import multiprocessing import os from typing import Optional, cast -from ..logging import setup_logging +from ..build_state import CensusBuildArgs +from ..util import process_init def cpu_count() -> int: @@ -16,12 +16,8 @@ def cpu_count() -> int: return cast(int, cpu_count) -def process_initializer(verbose: int = 0) -> None: - setup_logging(verbose) - - def create_process_pool_executor( - args: argparse.Namespace, max_workers: Optional[int] = None + args: CensusBuildArgs, max_workers: Optional[int] = None ) -> concurrent.futures.ProcessPoolExecutor: # We rely on the pool configuration being correct. Failure to do this will # lead to strange errors on some OS (eg., Linux defaults to fork). Rather @@ -29,9 +25,9 @@ def create_process_pool_executor( assert multiprocessing.get_start_method(True) == "spawn" return concurrent.futures.ProcessPoolExecutor( - max_workers=args.max_workers if max_workers is None else max_workers, - initializer=process_initializer, - initargs=(args.verbose,), + max_workers=args.config.max_workers if max_workers is None else max_workers, + initializer=process_init, + initargs=(args,), ) diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py index 1e996acb9..dd2f0041a 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py @@ -1,4 +1,3 @@ -import argparse import logging import os import urllib.parse @@ -7,11 +6,14 @@ import aiohttp import fsspec +from ..build_state import CensusBuildArgs from .datasets import Dataset from .mp import cpu_count, create_process_pool_executor -def stage_source_assets(datasets: List[Dataset], args: argparse.Namespace, assets_dir: str) -> None: +def stage_source_assets(datasets: List[Dataset], args: CensusBuildArgs) -> None: + assets_dir = args.h5ads_path.as_posix() + logging.info(f"Starting asset staging to {assets_dir}") assert os.path.isdir(assets_dir) @@ -19,7 +21,7 @@ def stage_source_assets(datasets: List[Dataset], args: argparse.Namespace, asset datasets = sorted(datasets, key=lambda d: d.asset_h5ad_filesize, reverse=True) N = len(datasets) - if getattr(args, "multi_process", False): + if not args.config.multi_process: n_workers = max(min(8, cpu_count()), 64) with create_process_pool_executor(args, n_workers) as pe: paths = list( diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/util.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/util.py index 3e5496210..e2adf1c01 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_soma/util.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/util.py @@ -1,6 +1,5 @@ import os import time -import urllib.parse from typing import Any, Iterator, Optional, Union import numpy as np @@ -64,21 +63,6 @@ def array_chunker( raise NotImplementedError("array_chunker: unsupported array type") -def uricat(container_uri: str, *paths: str) -> str: - """ - Concat one or more paths, separated with '/' - - Similar to urllib.parse.urljoin except it takes an iterator, and - assumes the container_uri is a 'directory'/container, ie, ends in '/'. - """ - - uri = container_uri - for p in paths: - uri = uri if uri.endswith("/") else uri + "/" - uri = urllib.parse.urljoin(uri, p) - return uri - - def fetch_json(url: str, delay_secs: float = 0.0) -> object: response = requests.get(url) response.raise_for_status() diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/validate.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/validate.py index 0d27f35d4..a322d8993 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_soma/validate.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/validate.py @@ -1,4 +1,3 @@ -import argparse import concurrent.futures import dataclasses import logging @@ -16,10 +15,13 @@ from scipy import sparse from typing_extensions import Self +from ..build_state import CensusBuildArgs +from ..util import urlcat from .anndata import make_anndata_cell_filter, open_anndata from .consolidate import list_uris_to_consolidate from .datasets import Dataset from .experiment_builder import ExperimentSpecification +from .experiment_specs import make_experiment_specs from .globals import ( CENSUS_DATA_NAME, CENSUS_DATASETS_COLUMNS, @@ -39,7 +41,6 @@ SOMA_TileDB_Context, ) from .mp import create_process_pool_executor, log_on_broken_process_pool -from .util import uricat @dataclass # TODO: use attrs @@ -63,7 +64,7 @@ def n_vars(self) -> int: def open_experiment(base_uri: str, eb: ExperimentSpecification) -> soma.Experiment: """Helper function that knows the Census schema path conventions.""" - return soma.Experiment.open(uricat(base_uri, CENSUS_DATA_NAME, eb.name), mode="r") + return soma.Experiment.open(urlcat(base_uri, CENSUS_DATA_NAME, eb.name), mode="r") def validate_all_soma_objects_exist(soma_path: str, experiment_specifications: List[ExperimentSpecification]) -> bool: @@ -179,7 +180,7 @@ def validate_axis_dataframes( soma_path: str, datasets: List[Dataset], experiment_specifications: List[ExperimentSpecification], - args: argparse.Namespace, + args: CensusBuildArgs, ) -> Dict[str, EbInfo]: """ " Validate axis dataframes: schema, shape, contents @@ -205,7 +206,7 @@ def validate_axis_dataframes( # check shapes & perform weak test of contents eb_info = {eb.name: EbInfo() for eb in experiment_specifications} - if args.multi_process: + if args.config.multi_process: with create_process_pool_executor(args) as ppe: futures = [ ppe.submit(_validate_axis_dataframes, (assets_path, soma_path, dataset, experiment_specifications)) @@ -397,7 +398,7 @@ def validate_X_layers( datasets: List[Dataset], experiment_specifications: List[ExperimentSpecification], eb_info: Dict[str, EbInfo], - args: argparse.Namespace, + args: CensusBuildArgs, ) -> bool: """ " Validate all X layers: schema, shape, contents @@ -429,7 +430,7 @@ def validate_X_layers( assert X.schema.field("soma_data").type == CENSUS_X_LAYERS[lyr] assert X.shape == (n_obs, n_vars) - if args.multi_process: + if args.config.multi_process: with create_process_pool_executor(args) as ppe: ROWS_PER_PROCESS = 1_000_000 dup_coord_futures = [ @@ -479,7 +480,7 @@ def load_datasets_from_census(assets_path: str, soma_path: str) -> List[Dataset] # census against the snapshot assets. with soma.Collection.open(soma_path, context=SOMA_TileDB_Context()) as census: df = census[CENSUS_INFO_NAME][CENSUS_DATASETS_NAME].read().concat().to_pandas() - df["corpora_asset_h5ad_uri"] = df.dataset_h5ad_path.map(lambda p: uricat(assets_path, p)) + df["corpora_asset_h5ad_uri"] = df.dataset_h5ad_path.map(lambda p: urlcat(assets_path, p)) datasets = Dataset.from_dataframe(df) return datasets @@ -487,7 +488,7 @@ def load_datasets_from_census(assets_path: str, soma_path: str) -> List[Dataset] def validate_manifest_contents(assets_path: str, datasets: List[Dataset]) -> bool: """Confirm contents of manifest are correct.""" for d in datasets: - p = pathlib.Path(uricat(assets_path, d.dataset_h5ad_path)) + p = pathlib.Path(urlcat(assets_path, d.dataset_h5ad_path)) assert p.exists() and p.is_file(), f"{d.dataset_h5ad_path} is missing from the census" assert str(p).endswith(".h5ad"), "Expected only H5AD assets" @@ -543,15 +544,19 @@ def _walk_tree(name: str, parent: Any) -> None: return True -def validate( - args: argparse.Namespace, soma_path: str, assets_path: str, experiment_specifications: List[ExperimentSpecification] -) -> bool: +def validate(args: CensusBuildArgs) -> bool: """ Validate that the "census" matches the datasets and experiment builder spec. Will raise if validation fails. Returns True on success. """ logging.info("Validation start") + + experiment_specifications = make_experiment_specs() + + soma_path = args.soma_path.as_posix() + assets_path = args.h5ads_path.as_posix() + assert validate_directory_structure(soma_path, assets_path) assert validate_all_soma_objects_exist(soma_path, experiment_specifications) diff --git a/tools/cell_census_builder/src/cell_census_builder/build_state.py b/tools/cell_census_builder/src/cell_census_builder/build_state.py new file mode 100644 index 000000000..dce19ddde --- /dev/null +++ b/tools/cell_census_builder/src/cell_census_builder/build_state.py @@ -0,0 +1,162 @@ +""" +build state and config +""" +import functools +import io +import os +import pathlib +from datetime import datetime +from typing import Any, Iterator, Mapping, Union + +import attrs +import yaml +from typing_extensions import Self + +CENSUS_BUILD_CONFIG = "config.yaml" +CENSUS_BUILD_STATE = "state.yaml" +CONFIG_DEFAULTS = { + "build_tag": datetime.now().astimezone().date().isoformat(), + "verbose": 1, + "log_dir": "logs", + "log_file": "build.log", + "cell_census_S3_path": "s3://cellxgene-data-public/cell-census", + # XXX TODO add host requirements, etc. + "consolidate": True, + "validate": True, + "multi_process": False, + "max_workers": None, + "manifest": None, + "test_first_n": None, + "test_disable_dirty_get_check": False, +} + + +class Namespace(Mapping[str, Any]): + """Readonly namespace""" + + def __init__(self, **kwargs: Any): + self._state = dict(kwargs) + + def __eq__(self, other: object) -> bool: + if isinstance(other, Namespace): + return self._state == other._state + return NotImplemented + + def __contains__(self, key: Any) -> bool: + return key in self._state + + def __repr__(self) -> str: + items = (f"{k}={v!r}" for k, v in self.items()) + return "{}({})".format(type(self).__name__, ", ".join(items)) + + def __getitem__(self, key: str) -> Any: + return self._state[key] + + def __getattr__(self, key: str) -> Any: + return self._state[key] + + def __iter__(self) -> Iterator[str]: + return iter(self._state) + + def __len__(self) -> int: + return len(self._state) + + def __getstate__(self) -> dict[str, Any]: + return self.__dict__.copy() + + def __setstate__(self, state: dict[str, Any]) -> None: + self.__dict__.update(state) + + +class MutableNamespace(Namespace): + """Mutable namespace""" + + def __setitem__(self, key: str, value: Any) -> None: + if not isinstance(key, str): + raise TypeError + self._state[key] = value + + # Do not implement __delitem__. Log format has no deletion marker, so delete + # semantics can't be supported until that is implemented. + + +class CensusBuildConfig(Namespace): + defaults = CONFIG_DEFAULTS + + def __init__(self, **kwargs: Any): + config = self.defaults.copy() + config.update(kwargs) + super().__init__(**config) + + @classmethod + def load(cls, file: Union[str, os.PathLike[str], io.TextIOBase]) -> Self: + if isinstance(file, (str, os.PathLike)): + with open(file) as f: + user_config = yaml.safe_load(f) + else: + user_config = yaml.safe_load(file) + + # Empty YAML config file is legal + if user_config is None: + user_config = {} + + # But we only understand a top-level dictionary (e.g., no lists, etc.) + if not isinstance(user_config, dict): + raise TypeError("YAML config file malformed - expected top-level dictionary") + + return cls(**user_config) + + +class CensusBuildState(MutableNamespace): + def __init__(self, **kwargs: Any): + self.__dirty_keys = set(kwargs) + super().__init__(**kwargs) + + def __setitem__(self, key: str, value: Any) -> None: + if self._state.get(key) == value: + return + super().__setitem__(key, value) + self.__dirty_keys.add(key) + + @classmethod + def load(cls, file: Union[str, os.PathLike[str], io.TextIOBase]) -> Self: + if isinstance(file, (str, os.PathLike)): + with open(file) as state_log: + documents = yaml.safe_load_all(state_log) + else: + documents = yaml.safe_load_all(file) + + return cls(**functools.reduce(lambda acc, r: acc.update(r) or acc, documents, {})) + + def commit(self, file: Union[str, os.PathLike[str]]) -> None: + # append dirty elements (atomic on Posix) + if self.__dirty_keys: + dirty = {k: self[k] for k in self.__dirty_keys} + self.__dirty_keys.clear() + with open(file, mode="a") as state_log: + record = f"--- # {datetime.now().isoformat()}\n" + yaml.dump(dirty) + state_log.write(record) + + +@attrs.define(frozen=True) +class CensusBuildArgs: + working_dir: pathlib.PosixPath = attrs.field(validator=attrs.validators.instance_of(pathlib.PosixPath)) + config: CensusBuildConfig = attrs.field(validator=attrs.validators.instance_of(CensusBuildConfig)) + state: CensusBuildState = attrs.field( + factory=CensusBuildState, validator=attrs.validators.instance_of(CensusBuildState) # default: empty state + ) + + @property + def soma_path(self) -> pathlib.PosixPath: + return self.working_dir / self.build_tag / "soma" + + @property + def h5ads_path(self) -> pathlib.PosixPath: + return self.working_dir / self.build_tag / "h5ads" + + @property + def build_tag(self) -> str: + build_tag = self.config.build_tag + if not isinstance(build_tag, str): + raise TypeError("Configuration contains non-string build_tag.") + return build_tag diff --git a/tools/cell_census_builder/src/cell_census_builder/host_validation.py b/tools/cell_census_builder/src/cell_census_builder/host_validation.py index 69a096056..4de94e9c2 100644 --- a/tools/cell_census_builder/src/cell_census_builder/host_validation.py +++ b/tools/cell_census_builder/src/cell_census_builder/host_validation.py @@ -1,67 +1,96 @@ import logging import os import sys -from typing import Optional +from typing import Union import psutil -from cell_census_builder.logging import hr_multibyte_unit, setup_logging +from .build_state import CensusBuildArgs +from .logging import hr_binary_unit, hr_decimal_unit -"""Minimum physical RAM""" -MIN_RAM = 512 * 1024**3 # 512GiB +"""Defaults""" +MIN_PHYSICAL_MEMORY = 512 * 1024**3 # 512GiB +MIN_SWAP_MEMORY = 2 * 1024**4 # 2TiB +MIN_FREE_DISK_SPACE = 1 * 1024**4 # 1 TiB -"""Minimum virtual memory/swap""" -MIN_SWAP = 2 * 1024**4 # 2TiB -"""Minimum free disk space""" -MIN_FREE_DISK_SPACE = 1 * 1024**4 # 1 TiB +def _check(condition: bool, message: str) -> bool: + """Like assert, but logs""" + if not condition: + logging.critical(message) + return condition -def check_os() -> None: +def check_os() -> bool: """ Check that we run on Posix (Linux, MacOS), as we rely on Posix semantics for a few things. """ - assert psutil.POSIX + return _check(os.name == "posix" and psutil.POSIX, "Census builder requires Posix OS") -def check_memory() -> None: +def check_physical_memory(min_physical_memory: int) -> bool: """ Check for sufficient physical and virtual memory. """ svmem = psutil.virtual_memory() - logging.debug(f"Host: {hr_multibyte_unit(svmem.total)} memory found") - assert svmem.total >= MIN_RAM, f"Insufficient memory (found {svmem.total}, require {MIN_RAM})" + logging.debug(f"Host: {hr_binary_unit(svmem.total)} memory found") + return _check( + svmem.total >= min_physical_memory, + f"Insufficient memory (found {hr_binary_unit(svmem.total)}, " f"require {hr_binary_unit(min_physical_memory)})", + ) - svswap = psutil.swap_memory() - logging.debug(f"Host: {hr_multibyte_unit(svswap.total)} swap found") - assert svswap.total >= MIN_SWAP, f"Insufficient swap space (found {svswap.total}, require {MIN_SWAP})" - -def check_free_disk(working_dir: Optional[str] = ".") -> None: +def check_swap_memory(min_swap_memory: int) -> bool: """ - Check for sufficient free disk space. + Check for sufficient physical and virtual memory. """ - skdiskusage = psutil.disk_usage(working_dir) - logging.debug(f"Host: {hr_multibyte_unit(skdiskusage.free)} free disk space found") - assert ( - skdiskusage.free >= MIN_FREE_DISK_SPACE - ), f"Insufficient free disk space (found {skdiskusage.free}, require {MIN_FREE_DISK_SPACE})" + svswap = psutil.swap_memory() + logging.debug(f"Host: {hr_binary_unit(svswap.total)} swap found") + return _check( + svswap.total >= min_swap_memory, + f"Insufficient swap space (found {hr_binary_unit(svswap.total)}, " + f"require {hr_binary_unit(min_swap_memory)})", + ) -def run_all_checks() -> int: +def check_free_disk(working_dir: Union[str, os.PathLike[str]], min_free_disk_space: int) -> bool: """ - Run all host validation checks. Returns zero or raises an exception. + Check for sufficient free disk space. """ - check_os() - check_memory() - check_free_disk(os.getcwd()) # assumed working directory is CWD - logging.info("Host validation success") - return 0 - - -# Process MUST return zero on success (all good) or non-zero on a + working_dir_fspath = working_dir.__fspath__() if isinstance(working_dir, os.PathLike) else working_dir + skdiskusage = psutil.disk_usage(working_dir_fspath) + logging.debug(f"Host: {hr_decimal_unit(skdiskusage.free)} free disk space found") + return _check( + skdiskusage.free >= min_free_disk_space, + f"Insufficient free disk space (found {hr_decimal_unit(skdiskusage.free)}, " + f"require {hr_decimal_unit(min_free_disk_space)})", + ) + + +def check_host(args: CensusBuildArgs) -> bool: + """Verify all host requirments. Return True if OK, False if conditions not met""" + return ( + check_os() + and check_physical_memory(args.config.get("min_physical_memory", MIN_PHYSICAL_MEMORY)) + and check_swap_memory(args.config.get("min_swap_memory", MIN_SWAP_MEMORY)) + and check_free_disk(args.working_dir, args.config.get("min_free_disk_space", MIN_FREE_DISK_SPACE)) + ) + + +# Return zero on success (all good) or non-zero on a # host which does not validate. if __name__ == "__main__": - setup_logging(verbose=1) - sys.exit(run_all_checks()) + """For CLI testing""" + + def main() -> int: + assert ( + check_os() + and check_physical_memory(MIN_PHYSICAL_MEMORY) + and check_swap_memory(MIN_SWAP_MEMORY) + and check_free_disk(os.getcwd(), MIN_FREE_DISK_SPACE) + ) # assumed working directory is CWD + print("Host validation success") + return 0 + + sys.exit(main()) diff --git a/tools/cell_census_builder/src/cell_census_builder/logging.py b/tools/cell_census_builder/src/cell_census_builder/logging.py index 987046b6d..7d587cdc9 100644 --- a/tools/cell_census_builder/src/cell_census_builder/logging.py +++ b/tools/cell_census_builder/src/cell_census_builder/logging.py @@ -1,26 +1,49 @@ import logging import math +import pathlib +import sys +from typing import List, Tuple +from .build_state import CensusBuildArgs -def setup_logging(verbose: int = 0) -> None: + +def logging_init(args: CensusBuildArgs) -> None: """ - Configure the logger + Configure the logger. """ - level = logging.DEBUG if verbose > 1 else logging.INFO if verbose == 1 else logging.WARNING + level = logging.DEBUG if args.config.verbose > 1 else logging.INFO if args.config.verbose == 1 else logging.WARNING + handlers: List[logging.Handler] = [logging.StreamHandler(sys.stderr)] + + # Create logging directory if configured appropriately + if args.config.log_dir and args.config.log_file: + logs_dir = pathlib.PosixPath(args.working_dir) / pathlib.PosixPath(args.config.log_dir) + logs_dir.mkdir(parents=True, exist_ok=True) + logs_file = logs_dir / args.config.log_file + handlers.insert(0, logging.FileHandler(logs_file)) + logging.basicConfig( format="%(asctime)s %(process)-7s %(levelname)-8s %(message)s", level=level, datefmt="%Y-%m-%d %H:%M:%S", + handlers=handlers, ) logging.captureWarnings(True) -def hr_multibyte_unit(n_bytes: int) -> str: - """Convert number of bytes into a human-readable binary (power of 1024) multi-byte unit string.""" +def _hr_multibyte_unit(n_bytes: int, unit_base: int, unit_size_names: Tuple[str, ...]) -> str: + """Private. Convert number of bytes into a human-readable multi-byte unit string.""" if n_bytes == 0: return "0B" - unit_size_name = ("B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB") - unit = int(math.floor(math.log(n_bytes, 1024))) - n_units = round(n_bytes / math.pow(1024, unit)) - return f"{n_units}{unit_size_name[unit]}" + unit = int(math.floor(math.log(n_bytes, unit_base))) + n_units = round(n_bytes / math.pow(unit_base, unit)) + return f"{n_units}{unit_size_names[unit]}" + + +def hr_binary_unit(n_bytes: int) -> str: + return _hr_multibyte_unit(n_bytes, 1024, ("B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB")) + + +def hr_decimal_unit(n_bytes: int) -> str: + """Convert number of bytes into a human-readable decimal (power of 1000) multi-byte unit string.""" + return _hr_multibyte_unit(n_bytes, 1000, ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")) diff --git a/tools/cell_census_builder/src/cell_census_builder/util.py b/tools/cell_census_builder/src/cell_census_builder/util.py new file mode 100644 index 000000000..683662deb --- /dev/null +++ b/tools/cell_census_builder/src/cell_census_builder/util.py @@ -0,0 +1,46 @@ +import multiprocessing +import urllib.parse + +from .build_state import CensusBuildArgs +from .logging import logging_init + + +def urljoin(base: str, url: str) -> str: + """ + like urllib.parse.urljoin, but doesn't get confused by S3:// + """ + p_url = urllib.parse.urlparse(url) + if p_url.netloc: + return url + + p_base = urllib.parse.urlparse(base) + path = urllib.parse.urljoin(p_base.path, p_url.path) + parts = [p_base.scheme, p_base.netloc, path, p_url.params, p_url.query, p_url.fragment] + return urllib.parse.urlunparse(parts) + + +def urlcat(base: str, *paths: str) -> str: + """ + Concat one or more paths, separated with '/'. Similar to urllib.parse.urljoin, + but doesn't get confused by S3:// and other "non-standard" protocols (treats + them as if they are same as http: or file:) + + Similar to urllib.parse.urljoin except it takes an iterator, and + assumes the container_uri is a 'directory'/container, ie, ends in '/'. + """ + + url = base + for p in paths: + url = url if url.endswith("/") else url + "/" + url = urljoin(url, p) + return url + + +def process_init(args: CensusBuildArgs) -> None: + """ + Called on every process start to configure global package/module behavior. + """ + if multiprocessing.get_start_method(True) != "spawn": + multiprocessing.set_start_method("spawn", True) + + logging_init(args) diff --git a/tools/cell_census_builder/tests/anndata/conftest.py b/tools/cell_census_builder/tests/anndata/conftest.py index a2d0a78bf..52064fe89 100644 --- a/tools/cell_census_builder/tests/anndata/conftest.py +++ b/tools/cell_census_builder/tests/anndata/conftest.py @@ -3,12 +3,16 @@ import anndata as ad import pytest from cell_census_builder.build_soma.datasets import Dataset +from cell_census_builder.build_state import CensusBuildArgs from ..conftest import ORGANISMS, get_h5ad @pytest.fixture -def datasets_with_mixed_feature_reference(assets_path: str) -> List[Dataset]: +def datasets_with_mixed_feature_reference(census_build_args: CensusBuildArgs) -> List[Dataset]: + census_build_args.h5ads_path.mkdir(parents=True, exist_ok=True) + assets_path = census_build_args.h5ads_path.as_posix() + organism = ORGANISMS[0] dataset_id = "an_id" datasets = [] @@ -31,7 +35,10 @@ def datasets_with_mixed_feature_reference(assets_path: str) -> List[Dataset]: @pytest.fixture -def datasets_with_larger_raw_layer(assets_path: str) -> List[Dataset]: +def datasets_with_larger_raw_layer(census_build_args: CensusBuildArgs) -> List[Dataset]: + census_build_args.h5ads_path.mkdir(parents=True, exist_ok=True) + assets_path = census_build_args.h5ads_path.as_posix() + organism = ORGANISMS[0] dataset_id = "an_id" datasets = [] @@ -56,7 +63,10 @@ def datasets_with_larger_raw_layer(assets_path: str) -> List[Dataset]: @pytest.fixture -def datasets_with_incorrect_schema_version(assets_path: str) -> List[Dataset]: +def datasets_with_incorrect_schema_version(census_build_args: CensusBuildArgs) -> List[Dataset]: + census_build_args.h5ads_path.mkdir(parents=True, exist_ok=True) + assets_path = census_build_args.h5ads_path.as_posix() + organism = ORGANISMS[0] dataset_id = "an_id" datasets = [] diff --git a/tools/cell_census_builder/tests/conftest.py b/tools/cell_census_builder/tests/conftest.py index 3b8363289..663949807 100644 --- a/tools/cell_census_builder/tests/conftest.py +++ b/tools/cell_census_builder/tests/conftest.py @@ -1,5 +1,4 @@ import io -import os import pathlib from typing import List, Optional @@ -13,7 +12,8 @@ from cell_census_builder.build_soma.globals import ( CENSUS_X_LAYERS_PLATFORM_CONFIG, ) -from cell_census_builder.build_soma.mp import process_initializer +from cell_census_builder.build_state import CensusBuildArgs, CensusBuildConfig +from cell_census_builder.util import process_init from scipy import sparse @@ -94,21 +94,22 @@ def get_h5ad(organism: Organism, gene_ids: Optional[List[str]] = None) -> anndat @pytest.fixture -def assets_path(tmp_path: pathlib.Path) -> str: - assets_path = f"{tmp_path}/h5ads" - os.mkdir(assets_path) - return assets_path +def census_build_args(request: pytest.FixtureRequest, tmp_path: pathlib.Path) -> CensusBuildArgs: + # parameterization is optional + try: + config = request.param + except AttributeError: + config = {} - -@pytest.fixture -def soma_path(tmp_path: pathlib.Path) -> str: - soma_path = f"{tmp_path}/soma" - os.mkdir(soma_path) - return soma_path + if config.get("manifest") is True: # if bool True, replace with an IOstream + config["manifest"] = request.getfixturevalue("manifest_csv") + return CensusBuildArgs(working_dir=tmp_path, config=CensusBuildConfig(**config)) @pytest.fixture -def datasets(assets_path: str) -> List[Dataset]: +def datasets(census_build_args: CensusBuildArgs) -> List[Dataset]: + census_build_args.h5ads_path.mkdir(parents=True, exist_ok=True) + assets_path = census_build_args.h5ads_path.as_posix() datasets = [] for organism in ORGANISMS: for i in range(NUM_DATASET): @@ -165,7 +166,7 @@ def manifest_csv_with_duplicates(tmp_path: pathlib.Path) -> io.TextIOWrapper: @pytest.fixture() -def setup(monkeypatch: MonkeyPatch) -> None: - process_initializer() +def setup(monkeypatch: MonkeyPatch, census_build_args: CensusBuildArgs) -> None: + process_init(census_build_args) monkeypatch.setitem(CENSUS_X_LAYERS_PLATFORM_CONFIG["raw"]["tiledb"]["create"]["dims"]["soma_dim_0"], "tile", 2) monkeypatch.setitem(CENSUS_X_LAYERS_PLATFORM_CONFIG["raw"]["tiledb"]["create"]["dims"]["soma_dim_1"], "tile", 2) diff --git a/tools/cell_census_builder/tests/test_builder.py b/tools/cell_census_builder/tests/test_builder.py index 90b233300..3a626b3da 100644 --- a/tools/cell_census_builder/tests/test_builder.py +++ b/tools/cell_census_builder/tests/test_builder.py @@ -1,19 +1,17 @@ -import io import os import pathlib -from types import ModuleType, SimpleNamespace +from types import ModuleType from typing import List from unittest.mock import patch import numpy as np import pandas as pd import pyarrow as pa +import pytest import tiledb import tiledbsoma as soma from cell_census_builder.build_soma.build import build, build_step1_get_source_datasets from cell_census_builder.build_soma.datasets import Dataset -from cell_census_builder.build_soma.experiment_builder import ExperimentBuilder -from cell_census_builder.build_soma.experiment_specs import make_experiment_specs from cell_census_builder.build_soma.globals import ( CENSUS_DATA_NAME, CENSUS_INFO_NAME, @@ -21,10 +19,16 @@ MEASUREMENT_RNA_NAME, ) from cell_census_builder.build_soma.validate import validate +from cell_census_builder.build_state import CensusBuildArgs +@pytest.mark.parametrize( + "census_build_args", [dict(multi_process=False, consolidate=True, build_tag="test_tag", verbose=1)], indirect=True +) def test_base_builder_creation( - datasets: List[Dataset], assets_path: str, soma_path: str, tmp_path: pathlib.Path, setup: None + datasets: List[Dataset], + census_build_args: CensusBuildArgs, + setup: None, ) -> None: """ Runs the builder, queries the census and performs a set of base assertions. @@ -34,25 +38,18 @@ def test_base_builder_creation( ), patch("cell_census_builder.build_soma.consolidate._run"), patch( "cell_census_builder.build_soma.validate.validate_consolidation", return_value=True ): - # Patching consolidate_tiledb_object, becuase is uses to much memory to run in github actions. - experiment_specifications = make_experiment_specs() - experiment_builders = [ExperimentBuilder(spec) for spec in experiment_specifications] - - from types import SimpleNamespace - - args = SimpleNamespace(multi_process=False, consolidate=True, build_tag="test_tag", verbose=True) - return_value = build(args, soma_path, assets_path, experiment_builders) + return_value = build(census_build_args) # return_value = 0 means that the build succeeded assert return_value == 0 # validate the cell_census - return_value = validate(args, soma_path, assets_path, experiment_specifications) + return_value = validate(census_build_args) assert return_value is True # Query the census and do assertions with soma.Collection.open( - uri=soma_path, + uri=census_build_args.soma_path.as_posix(), context=soma.options.SOMATileDBContext(tiledb_ctx=tiledb.Ctx({"vfs.s3.region": "us-west-2"})), ) as census: # There are 8 cells in total (4 from the first and 4 from the second datasets). They all belong to homo_sapiens @@ -130,21 +127,24 @@ def test_unicode_support(tmp_path: pathlib.Path) -> None: assert pd_df_in.read().concat().to_pandas()["value"].to_list() == ["Ünicode", "S̈upport"] -def test_build_step1_get_source_datasets(tmp_path: pathlib.Path, manifest_csv: io.TextIOWrapper) -> None: - import pathlib - - pathlib.Path(tmp_path / "dest").mkdir() - args = SimpleNamespace(manifest=manifest_csv, test_first_n=None, verbose=2, multi_process=True) +@pytest.mark.parametrize( + "census_build_args", + [dict(manifest=True, test_first_n=None, verbose=2, build_tag="build_tag", multi_process=True)], + indirect=True, +) +def test_build_step1_get_source_datasets(tmp_path: pathlib.Path, census_build_args: CensusBuildArgs) -> None: + # prereq for build step 1 + census_build_args.h5ads_path.mkdir(parents=True, exist_ok=True) # Call the function - datasets = build_step1_get_source_datasets(args, f"{tmp_path}/dest") + datasets = build_step1_get_source_datasets(census_build_args) # Verify that 2 datasets are returned assert len(datasets) == 2 # Verify that the datasets have been staged - assert pathlib.Path(tmp_path / "dest" / "dataset_id_1.h5ad").exists() - assert pathlib.Path(tmp_path / "dest" / "dataset_id_2.h5ad").exists() + assert pathlib.Path(tmp_path / "build_tag" / "h5ads" / "dataset_id_1.h5ad").exists() + assert pathlib.Path(tmp_path / "build_tag" / "h5ads" / "dataset_id_2.h5ad").exists() def setup_module(module: ModuleType) -> None: diff --git a/tools/cell_census_builder/tests/test_source_assets.py b/tools/cell_census_builder/tests/test_source_assets.py index 0b5f5707e..6ffa9179f 100644 --- a/tools/cell_census_builder/tests/test_source_assets.py +++ b/tools/cell_census_builder/tests/test_source_assets.py @@ -1,28 +1,29 @@ import pathlib -from types import ModuleType, SimpleNamespace +from types import ModuleType from cell_census_builder.build_soma.datasets import Dataset from cell_census_builder.build_soma.source_assets import stage_source_assets +from cell_census_builder.build_state import CensusBuildArgs -def test_source_assets(tmp_path: pathlib.Path) -> None: +def test_source_assets(tmp_path: pathlib.Path, census_build_args: CensusBuildArgs) -> None: """ `source_assets` should copy the datasets from their `corpora_asset_h5ad_uri` to the specified `assets_dir` """ datasets = [] - pathlib.Path(tmp_path / "source").mkdir() - pathlib.Path(tmp_path / "dest").mkdir() + (tmp_path / "source").mkdir() + census_build_args.h5ads_path.mkdir(parents=True, exist_ok=True) for i in range(10): dataset = Dataset(f"dataset_{i}", corpora_asset_h5ad_uri=f"file://{tmp_path}/source/dataset_{i}.h5ad") - pathlib.Path(tmp_path / "source" / f"dataset_{i}.h5ad").touch() + (tmp_path / "source" / f"dataset_{i}.h5ad").touch() datasets.append(dataset) # Call the function - stage_source_assets(datasets, SimpleNamespace(verbose=True), tmp_path / "dest") + stage_source_assets(datasets, census_build_args) # Verify that the files exist for i in range(10): - assert pathlib.Path(tmp_path / "dest" / f"dataset_{i}.h5ad").exists() + assert (census_build_args.h5ads_path / f"dataset_{i}.h5ad").exists() def setup_module(module: ModuleType) -> None: diff --git a/tools/cell_census_builder/tests/test_util.py b/tools/cell_census_builder/tests/test_util.py index 77aaf8dbf..7e14f2706 100644 --- a/tools/cell_census_builder/tests/test_util.py +++ b/tools/cell_census_builder/tests/test_util.py @@ -1,6 +1,7 @@ import numpy as np import pytest -from cell_census_builder.build_soma.util import array_chunker, is_nonnegative_integral, uricat +from cell_census_builder.build_soma.util import array_chunker, is_nonnegative_integral +from cell_census_builder.util import urlcat, urljoin from scipy.sparse import coo_matrix, csr_matrix, triu @@ -119,14 +120,33 @@ def test_array_chunker() -> None: list(array_chunker(X)) -def test_uricat() -> None: - assert uricat("path", "to", "somewhere") == "path/to/somewhere" - assert uricat("path/", "to/", "somewhere") == "path/to/somewhere" - assert uricat("path/", "to/", "somewhere/") == "path/to/somewhere/" - assert uricat("file:///path/to", "somewhere") == "file:///path/to/somewhere" - assert uricat("file:///path/to/", "somewhere") == "file:///path/to/somewhere" - assert uricat("file:///path/to", "somewhere") == "file:///path/to/somewhere" - assert uricat("file:///path/to/", "/absolute") == "file:///absolute" - assert uricat("file://path/to", "file://somewhere") == "file://somewhere" - assert uricat("file:///path/to", "file://somewhere") == "file://somewhere" - assert uricat("file:///path/to", "file:///somewhere") == "file:///somewhere" +def test_urljoin() -> None: + assert urljoin("path", "to") == "to" + assert urljoin("path/", "to") == "path/to" + assert urljoin("path/", "to/") == "path/to/" + assert urljoin("file:///path/to", "somewhere") == "file:///path/somewhere" + assert urljoin("file:///path/to/", "somewhere") == "file:///path/to/somewhere" + assert urljoin("file:///path/to", "somewhere") == "file:///path/somewhere" + assert urljoin("file:///path/to/", "/absolute") == "file:///absolute" + assert urljoin("file://path/to", "file://somewhere") == "file://somewhere" + assert urljoin("file:///path/to", "file://somewhere") == "file://somewhere" + assert urljoin("file:///path/to", "file:///somewhere") == "file:///somewhere" + assert urljoin("s3://foo", "bar") == "s3://foo/bar" + assert urljoin("s3://foo/", "bar") == "s3://foo/bar" + assert urljoin("s3://foo", "bar/") == "s3://foo/bar/" + + +def test_urlcat() -> None: + assert urlcat("path", "to", "somewhere") == "path/to/somewhere" + assert urlcat("path/", "to/", "somewhere") == "path/to/somewhere" + assert urlcat("path/", "to/", "somewhere/") == "path/to/somewhere/" + assert urlcat("file:///path/to", "somewhere") == "file:///path/to/somewhere" + assert urlcat("file:///path/to/", "somewhere") == "file:///path/to/somewhere" + assert urlcat("file:///path/to", "somewhere") == "file:///path/to/somewhere" + assert urlcat("file:///path/to/", "/absolute") == "file:///absolute" + assert urlcat("file://path/to", "file://somewhere") == "file://somewhere" + assert urlcat("file:///path/to", "file://somewhere") == "file://somewhere" + assert urlcat("file:///path/to", "file:///somewhere") == "file:///somewhere" + assert urlcat("s3://foo", "bar", "baz") == "s3://foo/bar/baz" + assert urlcat("s3://foo", "bar/", "baz") == "s3://foo/bar/baz" + assert urlcat("s3://foo", "bar/", "baz/") == "s3://foo/bar/baz/" From 4ae07c1576d13044d1d2b9cf6357feb77dc4a82c Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Mon, 20 Mar 2023 19:08:45 +0000 Subject: [PATCH 06/34] add missing package to dependency list --- tools/cell_census_builder/pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/cell_census_builder/pyproject.toml b/tools/cell_census_builder/pyproject.toml index 6d49850ec..3d10e4b6d 100644 --- a/tools/cell_census_builder/pyproject.toml +++ b/tools/cell_census_builder/pyproject.toml @@ -43,6 +43,7 @@ dependencies= [ "gitpython", "attrs>=22.2.0", "psutil", + "pyyaml", ] # [tool.setuptools.packages.find] From a8963eef5fc47c9fc9df6e5464ad46381f2949c5 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Mon, 20 Mar 2023 20:43:17 +0000 Subject: [PATCH 07/34] cleanup host validation config --- .../src/cell_census_builder/build_state.py | 16 +++++++++--- .../cell_census_builder/host_validation.py | 25 +++++++++++-------- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/tools/cell_census_builder/src/cell_census_builder/build_state.py b/tools/cell_census_builder/src/cell_census_builder/build_state.py index dce19ddde..871dfcc1b 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_state.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_state.py @@ -1,5 +1,6 @@ """ -build state and config +Manage the configuration and dynamic build state for the Census build. + """ import functools import io @@ -12,15 +13,18 @@ import yaml from typing_extensions import Self +""" +Defaults for Census configuration. +""" + CENSUS_BUILD_CONFIG = "config.yaml" CENSUS_BUILD_STATE = "state.yaml" -CONFIG_DEFAULTS = { +CENSUS_CONFIG_DEFAULTS = { "build_tag": datetime.now().astimezone().date().isoformat(), "verbose": 1, "log_dir": "logs", "log_file": "build.log", "cell_census_S3_path": "s3://cellxgene-data-public/cell-census", - # XXX TODO add host requirements, etc. "consolidate": True, "validate": True, "multi_process": False, @@ -28,6 +32,10 @@ "manifest": None, "test_first_n": None, "test_disable_dirty_get_check": False, + "host_validation_disable": False, # if True, host validation checks will be skipped + "host_validation_min_physical_memory": 512 * 1024**3, # 512GiB + "host_validation_min_swap_space": 2 * 1024**4, # 2TiB + "host_validation_min_free_disk_space": 1 * 1024**4, # 1 TiB } @@ -81,7 +89,7 @@ def __setitem__(self, key: str, value: Any) -> None: class CensusBuildConfig(Namespace): - defaults = CONFIG_DEFAULTS + defaults = CENSUS_CONFIG_DEFAULTS def __init__(self, **kwargs: Any): config = self.defaults.copy() diff --git a/tools/cell_census_builder/src/cell_census_builder/host_validation.py b/tools/cell_census_builder/src/cell_census_builder/host_validation.py index 4de94e9c2..bca5b30eb 100644 --- a/tools/cell_census_builder/src/cell_census_builder/host_validation.py +++ b/tools/cell_census_builder/src/cell_census_builder/host_validation.py @@ -8,11 +8,6 @@ from .build_state import CensusBuildArgs from .logging import hr_binary_unit, hr_decimal_unit -"""Defaults""" -MIN_PHYSICAL_MEMORY = 512 * 1024**3 # 512GiB -MIN_SWAP_MEMORY = 2 * 1024**4 # 2TiB -MIN_FREE_DISK_SPACE = 1 * 1024**4 # 1 TiB - def _check(condition: bool, message: str) -> bool: """Like assert, but logs""" @@ -70,11 +65,18 @@ def check_free_disk(working_dir: Union[str, os.PathLike[str]], min_free_disk_spa def check_host(args: CensusBuildArgs) -> bool: """Verify all host requirments. Return True if OK, False if conditions not met""" + if args.config.host_validation_disable: + return True + return ( check_os() - and check_physical_memory(args.config.get("min_physical_memory", MIN_PHYSICAL_MEMORY)) - and check_swap_memory(args.config.get("min_swap_memory", MIN_SWAP_MEMORY)) - and check_free_disk(args.working_dir, args.config.get("min_free_disk_space", MIN_FREE_DISK_SPACE)) + and check_physical_memory( + args.config.get("min_physical_memory", args.config.host_validation_min_physical_memory) + ) + and check_swap_memory(args.config.get("min_swap_memory", args.config.host_validation_min_swap_memory)) + and check_free_disk( + args.working_dir, args.config.get("min_free_disk_space", args.config.host_validation_min_free_disk_space) + ) ) @@ -82,13 +84,14 @@ def check_host(args: CensusBuildArgs) -> bool: # host which does not validate. if __name__ == "__main__": """For CLI testing""" + from .build_state import CENSUS_CONFIG_DEFAULTS def main() -> int: assert ( check_os() - and check_physical_memory(MIN_PHYSICAL_MEMORY) - and check_swap_memory(MIN_SWAP_MEMORY) - and check_free_disk(os.getcwd(), MIN_FREE_DISK_SPACE) + and check_physical_memory(CENSUS_CONFIG_DEFAULTS["host_validation_min_physical_memory"]) # type: ignore[arg-type] + and check_swap_memory(CENSUS_CONFIG_DEFAULTS["host_validation_min_swap_memory"]) # type: ignore[arg-type] + and check_free_disk(os.getcwd(), CENSUS_CONFIG_DEFAULTS["host_validation_min_free_disk_space"]) # type: ignore[arg-type] ) # assumed working directory is CWD print("Host validation success") return 0 From 76c6bbd5489aeca4e74d9d0786412af5f70e37ab Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Mon, 20 Mar 2023 20:45:10 +0000 Subject: [PATCH 08/34] update test CLI for host validation --- .../src/cell_census_builder/host_validation.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/cell_census_builder/src/cell_census_builder/host_validation.py b/tools/cell_census_builder/src/cell_census_builder/host_validation.py index bca5b30eb..fd60db8d4 100644 --- a/tools/cell_census_builder/src/cell_census_builder/host_validation.py +++ b/tools/cell_census_builder/src/cell_census_builder/host_validation.py @@ -87,12 +87,15 @@ def check_host(args: CensusBuildArgs) -> bool: from .build_state import CENSUS_CONFIG_DEFAULTS def main() -> int: - assert ( + if not ( check_os() and check_physical_memory(CENSUS_CONFIG_DEFAULTS["host_validation_min_physical_memory"]) # type: ignore[arg-type] and check_swap_memory(CENSUS_CONFIG_DEFAULTS["host_validation_min_swap_memory"]) # type: ignore[arg-type] and check_free_disk(os.getcwd(), CENSUS_CONFIG_DEFAULTS["host_validation_min_free_disk_space"]) # type: ignore[arg-type] - ) # assumed working directory is CWD + ): # assumed working directory is CWD + print("Host validation FAILURE") + return 1 + print("Host validation success") return 0 From bccd4f1bffe63a0f224869025b815b2b885a1cf5 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Tue, 21 Mar 2023 00:35:41 +0000 Subject: [PATCH 09/34] more namespace refactoring --- .../src/cell_census_builder/__main__.py | 19 +++++++++++++++---- .../build_soma/__init__.py | 7 +++++++ .../build_soma/__main__.py | 4 ++-- .../build_soma/{build.py => build_soma.py} | 0 .../{validate.py => validate_soma.py} | 0 .../src/cell_census_builder/build_state.py | 2 +- .../cell_census_builder/tests/test_builder.py | 10 +++++----- 7 files changed, 30 insertions(+), 12 deletions(-) rename tools/cell_census_builder/src/cell_census_builder/build_soma/{build.py => build_soma.py} (100%) rename tools/cell_census_builder/src/cell_census_builder/build_soma/{validate.py => validate_soma.py} (100%) diff --git a/tools/cell_census_builder/src/cell_census_builder/__main__.py b/tools/cell_census_builder/src/cell_census_builder/__main__.py index 01192ff7b..36636efd2 100644 --- a/tools/cell_census_builder/src/cell_census_builder/__main__.py +++ b/tools/cell_census_builder/src/cell_census_builder/__main__.py @@ -7,6 +7,8 @@ import s3fs from . import __version__ +from .build_soma import build as build_a_soma +from .build_soma import validate as validate_a_soma from .build_state import CENSUS_BUILD_CONFIG, CENSUS_BUILD_STATE, CensusBuildArgs, CensusBuildConfig from .host_validation import check_host from .util import process_init, urlcat @@ -67,6 +69,7 @@ def do_build(args: CensusBuildArgs) -> int: do_prebuild_set_defaults, do_prebuild_checks, do_build_soma, + do_validate_soma, do_create_reports, ] try: @@ -79,8 +82,8 @@ def do_build(args: CensusBuildArgs) -> int: return cc logging.info(f"Build step {build_step.__name__} [{n} of {len(build_steps)}]: complete") - except Exception as e: - logging.critical(f"Caught exception, exiting: {str(e)}") + except Exception: + logging.critical("Caught exception, exiting", exc_info=True) return 1 logging.info("Census build: completed") @@ -113,8 +116,16 @@ def do_prebuild_checks(args: CensusBuildArgs) -> int: def do_build_soma(args: CensusBuildArgs) -> int: - # WIP - # args.state["do_build_soma"] = True + if not build_a_soma(args): + return 1 + args.state["do_build_soma"] = True + return 0 + + +def do_validate_soma(args: CensusBuildArgs) -> int: + if not validate_a_soma(args): + return 1 + args.state["do_validate_soma"] = True return 0 diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/__init__.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/__init__.py index e69de29bb..1d58b873a 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_soma/__init__.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/__init__.py @@ -0,0 +1,7 @@ +from .build_soma import build +from .validate_soma import validate + +__all__ = [ + "build", + "validate", +] diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py index d1951b1b1..f88cfbe36 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py @@ -5,8 +5,8 @@ from ..build_state import CensusBuildArgs, CensusBuildConfig from ..util import process_init -from .build import build -from .validate import validate +from .build_soma import build +from .validate_soma import validate def main() -> int: diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/build.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/build_soma.py similarity index 100% rename from tools/cell_census_builder/src/cell_census_builder/build_soma/build.py rename to tools/cell_census_builder/src/cell_census_builder/build_soma/build_soma.py diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/validate.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/validate_soma.py similarity index 100% rename from tools/cell_census_builder/src/cell_census_builder/build_soma/validate.py rename to tools/cell_census_builder/src/cell_census_builder/build_soma/validate_soma.py diff --git a/tools/cell_census_builder/src/cell_census_builder/build_state.py b/tools/cell_census_builder/src/cell_census_builder/build_state.py index 871dfcc1b..66d7e39b2 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_state.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_state.py @@ -31,7 +31,7 @@ "max_workers": None, "manifest": None, "test_first_n": None, - "test_disable_dirty_get_check": False, + "test_disable_dirty_git_check": False, "host_validation_disable": False, # if True, host validation checks will be skipped "host_validation_min_physical_memory": 512 * 1024**3, # 512GiB "host_validation_min_swap_space": 2 * 1024**4, # 2TiB diff --git a/tools/cell_census_builder/tests/test_builder.py b/tools/cell_census_builder/tests/test_builder.py index 3a626b3da..61e54ae1b 100644 --- a/tools/cell_census_builder/tests/test_builder.py +++ b/tools/cell_census_builder/tests/test_builder.py @@ -10,7 +10,8 @@ import pytest import tiledb import tiledbsoma as soma -from cell_census_builder.build_soma.build import build, build_step1_get_source_datasets +from cell_census_builder.build_soma import build, validate +from cell_census_builder.build_soma.build_soma import build_step1_get_source_datasets from cell_census_builder.build_soma.datasets import Dataset from cell_census_builder.build_soma.globals import ( CENSUS_DATA_NAME, @@ -18,7 +19,6 @@ FEATURE_DATASET_PRESENCE_MATRIX_NAME, MEASUREMENT_RNA_NAME, ) -from cell_census_builder.build_soma.validate import validate from cell_census_builder.build_state import CensusBuildArgs @@ -33,10 +33,10 @@ def test_base_builder_creation( """ Runs the builder, queries the census and performs a set of base assertions. """ - with patch("cell_census_builder.build_soma.build.prepare_file_system"), patch( - "cell_census_builder.build_soma.build.build_step1_get_source_datasets", return_value=datasets + with patch("cell_census_builder.build_soma.build_soma.prepare_file_system"), patch( + "cell_census_builder.build_soma.build_soma.build_step1_get_source_datasets", return_value=datasets ), patch("cell_census_builder.build_soma.consolidate._run"), patch( - "cell_census_builder.build_soma.validate.validate_consolidation", return_value=True + "cell_census_builder.build_soma.validate_soma.validate_consolidation", return_value=True ): return_value = build(census_build_args) From 6dcb088c41cd4ecc10e1238cd9b96eb482b2a645 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Tue, 21 Mar 2023 02:10:57 +0000 Subject: [PATCH 10/34] add reports to workflow --- tools/cell_census_builder/README.md | 38 +++++++- .../src/cell_census_builder/__main__.py | 49 +++++------ .../src/cell_census_builder/build_state.py | 1 - .../src/cell_census_builder/census_summary.py | 88 +++++++++++-------- 4 files changed, 111 insertions(+), 65 deletions(-) diff --git a/tools/cell_census_builder/README.md b/tools/cell_census_builder/README.md index 4dbda235d..99c89c337 100644 --- a/tools/cell_census_builder/README.md +++ b/tools/cell_census_builder/README.md @@ -7,13 +7,45 @@ This tool is not intended for end-users - it is used by CZI to periodically crea CELLxGENE data in the above format. The remainder of this document is intended for users of the build package. -Please see the top-level [README](../../README.md) for more information on the Cell Census. +Please see the top-level [README](../../README.md) for more information on the Cell Census and +using the Cell Census data. # Overview This package contains sub-modules, each of which automate elements of the Cell Census build and release process. -The ultimate intention is to integrate these into an automated multi-step workflow. Until that occurs, individual steps -are provided as modules with their own `__main__`, to be manually invoked. +They are wrapped at the package top-leveby by a `__main__` which implements the Cell Census build process, +with standard defaults. + +The top-level build can be invoked as follows: + +- Create a working directory, e.g., `census-build` or equivalent. +- If any configuration defaults need to be overridden, create a `config.yaml` in the working directory containing the default overrides. +- Run the build as `python -m cell_census_builder your-working_dir` + +This will perform four steps (more will be added the future): + +- host validation +- build soma +- validate soma +- build reports (eg., summary) + +This will result in the following file tree: + +``` +working_dir: + | + +-- config.yaml # build config (user provided, read-only) + +-- state.yaml # build runtime state (eg., census version tag, etc) + +-- build-version # defaults to current date, e.g., 2023-01-20 + | +-- soma + | +-- h5ads + +-- logs # log files from various stages + | +-- build.log + | +-- ... + +-- reports + +-- census-summary-VERSION.txt + +-- census-diff-VERSION.txt +``` ## `host_validation` module diff --git a/tools/cell_census_builder/src/cell_census_builder/__main__.py b/tools/cell_census_builder/src/cell_census_builder/__main__.py index 36636efd2..dde5a9925 100644 --- a/tools/cell_census_builder/src/cell_census_builder/__main__.py +++ b/tools/cell_census_builder/src/cell_census_builder/__main__.py @@ -7,30 +7,9 @@ import s3fs from . import __version__ -from .build_soma import build as build_a_soma -from .build_soma import validate as validate_a_soma from .build_state import CENSUS_BUILD_CONFIG, CENSUS_BUILD_STATE, CensusBuildArgs, CensusBuildConfig -from .host_validation import check_host from .util import process_init, urlcat -""" -File tree for the build. - -working_dir: - | - +-- config.yaml # build config (user provided, read-only) - +-- state.yaml # build runtime state (eg., census version tag, etc) - +-- soma - +-- h5ads - +-- logs # log files from various stages - | +-- build.log - | +-- ... - +-- reports - +-- census-summary-VERSION.txt - +-- census-diff-VERSION.txt - -""" - def main() -> int: cli_parser = create_args_parser() @@ -98,6 +77,7 @@ def do_prebuild_set_defaults(args: CensusBuildArgs) -> int: def do_prebuild_checks(args: CensusBuildArgs) -> int: """Pre-build checks for host, config, etc. All pre-conditions should go here.""" + from .host_validation import check_host # check host configuration, e.g., free disk space if not check_host(args): @@ -116,22 +96,41 @@ def do_prebuild_checks(args: CensusBuildArgs) -> int: def do_build_soma(args: CensusBuildArgs) -> int: - if not build_a_soma(args): - return 1 + from .build_soma import build as build_a_soma + + if (cc := build_a_soma(args)) != 0: + return cc + args.state["do_build_soma"] = True return 0 def do_validate_soma(args: CensusBuildArgs) -> int: + from .build_soma import validate as validate_a_soma + if not validate_a_soma(args): + logging.critical("Validation of the census build has failed.") return 1 + args.state["do_validate_soma"] = True return 0 def do_create_reports(args: CensusBuildArgs) -> int: - # WIP - # args.state["do_create_reports"] = True + from .census_summary import display_summary, display_diff + + reports_dir = args.working_dir / "reports" + reports_dir.mkdir(parents=True, exist_ok=True) + + logging.info("Creating summary report") + with open(reports_dir / f"census-summary-{args.build_tag}.txt", mode="w") as f: + display_summary(uri=args.soma_path.as_posix(), file=f) + + logging.info("Creating diff report (new build vs 'latest')") + with open(reports_dir / f"census-diff-{args.build_tag}.txt", mode="w") as f: + display_diff(uri=args.soma_path.as_posix(), previous_census_version="latest", file=f) + + args.state["do_create_reports"] = True return 0 diff --git a/tools/cell_census_builder/src/cell_census_builder/build_state.py b/tools/cell_census_builder/src/cell_census_builder/build_state.py index 66d7e39b2..21036b38b 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_state.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_state.py @@ -26,7 +26,6 @@ "log_file": "build.log", "cell_census_S3_path": "s3://cellxgene-data-public/cell-census", "consolidate": True, - "validate": True, "multi_process": False, "max_workers": None, "manifest": None, diff --git a/tools/cell_census_builder/src/cell_census_builder/census_summary.py b/tools/cell_census_builder/src/cell_census_builder/census_summary.py index 86fe202af..0c016550b 100644 --- a/tools/cell_census_builder/src/cell_census_builder/census_summary.py +++ b/tools/cell_census_builder/src/cell_census_builder/census_summary.py @@ -1,10 +1,11 @@ import argparse import sys +from typing import TextIO, Optional import cell_census import pandas as pd -from .build.globals import CENSUS_DATA_NAME, CENSUS_INFO_NAME +from .build_soma.globals import CENSUS_DATA_NAME, CENSUS_INFO_NAME # Print all of the Pandas DataFrames, except the dimensions pd.options.display.max_columns = None # type: ignore[assignment] # None is legal per Pandas documentation. @@ -13,8 +14,13 @@ pd.options.display.show_dimensions = False # type: ignore[assignment] # boolean is legal per Pandas documentation. -def display_summary(census_version: str) -> int: - census = cell_census.open_soma(census_version=census_version) +def display_summary( + *, + census_version: Optional[str] = "latest", + uri: Optional[str] = None, + file: Optional[TextIO] = None, +) -> int: + census = cell_census.open_soma(census_version=census_version, uri=uri) COLS_TO_QUERY = [ ("soma_joinid", "cells"), @@ -31,18 +37,27 @@ def display_summary(census_version: str) -> int: # Use Pandas to summarize and display stats = [(organism, col[1], df[col[0]].nunique()) for organism, df in obs_df.items() for col in COLS_TO_QUERY] - print(census["census_info"]["summary"].read().concat().to_pandas()[["label", "value"]].to_string(index=False)) + print( + census["census_info"]["summary"].read().concat().to_pandas()[["label", "value"]].to_string(index=False), + file=file, + ) stats_df = pd.DataFrame(stats, columns=["organism", "attribute", "unique count"]) display_stats_df = pd.pivot(stats_df, index=["organism"], columns=["attribute"], values=["unique count"]) - print(display_stats_df) - print() + print(display_stats_df, file=file) + print(file=file) return 0 -def display_diff(census_version: str, previous_census_version: str) -> int: - census = cell_census.open_soma(census_version=census_version) - previous_census = cell_census.open_soma(census_version=previous_census_version) +def display_diff( + census_version: Optional[str] = "latest", + uri: Optional[str] = None, + previous_census_version: Optional[str] = None, + previous_uri: Optional[str] = None, + file: Optional[TextIO] = None, +) -> int: + census = cell_census.open_soma(census_version=census_version, uri=uri) + previous_census = cell_census.open_soma(census_version=previous_census_version, uri=previous_uri) # Total cell count deltas by experiment (mouse, human) @@ -50,9 +65,10 @@ def display_diff(census_version: str, previous_census_version: str) -> int: curr_count = census[CENSUS_DATA_NAME][organism].obs.count prev_count = previous_census[CENSUS_DATA_NAME][organism].obs.count print( - f"Previous {organism} cell count: {prev_count}, current {organism} cell count: {curr_count}, delta {curr_count - prev_count}" + f"Previous {organism} cell count: {prev_count}, current {organism} cell count: {curr_count}, delta {curr_count - prev_count}", + file=file, ) - print() + print(file=file) prev_datasets = previous_census[CENSUS_INFO_NAME]["datasets"].read().concat().to_pandas() curr_datasets = census[CENSUS_INFO_NAME]["datasets"].read().concat().to_pandas() @@ -64,20 +80,20 @@ def display_diff(census_version: str, previous_census_version: str) -> int: added_datasets = curr_datasets_ids - prev_dataset_ids removed_datasets = prev_dataset_ids - curr_datasets_ids if added_datasets: - print(f"Datasets that were added ({len(added_datasets)})") + print(f"Datasets that were added ({len(added_datasets)})", file=file) added_datasets_df = curr_datasets[curr_datasets["dataset_id"].isin(added_datasets)] - print(added_datasets_df[["dataset_id", "dataset_title", "collection_name"]]) + print(added_datasets_df[["dataset_id", "dataset_title", "collection_name"]], file=file) else: - print("No datasets were added") - print() + print("No datasets were added", file=file) + print(file=file) if removed_datasets: - print(f"Datasets that were removed ({len(removed_datasets)}") + print(f"Datasets that were removed ({len(removed_datasets)})", file=file) removed_datasets_df = prev_datasets[prev_datasets["dataset_id"].isin(removed_datasets)] - print(removed_datasets_df[["dataset_id", "dataset_title", "collection_name"]]) + print(removed_datasets_df[["dataset_id", "dataset_title", "collection_name"]], file=file) else: - print("No datasets were removed") - print() + print("No datasets were removed", file=file) + print(file=file) # Datasets in both versions but that have differing cell counts joined = prev_datasets.join( @@ -88,9 +104,9 @@ def display_diff(census_version: str, previous_census_version: str) -> int: ][["dataset_id", "dataset_total_cell_count_prev", "dataset_total_cell_count_curr"]] if not datasets_with_different_cell_counts.empty: - print("Datasets that have a different cell count") - print(datasets_with_different_cell_counts) - print() + print("Datasets that have a different cell count", file=file) + print(datasets_with_different_cell_counts, file=file) + print(file=file) # Deltas between summary_cell_counts dataframes y = census["census_info"]["summary_cell_counts"].read().concat().to_pandas() @@ -104,17 +120,17 @@ def display_diff(census_version: str, previous_census_version: str) -> int: ["total_cell_count_prev", "total_cell_count_curr"] ].reset_index() if not delta.empty: - print("Summary delta - total cell counts") - print(delta) - print() + print("Summary delta - total cell counts", file=file) + print(delta, file=file) + print(file=file) delta = w.loc[w["unique_cell_count_prev"] != w["unique_cell_count_curr"]][ ["unique_cell_count_prev", "unique_cell_count_curr"] ].reset_index() if not delta.empty: - print("Summary delta - unique cell counts") - print(delta) - print() + print("Summary delta - unique cell counts", file=file) + print(delta, file=file) + print(file=file) # Genes removed, added for organism in census[CENSUS_DATA_NAME]: @@ -123,19 +139,19 @@ def display_diff(census_version: str, previous_census_version: str) -> int: new_genes = set(curr_genes["feature_id"]) - set(prev_genes["feature_id"]) if new_genes: - print("Genes added") - print(new_genes) + print("Genes added", file=file) + print(new_genes, file=file) else: "No genes were added." - print() + print(file=file) removed_genes = set(prev_genes["feature_id"]) - set(curr_genes["feature_id"]) if removed_genes: - print("Genes removed") - print(removed_genes) + print("Genes removed", file=file) + print(removed_genes, file=file) else: "No genes were removed." - print() + print(file=file) return 0 @@ -161,9 +177,9 @@ def main() -> int: assert args.subcommand in ["summarize", "diff"] if args.subcommand == "summarize": - return display_summary(args.census_version) + return display_summary(census_version=args.census_version) elif args.subcommand == "diff": - return display_diff(args.census_version, args.previous_version) + return display_diff(census_version=args.census_version, previous_census_version=args.previous_version) return 0 From e8829e0aa0cc4f1c6f09d577e55b1ab425eb0474 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Tue, 21 Mar 2023 02:11:24 +0000 Subject: [PATCH 11/34] lint --- tools/cell_census_builder/src/cell_census_builder/__main__.py | 2 +- .../src/cell_census_builder/census_summary.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/cell_census_builder/src/cell_census_builder/__main__.py b/tools/cell_census_builder/src/cell_census_builder/__main__.py index dde5a9925..e613967d6 100644 --- a/tools/cell_census_builder/src/cell_census_builder/__main__.py +++ b/tools/cell_census_builder/src/cell_census_builder/__main__.py @@ -117,7 +117,7 @@ def do_validate_soma(args: CensusBuildArgs) -> int: def do_create_reports(args: CensusBuildArgs) -> int: - from .census_summary import display_summary, display_diff + from .census_summary import display_diff, display_summary reports_dir = args.working_dir / "reports" reports_dir.mkdir(parents=True, exist_ok=True) diff --git a/tools/cell_census_builder/src/cell_census_builder/census_summary.py b/tools/cell_census_builder/src/cell_census_builder/census_summary.py index 0c016550b..e8b33f791 100644 --- a/tools/cell_census_builder/src/cell_census_builder/census_summary.py +++ b/tools/cell_census_builder/src/cell_census_builder/census_summary.py @@ -1,6 +1,6 @@ import argparse import sys -from typing import TextIO, Optional +from typing import Optional, TextIO import cell_census import pandas as pd From 2c2dba95597ef09723593bcf42c93ebe326096bc Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Tue, 21 Mar 2023 02:25:52 +0000 Subject: [PATCH 12/34] handle default config correctly --- .../src/cell_census_builder/__main__.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/cell_census_builder/src/cell_census_builder/__main__.py b/tools/cell_census_builder/src/cell_census_builder/__main__.py index e613967d6..c1948757b 100644 --- a/tools/cell_census_builder/src/cell_census_builder/__main__.py +++ b/tools/cell_census_builder/src/cell_census_builder/__main__.py @@ -19,14 +19,15 @@ def main() -> int: if not working_dir.is_dir(): logging.critical("Census builder: unable to find working directory - exiting.") return 1 - if not (working_dir / CENSUS_BUILD_CONFIG).is_file(): - logging.critical("Census builder: unable to find config.yaml in working directory - exiting.") - return 1 if (working_dir / CENSUS_BUILD_STATE).exists(): logging.critical("Found pre-existing census build in working directory - aborting census build.") return 1 - build_config = CensusBuildConfig.load(working_dir / CENSUS_BUILD_CONFIG) + if (working_dir / CENSUS_BUILD_CONFIG).is_file(): + build_config = CensusBuildConfig.load(working_dir / CENSUS_BUILD_CONFIG) + else: + build_config = CensusBuildConfig() + build_args = CensusBuildArgs(working_dir=working_dir, config=build_config) # Process initialization/setup must be done early From 4680c87c07ff85923f3d599cc92b2c5b4541422f Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Tue, 21 Mar 2023 02:26:20 +0000 Subject: [PATCH 13/34] fix typo in defaults --- .../cell_census_builder/src/cell_census_builder/build_state.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cell_census_builder/src/cell_census_builder/build_state.py b/tools/cell_census_builder/src/cell_census_builder/build_state.py index 21036b38b..efafdfea7 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_state.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_state.py @@ -33,7 +33,7 @@ "test_disable_dirty_git_check": False, "host_validation_disable": False, # if True, host validation checks will be skipped "host_validation_min_physical_memory": 512 * 1024**3, # 512GiB - "host_validation_min_swap_space": 2 * 1024**4, # 2TiB + "host_validation_min_swap_memory": 2 * 1024**4, # 2TiB "host_validation_min_free_disk_space": 1 * 1024**4, # 1 TiB } From cb0d3f7525f0772d5f65b265192002a3ca0b6321 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Tue, 21 Mar 2023 23:21:07 +0000 Subject: [PATCH 14/34] fix report typo --- .../src/cell_census_builder/census_summary.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/cell_census_builder/src/cell_census_builder/census_summary.py b/tools/cell_census_builder/src/cell_census_builder/census_summary.py index e8b33f791..dcf472e74 100644 --- a/tools/cell_census_builder/src/cell_census_builder/census_summary.py +++ b/tools/cell_census_builder/src/cell_census_builder/census_summary.py @@ -142,7 +142,7 @@ def display_diff( print("Genes added", file=file) print(new_genes, file=file) else: - "No genes were added." + print("No genes were added.", file=file) print(file=file) removed_genes = set(prev_genes["feature_id"]) - set(curr_genes["feature_id"]) @@ -150,7 +150,7 @@ def display_diff( print("Genes removed", file=file) print(removed_genes, file=file) else: - "No genes were removed." + print("No genes were removed.", file=file) print(file=file) return 0 From d3076160cf4d121d81dd436ae9385eb2f60f84b7 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Tue, 21 Mar 2023 23:21:55 +0000 Subject: [PATCH 15/34] fix state load issue; enable multi-process by default --- .../src/cell_census_builder/build_state.py | 35 +++++++++++++------ 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/tools/cell_census_builder/src/cell_census_builder/build_state.py b/tools/cell_census_builder/src/cell_census_builder/build_state.py index efafdfea7..91a4a8069 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_state.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_state.py @@ -10,6 +10,7 @@ from typing import Any, Iterator, Mapping, Union import attrs +import psutil import yaml from typing_extensions import Self @@ -20,21 +21,33 @@ CENSUS_BUILD_CONFIG = "config.yaml" CENSUS_BUILD_STATE = "state.yaml" CENSUS_CONFIG_DEFAULTS = { - "build_tag": datetime.now().astimezone().date().isoformat(), + # General config "verbose": 1, "log_dir": "logs", "log_file": "build.log", - "cell_census_S3_path": "s3://cellxgene-data-public/cell-census", "consolidate": True, - "multi_process": False, - "max_workers": None, - "manifest": None, - "test_first_n": None, - "test_disable_dirty_git_check": False, + # + # Paths and census version name determined by spec. + "cell_census_S3_path": "s3://cellxgene-data-public/cell-census", + "build_tag": datetime.now().astimezone().date().isoformat(), + # + # Default multi-process. Memory scaling based on empirical tests. + "multi_process": True, + "max_workers": 2 + int(psutil.virtual_memory().total / (96 * 1024**3)), + # + # XXX TODO: this exposes a bug in the validation pass + # "multi_process": False, + # "max_workers": None, + # + # Host minimum resource validation "host_validation_disable": False, # if True, host validation checks will be skipped "host_validation_min_physical_memory": 512 * 1024**3, # 512GiB "host_validation_min_swap_memory": 2 * 1024**4, # 2TiB "host_validation_min_free_disk_space": 1 * 1024**4, # 1 TiB + # For testing convenience only + "manifest": None, + "test_first_n": None, + "test_disable_dirty_git_check": False, } @@ -129,11 +142,13 @@ def __setitem__(self, key: str, value: Any) -> None: def load(cls, file: Union[str, os.PathLike[str], io.TextIOBase]) -> Self: if isinstance(file, (str, os.PathLike)): with open(file) as state_log: - documents = yaml.safe_load_all(state_log) + documents = list(yaml.safe_load_all(state_log)) else: - documents = yaml.safe_load_all(file) + documents = list(yaml.safe_load_all(file)) - return cls(**functools.reduce(lambda acc, r: acc.update(r) or acc, documents, {})) + state = cls(**functools.reduce(lambda acc, r: acc.update(r) or acc, documents, {})) + state.__dirty_keys.clear() + return state def commit(self, file: Union[str, os.PathLike[str]]) -> None: # append dirty elements (atomic on Posix) From 1f70ab91f6d7b261a68fec253095fa31b18877ea Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Tue, 21 Mar 2023 23:22:30 +0000 Subject: [PATCH 16/34] fix typo in program name --- .../src/cell_census_builder/build_soma/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py index f88cfbe36..128f423d3 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/__main__.py @@ -29,7 +29,7 @@ def main() -> int: def create_args_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser(prog="cell_census_builder") + parser = argparse.ArgumentParser(prog="cell_census_builder.build_soma") parser.add_argument("uri", type=str, help="Census top-level URI") parser.add_argument("-v", "--verbose", action="count", default=0, help="Increase logging verbosity") parser.add_argument( From e40653a9846fc43122554b006b3a8a9432db2770 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Tue, 21 Mar 2023 23:22:50 +0000 Subject: [PATCH 17/34] add build resumption --- .../src/cell_census_builder/__main__.py | 83 +++++++++++-------- 1 file changed, 48 insertions(+), 35 deletions(-) diff --git a/tools/cell_census_builder/src/cell_census_builder/__main__.py b/tools/cell_census_builder/src/cell_census_builder/__main__.py index c1948757b..ea68e31c3 100644 --- a/tools/cell_census_builder/src/cell_census_builder/__main__.py +++ b/tools/cell_census_builder/src/cell_census_builder/__main__.py @@ -7,7 +7,7 @@ import s3fs from . import __version__ -from .build_state import CENSUS_BUILD_CONFIG, CENSUS_BUILD_STATE, CensusBuildArgs, CensusBuildConfig +from .build_state import CENSUS_BUILD_CONFIG, CENSUS_BUILD_STATE, CensusBuildArgs, CensusBuildConfig, CensusBuildState from .util import process_init, urlcat @@ -19,25 +19,30 @@ def main() -> int: if not working_dir.is_dir(): logging.critical("Census builder: unable to find working directory - exiting.") return 1 - if (working_dir / CENSUS_BUILD_STATE).exists(): - logging.critical("Found pre-existing census build in working directory - aborting census build.") - return 1 if (working_dir / CENSUS_BUILD_CONFIG).is_file(): build_config = CensusBuildConfig.load(working_dir / CENSUS_BUILD_CONFIG) else: build_config = CensusBuildConfig() - build_args = CensusBuildArgs(working_dir=working_dir, config=build_config) + if not cli_args.test_resume: + if (working_dir / CENSUS_BUILD_STATE).exists(): + logging.critical("Found pre-existing census build in working directory - aborting census build.") + return 1 + build_state = CensusBuildState() + else: + build_state = CensusBuildState.load(working_dir / CENSUS_BUILD_STATE) + + build_args = CensusBuildArgs(working_dir=working_dir, config=build_config, state=build_state) # Process initialization/setup must be done early process_init(build_args) # Return process exit code (or raise, which exits with a code of `1`) - return do_build(build_args) + return do_build(build_args, skip_completed_steps=cli_args.test_resume) -def do_build(args: CensusBuildArgs) -> int: +def do_build(args: CensusBuildArgs, skip_completed_steps: bool = False) -> int: """ Top-level build sequence. @@ -45,7 +50,7 @@ def do_build(args: CensusBuildArgs) -> int: exit code or raises. """ logging.info(f"Census build: start [version={__version__}]") - build_steps: List[Callable[[CensusBuildArgs], int]] = [ + build_steps: List[Callable[[CensusBuildArgs], bool]] = [ do_prebuild_set_defaults, do_prebuild_checks, do_build_soma, @@ -54,13 +59,19 @@ def do_build(args: CensusBuildArgs) -> int: ] try: for n, build_step in enumerate(build_steps, start=1): - logging.info(f"Build step {build_step.__name__} [{n} of {len(build_steps)}]: start") - cc = build_step(args) + step_n_of = f"Build step {build_step.__name__} [{n} of {len(build_steps)}]" + if skip_completed_steps and args.state.get(build_step.__name__): + logging.info(f"{step_n_of}: already complete, skipping.") + continue + + logging.info(f"{step_n_of}: start") + if not build_step(args): + logging.critical(f"{step_n_of}: failed, aborting build.") + return 1 + + args.state[build_step.__name__] = True args.state.commit(args.working_dir / CENSUS_BUILD_STATE) - if cc != 0: - logging.critical(f"Build step {build_step.__name__} returned error code {cc}: aborting build.") - return cc - logging.info(f"Build step {build_step.__name__} [{n} of {len(build_steps)}]: complete") + logging.info(f"{step_n_of}: complete") except Exception: logging.critical("Caught exception, exiting", exc_info=True) @@ -70,19 +81,18 @@ def do_build(args: CensusBuildArgs) -> int: return 0 -def do_prebuild_set_defaults(args: CensusBuildArgs) -> int: - """Set any default state required by build steps.""" - args.state["do_prebuild_set_defaults"] = True - return 0 +def do_prebuild_set_defaults(args: CensusBuildArgs) -> bool: + """Set any defaults required by build steps.""" + return True -def do_prebuild_checks(args: CensusBuildArgs) -> int: +def do_prebuild_checks(args: CensusBuildArgs) -> bool: """Pre-build checks for host, config, etc. All pre-conditions should go here.""" from .host_validation import check_host # check host configuration, e.g., free disk space if not check_host(args): - return 1 + return False # verify the build tag is not already published/in use build_tag = args.config.build_tag @@ -90,34 +100,32 @@ def do_prebuild_checks(args: CensusBuildArgs) -> int: s3path = urlcat(args.config.cell_census_S3_path, build_tag) if s3fs.S3FileSystem(anon=True).exists(s3path): logging.error(f"Build tag {build_tag} already exists at {s3path}.") - return 1 + return False - args.state["do_prebuild_checks"] = True - return 0 + return True -def do_build_soma(args: CensusBuildArgs) -> int: +def do_build_soma(args: CensusBuildArgs) -> bool: from .build_soma import build as build_a_soma if (cc := build_a_soma(args)) != 0: - return cc + logging.critical(f"Build of census failed with code {cc}.") + return False - args.state["do_build_soma"] = True - return 0 + return True -def do_validate_soma(args: CensusBuildArgs) -> int: +def do_validate_soma(args: CensusBuildArgs) -> bool: from .build_soma import validate as validate_a_soma if not validate_a_soma(args): logging.critical("Validation of the census build has failed.") - return 1 + return False - args.state["do_validate_soma"] = True - return 0 + return True -def do_create_reports(args: CensusBuildArgs) -> int: +def do_create_reports(args: CensusBuildArgs) -> bool: from .census_summary import display_diff, display_summary reports_dir = args.working_dir / "reports" @@ -131,13 +139,18 @@ def do_create_reports(args: CensusBuildArgs) -> int: with open(reports_dir / f"census-diff-{args.build_tag}.txt", mode="w") as f: display_diff(uri=args.soma_path.as_posix(), previous_census_version="latest", file=f) - args.state["do_create_reports"] = True - return 0 + return True def create_args_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser(prog="cell_census_builder") + parser = argparse.ArgumentParser(prog="cell_census_builder", description="Build the official cell census.") parser.add_argument("working_dir", type=str, help="Working directory for the build") + parser.add_argument( + "--test-resume", + action=argparse.BooleanOptionalAction, + default=False, + help="Attempt to resume the build by skipping completed workflow steps. CAUTION: TEST OPTION ONLY.", + ) return parser From 18e1b671ea7f854d04535cdedca78da0095aae81 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Wed, 22 Mar 2023 14:23:03 +0000 Subject: [PATCH 18/34] dockerfile update --- tools/cell_census_builder/Dockerfile | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 tools/cell_census_builder/Dockerfile diff --git a/tools/cell_census_builder/Dockerfile b/tools/cell_census_builder/Dockerfile new file mode 100644 index 000000000..67a7a3d52 --- /dev/null +++ b/tools/cell_census_builder/Dockerfile @@ -0,0 +1,17 @@ +FROM ubuntu:22.04 + +ENV DEBIAN_FRONTEND=noninteractive + +ARG COMMIT_SHA +ENV COMMIT_SHA=${COMMIT_SHA} + +RUN apt update && apt -y full-upgrade && apt -y install python3.10-venv python3-pip awscli + +ADD dist/ /tools/cell_census_builder + +RUN python3 -m pip install -U pip +RUN python3 -m pip -v install /tools/cell_census_builder/*.whl + +WORKDIR /census-build + +ENTRYPOINT ["python3", "-m", "cell_census_builder", "."] From c35554aaa69e56d042be6ef71ab9fd5e0f54c325 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Wed, 22 Mar 2023 16:19:26 +0000 Subject: [PATCH 19/34] docker build refinement --- tools/cell_census_builder/Makefile | 10 ++++++ tools/cell_census_builder/README.md | 35 ++++++++++++------- .../src/cell_census_builder/__main__.py | 4 ++- .../build_soma/build_soma.py | 2 +- .../build_soma/manifest.py | 10 ++++-- .../build_soma/source_assets.py | 2 +- .../cell_census_builder/build_soma/util.py | 1 + .../src/cell_census_builder/build_state.py | 3 +- tools/cell_census_builder/tests/conftest.py | 9 +++-- .../cell_census_builder/tests/test_builder.py | 2 +- .../tests/test_manifest.py | 17 +++++++-- 11 files changed, 66 insertions(+), 29 deletions(-) create mode 100644 tools/cell_census_builder/Makefile diff --git a/tools/cell_census_builder/Makefile b/tools/cell_census_builder/Makefile new file mode 100644 index 000000000..c82691a75 --- /dev/null +++ b/tools/cell_census_builder/Makefile @@ -0,0 +1,10 @@ +# Build docker container + +.PHONY: container +container: + python3 -m build . + docker build --build-arg=COMMIT_SHA=$(git rev-parse --short HEAD) -t cell-census-builder . + +.PHONY: clean +clean: + rm -rf build dist diff --git a/tools/cell_census_builder/README.md b/tools/cell_census_builder/README.md index 99c89c337..40b2ce6f1 100644 --- a/tools/cell_census_builder/README.md +++ b/tools/cell_census_builder/README.md @@ -13,7 +13,7 @@ using the Cell Census data. # Overview This package contains sub-modules, each of which automate elements of the Cell Census build and release process. -They are wrapped at the package top-leveby by a `__main__` which implements the Cell Census build process, +They are wrapped at the package top-level by by a `__main__` which implements the Cell Census build process, with standard defaults. The top-level build can be invoked as follows: @@ -47,6 +47,27 @@ working_dir: +-- census-diff-VERSION.txt ``` +# Building and using the Docker container + +The standard Census build is expected to be done via a Docker container. + +To build the container, do a `git pull` to the version you want to use, and do the following to create a container called `cell-census-builder`: + +``` +$ cd tools/cell_census_builder +$ make container +``` + +To use the container to build the _full_ census, with default options, pick a working directory (e.g., /tmp/census-build), and: + +``` +$ mkdir /tmp/census-build +$ chmod ug+s /tmp/census-build # optional, but makes permissions handling simpler +$ docker run --mount type=bind,source="`pwd`/tmp/census-build",target='/census-build' cell-census-builder +``` + +# Module-specific notes + ## `host_validation` module Module which provides a set of checks that the current host machine has the requisite capabilities @@ -102,15 +123,3 @@ If you run out of memory, reduce `--max-workers`. You can also try a higher numb You can specify a file system path or a URI in the second field - To create a cell census at ``, execute: > $ python -m cell_census_builder build --manifest - -### Other info - -There are more options discoverable via the `--help` command line option. - -Note on required host resources: - -- all H5AD files not on the local disk will be downloaded/cached locally. There must be - sufficient local file system space. Location of cache can be controlled with the - environment variable `FSSPEC_CACHE_DIR` -- each H5AD will be read into memory, in its entirety. Sufficient RAM must be present to - allow for this (and to do so for multiple H5ADs concurrently if you use the `--multi-process` option) diff --git a/tools/cell_census_builder/src/cell_census_builder/__main__.py b/tools/cell_census_builder/src/cell_census_builder/__main__.py index ea68e31c3..618234ff2 100644 --- a/tools/cell_census_builder/src/cell_census_builder/__main__.py +++ b/tools/cell_census_builder/src/cell_census_builder/__main__.py @@ -35,7 +35,7 @@ def main() -> int: build_args = CensusBuildArgs(working_dir=working_dir, config=build_config, state=build_state) - # Process initialization/setup must be done early + # Process initialization/setup must be done early. NOTE: do NOT log before this line! process_init(build_args) # Return process exit code (or raise, which exits with a code of `1`) @@ -50,6 +50,8 @@ def do_build(args: CensusBuildArgs, skip_completed_steps: bool = False) -> int: exit code or raises. """ logging.info(f"Census build: start [version={__version__}]") + logging.info(args) + build_steps: List[Callable[[CensusBuildArgs], bool]] = [ do_prebuild_set_defaults, do_prebuild_checks, diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/build_soma.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/build_soma.py index ab3693b75..612a0fc38 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_soma/build_soma.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/build_soma.py @@ -38,7 +38,7 @@ def prepare_file_system(args: CensusBuildArgs) -> None: raise Exception("Census build path already exists - aborting build") # Ensure that the git tree is clean - if not args.config.test_disable_dirty_git_check and is_git_repo_dirty(): + if not args.config.disable_dirty_git_check and is_git_repo_dirty(): raise Exception("The git repo has uncommitted changes - aborting build") # Create top-level build directories diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/manifest.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/manifest.py index 53a6de916..1c7c4d266 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_soma/manifest.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/manifest.py @@ -68,7 +68,7 @@ def load_manifest_from_CxG() -> List[Dataset]: logging.info(f"Found {len(datasets)} datasets, in {len(collections)} collections") # load per-dataset schema version - with concurrent.futures.ThreadPoolExecutor(max_workers=16) as tp: + with concurrent.futures.ThreadPoolExecutor(max_workers=32) as tp: dataset_metadata = tp.map( lambda d: fetch_json( f"{CXG_BASE_URI}curation/v1/collections/{d['collection_id']}/datasets/{d['dataset_id']}" @@ -130,13 +130,17 @@ def load_manifest_from_CxG() -> List[Dataset]: return [Dataset(**d) for d in datasets.values()] -def load_manifest(manifest_fp: Optional[io.TextIOBase] = None) -> List[Dataset]: +def load_manifest(manifest_fp: Optional[Union[str, io.TextIOBase]] = None) -> List[Dataset]: """ Load dataset manifest from the file pointer if provided, else bootstrap the load rom the CELLxGENE REST API. """ if manifest_fp is not None: - datasets = load_manifest_from_fp(manifest_fp) + if isinstance(manifest_fp, str): + with open(manifest_fp) as f: + datasets = load_manifest_from_fp(f) + else: + datasets = load_manifest_from_fp(manifest_fp) else: datasets = load_manifest_from_CxG() diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py index dd2f0041a..15810c2c5 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py @@ -21,7 +21,7 @@ def stage_source_assets(datasets: List[Dataset], args: CensusBuildArgs) -> None: datasets = sorted(datasets, key=lambda d: d.asset_h5ad_filesize, reverse=True) N = len(datasets) - if not args.config.multi_process: + if args.config.multi_process: n_workers = max(min(8, cpu_count()), 64) with create_process_pool_executor(args, n_workers) as pe: paths = list( diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/util.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/util.py index e2adf1c01..f59d9e740 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_soma/util.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/util.py @@ -122,6 +122,7 @@ def get_git_commit_sha() -> str: commit_sha_var = os.getenv("COMMIT_SHA") if commit_sha_var is not None: return commit_sha_var + import git # Scoped import - this requires the git executable to exist on the machine repo = git.Repo(search_parent_directories=True) diff --git a/tools/cell_census_builder/src/cell_census_builder/build_state.py b/tools/cell_census_builder/src/cell_census_builder/build_state.py index 91a4a8069..54e892191 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_state.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_state.py @@ -26,6 +26,7 @@ "log_dir": "logs", "log_file": "build.log", "consolidate": True, + "disable_dirty_git_check": True, # # Paths and census version name determined by spec. "cell_census_S3_path": "s3://cellxgene-data-public/cell-census", @@ -44,10 +45,10 @@ "host_validation_min_physical_memory": 512 * 1024**3, # 512GiB "host_validation_min_swap_memory": 2 * 1024**4, # 2TiB "host_validation_min_free_disk_space": 1 * 1024**4, # 1 TiB + # # For testing convenience only "manifest": None, "test_first_n": None, - "test_disable_dirty_git_check": False, } diff --git a/tools/cell_census_builder/tests/conftest.py b/tools/cell_census_builder/tests/conftest.py index 663949807..2f1c525e3 100644 --- a/tools/cell_census_builder/tests/conftest.py +++ b/tools/cell_census_builder/tests/conftest.py @@ -1,4 +1,3 @@ -import io import pathlib from typing import List, Optional @@ -131,7 +130,7 @@ def datasets(census_build_args: CensusBuildArgs) -> List[Dataset]: @pytest.fixture -def manifest_csv(tmp_path: pathlib.Path) -> io.TextIOWrapper: +def manifest_csv(tmp_path: pathlib.Path) -> str: manifest_content = f""" dataset_id_1, {tmp_path}/data/h5ads/dataset_id_1.h5ad dataset_id_2, {tmp_path}/data/h5ads/dataset_id_2.h5ad @@ -144,11 +143,11 @@ def manifest_csv(tmp_path: pathlib.Path) -> io.TextIOWrapper: with open(path, "w+") as f: f.writelines(manifest_content.strip()) - return open(path) + return path @pytest.fixture -def manifest_csv_with_duplicates(tmp_path: pathlib.Path) -> io.TextIOWrapper: +def manifest_csv_with_duplicates(tmp_path: pathlib.Path) -> str: manifest_content = f""" dataset_id_1, {tmp_path}/data/h5ads/dataset_id_1.h5ad dataset_id_2, {tmp_path}/data/h5ads/dataset_id_2.h5ad @@ -162,7 +161,7 @@ def manifest_csv_with_duplicates(tmp_path: pathlib.Path) -> io.TextIOWrapper: with open(path, "w+") as f: f.writelines(manifest_content.strip()) - return open(path) + return path @pytest.fixture() diff --git a/tools/cell_census_builder/tests/test_builder.py b/tools/cell_census_builder/tests/test_builder.py index 61e54ae1b..968f80032 100644 --- a/tools/cell_census_builder/tests/test_builder.py +++ b/tools/cell_census_builder/tests/test_builder.py @@ -129,7 +129,7 @@ def test_unicode_support(tmp_path: pathlib.Path) -> None: @pytest.mark.parametrize( "census_build_args", - [dict(manifest=True, test_first_n=None, verbose=2, build_tag="build_tag", multi_process=True)], + [dict(manifest=True, test_first_n=None, verbose=2, build_tag="build_tag", multi_process=True, max_workers=2)], indirect=True, ) def test_build_step1_get_source_datasets(tmp_path: pathlib.Path, census_build_args: CensusBuildArgs) -> None: diff --git a/tools/cell_census_builder/tests/test_manifest.py b/tools/cell_census_builder/tests/test_manifest.py index 89f6077dc..cbf8773c5 100644 --- a/tools/cell_census_builder/tests/test_manifest.py +++ b/tools/cell_census_builder/tests/test_manifest.py @@ -1,4 +1,3 @@ -import io import pathlib import re from unittest.mock import patch @@ -6,7 +5,7 @@ from cell_census_builder.build_soma.manifest import CXG_BASE_URI, load_manifest -def test_load_manifest_from_file(tmp_path: pathlib.Path, manifest_csv: io.TextIOWrapper) -> None: +def test_load_manifest_from_file(tmp_path: pathlib.Path, manifest_csv: str) -> None: """ If specified a parameter, `load_manifest` should load the dataset manifest from such file. """ @@ -17,14 +16,26 @@ def test_load_manifest_from_file(tmp_path: pathlib.Path, manifest_csv: io.TextIO assert manifest[0].corpora_asset_h5ad_uri == f"{tmp_path}/data/h5ads/dataset_id_1.h5ad" assert manifest[1].corpora_asset_h5ad_uri == f"{tmp_path}/data/h5ads/dataset_id_2.h5ad" + with open(manifest_csv) as fp: + manifest = load_manifest(fp) + assert len(manifest) == 2 + assert manifest[0].dataset_id == "dataset_id_1" + assert manifest[1].dataset_id == "dataset_id_2" + assert manifest[0].corpora_asset_h5ad_uri == f"{tmp_path}/data/h5ads/dataset_id_1.h5ad" + assert manifest[1].corpora_asset_h5ad_uri == f"{tmp_path}/data/h5ads/dataset_id_2.h5ad" -def test_load_manifest_does_dedup(manifest_csv_with_duplicates: io.TextIOWrapper) -> None: + +def test_load_manifest_does_dedup(manifest_csv_with_duplicates: str) -> None: """ `load_manifest` should not include duplicate datasets from the manifest """ manifest = load_manifest(manifest_csv_with_duplicates) assert len(manifest) == 2 + with open(manifest_csv_with_duplicates) as fp: + manifest = load_manifest(fp) + assert len(manifest) == 2 + def test_load_manifest_from_cxg() -> None: """ From fe93cb3fd3b29057c7070981046e9792db79931d Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Wed, 22 Mar 2023 17:55:07 +0000 Subject: [PATCH 20/34] refine builder build process --- tools/cell_census_builder/Dockerfile | 3 ++- tools/cell_census_builder/Makefile | 6 +++--- tools/cell_census_builder/README.md | 24 +++++++++++++++++------- tools/cell_census_builder/entrypoint.sh | 4 ++++ tools/cell_census_builder/pyproject.toml | 8 ++++---- 5 files changed, 30 insertions(+), 15 deletions(-) create mode 100644 tools/cell_census_builder/entrypoint.sh diff --git a/tools/cell_census_builder/Dockerfile b/tools/cell_census_builder/Dockerfile index 67a7a3d52..0b09650f5 100644 --- a/tools/cell_census_builder/Dockerfile +++ b/tools/cell_census_builder/Dockerfile @@ -7,6 +7,7 @@ ENV COMMIT_SHA=${COMMIT_SHA} RUN apt update && apt -y full-upgrade && apt -y install python3.10-venv python3-pip awscli +ADD entrypoint.sh / ADD dist/ /tools/cell_census_builder RUN python3 -m pip install -U pip @@ -14,4 +15,4 @@ RUN python3 -m pip -v install /tools/cell_census_builder/*.whl WORKDIR /census-build -ENTRYPOINT ["python3", "-m", "cell_census_builder", "."] +ENTRYPOINT ["/bin/bash", "/entrypoint.sh"] diff --git a/tools/cell_census_builder/Makefile b/tools/cell_census_builder/Makefile index c82691a75..6a90caedc 100644 --- a/tools/cell_census_builder/Makefile +++ b/tools/cell_census_builder/Makefile @@ -1,7 +1,7 @@ -# Build docker container +# Build docker image. This Makefile is for developer convenience. -.PHONY: container -container: +.PHONY: image +image: python3 -m build . docker build --build-arg=COMMIT_SHA=$(git rev-parse --short HEAD) -t cell-census-builder . diff --git a/tools/cell_census_builder/README.md b/tools/cell_census_builder/README.md index 40b2ce6f1..d38016a22 100644 --- a/tools/cell_census_builder/README.md +++ b/tools/cell_census_builder/README.md @@ -10,7 +10,7 @@ build package. Please see the top-level [README](../../README.md) for more information on the Cell Census and using the Cell Census data. -# Overview +## Overview This package contains sub-modules, each of which automate elements of the Cell Census build and release process. They are wrapped at the package top-level by by a `__main__` which implements the Cell Census build process, @@ -47,7 +47,7 @@ working_dir: +-- census-diff-VERSION.txt ``` -# Building and using the Docker container +## Building and using the Docker container The standard Census build is expected to be done via a Docker container. @@ -66,9 +66,19 @@ $ chmod ug+s /tmp/census-build # optional, but makes permissions handling simp $ docker run --mount type=bind,source="`pwd`/tmp/census-build",target='/census-build' cell-census-builder ``` -# Module-specific notes +### Commands to cleanup local Docker state on your ec2 instance (while building an image) -## `host_validation` module +Docker keeps around intermediate layers/images and if your machine doesn't have enough memory, you might run into issues. You can blow away these cached layers/images by running the following commands. + +``` +docker system prune +docker rm -f $(docker ps -aq) +docker rmi -f $(docker images -q) +``` + +## Module-specific notes + +### `host_validation` module Module which provides a set of checks that the current host machine has the requisite capabilities to build the census (e.g., free disk space). Raises exception (non-zero process exit) if host is @@ -76,7 +86,7 @@ unable to meet base requirements. Stand-alone usage: `python -m cell_census_builder.host_validation` -## `build_soma` module +### `build_soma` module Stand-alone use: `python -m cell_census_builder.build_soma ...` @@ -100,7 +110,7 @@ Modes of operation: a) (default) creating the entire "cell census" using all files currently in the CELLxGENE repository. b) creating a smaller "cell census" from a user-provided list of files (a "manifest") -### Mode (a) - creating the full cell census from the entire CELLxGENE (public) corpus: +#### Mode (a) - creating the full cell census from the entire CELLxGENE (public) corpus: - On a large-memory machine with _ample_ free (local) disk (eg, 3/4 TB or more) and swap (1 TB or more) - To create a cell census at ``, execute: @@ -112,7 +122,7 @@ b) creating a smaller "cell census" from a user-provided list of files (a "manif If you run out of memory, reduce `--max-workers`. You can also try a higher number if you have lots of CPU & memory. -### Mode (b) - creating a cell census from a user-provided list of H5AD files: +#### Mode (b) - creating a cell census from a user-provided list of H5AD files: - Create a manifest file, in CSV format, containing two columns: dataset_id, h5ad_uri. Example: ```csv diff --git a/tools/cell_census_builder/entrypoint.sh b/tools/cell_census_builder/entrypoint.sh new file mode 100644 index 000000000..cf3f21f10 --- /dev/null +++ b/tools/cell_census_builder/entrypoint.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +python3 -m cell_census_builder . --help + diff --git a/tools/cell_census_builder/pyproject.toml b/tools/cell_census_builder/pyproject.toml index 3d10e4b6d..d5a1f4faa 100644 --- a/tools/cell_census_builder/pyproject.toml +++ b/tools/cell_census_builder/pyproject.toml @@ -46,10 +46,10 @@ dependencies= [ "pyyaml", ] -# [tool.setuptools.packages.find] -# where = ["src"] -# include = ["cell_census_builder*"] # package names should match these glob patterns (["*"] by default) -# exclude = ["tests*", "scripts*"] # exclude packages matching these glob patterns (empty by default) +[tool.setuptools.packages.find] +where = ["src"] +include = ["cell_census_builder*"] # package names should match these glob patterns (["*"] by default) +exclude = ["tests*"] # exclude packages matching these glob patterns (empty by default) [tool.setuptools_scm] root = "../.." From 5da72eca913167d9c1e066394b60451cbc1c42b8 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Wed, 22 Mar 2023 17:55:34 +0000 Subject: [PATCH 21/34] add GHA for docker image build --- .github/workflows/py-build.yml | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/.github/workflows/py-build.yml b/.github/workflows/py-build.yml index 6c1790c03..e1268660f 100644 --- a/.github/workflows/py-build.yml +++ b/.github/workflows/py-build.yml @@ -2,8 +2,9 @@ name: Python cell_census build on: pull_request: - paths-ignore: - - "api/r/**" + paths: + - "api/python/**" + - "tools/cell_census_builder/**" push: branches: [main] workflow_dispatch: @@ -34,3 +35,28 @@ jobs: uses: actions/upload-artifact@v3 with: path: api/python/cell_census/dist/* + + build_docker_container: + name: Build Docker image for Census Builder + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - uses: actions/setup-python@v4 + with: + python-version: "3.10" + + - name: Install deps + run: | + python -m pip install -U pip setuptools build + + - name: Build package + run: python -m build + working-directory: tools/cell_census_builder/ + + - name: Build image + run: docker build --build-arg=COMMIT_SHA=$(git rev-parse --short HEAD) -t cell-census-builder . + working-directory: tools/cell_census_builder/ From accb53b51708f5c24d346eb6d732f9f451a92e0d Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Wed, 22 Mar 2023 18:14:36 +0000 Subject: [PATCH 22/34] update readme --- tools/cell_census_builder/README.md | 10 ++++- tools/census-builder-workflow/Dockerfile | 17 ------- tools/census-builder-workflow/README.md | 45 ------------------- .../census-builder-workflow/build-census.yaml | 11 ----- tools/census-builder-workflow/entrypoint.py | 3 -- 5 files changed, 9 insertions(+), 77 deletions(-) delete mode 100644 tools/census-builder-workflow/Dockerfile delete mode 100644 tools/census-builder-workflow/README.md delete mode 100644 tools/census-builder-workflow/build-census.yaml delete mode 100755 tools/census-builder-workflow/entrypoint.py diff --git a/tools/cell_census_builder/README.md b/tools/cell_census_builder/README.md index d38016a22..4a73fef1e 100644 --- a/tools/cell_census_builder/README.md +++ b/tools/cell_census_builder/README.md @@ -3,7 +3,7 @@ This package contains code to build and release the Cell Census in the SOMA format, as specified in the [data schema](https://github.com/chanzuckerberg/cell-census/blob/main/docs/cell_census_schema.md). -This tool is not intended for end-users - it is used by CZI to periodically create and release all +This tool is not intended for end-users - it is used by the CELLxGENE team to periodically create and release all CELLxGENE data in the above format. The remainder of this document is intended for users of the build package. @@ -66,6 +66,14 @@ $ chmod ug+s /tmp/census-build # optional, but makes permissions handling simp $ docker run --mount type=bind,source="`pwd`/tmp/census-build",target='/census-build' cell-census-builder ``` +### Build configuration options + +To be documented. They are all present in the `build_state.py` file. + +### Building the docker image + +The image is built by a GHA workflow. For developer builds, there is a target present in `tools/cell_census_builder/Makefile`. + ### Commands to cleanup local Docker state on your ec2 instance (while building an image) Docker keeps around intermediate layers/images and if your machine doesn't have enough memory, you might run into issues. You can blow away these cached layers/images by running the following commands. diff --git a/tools/census-builder-workflow/Dockerfile b/tools/census-builder-workflow/Dockerfile deleted file mode 100644 index 899445ff3..000000000 --- a/tools/census-builder-workflow/Dockerfile +++ /dev/null @@ -1,17 +0,0 @@ -FROM ubuntu:22.04 - -ENV DEBIAN_FRONTEND=noninteractive - -ARG COMMIT_SHA -ENV COMMIT_SHA=${COMMIT_SHA} - -RUN apt update && apt -y install python3.10-venv python3-pip awscli - -ADD cell_census_builder/ /tools/cell_census_builder -ADD scripts/requirements.txt . -ADD entrypoint.py . -ADD build-census.yaml . - -RUN python3 -m pip install -r requirements.txt - -ENTRYPOINT ["./entrypoint.py"] diff --git a/tools/census-builder-workflow/README.md b/tools/census-builder-workflow/README.md deleted file mode 100644 index 37512c232..000000000 --- a/tools/census-builder-workflow/README.md +++ /dev/null @@ -1,45 +0,0 @@ -# Cell Census Builder Workflow - -This subproject can be used to run a cell-census build using a Docker container and a custom workflow file. - -## Instructions - -### Build - -To build the docker container, `cd` into the parent folder (`tools/`) and run: - -```docker build --build-arg=COMMIT_SHA=$(git rev-parse --short HEAD) . -t census-builder``` - -This will build a Docker container named `census-builder`. - -### Prepare - -Before running the workflow, make sure that a `data` directory exists on the machine. This can contain any inputs for the builder (e.g. a manifest file and local `h5ad`s), and will also be used to output the built cell census. This folder will also need to contain a `build-census.yaml` file as defined in the next step. - - -### Create workflow file - -In the `data` folder, create a `build-census.yaml` file that contain a workflow that will be executed by the builder. This should also contain all the parameters for the workflow. - -Here is an example workflow that runs the builder using a manifest file: - -``` -census-builder: - uri: - /data/cell-census-small/ - verbose: - true - commands: - build: - manifest: - /data/manifest-small.csv - test-disable-dirty-git-check: - true -``` - - -### Run - -Run the builder workflow with: - -```docker run --mount type=bind,source="path/to/data",target=/data census-builder``` \ No newline at end of file diff --git a/tools/census-builder-workflow/build-census.yaml b/tools/census-builder-workflow/build-census.yaml deleted file mode 100644 index 17678f15e..000000000 --- a/tools/census-builder-workflow/build-census.yaml +++ /dev/null @@ -1,11 +0,0 @@ -census-builder: - uri: - /data/cell-census-small/ - verbose: - true - commands: - build: - manifest: - /data/manifest-small.csv - test-disable-dirty-git-check: - true diff --git a/tools/census-builder-workflow/entrypoint.py b/tools/census-builder-workflow/entrypoint.py deleted file mode 100755 index 4e5e2cc8a..000000000 --- a/tools/census-builder-workflow/entrypoint.py +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/python3 - -print("Calling the builder...") From 40b1c9052ee25da79fbf173d1527fa57e8ef69f6 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Wed, 22 Mar 2023 18:20:35 +0000 Subject: [PATCH 23/34] fix entry point --- tools/cell_census_builder/entrypoint.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/cell_census_builder/entrypoint.sh b/tools/cell_census_builder/entrypoint.sh index cf3f21f10..31e2437bc 100644 --- a/tools/cell_census_builder/entrypoint.sh +++ b/tools/cell_census_builder/entrypoint.sh @@ -1,4 +1,3 @@ #!/bin/bash -python3 -m cell_census_builder . --help - +python3 -m cell_census_builder . From 86993eed2a1b8531f9b0f897f2dfb4c06fb75d7a Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Wed, 22 Mar 2023 18:38:05 +0000 Subject: [PATCH 24/34] more readme edits --- tools/cell_census_builder/README.md | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tools/cell_census_builder/README.md b/tools/cell_census_builder/README.md index 4a73fef1e..98771b371 100644 --- a/tools/cell_census_builder/README.md +++ b/tools/cell_census_builder/README.md @@ -49,13 +49,11 @@ working_dir: ## Building and using the Docker container -The standard Census build is expected to be done via a Docker container. - -To build the container, do a `git pull` to the version you want to use, and do the following to create a container called `cell-census-builder`: +The standard Census build is expected to be done via a Docker container. To build the required image, do a `git pull` to the version you want to use, and do the following to create a docker image called `cell-census-builder`: ``` $ cd tools/cell_census_builder -$ make container +$ make image ``` To use the container to build the _full_ census, with default options, pick a working directory (e.g., /tmp/census-build), and: @@ -68,11 +66,7 @@ $ docker run --mount type=bind,source="`pwd`/tmp/census-build",target='/census-b ### Build configuration options -To be documented. They are all present in the `build_state.py` file. - -### Building the docker image - -The image is built by a GHA workflow. For developer builds, there is a target present in `tools/cell_census_builder/Makefile`. +To be documented. Defaults are defined in the `build_state.py` file, and can be passed to the build process by creating a `config.yaml` in the build working directory. ### Commands to cleanup local Docker state on your ec2 instance (while building an image) From 2c8e2b083dc2c50b6717afce35f0437902d3dd68 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Wed, 22 Mar 2023 19:24:03 +0000 Subject: [PATCH 25/34] fix owlready2 installation in docker image --- tools/cell_census_builder/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/cell_census_builder/Dockerfile b/tools/cell_census_builder/Dockerfile index 0b09650f5..5073ba55a 100644 --- a/tools/cell_census_builder/Dockerfile +++ b/tools/cell_census_builder/Dockerfile @@ -10,8 +10,8 @@ RUN apt update && apt -y full-upgrade && apt -y install python3.10-venv python3- ADD entrypoint.sh / ADD dist/ /tools/cell_census_builder -RUN python3 -m pip install -U pip -RUN python3 -m pip -v install /tools/cell_census_builder/*.whl +RUN python3 -m pip install -U pip Cython wheel build +RUN python3 -m pip install /tools/cell_census_builder/*.whl WORKDIR /census-build From f103b5cd10155b380ae8485699d522de627fb539 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Wed, 22 Mar 2023 20:45:19 +0000 Subject: [PATCH 26/34] PR feedback --- tools/cell_census_builder/README.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tools/cell_census_builder/README.md b/tools/cell_census_builder/README.md index 98771b371..236060fa7 100644 --- a/tools/cell_census_builder/README.md +++ b/tools/cell_census_builder/README.md @@ -19,7 +19,7 @@ with standard defaults. The top-level build can be invoked as follows: - Create a working directory, e.g., `census-build` or equivalent. -- If any configuration defaults need to be overridden, create a `config.yaml` in the working directory containing the default overrides. +- If any configuration defaults need to be overridden, create a `config.yaml` in the working directory containing the default overrides. _NOTE:_ by default you do not need to create a `config.yaml` file -- the defaults are appropriate to build the full Census. - Run the build as `python -m cell_census_builder your-working_dir` This will perform four steps (more will be added the future): @@ -66,7 +66,14 @@ $ docker run --mount type=bind,source="`pwd`/tmp/census-build",target='/census-b ### Build configuration options -To be documented. Defaults are defined in the `build_state.py` file, and can be passed to the build process by creating a `config.yaml` in the build working directory. +This is primarily for the use of package developers. The defaults are suitable for the standad Census build, and are defined in the `build_state.py` file. + +If you need to override a default, create `config.yaml` in the build working directory and specify the overrides. An example `config.yaml` might look like: + +``` +verbose: 2 # debug level logging +consolidate: false # disable TileDB consolidation +``` ### Commands to cleanup local Docker state on your ec2 instance (while building an image) From 756d6f7b72321ffae747c2948e98687c8292924c Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Wed, 22 Mar 2023 20:52:38 +0000 Subject: [PATCH 27/34] PR feedback --- .../src/cell_census_builder/build_soma/manifest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/manifest.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/manifest.py index 1c7c4d266..d3768d675 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_soma/manifest.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/manifest.py @@ -68,7 +68,7 @@ def load_manifest_from_CxG() -> List[Dataset]: logging.info(f"Found {len(datasets)} datasets, in {len(collections)} collections") # load per-dataset schema version - with concurrent.futures.ThreadPoolExecutor(max_workers=32) as tp: + with concurrent.futures.ThreadPoolExecutor() as tp: dataset_metadata = tp.map( lambda d: fetch_json( f"{CXG_BASE_URI}curation/v1/collections/{d['collection_id']}/datasets/{d['dataset_id']}" From 151036992792db2e640a7b0aa2b227e6acae54dc Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Wed, 22 Mar 2023 21:14:59 +0000 Subject: [PATCH 28/34] fix email address in metadata --- tools/cell_census_builder/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cell_census_builder/pyproject.toml b/tools/cell_census_builder/pyproject.toml index d5a1f4faa..869e89371 100644 --- a/tools/cell_census_builder/pyproject.toml +++ b/tools/cell_census_builder/pyproject.toml @@ -7,7 +7,7 @@ name = "cell_census_builder" dynamic = ["version"] description = "Build Cell Census" authors = [ - { name = "Chan Zuckerberg Initiative", email = "cellxgene@chanzuckerberg.com" } + { name = "Chan Zuckerberg Initiative", email = "soma@chanzuckerberg.com" } ] license = { text = "MIT" } readme = "README.md" From 0566b17386418f4c9cde6dfc64fbc64141183f72 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Thu, 23 Mar 2023 01:14:58 +0000 Subject: [PATCH 29/34] add file size integrity check on downloads --- .../src/cell_census_builder/build_soma/source_assets.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py index 15810c2c5..da893d9d4 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/source_assets.py @@ -46,6 +46,11 @@ def _copy_file(n: int, dataset: Dataset, asset_dir: str, N: int) -> str: logging.info(f"Staging {dataset.dataset_id} ({n} of {N}) to {dataset_path}") fs.get_file(dataset.corpora_asset_h5ad_uri, dataset_path) + + # verify file size is as expected, if we know the size a priori + assert (dataset.asset_h5ad_filesize == -1) or (dataset.asset_h5ad_filesize == os.path.getsize(dataset_path)) + # TODO: add integrity checksum as well. Waiting on feature request chanzuckerberg/single-cell-data-portal#4392 + logging.info(f"Staging {dataset.dataset_id} ({n} of {N}) complete") return dataset_file_name From 84acafba5c5fcf0c01f3a7f7bf570b3f40475d97 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Thu, 23 Mar 2023 02:33:05 +0000 Subject: [PATCH 30/34] add missing broken process pool logger --- .../src/cell_census_builder/build_soma/validate_soma.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/cell_census_builder/src/cell_census_builder/build_soma/validate_soma.py b/tools/cell_census_builder/src/cell_census_builder/build_soma/validate_soma.py index a322d8993..0c315673a 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_soma/validate_soma.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_soma/validate_soma.py @@ -213,6 +213,7 @@ def validate_axis_dataframes( for dataset in datasets ] for n, future in enumerate(concurrent.futures.as_completed(futures), start=1): + log_on_broken_process_pool(ppe) res = future.result() for eb_name, ebi in res.items(): eb_info[eb_name].update(ebi) From 4f50e11b61d3c9671dc75f6e33a27677d524658f Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Thu, 23 Mar 2023 03:05:40 +0000 Subject: [PATCH 31/34] tweak developer Makefile for builder --- tools/cell_census_builder/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cell_census_builder/Makefile b/tools/cell_census_builder/Makefile index 6a90caedc..c5f2dd697 100644 --- a/tools/cell_census_builder/Makefile +++ b/tools/cell_census_builder/Makefile @@ -1,7 +1,7 @@ # Build docker image. This Makefile is for developer convenience. .PHONY: image -image: +image: clean python3 -m build . docker build --build-arg=COMMIT_SHA=$(git rev-parse --short HEAD) -t cell-census-builder . From e4f4c5704a813274afc65b6ec1a492e7d6f72a8a Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Thu, 23 Mar 2023 03:31:32 +0000 Subject: [PATCH 32/34] clean up comments --- .../src/cell_census_builder/build_state.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tools/cell_census_builder/src/cell_census_builder/build_state.py b/tools/cell_census_builder/src/cell_census_builder/build_state.py index 54e892191..6a05c16b8 100644 --- a/tools/cell_census_builder/src/cell_census_builder/build_state.py +++ b/tools/cell_census_builder/src/cell_census_builder/build_state.py @@ -36,10 +36,6 @@ "multi_process": True, "max_workers": 2 + int(psutil.virtual_memory().total / (96 * 1024**3)), # - # XXX TODO: this exposes a bug in the validation pass - # "multi_process": False, - # "max_workers": None, - # # Host minimum resource validation "host_validation_disable": False, # if True, host validation checks will be skipped "host_validation_min_physical_memory": 512 * 1024**3, # 512GiB From 2eb588ecd7537e3c235c4a3981801f6097f29692 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Thu, 23 Mar 2023 16:12:52 +0000 Subject: [PATCH 33/34] PR feedback --- tools/cell_census_builder/Makefile | 16 +++++++++++- tools/cell_census_builder/README.md | 26 +++++++++++++------ .../src/cell_census_builder/__init__.py | 7 +---- tools/scripts/requirements-dev.txt | 2 ++ 4 files changed, 36 insertions(+), 15 deletions(-) diff --git a/tools/cell_census_builder/Makefile b/tools/cell_census_builder/Makefile index c5f2dd697..abf3f1b06 100644 --- a/tools/cell_census_builder/Makefile +++ b/tools/cell_census_builder/Makefile @@ -1,10 +1,24 @@ -# Build docker image. This Makefile is for developer convenience. +# Build docker image. This Makefile is for convenience in development, +# and as a means to manually build in advance of pushing the image to +# a registry. +# +# COMING SOON: Docker builds for routine use are created by a GHA, and +# will be available in a Docker repository. +# Create the image .PHONY: image image: clean python3 -m build . docker build --build-arg=COMMIT_SHA=$(git rev-parse --short HEAD) -t cell-census-builder . +# Clean Python build .PHONY: clean clean: rm -rf build dist + +# Prune docker cache +.PHONY: prune +prune: + docker system prune -f + if [ "$(docker ps -aq)" ]; then docker rm -f $(docker ps -aq) ; fi + if [ "$(docker images -q)" ]; then docker rmi -f $(docker images -q) ; fi diff --git a/tools/cell_census_builder/README.md b/tools/cell_census_builder/README.md index 236060fa7..fb3c1f8b6 100644 --- a/tools/cell_census_builder/README.md +++ b/tools/cell_census_builder/README.md @@ -49,19 +49,29 @@ working_dir: ## Building and using the Docker container +### Prerequisites + +You will need: + +- Linux - known to work on Ubuntu 20 and 22, and should work fine on most other (modern) Linux distros +- Docker - [primary installation instructions](https://docs.docker.com/engine/install/ubuntu/#installation-methods) and [important post-install configuration](https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user) +- Python 3.9+ + +### Build & run + The standard Census build is expected to be done via a Docker container. To build the required image, do a `git pull` to the version you want to use, and do the following to create a docker image called `cell-census-builder`: -``` -$ cd tools/cell_census_builder -$ make image +```shell +cd tools/cell_census_builder +make image ``` To use the container to build the _full_ census, with default options, pick a working directory (e.g., /tmp/census-build), and: -``` -$ mkdir /tmp/census-build -$ chmod ug+s /tmp/census-build # optional, but makes permissions handling simpler -$ docker run --mount type=bind,source="`pwd`/tmp/census-build",target='/census-build' cell-census-builder +```shell +mkdir /tmp/census-build +chmod ug+s /tmp/census-build # optional, but makes permissions handling simpler +docker run --mount type=bind,source="`pwd`/tmp/census-build",target='/census-build' cell-census-builder ``` ### Build configuration options @@ -79,7 +89,7 @@ consolidate: false # disable TileDB consolidation Docker keeps around intermediate layers/images and if your machine doesn't have enough memory, you might run into issues. You can blow away these cached layers/images by running the following commands. -``` +```shell docker system prune docker rm -f $(docker ps -aq) docker rmi -f $(docker images -q) diff --git a/tools/cell_census_builder/src/cell_census_builder/__init__.py b/tools/cell_census_builder/src/cell_census_builder/__init__.py index 4cd7c916d..584b56c05 100644 --- a/tools/cell_census_builder/src/cell_census_builder/__init__.py +++ b/tools/cell_census_builder/src/cell_census_builder/__init__.py @@ -1,9 +1,4 @@ -try: - from importlib import metadata -except ImportError: - # for python <=3.7 - import importlib_metadata as metadata # type: ignore[no-redef] - +from importlib import metadata try: __version__ = metadata.version("cell_census_builder") diff --git a/tools/scripts/requirements-dev.txt b/tools/scripts/requirements-dev.txt index ddb07b83d..ec6b31fa1 100644 --- a/tools/scripts/requirements-dev.txt +++ b/tools/scripts/requirements-dev.txt @@ -1,3 +1,5 @@ pytest coverage requests-mock +setuptools +build From 509ffd20b1a5c6243269dee8656770c361b4d54f Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Thu, 23 Mar 2023 16:20:00 +0000 Subject: [PATCH 34/34] fix typo --- tools/cell_census_builder/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cell_census_builder/README.md b/tools/cell_census_builder/README.md index fb3c1f8b6..bdf8819b3 100644 --- a/tools/cell_census_builder/README.md +++ b/tools/cell_census_builder/README.md @@ -71,7 +71,7 @@ To use the container to build the _full_ census, with default options, pick a wo ```shell mkdir /tmp/census-build chmod ug+s /tmp/census-build # optional, but makes permissions handling simpler -docker run --mount type=bind,source="`pwd`/tmp/census-build",target='/census-build' cell-census-builder +docker run --mount type=bind,source="/tmp/census-build",target='/census-build' cell-census-builder ``` ### Build configuration options