From ea9b39d7653ebb35c01c2d7860600c91d07c37aa Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 6 Jun 2024 18:12:41 +0000 Subject: [PATCH 01/15] Update min python version, allow python 3.12 --- .github/workflows/py-dependency-check.yml | 4 +- .github/workflows/py-unittests.yml | 4 +- api/python/cellxgene_census/pyproject.toml | 11 +++-- .../tests/experimental/ml/test_pytorch.py | 40 ++++++++----------- api/python/notebooks/README.md | 2 +- docs/cellxgene_census_docsite_installation.md | 2 +- 6 files changed, 28 insertions(+), 35 deletions(-) diff --git a/.github/workflows/py-dependency-check.yml b/.github/workflows/py-dependency-check.yml index daa85c09a..d992f36e1 100644 --- a/.github/workflows/py-dependency-check.yml +++ b/.github/workflows/py-dependency-check.yml @@ -22,10 +22,10 @@ jobs: fail-fast: false # don't fail-fast, as errors are often specific to a single cell in the matrix matrix: os: [single-cell-8c64g-runner, macos-latest] - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.10", "3.11", "3.12"] exclude: - os: macos-latest - python-version: "3.8" + python-version: "3.10" runs-on: ${{matrix.os}} diff --git a/.github/workflows/py-unittests.yml b/.github/workflows/py-unittests.yml index 115083909..a11c6d24c 100644 --- a/.github/workflows/py-unittests.yml +++ b/.github/workflows/py-unittests.yml @@ -18,10 +18,10 @@ jobs: fail-fast: false # Don't stop the workflow if one of the jobs fails matrix: os: [single-cell-8c64g-runner, macos-latest] - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.10", "3.11", "3.12"] exclude: - os: macos-latest - python-version: "3.8" + python-version: "3.10" runs-on: ${{matrix.os}} diff --git a/api/python/cellxgene_census/pyproject.toml b/api/python/cellxgene_census/pyproject.toml index 3fa7a9c92..8c9fa9026 100644 --- a/api/python/cellxgene_census/pyproject.toml +++ b/api/python/cellxgene_census/pyproject.toml @@ -11,7 +11,7 @@ authors = [ ] license = { text = "MIT" } readme = "README.md" -requires-python = ">= 3.8, < 3.12" +requires-python = ">= 3.10, < 3.13" classifiers = [ "Development Status :: 4 - Beta", "Intended Audience :: Developers", @@ -22,10 +22,9 @@ classifiers = [ "Topic :: Scientific/Engineering :: Bio-Informatics", "Operating System :: POSIX :: Linux", "Operating System :: MacOS :: MacOS X", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", ] dependencies= [ # NOTE: the tiledbsoma version must be >= to the version used in the Census builder, to @@ -33,7 +32,7 @@ dependencies= [ # Make sure this version does not fall behind the builder's tiledbsoma version. "tiledbsoma==1.11.4", "anndata", - "numpy>=1.21,<2.0", + "numpy>=1.23,<2.0", "requests", "typing_extensions", "s3fs>=2021.06.1", @@ -43,7 +42,7 @@ dependencies= [ experimental = [ "torch~=2.2.0", "torchdata~=0.7", - "scikit-learn~=1.0", + "scikit-learn>=1.2", "scikit-misc>=0.2", # scikit-misc 0.3 dropped Python 3.8 support "psutil~=5.0", "datasets~=2.0", @@ -78,7 +77,7 @@ root = "../../.." [tool.ruff] line-length = 120 src = ["api/python/cellxgene_census/src"] -target-version = "py38" +target-version = "py310" [tool.ruff.lint] select = [ diff --git a/api/python/cellxgene_census/tests/experimental/ml/test_pytorch.py b/api/python/cellxgene_census/tests/experimental/ml/test_pytorch.py index b3ac43fc4..396973a9e 100644 --- a/api/python/cellxgene_census/tests/experimental/ml/test_pytorch.py +++ b/api/python/cellxgene_census/tests/experimental/ml/test_pytorch.py @@ -1,6 +1,5 @@ import pathlib -import sys -from typing import Callable, List, Optional, Sequence, Union +from collections.abc import Callable, Sequence from unittest.mock import patch import numpy as np @@ -49,17 +48,17 @@ def pytorch_seq_x_value_gen(obs_range: range, var_range: range) -> spmatrix: @pytest.fixture -def X_layer_names() -> List[str]: +def X_layer_names() -> list[str]: return ["raw"] @pytest.fixture -def obsp_layer_names() -> Optional[List[str]]: +def obsp_layer_names() -> list[str] | None: return None @pytest.fixture -def varp_layer_names() -> Optional[List[str]]: +def varp_layer_names() -> list[str] | None: return None @@ -99,8 +98,8 @@ def add_sparse_array( @pytest.fixture(scope="function") def soma_experiment( tmp_path: pathlib.Path, - obs_range: Union[int, range], - var_range: Union[int, range], + obs_range: int | range, + var_range: int | range, X_value_gen: Callable[[range, range], sparse.spmatrix], obsp_layer_names: Sequence[str], varp_layer_names: Sequence[str], @@ -363,10 +362,6 @@ def test_encoders(soma_experiment: Experiment) -> None: @pytest.mark.experimental -@pytest.mark.skipif( - (sys.version_info.major, sys.version_info.minor) == (3, 9), - reason="fails intermittently with OOM error for 3.9", -) # noinspection PyTestParametrized @pytest.mark.parametrize("obs_range,var_range,X_value_gen", [(6, 3, pytorch_x_value_gen)]) def test_multiprocessing__returns_full_result(soma_experiment: Experiment) -> None: @@ -398,11 +393,11 @@ def test_distributed__returns_data_partition_for_rank( """Tests pytorch._partition_obs_joinids() behavior in a simulated PyTorch distributed processing mode, using mocks to avoid having to do real PyTorch distributed setup.""" - with patch("cellxgene_census.experimental.ml.pytorch.dist.is_initialized") as mock_dist_is_initialized, patch( - "cellxgene_census.experimental.ml.pytorch.dist.get_rank" - ) as mock_dist_get_rank, patch( - "cellxgene_census.experimental.ml.pytorch.dist.get_world_size" - ) as mock_dist_get_world_size: + with ( + patch("cellxgene_census.experimental.ml.pytorch.dist.is_initialized") as mock_dist_is_initialized, + patch("cellxgene_census.experimental.ml.pytorch.dist.get_rank") as mock_dist_get_rank, + patch("cellxgene_census.experimental.ml.pytorch.dist.get_world_size") as mock_dist_get_world_size, + ): mock_dist_is_initialized.return_value = True mock_dist_get_rank.return_value = 1 mock_dist_get_world_size.return_value = 3 @@ -433,13 +428,12 @@ def test_distributed_and_multiprocessing__returns_data_partition_for_rank( DataLoader multiprocessing mode, using mocks to avoid having to do distributed pytorch setup or real DataLoader multiprocessing.""" - with patch("torch.utils.data.get_worker_info") as mock_get_worker_info, patch( - "cellxgene_census.experimental.ml.pytorch.dist.is_initialized" - ) as mock_dist_is_initialized, patch( - "cellxgene_census.experimental.ml.pytorch.dist.get_rank" - ) as mock_dist_get_rank, patch( - "cellxgene_census.experimental.ml.pytorch.dist.get_world_size" - ) as mock_dist_get_world_size: + with ( + patch("torch.utils.data.get_worker_info") as mock_get_worker_info, + patch("cellxgene_census.experimental.ml.pytorch.dist.is_initialized") as mock_dist_is_initialized, + patch("cellxgene_census.experimental.ml.pytorch.dist.get_rank") as mock_dist_get_rank, + patch("cellxgene_census.experimental.ml.pytorch.dist.get_world_size") as mock_dist_get_world_size, + ): mock_get_worker_info.return_value = WorkerInfo(id=1, num_workers=2, seed=1234) mock_dist_is_initialized.return_value = True mock_dist_get_rank.return_value = 1 diff --git a/api/python/notebooks/README.md b/api/python/notebooks/README.md index 2b1c683ef..cdf89656d 100644 --- a/api/python/notebooks/README.md +++ b/api/python/notebooks/README.md @@ -9,7 +9,7 @@ Demonstration notebooks for the CZ CELLxGENE Discover Census. There are two kind You must be on a Linux or MacOS system, with the following installed: -* Python 3.8 to 3.11 +* Python 3.10 to 3.12 * Jupyter or some other means of running notebooks (e.g., vscode) For now, it is recommended that you do all this on a host with sufficient memory, diff --git a/docs/cellxgene_census_docsite_installation.md b/docs/cellxgene_census_docsite_installation.md index 4654eb37a..0cfbd969b 100644 --- a/docs/cellxgene_census_docsite_installation.md +++ b/docs/cellxgene_census_docsite_installation.md @@ -4,7 +4,7 @@ The Census API requires a Linux or MacOS system with: -- Python 3.8 to Python 3.11. Or R, supported versions TBD. +- Python 3.10 to Python 3.12. Or R, supported versions TBD. - Recommended: >16 GB of memory. - Recommended: >5 Mbps internet connection. - Recommended: for increased performance use the API through a AWS-EC2 instance from the region `us-west-2`. The Census data builds are hosted in a AWS-S3 bucket in that region. From aa014d2e585e937e118a731cf19960df33234250 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 6 Jun 2024 20:43:26 +0000 Subject: [PATCH 02/15] try forcing pyarrow 14 --- .github/workflows/py-unittests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/py-unittests.yml b/.github/workflows/py-unittests.yml index a11c6d24c..139a65dac 100644 --- a/.github/workflows/py-unittests.yml +++ b/.github/workflows/py-unittests.yml @@ -41,6 +41,7 @@ jobs: pip install --use-pep517 accumulation-tree # Geneformer dependency needs --use-pep517 for Cython GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ./api/python/cellxgene_census/scripts/requirements-dev.txt pip install -e './api/python/cellxgene_census/[experimental]' + pip install "pyarrow~=14.0" # TODO: Remove this before merging! - name: Report Dependency Versions run: pip list - name: Test with pytest (API, main tests) From b596f68a57fb304723a774d369d33a7898fddd7b Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 6 Jun 2024 21:02:15 +0000 Subject: [PATCH 03/15] try forcing pyarrow 14, but better --- .github/workflows/py-unittests.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/py-unittests.yml b/.github/workflows/py-unittests.yml index 139a65dac..15aa924aa 100644 --- a/.github/workflows/py-unittests.yml +++ b/.github/workflows/py-unittests.yml @@ -40,8 +40,7 @@ jobs: python -m pip install -U pip setuptools wheel pip install --use-pep517 accumulation-tree # Geneformer dependency needs --use-pep517 for Cython GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ./api/python/cellxgene_census/scripts/requirements-dev.txt - pip install -e './api/python/cellxgene_census/[experimental]' - pip install "pyarrow~=14.0" # TODO: Remove this before merging! + pip install -e './api/python/cellxgene_census/[experimental]' "pyarrow~=14.0" # TODO: remove pyarrow specification - name: Report Dependency Versions run: pip list - name: Test with pytest (API, main tests) From 3e287ad8437eb49bed74f5a03ac2c8bf3f6d7af3 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 6 Jun 2024 21:18:59 +0000 Subject: [PATCH 04/15] nvm --- .github/workflows/py-unittests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/py-unittests.yml b/.github/workflows/py-unittests.yml index 15aa924aa..a11c6d24c 100644 --- a/.github/workflows/py-unittests.yml +++ b/.github/workflows/py-unittests.yml @@ -40,7 +40,7 @@ jobs: python -m pip install -U pip setuptools wheel pip install --use-pep517 accumulation-tree # Geneformer dependency needs --use-pep517 for Cython GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ./api/python/cellxgene_census/scripts/requirements-dev.txt - pip install -e './api/python/cellxgene_census/[experimental]' "pyarrow~=14.0" # TODO: remove pyarrow specification + pip install -e './api/python/cellxgene_census/[experimental]' - name: Report Dependency Versions run: pip list - name: Test with pytest (API, main tests) From 2ea553a2632db5160c5ffa2fda1cec54aff6ebea Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Fri, 7 Jun 2024 18:55:41 +0000 Subject: [PATCH 05/15] Drop 3.12 --- .github/workflows/py-dependency-check.yml | 2 +- .github/workflows/py-unittests.yml | 2 +- api/python/cellxgene_census/pyproject.toml | 3 +-- api/python/notebooks/README.md | 2 +- docs/cellxgene_census_docsite_installation.md | 2 +- 5 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/py-dependency-check.yml b/.github/workflows/py-dependency-check.yml index d992f36e1..e49a3496c 100644 --- a/.github/workflows/py-dependency-check.yml +++ b/.github/workflows/py-dependency-check.yml @@ -22,7 +22,7 @@ jobs: fail-fast: false # don't fail-fast, as errors are often specific to a single cell in the matrix matrix: os: [single-cell-8c64g-runner, macos-latest] - python-version: ["3.10", "3.11", "3.12"] + python-version: ["3.10", "3.11"] exclude: - os: macos-latest python-version: "3.10" diff --git a/.github/workflows/py-unittests.yml b/.github/workflows/py-unittests.yml index a11c6d24c..1a12accac 100644 --- a/.github/workflows/py-unittests.yml +++ b/.github/workflows/py-unittests.yml @@ -18,7 +18,7 @@ jobs: fail-fast: false # Don't stop the workflow if one of the jobs fails matrix: os: [single-cell-8c64g-runner, macos-latest] - python-version: ["3.10", "3.11", "3.12"] + python-version: ["3.10", "3.11"] exclude: - os: macos-latest python-version: "3.10" diff --git a/api/python/cellxgene_census/pyproject.toml b/api/python/cellxgene_census/pyproject.toml index 8c9fa9026..8dba82461 100644 --- a/api/python/cellxgene_census/pyproject.toml +++ b/api/python/cellxgene_census/pyproject.toml @@ -11,7 +11,7 @@ authors = [ ] license = { text = "MIT" } readme = "README.md" -requires-python = ">= 3.10, < 3.13" +requires-python = ">= 3.10, < 3.12" classifiers = [ "Development Status :: 4 - Beta", "Intended Audience :: Developers", @@ -24,7 +24,6 @@ classifiers = [ "Operating System :: MacOS :: MacOS X", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", ] dependencies= [ # NOTE: the tiledbsoma version must be >= to the version used in the Census builder, to diff --git a/api/python/notebooks/README.md b/api/python/notebooks/README.md index cdf89656d..a53e477ab 100644 --- a/api/python/notebooks/README.md +++ b/api/python/notebooks/README.md @@ -9,7 +9,7 @@ Demonstration notebooks for the CZ CELLxGENE Discover Census. There are two kind You must be on a Linux or MacOS system, with the following installed: -* Python 3.10 to 3.12 +* Python 3.10 to 3.11 * Jupyter or some other means of running notebooks (e.g., vscode) For now, it is recommended that you do all this on a host with sufficient memory, diff --git a/docs/cellxgene_census_docsite_installation.md b/docs/cellxgene_census_docsite_installation.md index 0cfbd969b..41347e9a3 100644 --- a/docs/cellxgene_census_docsite_installation.md +++ b/docs/cellxgene_census_docsite_installation.md @@ -4,7 +4,7 @@ The Census API requires a Linux or MacOS system with: -- Python 3.10 to Python 3.12. Or R, supported versions TBD. +- Python 3.10 to Python 3.11. Or R, supported versions TBD. - Recommended: >16 GB of memory. - Recommended: >5 Mbps internet connection. - Recommended: for increased performance use the API through a AWS-EC2 instance from the region `us-west-2`. The Census data builds are hosted in a AWS-S3 bucket in that region. From b889636f8220ecda657ae95be28dcad930df1c21 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 2 Jul 2024 21:08:53 +0000 Subject: [PATCH 06/15] Ignore bugbear rule for zip --- api/python/cellxgene_census/pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/api/python/cellxgene_census/pyproject.toml b/api/python/cellxgene_census/pyproject.toml index 277f1282a..653a9f850 100644 --- a/api/python/cellxgene_census/pyproject.toml +++ b/api/python/cellxgene_census/pyproject.toml @@ -123,6 +123,8 @@ ignore = [ "D205", # Prefer absolute imports over relative imports from parent modules TODO: enable "TID252", + # It's okay to use zip without the strict kwarg. In fact, numba doesn't like it when you use it + "B905", ] [tool.ruff.lint.pydocstyle] From d3908058f97ce499188925dbcdc6b29dc9db236d Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 2 Jul 2024 21:12:49 +0000 Subject: [PATCH 07/15] Automatted formatting --- .../src/cellxgene_census/_get_anndata.py | 49 +++++++-------- .../src/cellxgene_census/_open.py | 18 +++--- .../cellxgene_census/_release_directory.py | 35 ++++++----- .../experimental/_embedding.py | 6 +- .../ml/huggingface/cell_dataset_builder.py | 13 ++-- .../ml/huggingface/geneformer_tokenizer.py | 11 ++-- .../experimental/ml/pytorch.py | 59 ++++++++++--------- .../experimental/pp/_highly_variable_genes.py | 4 +- .../experimental/pp/_online.py | 12 ++-- .../experimental/pp/_stats.py | 3 +- .../experimental/util/_eager_iter.py | 11 ++-- .../tests/experimental/pp/test_stats.py | 8 +-- .../cellxgene_census/tests/test_acceptance.py | 25 ++++---- .../tests/test_get_anndata.py | 16 ++--- .../cellxgene_census/tests/test_lts_compat.py | 3 +- 15 files changed, 139 insertions(+), 134 deletions(-) diff --git a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py index e37337184..9d7a5c41b 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py @@ -7,7 +7,8 @@ Methods to retrieve slices of the census as AnnData objects. """ -from typing import Literal, Optional, Sequence +from collections.abc import Sequence +from typing import Literal from warnings import warn import anndata @@ -27,20 +28,20 @@ def get_anndata( organism: str, measurement_name: str = "RNA", X_name: str = "raw", - X_layers: Optional[Sequence[str]] = (), - obsm_layers: Optional[Sequence[str]] = (), - obsp_layers: Optional[Sequence[str]] = (), - varm_layers: Optional[Sequence[str]] = (), - varp_layers: Optional[Sequence[str]] = (), - obs_value_filter: Optional[str] = None, - obs_coords: Optional[SparseDFCoord] = None, - var_value_filter: Optional[str] = None, - var_coords: Optional[SparseDFCoord] = None, - column_names: Optional[soma.AxisColumnNames] = None, - obs_embeddings: Optional[Sequence[str]] = (), - var_embeddings: Optional[Sequence[str]] = (), - obs_column_names: Optional[Sequence[str]] = None, - var_column_names: Optional[Sequence[str]] = None, + X_layers: Sequence[str] | None = (), + obsm_layers: Sequence[str] | None = (), + obsp_layers: Sequence[str] | None = (), + varm_layers: Sequence[str] | None = (), + varp_layers: Sequence[str] | None = (), + obs_value_filter: str | None = None, + obs_coords: SparseDFCoord | None = None, + var_value_filter: str | None = None, + var_coords: SparseDFCoord | None = None, + column_names: soma.AxisColumnNames | None = None, + obs_embeddings: Sequence[str] | None = (), + var_embeddings: Sequence[str] | None = (), + obs_column_names: Sequence[str] | None = None, + var_column_names: Sequence[str] | None = None, ) -> anndata.AnnData: """Convenience wrapper around :class:`tiledbsoma.Experiment` query, to build and execute a query, and return it as an :class:`anndata.AnnData` object. @@ -176,9 +177,9 @@ def _get_axis_metadata( axis: Literal["obs", "var"], organism: str, *, - value_filter: Optional[str] = None, - coords: Optional[SparseDFCoord] = slice(None), - column_names: Optional[Sequence[str]] = None, + value_filter: str | None = None, + coords: SparseDFCoord | None = slice(None), + column_names: Sequence[str] | None = None, ) -> pd.DataFrame: exp = _get_experiment(census, organism) coords = (slice(None),) if coords is None else (coords,) @@ -198,9 +199,9 @@ def get_obs( census: soma.Collection, organism: str, *, - value_filter: Optional[str] = None, - coords: Optional[SparseDFCoord] = slice(None), - column_names: Optional[Sequence[str]] = None, + value_filter: str | None = None, + coords: SparseDFCoord | None = slice(None), + column_names: Sequence[str] | None = None, ) -> pd.DataFrame: """Get the observation metadata for a query on the census. @@ -230,9 +231,9 @@ def get_var( census: soma.Collection, organism: str, *, - value_filter: Optional[str] = None, - coords: Optional[SparseDFCoord] = slice(None), - column_names: Optional[Sequence[str]] = None, + value_filter: str | None = None, + coords: SparseDFCoord | None = slice(None), + column_names: Sequence[str] | None = None, ) -> pd.DataFrame: """Get the variable metadata for a query on the census. diff --git a/api/python/cellxgene_census/src/cellxgene_census/_open.py b/api/python/cellxgene_census/src/cellxgene_census/_open.py index 640d2d9a6..eb49d96d5 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_open.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_open.py @@ -10,7 +10,7 @@ import logging import os.path import urllib.parse -from typing import Any, Dict, Optional, get_args +from typing import Any, get_args import s3fs import tiledbsoma as soma @@ -28,7 +28,7 @@ DEFAULT_CENSUS_VERSION = "stable" -DEFAULT_TILEDB_CONFIGURATION: Dict[str, Any] = { +DEFAULT_TILEDB_CONFIGURATION: dict[str, Any] = { # https://docs.tiledb.com/main/how-to/configuration#configuration-parameters "py.init_buffer_bytes": 1 * 1024**3, "soma.init_buffer_bytes": 1 * 1024**3, @@ -67,7 +67,7 @@ def _resolve_census_locator(locator: CensusLocator, mirror: CensusMirror) -> Res def _open_soma( locator: ResolvedCensusLocator, - context: Optional[soma.options.SOMATileDBContext] = None, + context: soma.options.SOMATileDBContext | None = None, ) -> soma.Collection: """Private. Merge config defaults and return open census as a soma Collection/context.""" # if no user-defined context, cellxgene_census defaults take precedence over SOMA defaults @@ -81,7 +81,7 @@ def _open_soma( return soma.open(locator["uri"], mode="r", soma_type=soma.Collection, context=context) -def get_default_soma_context(tiledb_config: Optional[Dict[str, Any]] = None) -> soma.options.SOMATileDBContext: +def get_default_soma_context(tiledb_config: dict[str, Any] | None = None) -> soma.options.SOMATileDBContext: """Return a :class:`tiledbsoma.SOMATileDBContext` with sensible defaults that can be further customized by the user. The customized context can then be passed to :func:`cellxgene_census.open_soma` with the ``context`` argument or to :meth:`somacore.SOMAObject.open` with the ``context`` argument, such as @@ -126,11 +126,11 @@ def get_default_soma_context(tiledb_config: Optional[Dict[str, Any]] = None) -> def open_soma( *, - census_version: Optional[str] = DEFAULT_CENSUS_VERSION, - mirror: Optional[str] = None, - uri: Optional[str] = None, - tiledb_config: Optional[Dict[str, Any]] = None, - context: Optional[soma.options.SOMATileDBContext] = None, + census_version: str | None = DEFAULT_CENSUS_VERSION, + mirror: str | None = None, + uri: str | None = None, + tiledb_config: dict[str, Any] | None = None, + context: soma.options.SOMATileDBContext | None = None, ) -> soma.Collection: """Open the Census by version or URI. diff --git a/api/python/cellxgene_census/src/cellxgene_census/_release_directory.py b/api/python/cellxgene_census/src/cellxgene_census/_release_directory.py index a5da8f08b..c33c5fd7a 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_release_directory.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_release_directory.py @@ -7,9 +7,8 @@ Methods to retrieve information about versions of the publicly hosted Census object. """ -import typing from collections import OrderedDict -from typing import Any, Dict, Literal, Optional, Union, cast +from typing import Any, Literal, cast import requests from typing_extensions import NotRequired, TypedDict @@ -35,7 +34,7 @@ class CensusLocator(TypedDict): uri: str relative_uri: str - s3_region: Optional[str] + s3_region: str | None class CensusVersionRetraction(TypedDict): @@ -53,13 +52,13 @@ class CensusVersionRetraction(TypedDict): """ date: str - reason: Optional[str] - info_url: Optional[str] - replaced_by: Optional[str] + reason: str | None + info_url: str | None + replaced_by: str | None ReleaseFlag = Literal["lts", "retracted"] -ReleaseFlags = Dict[ReleaseFlag, bool] +ReleaseFlags = dict[ReleaseFlag, bool] class CensusVersionDescription(TypedDict): @@ -80,7 +79,7 @@ class CensusVersionDescription(TypedDict): If retracted, details of the retraction. """ - release_date: Optional[str] + release_date: str | None release_build: str soma: CensusLocator h5ads: CensusLocator @@ -88,7 +87,7 @@ class CensusVersionDescription(TypedDict): retraction: NotRequired[CensusVersionRetraction] -CensusDirectory = Dict[CensusVersionName, Union[CensusVersionName, CensusVersionDescription]] +CensusDirectory = dict[CensusVersionName, CensusVersionName | CensusVersionDescription] """ A provider identifies a storage medium for the Census, which can either be a cloud provider or a local file. @@ -130,10 +129,10 @@ class CensusMirror(TypedDict): provider: Provider base_uri: str - region: Optional[str] + region: str | None -CensusMirrors = Dict[CensusMirrorName, Union[CensusMirrorName, CensusMirror]] +CensusMirrors = dict[CensusMirrorName, CensusMirrorName | CensusMirror] class ResolvedCensusLocator(TypedDict): @@ -152,7 +151,7 @@ class ResolvedCensusLocator(TypedDict): """ uri: str - region: Optional[str] + region: str | None provider: str @@ -197,8 +196,8 @@ def get_census_version_description(census_version: str) -> CensusVersionDescript def get_census_version_directory( - *, lts: Optional[bool] = None, retracted: Optional[bool] = False -) -> Dict[CensusVersionName, CensusVersionDescription]: + *, lts: bool | None = None, retracted: bool | None = False +) -> dict[CensusVersionName, CensusVersionDescription]: """Get the directory of Census versions currently available, optionally filtering by specified flags. If a filtering flag is not specified, Census versions will not be filtered by that flag. Defaults to including both "long-term stable" (LTS) and weekly Census versions, and excluding @@ -355,7 +354,7 @@ def get_census_version_directory( directory: dict[str, str | dict[str, Any]] = response.json() directory_out: CensusDirectory = {} - aliases: typing.Set[CensusVersionName] = set() + aliases: set[CensusVersionName] = set() # Resolve all aliases for easier use for census_version_name in list(directory.keys()): @@ -398,7 +397,7 @@ def get_census_version_directory( directory_out[census_version_name] = census_version_description.copy() # Cast is safe, as we have removed all aliases - unordered_directory = cast(Dict[CensusVersionName, CensusVersionDescription], directory_out) + unordered_directory = cast(dict[CensusVersionName, CensusVersionDescription], directory_out) # Sort by aliases and release date, descending aliased_releases = [(k, v) for k, v in unordered_directory.items() if k in aliases] @@ -414,7 +413,7 @@ def get_census_version_directory( return ordered_directory -def get_census_mirror_directory() -> Dict[CensusMirrorName, CensusMirror]: +def get_census_mirror_directory() -> dict[CensusMirrorName, CensusMirror]: """Get the directory of Census mirrors currently available. Returns: @@ -426,7 +425,7 @@ def get_census_mirror_directory() -> Dict[CensusMirrorName, CensusMirror]: """ mirrors = _get_census_mirrors() del mirrors["default"] - return cast(Dict[CensusMirrorName, CensusMirror], mirrors) + return cast(dict[CensusMirrorName, CensusMirror], mirrors) def _get_census_mirrors() -> CensusMirrors: diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py index da40da331..afd4b2552 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py @@ -8,7 +8,7 @@ import json import warnings -from typing import Any, Dict, cast +from typing import Any, cast import numpy as np import numpy.typing as npt @@ -53,7 +53,7 @@ def get_embedding_metadata(embedding_uri: str, context: soma.options.SOMATileDBC embedding_metadata = json.loads(E.metadata["CxG_embedding_info"]) assert isinstance(embedding_metadata, dict) - return cast(Dict[str, Any], embedding_metadata) + return cast(dict[str, Any], embedding_metadata) def _get_embedding( @@ -192,7 +192,7 @@ def get_embedding_metadata_by_name( response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL) response.raise_for_status() - manifest = cast(Dict[str, Dict[str, Any]], response.json()) + manifest = cast(dict[str, dict[str, Any]], response.json()) embeddings = [] for _, obj in manifest.items(): if ( diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py index 6b274e8fd..910abc015 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py @@ -1,6 +1,7 @@ import uuid from abc import ABC, abstractmethod -from typing import Any, Dict, Generator, Optional +from collections.abc import Generator +from typing import Any import scipy.sparse from datasets import Dataset @@ -37,7 +38,7 @@ def __init__( measurement_name: str = "RNA", layer_name: str = "raw", *, - block_size: Optional[int] = None, + block_size: int | None = None, **kwargs: Any, ): """Initialize the CellDatasetBuilder to process the results of a Census @@ -55,13 +56,13 @@ def __init__( self.layer_name = layer_name self.block_size = block_size - def build(self, from_generator_kwargs: Optional[Dict[str, Any]] = None) -> Dataset: + def build(self, from_generator_kwargs: dict[str, Any] | None = None) -> Dataset: """Build the dataset from query results. - `from_generator_kwargs`: kwargs passed through to `Dataset.from_generator()` """ - def gen() -> Generator[Dict[str, Any], None, None]: + def gen() -> Generator[dict[str, Any], None, None]: for Xblock, (block_cell_joinids, _) in ( self.X(self.layer_name).blockwise(axis=0, reindex_disable_on_axis=[1], size=self.block_size).scipy() ): @@ -72,7 +73,7 @@ def gen() -> Generator[Dict[str, Any], None, None]: return Dataset.from_generator(_DatasetGeneratorPickleHack(gen), **(from_generator_kwargs or {})) @abstractmethod - def cell_item(self, cell_joinid: int, Xrow: scipy.sparse.csr_matrix) -> Dict[str, Any]: + def cell_item(self, cell_joinid: int, Xrow: scipy.sparse.csr_matrix) -> dict[str, Any]: """Abstract method to process the X row for one cell into a Dataset item. - `cell_joinid`: The cell `soma_joinid`. @@ -85,7 +86,7 @@ def cell_item(self, cell_joinid: int, Xrow: scipy.sparse.csr_matrix) -> Dict[str class _DatasetGeneratorPickleHack: """SEE: https://github.com/huggingface/datasets/issues/6194.""" - def __init__(self, generator: Any, generator_id: Optional[str] = None) -> None: + def __init__(self, generator: Any, generator_id: str | None = None) -> None: self.generator = generator self.generator_id = generator_id if generator_id is not None else str(uuid.uuid4()) diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py index 3c8310fe1..224565400 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py @@ -1,5 +1,6 @@ import pickle -from typing import Any, Dict, Optional, Sequence, Set +from collections.abc import Sequence +from typing import Any import numpy as np import numpy.typing as npt @@ -42,7 +43,7 @@ class GeneformerTokenizer(CellDatasetBuilder): - and the specified `obs_column_names` (cell metadata from the experiment obs dataframe) """ - obs_column_names: Set[str] + obs_column_names: set[str] max_input_tokens: int # set of gene soma_joinids corresponding to genes modeled by Geneformer: @@ -54,8 +55,8 @@ def __init__( self, experiment: tiledbsoma.Experiment, *, - obs_column_names: Optional[Sequence[str]] = None, - obs_attributes: Optional[Sequence[str]] = None, + obs_column_names: Sequence[str] | None = None, + obs_attributes: Sequence[str] | None = None, max_input_tokens: int = 2048, token_dictionary_file: str = "", gene_median_file: str = "", @@ -152,7 +153,7 @@ def __enter__(self) -> "GeneformerTokenizer": self.obs_df = self.obs(column_names=obs_column_names).concat().to_pandas().set_index("soma_joinid") return self - def cell_item(self, cell_joinid: int, cell_Xrow: scipy.sparse.csr_matrix) -> Dict[str, Any]: + def cell_item(self, cell_joinid: int, cell_Xrow: scipy.sparse.csr_matrix) -> dict[str, Any]: """Given the expression vector for one cell, compute the Dataset item providing the Geneformer inputs (token sequence and metadata). """ diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/pytorch.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/pytorch.py index 6bf9aa30c..c17c29818 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/pytorch.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/pytorch.py @@ -1,11 +1,12 @@ import gc import logging import os +from collections.abc import Iterator, Sequence from contextlib import contextmanager from datetime import timedelta from math import ceil from time import time -from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple +from typing import Any import numpy as np import numpy.typing as npt @@ -31,7 +32,7 @@ pytorch_logger = logging.getLogger("cellxgene_census.experimental.pytorch") # TODO: Rename to reflect the correct order of the Tensors within the tuple: (X, obs) -ObsAndXDatum = Tuple[Tensor, Tensor] +ObsAndXDatum = tuple[Tensor, Tensor] """Return type of ``ExperimentDataPipe`` that pairs a Tensor of ``obs`` row(s) with a Tensor of ``X`` matrix row(s). The Tensors are rank 1 if ``batch_size`` is 1, otherwise the Tensors are rank 2.""" @@ -53,7 +54,7 @@ def __len__(self) -> int: return len(self.obs) -Encoders = Dict[str, LabelEncoder] +Encoders = dict[str, LabelEncoder] """A dictionary of ``LabelEncoder``s keyed by the ``obs`` column name.""" @@ -92,7 +93,7 @@ def __add__(self, other: "Stats") -> "Stats": @contextmanager def _open_experiment( uri: str, - aws_region: Optional[str] = None, + aws_region: str | None = None, ) -> soma.Experiment: """Internal method for opening a SOMA ``Experiment`` as a context manager.""" context = get_default_soma_context().replace(tiledb_config={"vfs.s3.region": aws_region} if aws_region else {}) @@ -119,10 +120,10 @@ def __init__( obs: soma.DataFrame, X: soma.SparseNDArray, obs_column_names: Sequence[str], - obs_joinids_chunked: List[npt.NDArray[np.int64]], + obs_joinids_chunked: list[npt.NDArray[np.int64]], var_joinids: npt.NDArray[np.int64], - shuffle_chunk_count: Optional[int] = None, - shuffle_rng: Optional[Generator] = None, + shuffle_chunk_count: int | None = None, + shuffle_rng: Generator | None = None, ): self.obs = obs self.X = X @@ -191,7 +192,7 @@ def __next__(self) -> _SOMAChunk: return _SOMAChunk(obs=obs_batch, X=X_batch, stats=stats) -def list_split(arr_list: List[Any], sublist_len: int) -> List[List[Any]]: +def list_split(arr_list: list[Any], sublist_len: int) -> list[list[Any]]: """Splits a python list into a list of sublists where each sublist is of size `sublist_len`. TODO: Replace with `itertools.batched` when Python 3.12 becomes the minimum supported version. """ @@ -208,7 +209,7 @@ def list_split(arr_list: List[Any], sublist_len: int) -> List[List[Any]]: return result -def run_gc() -> Tuple[Tuple[Any, Any, Any], Tuple[Any, Any, Any]]: # noqa: D103 +def run_gc() -> tuple[tuple[Any, Any, Any], tuple[Any, Any, Any]]: # noqa: D103 proc = psutil.Process(os.getpid()) pre_gc = proc.memory_full_info(), psutil.virtual_memory(), psutil.swap_memory() @@ -234,7 +235,7 @@ class _ObsAndXIterator(Iterator[ObsAndXDatum]): soma_chunk_iter: Iterator[_SOMAChunk] """The iterator for SOMA chunks of paired obs and X data""" - soma_chunk: Optional[_SOMAChunk] + soma_chunk: _SOMAChunk | None """The current SOMA chunk of obs and X data""" i: int = -1 @@ -245,15 +246,15 @@ def __init__( obs: soma.DataFrame, X: soma.SparseNDArray, obs_column_names: Sequence[str], - obs_joinids_chunked: List[npt.NDArray[np.int64]], + obs_joinids_chunked: list[npt.NDArray[np.int64]], var_joinids: npt.NDArray[np.int64], batch_size: int, - encoders: Dict[str, LabelEncoder], + encoders: dict[str, LabelEncoder], stats: Stats, return_sparse_X: bool, use_eager_fetch: bool, - shuffle_chunk_count: Optional[int] = None, - shuffle_rng: Optional[Generator] = None, + shuffle_chunk_count: int | None = None, + shuffle_rng: Generator | None = None, ) -> None: self.soma_chunk_iter = _ObsAndXSOMAIterator( obs, X, obs_column_names, obs_joinids_chunked, var_joinids, shuffle_chunk_count, shuffle_rng @@ -392,15 +393,15 @@ class ExperimentDataPipe(pipes.IterDataPipe[Dataset[ObsAndXDatum]]): # type: ig _initialized: bool - _obs_joinids: Optional[npt.NDArray[np.int64]] + _obs_joinids: npt.NDArray[np.int64] | None - _var_joinids: Optional[npt.NDArray[np.int64]] + _var_joinids: npt.NDArray[np.int64] | None - _encoders: Optional[Encoders] + _encoders: Encoders | None _stats: Stats - _shuffle_rng: Optional[Generator] + _shuffle_rng: Generator | None # TODO: Consider adding another convenience method wrapper to construct this object whose signature is more closely # aligned with get_anndata() params (i.e. "exploded" AxisQuery params). @@ -409,16 +410,16 @@ def __init__( experiment: soma.Experiment, measurement_name: str = "RNA", X_name: str = "raw", - obs_query: Optional[soma.AxisQuery] = None, - var_query: Optional[soma.AxisQuery] = None, + obs_query: soma.AxisQuery | None = None, + var_query: soma.AxisQuery | None = None, obs_column_names: Sequence[str] = (), batch_size: int = 1, shuffle: bool = True, - seed: Optional[int] = None, + seed: int | None = None, return_sparse_X: bool = False, - soma_chunk_size: Optional[int] = 64, + soma_chunk_size: int | None = 64, use_eager_fetch: bool = True, - shuffle_chunk_count: Optional[int] = 2000, + shuffle_chunk_count: int | None = 2000, ) -> None: r"""Construct a new ``ExperimentDataPipe``. @@ -526,10 +527,10 @@ def _init(self) -> None: @staticmethod def _subset_ids_to_partition( - ids_chunked: List[npt.NDArray[np.int64]], + ids_chunked: list[npt.NDArray[np.int64]], partition_index: int, num_partitions: int, - ) -> List[npt.NDArray[np.int64]]: + ) -> list[npt.NDArray[np.int64]]: """Returns a single partition of the obs_joinids_chunked (a 2D ndarray), based upon the current process's distributed rank and world size. """ @@ -552,7 +553,7 @@ def _compute_partitions( loader_partitions: int, dist_partition: int, num_dist_partitions: int, - ) -> Tuple[int, int]: + ) -> tuple[int, int]: # NOTE: Can alternately use a `worker_init_fn` to split among workers split workload total_partitions = num_dist_partitions * loader_partitions partition = dist_partition * loader_partitions + loader_partition @@ -595,7 +596,7 @@ def __iter__(self) -> Iterator[ObsAndXDatum]: dist_partition=dist.get_rank() if dist.is_initialized() else 0, num_dist_partitions=dist.get_world_size() if dist.is_initialized() else 1, ) - obs_joinids_chunked_partition: List[npt.NDArray[np.int64]] = self._subset_ids_to_partition( + obs_joinids_chunked_partition: list[npt.NDArray[np.int64]] = self._subset_ids_to_partition( obs_joinids_chunked, partition, partitions ) @@ -622,7 +623,7 @@ def __iter__(self) -> Iterator[ObsAndXDatum]: ) @staticmethod - def _chunk_ids(ids: npt.NDArray[np.int64], chunk_size: int) -> List[npt.NDArray[np.int64]]: + def _chunk_ids(ids: npt.NDArray[np.int64], chunk_size: int) -> list[npt.NDArray[np.int64]]: num_chunks = max(1, ceil(len(ids) / chunk_size)) pytorch_logger.debug(f"Shuffling {len(ids)} obs joinids into {num_chunks} chunks of {chunk_size}") return np.array_split(ids, num_chunks) @@ -663,7 +664,7 @@ def stats(self) -> Stats: return self._stats @property - def shape(self) -> Tuple[int, int]: + def shape(self) -> tuple[int, int]: """Get the shape of the data that will be returned by this :class:`cellxgene_census.experimental.ml.pytorch.ExperimentDataPipe`. This is the number of obs (cell) and var (feature) counts in the returned data. If used in multiprocessing mode (i.e. :class:`torch.utils.data.DataLoader` instantiated with num_workers > 0), the obs (cell) count will reflect diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_highly_variable_genes.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_highly_variable_genes.py index c47ad9f1e..188513c65 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_highly_variable_genes.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_highly_variable_genes.py @@ -1,14 +1,14 @@ from __future__ import annotations import os +from collections.abc import Callable, Sequence from concurrent import futures -from typing import Any, Callable, Sequence, cast +from typing import Any, Literal, cast import numpy as np import pandas as pd import tiledbsoma as soma from somacore.options import SparseDFCoord -from typing_extensions import Literal from ..._experiment import _get_experiment from ..util._eager_iter import _EagerIterator diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py index 2eaf71a7b..13bbe76c5 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py @@ -1,5 +1,3 @@ -from typing import Optional, Tuple - import numba import numpy as np import numpy.typing as npt @@ -43,7 +41,7 @@ def update( self, var_vec: npt.NDArray[np.int64], val_vec: npt.NDArray[np.float32], - batch_vec: Optional[npt.NDArray[np.int64]] = None, + batch_vec: npt.NDArray[np.int64] | None = None, ) -> None: if self.n_batches == 1: assert batch_vec is None @@ -54,7 +52,7 @@ def update( def finalize( self, - ) -> Tuple[ + ) -> tuple[ npt.NDArray[np.float64], npt.NDArray[np.float64], npt.NDArray[np.float64], @@ -125,7 +123,7 @@ def update( self, var_vec: npt.NDArray[np.int64], val_vec: npt.NDArray[np.float32], - batch_vec: Optional[npt.NDArray[np.int64]] = None, + batch_vec: npt.NDArray[np.int64] | None = None, ) -> None: if self.n_batches == 1: assert batch_vec is None @@ -147,7 +145,7 @@ def update( self.clip_val, ) - def finalize(self) -> Tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]: + def finalize(self) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]: return self.counts_sum, self.squared_counts_sum @@ -282,7 +280,7 @@ def _mbomv_combine_batches( n_samples: npt.NDArray[np.int64], u: npt.NDArray[np.float64], M2: npt.NDArray[np.float64], -) -> Tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]: +) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]: """Combine all batches using Chan's parallel adaptation of Welford's. Returns tuple of (u, M2). diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_stats.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_stats.py index a5930525a..8cef3e24e 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_stats.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_stats.py @@ -1,7 +1,8 @@ from __future__ import annotations +from collections.abc import Generator from concurrent import futures -from typing import Any, Generator +from typing import Any import numpy as np import numpy.typing as npt diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/util/_eager_iter.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/util/_eager_iter.py index 6ee5db37a..9c229b889 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/util/_eager_iter.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/util/_eager_iter.py @@ -1,9 +1,10 @@ import logging import threading from collections import deque +from collections.abc import Iterator from concurrent import futures from concurrent.futures import Future -from typing import Deque, Iterator, Optional, TypeVar +from typing import TypeVar util_logger = logging.getLogger("cellxgene_census.experimental.util") @@ -14,13 +15,13 @@ class _EagerIterator(Iterator[_T]): def __init__( self, iterator: Iterator[_T], - pool: Optional[futures.Executor] = None, + pool: futures.Executor | None = None, ): super().__init__() self.iterator = iterator self._pool = pool or futures.ThreadPoolExecutor() self._own_pool = pool is None - self._future: Optional[Future[_T]] = None + self._future: Future[_T] | None = None self._begin_next() def _begin_next(self) -> None: @@ -56,14 +57,14 @@ def __init__( self, iterator: Iterator[_T], max_pending: int = 1, - pool: Optional[futures.Executor] = None, + pool: futures.Executor | None = None, ): super().__init__() self.iterator = iterator self.max_pending = max_pending self._pool = pool or futures.ThreadPoolExecutor() self._own_pool = pool is None - self._pending_results: Deque[futures.Future[_T]] = deque() + self._pending_results: deque[futures.Future[_T]] = deque() self._lock = threading.Lock() self._begin_next() diff --git a/api/python/cellxgene_census/tests/experimental/pp/test_stats.py b/api/python/cellxgene_census/tests/experimental/pp/test_stats.py index 3c113ea07..ecc410d0e 100644 --- a/api/python/cellxgene_census/tests/experimental/pp/test_stats.py +++ b/api/python/cellxgene_census/tests/experimental/pp/test_stats.py @@ -1,4 +1,4 @@ -from typing import Any, Tuple, Union +from typing import Any import numpy as np import numpy.ma as ma @@ -11,7 +11,7 @@ from cellxgene_census.experimental import pp -def var(X: Union[sparse.csc_matrix, sparse.csr_matrix], axis: int = 0, ddof: int = 1) -> Any: +def var(X: sparse.csc_matrix | sparse.csr_matrix, axis: int = 0, ddof: int = 1) -> Any: """ Variance of a sparse matrix calculated as mean(X**2) - mean(X)**2 with Bessel's correction applied for unbiased estimate @@ -52,7 +52,7 @@ def test_mean_variance( calc_mean: bool, calc_variance: bool, small_mem_context: soma.SOMATileDBContext, - obs_coords: Tuple[None, slice], + obs_coords: tuple[None, slice], ) -> None: with cellxgene_census.open_soma(census_version="latest", context=small_mem_context) as census: with census["census_data"][experiment_name].axis_query( @@ -119,7 +119,7 @@ def test_mean_variance_nnz_only( calc_mean: bool, calc_variance: bool, small_mem_context: soma.SOMATileDBContext, - obs_coords: Tuple[None, slice], + obs_coords: tuple[None, slice], ) -> None: # Note: since this test requires materializing the matrix in memory to compute the mean/variance, # we're going to use a coord slice based approach. This will ensure the matrix can fit in memory. diff --git a/api/python/cellxgene_census/tests/test_acceptance.py b/api/python/cellxgene_census/tests/test_acceptance.py index d4587e03e..bd01b840b 100644 --- a/api/python/cellxgene_census/tests/test_acceptance.py +++ b/api/python/cellxgene_census/tests/test_acceptance.py @@ -11,7 +11,8 @@ See README.md for historical data. """ -from typing import Any, Dict, Iterator, Optional, Tuple +from collections.abc import Iterator +from typing import Any import pyarrow as pa import pytest @@ -21,7 +22,7 @@ from cellxgene_census._open import DEFAULT_TILEDB_CONFIGURATION -def make_context(census_version: str, config: Optional[Dict[str, Any]] = None) -> soma.SOMATileDBContext: +def make_context(census_version: str, config: dict[str, Any] | None = None) -> soma.SOMATileDBContext: config = config or {} version = cellxgene_census.get_census_version_description(census_version) s3_region = version["soma"].get("s3_region", "us-west-2") @@ -51,7 +52,7 @@ def test_load_axes(organism: str) -> None: del var_df -def table_iter_is_ok(tbl_iter: Iterator[pa.Table], stop_after: Optional[int] = 2) -> bool: +def table_iter_is_ok(tbl_iter: Iterator[pa.Table], stop_after: int | None = 2) -> bool: """ Utility that verifies that the value is an iterator of pa.Table. @@ -78,7 +79,7 @@ def table_iter_is_ok(tbl_iter: Iterator[pa.Table], stop_after: Optional[int] = 2 pytest.param(None, DEFAULT_TILEDB_CONFIGURATION, marks=pytest.mark.expensive), ], ) -def test_incremental_read_obs(organism: str, stop_after: Optional[int], ctx_config: Optional[Dict[str, Any]]) -> None: +def test_incremental_read_obs(organism: str, stop_after: int | None, ctx_config: dict[str, Any] | None) -> None: """Verify that obs, var and X[raw] can be read incrementally, i.e., in chunks""" # ctx_config=None open census with a small (default) TileDB buffer size, which reduces @@ -101,7 +102,7 @@ def test_incremental_read_obs(organism: str, stop_after: Optional[int], ctx_conf pytest.param(None, DEFAULT_TILEDB_CONFIGURATION, marks=pytest.mark.expensive), ], ) -def test_incremental_read_var(organism: str, stop_after: Optional[int], ctx_config: Optional[Dict[str, Any]]) -> None: +def test_incremental_read_var(organism: str, stop_after: int | None, ctx_config: dict[str, Any] | None) -> None: """Verify that var can be read incrementally, i.e., in chunks""" # ctx_config=None open census with a small (default) TileDB buffer size, which reduces @@ -143,9 +144,9 @@ def test_incremental_read_var(organism: str, stop_after: Optional[int], ctx_conf ) def test_incremental_read_X( organism: str, - stop_after: Optional[int], - ctx_config: Optional[Dict[str, Any]], - coords: Optional[Tuple[slice, slice]], + stop_after: int | None, + ctx_config: dict[str, Any] | None, + coords: tuple[slice, slice] | None, ) -> None: """Verify that obs, var and X[raw] can be read incrementally, i.e., in chunks""" @@ -165,7 +166,7 @@ def test_incremental_read_X( ["tissue=='aorta'", pytest.param("tissue=='brain'", marks=pytest.mark.expensive)], ) @pytest.mark.parametrize("stop_after", [2, pytest.param(None, marks=pytest.mark.expensive)]) -def test_incremental_query(organism: str, obs_value_filter: str, stop_after: Optional[int]) -> None: +def test_incremental_query(organism: str, obs_value_filter: str, stop_after: int | None) -> None: """Verify incremental read of query result.""" # use default TileDB configuration with cellxgene_census.open_soma(census_version="latest") as census: @@ -260,9 +261,9 @@ def test_incremental_query(organism: str, obs_value_filter: str, stop_after: Opt ) def test_get_anndata( organism: str, - obs_value_filter: Optional[str], - obs_coords: Optional[slice], - ctx_config: Optional[Dict[str, Any]], + obs_value_filter: str | None, + obs_coords: slice | None, + ctx_config: dict[str, Any] | None, ) -> None: """Verify query and read into AnnData""" ctx_config = ctx_config or {} diff --git a/api/python/cellxgene_census/tests/test_get_anndata.py b/api/python/cellxgene_census/tests/test_get_anndata.py index 71ef42eb7..edc7f1e0a 100644 --- a/api/python/cellxgene_census/tests/test_get_anndata.py +++ b/api/python/cellxgene_census/tests/test_get_anndata.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Literal +from typing import Any, Literal import numpy as np import pandas as pd @@ -93,7 +93,7 @@ def test_get_anndata_x_layer(census: soma.Collection, layer: str) -> None: @pytest.mark.live_corpus @pytest.mark.parametrize("layers", [["raw", "normalized"], ["normalized", "raw"]]) -def test_get_anndata_two_layers(census: soma.Collection, layers: List[str]) -> None: +def test_get_anndata_two_layers(census: soma.Collection, layers: list[str]) -> None: ad_primary_layer_in_X = cellxgene_census.get_anndata( census, organism="Homo sapiens", @@ -175,7 +175,7 @@ def test_get_anndata_obsm_one_layer(lts_census: soma.Collection, obsm_layer: str @pytest.mark.live_corpus @pytest.mark.parametrize("obsm_layers", [["scvi", "geneformer"]]) -def test_get_anndata_obsm_two_layers(lts_census: soma.Collection, obsm_layers: List[str]) -> None: +def test_get_anndata_obsm_two_layers(lts_census: soma.Collection, obsm_layers: list[str]) -> None: # NOTE: this test will break after next LTS release (>2023-12-15), since scvi and geneformer # won't be distributed as part of `obsm_layers` anymore. Delete this test when it happens. ad = cellxgene_census.get_anndata( @@ -195,7 +195,7 @@ def test_get_anndata_obsm_two_layers(lts_census: soma.Collection, obsm_layers: L @pytest.mark.live_corpus @pytest.mark.parametrize("obs_embeddings", [["scvi", "geneformer", "uce"]]) -def test_get_anndata_obs_embeddings(lts_census: soma.Collection, obs_embeddings: List[str]) -> None: +def test_get_anndata_obs_embeddings(lts_census: soma.Collection, obs_embeddings: list[str]) -> None: # NOTE: when the next LTS gets released (>2023-12-15), embeddings may or may not be available, # so this test could require adjustments. ad = cellxgene_census.get_anndata( @@ -216,7 +216,7 @@ def test_get_anndata_obs_embeddings(lts_census: soma.Collection, obs_embeddings: @pytest.mark.live_corpus @pytest.mark.parametrize("var_embeddings", [["nmf"]]) -def test_get_anndata_var_embeddings(lts_census: soma.Collection, var_embeddings: List[str]) -> None: +def test_get_anndata_var_embeddings(lts_census: soma.Collection, var_embeddings: list[str]) -> None: # NOTE: when the next LTS gets released (>2023-12-15), embeddings may or may not be available, # so this test could require adjustments. @@ -311,7 +311,7 @@ def test_deprecated_column_api(census: soma.Collection) -> None: pd.testing.assert_frame_equal(ad_curr.var, ad_prev.var) -def _map_to_get_anndata_args(query: Dict[str, Any], axis: Literal["obs", "var"]) -> Dict[str, Any]: +def _map_to_get_anndata_args(query: dict[str, Any], axis: Literal["obs", "var"]) -> dict[str, Any]: """Helper to map arguments of get_obs/ get_var to get_anndata.""" result = {} if "coords" in query: @@ -344,7 +344,7 @@ def _map_to_get_anndata_args(query: Dict[str, Any], axis: Literal["obs", "var"]) pytest.param({"value_filter": "tissue_general == 'vasculature'"}, id="value_filter"), ], ) -def test_get_obs(lts_census: soma.Collection, query: Dict[str, Any]) -> None: +def test_get_obs(lts_census: soma.Collection, query: dict[str, Any]) -> None: adata_obs = cellxgene_census.get_anndata( lts_census, organism="Mus musculus", **_map_to_get_anndata_args(query, "obs") ).obs @@ -370,7 +370,7 @@ def test_get_obs(lts_census: soma.Collection, query: Dict[str, Any]) -> None: pytest.param({"value_filter": "feature_name in ['Gm53058', '0610010K14Rik']"}, id="value_filter"), ], ) -def test_get_var(lts_census: soma.Collection, query: Dict[str, Any]) -> None: +def test_get_var(lts_census: soma.Collection, query: dict[str, Any]) -> None: adata_var = cellxgene_census.get_anndata( lts_census, organism="Mus musculus", obs_coords=slice(0), **_map_to_get_anndata_args(query, "var") ).var diff --git a/api/python/cellxgene_census/tests/test_lts_compat.py b/api/python/cellxgene_census/tests/test_lts_compat.py index dbe646cdd..2212f3076 100644 --- a/api/python/cellxgene_census/tests/test_lts_compat.py +++ b/api/python/cellxgene_census/tests/test_lts_compat.py @@ -9,7 +9,8 @@ from __future__ import annotations from collections import deque -from typing import Iterator, Literal, Sequence, Union, get_args +from collections.abc import Iterator, Sequence +from typing import Literal, Union, get_args import pyarrow as pa import pytest From 51556b7aa70fd2c4d8cb95f303993bbad54e6368 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 2 Jul 2024 21:14:53 +0000 Subject: [PATCH 08/15] Manual fixes --- .../src/cellxgene_census/experimental/_embedding.py | 2 +- .../cellxgene_census/tests/test_lts_compat.py | 13 ++++--------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py index afd4b2552..20a593996 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py @@ -65,7 +65,7 @@ def _get_embedding( context: soma.options.SOMATileDBContext | None = None, ) -> npt.NDArray[np.float32]: """Private. Like get_embedding, but accepts a Census object and a Census directory.""" - if isinstance(obs_soma_joinids, (pa.Array, pa.ChunkedArray, pd.Series)): + if isinstance(obs_soma_joinids, pa.Array | pa.ChunkedArray | pd.Series): obs_soma_joinids = obs_soma_joinids.to_numpy() assert isinstance(obs_soma_joinids, np.ndarray) if obs_soma_joinids.dtype != np.int64: diff --git a/api/python/cellxgene_census/tests/test_lts_compat.py b/api/python/cellxgene_census/tests/test_lts_compat.py index 2212f3076..2c486d541 100644 --- a/api/python/cellxgene_census/tests/test_lts_compat.py +++ b/api/python/cellxgene_census/tests/test_lts_compat.py @@ -10,7 +10,7 @@ from collections import deque from collections.abc import Iterator, Sequence -from typing import Literal, Union, get_args +from typing import Literal, TypeAlias, get_args import pyarrow as pa import pytest @@ -28,14 +28,9 @@ ] CollectionTypeNames = ["SOMACollection", "SOMAExperiment", "SOMAMeasurement"] -SOMATypes = Union[ - soma.Collection, - soma.DataFrame, - soma.SparseNDArray, - soma.DenseNDArray, - soma.Experiment, - soma.Measurement, -] +SOMATypes: TypeAlias = ( + soma.Collection | soma.DataFrame | soma.SparseNDArray | soma.DenseNDArray | soma.Experiment | soma.Measurement +) def walk_census( From dcf94bd5aa83cf2b9260b70e001a31db27304487 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 3 Jul 2024 19:07:44 +0000 Subject: [PATCH 09/15] linting --- api/python/cellxgene_census/tests/test_user_agent.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/api/python/cellxgene_census/tests/test_user_agent.py b/api/python/cellxgene_census/tests/test_user_agent.py index dc410df9a..41612c649 100644 --- a/api/python/cellxgene_census/tests/test_user_agent.py +++ b/api/python/cellxgene_census/tests/test_user_agent.py @@ -3,9 +3,10 @@ import json import os +from collections.abc import Callable from functools import partial from pathlib import Path -from typing import TYPE_CHECKING, Callable +from typing import TYPE_CHECKING import numpy as np import proxy From c5c77d6e9f25a20f299dcabd4b9d4da9b2b269c3 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 8 Jul 2024 21:33:24 +0000 Subject: [PATCH 10/15] pre-commit --- .../src/cellxgene_census/experimental/ml/encoders.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/encoders.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/encoders.py index 3d4fc4dc5..0be576ef6 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/encoders.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/encoders.py @@ -1,6 +1,5 @@ import abc import functools -from typing import List import numpy.typing as npt import pandas as pd @@ -47,7 +46,7 @@ def name(self) -> str: @property @abc.abstractmethod - def columns(self) -> List[str]: + def columns(self) -> list[str]: """Columns in ``obs`` that the encoder will be applied to.""" pass @@ -77,7 +76,7 @@ def name(self) -> str: return self.col @property - def columns(self) -> List[str]: + def columns(self) -> list[str]: """Columns in ``obs`` that the encoder will be applied to.""" return [self.col] @@ -90,7 +89,7 @@ def classes_(self): # type: ignore class BatchEncoder(Encoder): """An encoder that concatenates and encodes several ``obs`` columns.""" - def __init__(self, cols: List[str], name: str = "batch"): + def __init__(self, cols: list[str], name: str = "batch"): self.cols = cols from sklearn.preprocessing import LabelEncoder @@ -115,7 +114,7 @@ def fit(self, obs: pd.DataFrame) -> None: self._encoder.fit(arr.unique()) @property - def columns(self) -> List[str]: + def columns(self) -> list[str]: """Columns in ``obs`` that the encoder will be applied to.""" return self.cols From 628073a0a421d792ef28a64ca7a01a539c474c06 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 4 Sep 2024 22:46:15 +0000 Subject: [PATCH 11/15] Try allowing installs on 3.12 --- api/python/cellxgene_census/pyproject.toml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/api/python/cellxgene_census/pyproject.toml b/api/python/cellxgene_census/pyproject.toml index 3a7d41250..e9aa1979d 100644 --- a/api/python/cellxgene_census/pyproject.toml +++ b/api/python/cellxgene_census/pyproject.toml @@ -11,7 +11,7 @@ authors = [ ] license = { text = "MIT" } readme = "README.md" -requires-python = ">= 3.10, < 3.12" +requires-python = ">= 3.10, < 3.13" classifiers = [ "Development Status :: 4 - Beta", "Intended Audience :: Developers", @@ -24,12 +24,13 @@ classifiers = [ "Operating System :: MacOS :: MacOS X", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", ] dependencies= [ # NOTE: the tiledbsoma version must be >= to the version used in the Census builder, to # ensure that the assets are readable (tiledbsoma supports backward compatible reading). # Make sure this version does not fall behind the builder's tiledbsoma version. - "tiledbsoma~=1.12.3", + "tiledbsoma>=1.12.3", "anndata", "numpy>=1.23,<2.0", "requests", From 4cac29220b4a57687d1e0e5f8f07759b630f28f7 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 4 Sep 2024 23:13:02 +0000 Subject: [PATCH 12/15] Update test for missing creds --- api/python/cellxgene_census/tests/test_open.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/api/python/cellxgene_census/tests/test_open.py b/api/python/cellxgene_census/tests/test_open.py index df20b3337..5945ea9e4 100644 --- a/api/python/cellxgene_census/tests/test_open.py +++ b/api/python/cellxgene_census/tests/test_open.py @@ -442,8 +442,8 @@ def test_opening_census_without_anon_access_fails_with_bogus_creds() -> None: os.environ["AWS_SECRET_ACCESS_KEY"] = "fake_key" # Passing an empty context with pytest.raises( - tiledb.TileDBError, - match=r"The AWS Access Key Id you provided does not exist in our records", + (tiledb.TileDBError, soma.DoesNotExistError), + match=r"does not exist", ): cellxgene_census.open_soma(census_version="latest", context=soma.SOMATileDBContext()) From 6eec02d243fe32818f0f39cb0afda6b2d48f2002 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 4 Sep 2024 23:23:25 +0000 Subject: [PATCH 13/15] Update skipped testing env to macos+python=3.12 --- .github/workflows/py-unittests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/py-unittests.yml b/.github/workflows/py-unittests.yml index 044ce3b6e..e5ba6f0eb 100644 --- a/.github/workflows/py-unittests.yml +++ b/.github/workflows/py-unittests.yml @@ -24,7 +24,7 @@ jobs: python-version: ["3.10", "3.11", "3.12"] exclude: - os: macos-latest - python-version: "3.10" + python-version: "3.12" runs-on: ${{matrix.os}} From a3523b6ace47750bdd892436247ab2cadd73a55b Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 4 Sep 2024 23:43:50 +0000 Subject: [PATCH 14/15] Skip correct python+os combo for dep check --- .github/workflows/py-dependency-check.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/py-dependency-check.yml b/.github/workflows/py-dependency-check.yml index aefd78f70..010409042 100644 --- a/.github/workflows/py-dependency-check.yml +++ b/.github/workflows/py-dependency-check.yml @@ -28,7 +28,7 @@ jobs: python-version: ["3.10", "3.11", "3.12"] exclude: - os: macos-latest - python-version: "3.10" + python-version: "3.12" runs-on: ${{matrix.os}} From 57e5d3b229e90e1911adbae41626126a9a1b2272 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 5 Sep 2024 10:55:21 -0700 Subject: [PATCH 15/15] Correct max version --- .github/workflows/py-dependency-check.yml | 2 +- api/python/notebooks/README.md | 2 +- docs/cellxgene_census_docsite_installation.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/py-dependency-check.yml b/.github/workflows/py-dependency-check.yml index aefd78f70..010409042 100644 --- a/.github/workflows/py-dependency-check.yml +++ b/.github/workflows/py-dependency-check.yml @@ -28,7 +28,7 @@ jobs: python-version: ["3.10", "3.11", "3.12"] exclude: - os: macos-latest - python-version: "3.10" + python-version: "3.12" runs-on: ${{matrix.os}} diff --git a/api/python/notebooks/README.md b/api/python/notebooks/README.md index a53e477ab..cdf89656d 100644 --- a/api/python/notebooks/README.md +++ b/api/python/notebooks/README.md @@ -9,7 +9,7 @@ Demonstration notebooks for the CZ CELLxGENE Discover Census. There are two kind You must be on a Linux or MacOS system, with the following installed: -* Python 3.10 to 3.11 +* Python 3.10 to 3.12 * Jupyter or some other means of running notebooks (e.g., vscode) For now, it is recommended that you do all this on a host with sufficient memory, diff --git a/docs/cellxgene_census_docsite_installation.md b/docs/cellxgene_census_docsite_installation.md index 41347e9a3..0cfbd969b 100644 --- a/docs/cellxgene_census_docsite_installation.md +++ b/docs/cellxgene_census_docsite_installation.md @@ -4,7 +4,7 @@ The Census API requires a Linux or MacOS system with: -- Python 3.10 to Python 3.11. Or R, supported versions TBD. +- Python 3.10 to Python 3.12. Or R, supported versions TBD. - Recommended: >16 GB of memory. - Recommended: >5 Mbps internet connection. - Recommended: for increased performance use the API through a AWS-EC2 instance from the region `us-west-2`. The Census data builds are hosted in a AWS-S3 bucket in that region.