diff --git a/.github/workflows/py-dependency-check.yml b/.github/workflows/py-dependency-check.yml index 3afc6a06a..010409042 100644 --- a/.github/workflows/py-dependency-check.yml +++ b/.github/workflows/py-dependency-check.yml @@ -25,10 +25,10 @@ jobs: fail-fast: false # don't fail-fast, as errors are often specific to a single cell in the matrix matrix: os: [sc-dev-64g-runner, macos-latest] - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.10", "3.11", "3.12"] exclude: - os: macos-latest - python-version: "3.8" + python-version: "3.12" runs-on: ${{matrix.os}} diff --git a/.github/workflows/py-unittests.yml b/.github/workflows/py-unittests.yml index f224f4560..e5ba6f0eb 100644 --- a/.github/workflows/py-unittests.yml +++ b/.github/workflows/py-unittests.yml @@ -21,10 +21,10 @@ jobs: fail-fast: false # Don't stop the workflow if one of the jobs fails matrix: os: [sc-dev-64g-runner, macos-latest] - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.10", "3.11", "3.12"] exclude: - os: macos-latest - python-version: "3.8" + python-version: "3.12" runs-on: ${{matrix.os}} diff --git a/api/python/cellxgene_census/pyproject.toml b/api/python/cellxgene_census/pyproject.toml index f050bef75..e9aa1979d 100644 --- a/api/python/cellxgene_census/pyproject.toml +++ b/api/python/cellxgene_census/pyproject.toml @@ -11,7 +11,7 @@ authors = [ ] license = { text = "MIT" } readme = "README.md" -requires-python = ">= 3.8, < 3.12" +requires-python = ">= 3.10, < 3.13" classifiers = [ "Development Status :: 4 - Beta", "Intended Audience :: Developers", @@ -22,18 +22,17 @@ classifiers = [ "Topic :: Scientific/Engineering :: Bio-Informatics", "Operating System :: POSIX :: Linux", "Operating System :: MacOS :: MacOS X", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", ] dependencies= [ # NOTE: the tiledbsoma version must be >= to the version used in the Census builder, to # ensure that the assets are readable (tiledbsoma supports backward compatible reading). # Make sure this version does not fall behind the builder's tiledbsoma version. - "tiledbsoma~=1.12.3", + "tiledbsoma>=1.12.3", "anndata", - "numpy>=1.21,<2.0", + "numpy>=1.23,<2.0", "requests", "typing_extensions", "s3fs>=2021.06.1", @@ -43,9 +42,8 @@ dependencies= [ experimental = [ "torch", "torchdata~=0.7", - "scikit-learn~=1.0", + "scikit-learn>=1.2", "scikit-misc>=0.2,<0.4", # scikit-misc 0.3 dropped Python 3.8 support, and 0.4 doesn't have MacOS/ARM wheels - "psutil~=5.0", "datasets~=2.0", "tdigest~=0.5", # choose newest version of tiledb-vector-search that doesn't need a newer version of tiledb @@ -81,7 +79,7 @@ root = "../../.." [tool.ruff] line-length = 120 src = ["api/python/cellxgene_census/src"] -target-version = "py38" +target-version = "py310" [tool.ruff.lint] select = [ @@ -129,6 +127,8 @@ ignore = [ "D205", # Prefer absolute imports over relative imports from parent modules TODO: enable "TID252", + # It's okay to use zip without the strict kwarg. In fact, numba doesn't like it when you use it + "B905", ] [tool.ruff.lint.pydocstyle] diff --git a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py index e37337184..9d7a5c41b 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py @@ -7,7 +7,8 @@ Methods to retrieve slices of the census as AnnData objects. """ -from typing import Literal, Optional, Sequence +from collections.abc import Sequence +from typing import Literal from warnings import warn import anndata @@ -27,20 +28,20 @@ def get_anndata( organism: str, measurement_name: str = "RNA", X_name: str = "raw", - X_layers: Optional[Sequence[str]] = (), - obsm_layers: Optional[Sequence[str]] = (), - obsp_layers: Optional[Sequence[str]] = (), - varm_layers: Optional[Sequence[str]] = (), - varp_layers: Optional[Sequence[str]] = (), - obs_value_filter: Optional[str] = None, - obs_coords: Optional[SparseDFCoord] = None, - var_value_filter: Optional[str] = None, - var_coords: Optional[SparseDFCoord] = None, - column_names: Optional[soma.AxisColumnNames] = None, - obs_embeddings: Optional[Sequence[str]] = (), - var_embeddings: Optional[Sequence[str]] = (), - obs_column_names: Optional[Sequence[str]] = None, - var_column_names: Optional[Sequence[str]] = None, + X_layers: Sequence[str] | None = (), + obsm_layers: Sequence[str] | None = (), + obsp_layers: Sequence[str] | None = (), + varm_layers: Sequence[str] | None = (), + varp_layers: Sequence[str] | None = (), + obs_value_filter: str | None = None, + obs_coords: SparseDFCoord | None = None, + var_value_filter: str | None = None, + var_coords: SparseDFCoord | None = None, + column_names: soma.AxisColumnNames | None = None, + obs_embeddings: Sequence[str] | None = (), + var_embeddings: Sequence[str] | None = (), + obs_column_names: Sequence[str] | None = None, + var_column_names: Sequence[str] | None = None, ) -> anndata.AnnData: """Convenience wrapper around :class:`tiledbsoma.Experiment` query, to build and execute a query, and return it as an :class:`anndata.AnnData` object. @@ -176,9 +177,9 @@ def _get_axis_metadata( axis: Literal["obs", "var"], organism: str, *, - value_filter: Optional[str] = None, - coords: Optional[SparseDFCoord] = slice(None), - column_names: Optional[Sequence[str]] = None, + value_filter: str | None = None, + coords: SparseDFCoord | None = slice(None), + column_names: Sequence[str] | None = None, ) -> pd.DataFrame: exp = _get_experiment(census, organism) coords = (slice(None),) if coords is None else (coords,) @@ -198,9 +199,9 @@ def get_obs( census: soma.Collection, organism: str, *, - value_filter: Optional[str] = None, - coords: Optional[SparseDFCoord] = slice(None), - column_names: Optional[Sequence[str]] = None, + value_filter: str | None = None, + coords: SparseDFCoord | None = slice(None), + column_names: Sequence[str] | None = None, ) -> pd.DataFrame: """Get the observation metadata for a query on the census. @@ -230,9 +231,9 @@ def get_var( census: soma.Collection, organism: str, *, - value_filter: Optional[str] = None, - coords: Optional[SparseDFCoord] = slice(None), - column_names: Optional[Sequence[str]] = None, + value_filter: str | None = None, + coords: SparseDFCoord | None = slice(None), + column_names: Sequence[str] | None = None, ) -> pd.DataFrame: """Get the variable metadata for a query on the census. diff --git a/api/python/cellxgene_census/src/cellxgene_census/_open.py b/api/python/cellxgene_census/src/cellxgene_census/_open.py index 642e6fbb6..5af0fea0b 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_open.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_open.py @@ -10,7 +10,7 @@ import logging import os.path import urllib.parse -from typing import Any, Dict, Optional, get_args +from typing import Any, get_args import s3fs import tiledbsoma as soma @@ -32,7 +32,7 @@ "anon": True, "cache_regions": True, } -DEFAULT_TILEDB_CONFIGURATION: Dict[str, Any] = { +DEFAULT_TILEDB_CONFIGURATION: dict[str, Any] = { # https://docs.tiledb.com/main/how-to/configuration#configuration-parameters "py.init_buffer_bytes": 1 * 1024**3, "soma.init_buffer_bytes": 1 * 1024**3, @@ -71,7 +71,7 @@ def _resolve_census_locator(locator: CensusLocator, mirror: CensusMirror) -> Res def _open_soma( locator: ResolvedCensusLocator, - context: Optional[soma.options.SOMATileDBContext] = None, + context: soma.options.SOMATileDBContext | None = None, ) -> soma.Collection: """Private. Merge config defaults and return open census as a soma Collection/context.""" # if no user-defined context, cellxgene_census defaults take precedence over SOMA defaults @@ -85,7 +85,7 @@ def _open_soma( return soma.open(locator["uri"], mode="r", soma_type=soma.Collection, context=context) -def get_default_soma_context(tiledb_config: Optional[Dict[str, Any]] = None) -> soma.options.SOMATileDBContext: +def get_default_soma_context(tiledb_config: dict[str, Any] | None = None) -> soma.options.SOMATileDBContext: """Return a :class:`tiledbsoma.SOMATileDBContext` with sensible defaults that can be further customized by the user. The customized context can then be passed to :func:`cellxgene_census.open_soma` with the ``context`` argument or to :meth:`somacore.SOMAObject.open` with the ``context`` argument, such as @@ -132,11 +132,11 @@ def get_default_soma_context(tiledb_config: Optional[Dict[str, Any]] = None) -> def open_soma( *, - census_version: Optional[str] = DEFAULT_CENSUS_VERSION, - mirror: Optional[str] = None, - uri: Optional[str] = None, - tiledb_config: Optional[Dict[str, Any]] = None, - context: Optional[soma.options.SOMATileDBContext] = None, + census_version: str | None = DEFAULT_CENSUS_VERSION, + mirror: str | None = None, + uri: str | None = None, + tiledb_config: dict[str, Any] | None = None, + context: soma.options.SOMATileDBContext | None = None, ) -> soma.Collection: """Open the Census by version or URI. diff --git a/api/python/cellxgene_census/src/cellxgene_census/_release_directory.py b/api/python/cellxgene_census/src/cellxgene_census/_release_directory.py index 523603ea5..177094e90 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_release_directory.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_release_directory.py @@ -7,9 +7,8 @@ Methods to retrieve information about versions of the publicly hosted Census object. """ -import typing from collections import OrderedDict -from typing import Any, Dict, Literal, Optional, Union, cast +from typing import Any, Literal, cast import requests from typing_extensions import NotRequired, TypedDict @@ -37,7 +36,7 @@ class CensusLocator(TypedDict): uri: str relative_uri: str - s3_region: Optional[str] + s3_region: str | None class CensusVersionRetraction(TypedDict): @@ -55,13 +54,13 @@ class CensusVersionRetraction(TypedDict): """ date: str - reason: Optional[str] - info_url: Optional[str] - replaced_by: Optional[str] + reason: str | None + info_url: str | None + replaced_by: str | None ReleaseFlag = Literal["lts", "retracted"] -ReleaseFlags = Dict[ReleaseFlag, bool] +ReleaseFlags = dict[ReleaseFlag, bool] class CensusVersionDescription(TypedDict): @@ -82,7 +81,7 @@ class CensusVersionDescription(TypedDict): If retracted, details of the retraction. """ - release_date: Optional[str] + release_date: str | None release_build: str soma: CensusLocator h5ads: CensusLocator @@ -90,7 +89,7 @@ class CensusVersionDescription(TypedDict): retraction: NotRequired[CensusVersionRetraction] -CensusDirectory = Dict[CensusVersionName, Union[CensusVersionName, CensusVersionDescription]] +CensusDirectory = dict[CensusVersionName, CensusVersionName | CensusVersionDescription] """ A provider identifies a storage medium for the Census, which can either be a cloud provider or a local file. @@ -132,11 +131,11 @@ class CensusMirror(TypedDict): provider: Provider base_uri: str - region: Optional[str] + region: str | None embeddings_base_uri: str -CensusMirrors = Dict[CensusMirrorName, Union[CensusMirrorName, CensusMirror]] +CensusMirrors = dict[CensusMirrorName, CensusMirrorName | CensusMirror] class ResolvedCensusLocator(TypedDict): @@ -155,7 +154,7 @@ class ResolvedCensusLocator(TypedDict): """ uri: str - region: Optional[str] + region: str | None provider: str @@ -200,8 +199,8 @@ def get_census_version_description(census_version: str) -> CensusVersionDescript def get_census_version_directory( - *, lts: Optional[bool] = None, retracted: Optional[bool] = False -) -> Dict[CensusVersionName, CensusVersionDescription]: + *, lts: bool | None = None, retracted: bool | None = False +) -> dict[CensusVersionName, CensusVersionDescription]: """Get the directory of Census versions currently available, optionally filtering by specified flags. If a filtering flag is not specified, Census versions will not be filtered by that flag. Defaults to including both "long-term stable" (LTS) and weekly Census versions, and excluding @@ -358,7 +357,7 @@ def get_census_version_directory( directory: dict[str, str | dict[str, Any]] = response.json() directory_out: CensusDirectory = {} - aliases: typing.Set[CensusVersionName] = set() + aliases: set[CensusVersionName] = set() # Resolve all aliases for easier use for census_version_name in list(directory.keys()): @@ -401,7 +400,7 @@ def get_census_version_directory( directory_out[census_version_name] = census_version_description.copy() # Cast is safe, as we have removed all aliases - unordered_directory = cast(Dict[CensusVersionName, CensusVersionDescription], directory_out) + unordered_directory = cast(dict[CensusVersionName, CensusVersionDescription], directory_out) # Sort by aliases and release date, descending aliased_releases = [(k, v) for k, v in unordered_directory.items() if k in aliases] @@ -417,7 +416,7 @@ def get_census_version_directory( return ordered_directory -def get_census_mirror_directory() -> Dict[CensusMirrorName, CensusMirror]: +def get_census_mirror_directory() -> dict[CensusMirrorName, CensusMirror]: """Get the directory of Census mirrors currently available. Returns: @@ -429,7 +428,7 @@ def get_census_mirror_directory() -> Dict[CensusMirrorName, CensusMirror]: """ mirrors = _get_census_mirrors() del mirrors["default"] - return cast(Dict[CensusMirrorName, CensusMirror], mirrors) + return cast(dict[CensusMirrorName, CensusMirror], mirrors) def _get_census_mirrors() -> CensusMirrors: diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py index 4baba8e06..34d93ef42 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py @@ -8,7 +8,7 @@ import json import warnings -from typing import Any, Dict, cast +from typing import Any, cast import numpy as np import numpy.typing as npt @@ -55,7 +55,7 @@ def get_embedding_metadata(embedding_uri: str, context: soma.options.SOMATileDBC embedding_metadata = json.loads(E.metadata["CxG_embedding_info"]) assert isinstance(embedding_metadata, dict) - return cast(Dict[str, Any], embedding_metadata) + return cast(dict[str, Any], embedding_metadata) def _get_embedding( @@ -67,7 +67,7 @@ def _get_embedding( context: soma.options.SOMATileDBContext | None = None, ) -> npt.NDArray[np.float32]: """Private. Like get_embedding, but accepts a Census object and a Census directory.""" - if isinstance(obs_soma_joinids, (pa.Array, pa.ChunkedArray, pd.Series)): + if isinstance(obs_soma_joinids, pa.Array | pa.ChunkedArray | pd.Series): obs_soma_joinids = obs_soma_joinids.to_numpy() assert isinstance(obs_soma_joinids, np.ndarray) if obs_soma_joinids.dtype != np.int64: @@ -194,7 +194,7 @@ def get_embedding_metadata_by_name( response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL, headers={"User-Agent": _user_agent()}) response.raise_for_status() - manifest = cast(Dict[str, Dict[str, Any]], response.json()) + manifest = cast(dict[str, dict[str, Any]], response.json()) embeddings = [] for _, obj in manifest.items(): if ( diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding_search.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding_search.py index 6290f418b..8d095a08c 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding_search.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding_search.py @@ -1,7 +1,8 @@ """Nearest-neighbor search based on vector index of Census embeddings.""" +from collections.abc import Sequence from contextlib import ExitStack -from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, cast +from typing import Any, NamedTuple, cast import anndata as ad import numpy as np @@ -42,9 +43,9 @@ def find_nearest_obs( k: int = 10, nprobe: int = 100, memory_GiB: int = 4, - mirror: Optional[str] = None, - embedding_metadata: Optional[Dict[str, Any]] = None, - **kwargs: Dict[str, Any], + mirror: str | None = None, + embedding_metadata: dict[str, Any] | None = None, + **kwargs: dict[str, Any], ) -> NeighborObs: """Search Census for similar obs (cells) based on nearest neighbors in embedding space. @@ -96,9 +97,9 @@ def find_nearest_obs( def _resolve_embedding_index( - embedding_metadata: Dict[str, Any], - mirror: Optional[str] = None, -) -> Optional[Tuple[str, str]]: + embedding_metadata: dict[str, Any], + mirror: str | None = None, +) -> tuple[str, str] | None: index_metadata = embedding_metadata.get("indexes", None) if not index_metadata: return None @@ -116,7 +117,7 @@ def predict_obs_metadata( census_version: str, neighbors: NeighborObs, column_names: Sequence[str], - experiment: Optional[soma.Experiment] = None, + experiment: soma.Experiment | None = None, ) -> pd.DataFrame: """Predict obs metadata attributes for the query cells based on the embedding nearest neighbors. @@ -156,7 +157,7 @@ def predict_obs_metadata( # step through query cells to generate prediction for each column as the plurality value # found among its neighbors, with a confidence score based on the simple fraction (for now) # TODO: something more intelligent for numeric columns! also use distances, etc. - out: Dict[str, List[Any]] = {} + out: dict[str, list[Any]] = {} for i in range(neighbors.neighbor_ids.shape[0]): neighbors_i = neighbor_obs.loc[neighbors.neighbor_ids[i]] for col in column_names: diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/encoders.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/encoders.py index 3d4fc4dc5..0be576ef6 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/encoders.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/encoders.py @@ -1,6 +1,5 @@ import abc import functools -from typing import List import numpy.typing as npt import pandas as pd @@ -47,7 +46,7 @@ def name(self) -> str: @property @abc.abstractmethod - def columns(self) -> List[str]: + def columns(self) -> list[str]: """Columns in ``obs`` that the encoder will be applied to.""" pass @@ -77,7 +76,7 @@ def name(self) -> str: return self.col @property - def columns(self) -> List[str]: + def columns(self) -> list[str]: """Columns in ``obs`` that the encoder will be applied to.""" return [self.col] @@ -90,7 +89,7 @@ def classes_(self): # type: ignore class BatchEncoder(Encoder): """An encoder that concatenates and encodes several ``obs`` columns.""" - def __init__(self, cols: List[str], name: str = "batch"): + def __init__(self, cols: list[str], name: str = "batch"): self.cols = cols from sklearn.preprocessing import LabelEncoder @@ -115,7 +114,7 @@ def fit(self, obs: pd.DataFrame) -> None: self._encoder.fit(arr.unique()) @property - def columns(self) -> List[str]: + def columns(self) -> list[str]: """Columns in ``obs`` that the encoder will be applied to.""" return self.cols diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py index 07d2212c8..5a9c2d626 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py @@ -1,6 +1,7 @@ import uuid from abc import ABC, abstractmethod -from typing import Any, Dict, Generator, Optional +from collections.abc import Generator +from typing import Any import scipy.sparse from datasets import Dataset @@ -37,7 +38,7 @@ def __init__( measurement_name: str = "RNA", layer_name: str = "raw", *, - block_size: Optional[int] = None, + block_size: int | None = None, **kwargs: Any, ): """Initialize the CellDatasetBuilder to process the results of a Census @@ -55,13 +56,13 @@ def __init__( self.layer_name = layer_name self.block_size = block_size - def build(self, from_generator_kwargs: Optional[Dict[str, Any]] = None) -> Dataset: + def build(self, from_generator_kwargs: dict[str, Any] | None = None) -> Dataset: """Build the dataset from query results. - `from_generator_kwargs`: kwargs passed through to `Dataset.from_generator()` """ - def gen() -> Generator[Dict[str, Any], None, None]: + def gen() -> Generator[dict[str, Any], None, None]: for Xblock, (block_cell_joinids, _) in ( self.X(self.layer_name).blockwise(axis=0, reindex_disable_on_axis=[1], size=self.block_size).scipy() ): @@ -73,7 +74,7 @@ def gen() -> Generator[Dict[str, Any], None, None]: return Dataset.from_generator(_DatasetGeneratorPickleHack(gen), **(from_generator_kwargs or {})) @abstractmethod - def cell_item(self, cell_joinid: int, Xrow: scipy.sparse.csr_matrix) -> Dict[str, Any]: + def cell_item(self, cell_joinid: int, Xrow: scipy.sparse.csr_matrix) -> dict[str, Any]: """Abstract method to process the X row for one cell into a Dataset item. - `cell_joinid`: The cell `soma_joinid`. @@ -86,7 +87,7 @@ def cell_item(self, cell_joinid: int, Xrow: scipy.sparse.csr_matrix) -> Dict[str class _DatasetGeneratorPickleHack: """SEE: https://github.com/huggingface/datasets/issues/6194.""" - def __init__(self, generator: Any, generator_id: Optional[str] = None) -> None: + def __init__(self, generator: Any, generator_id: str | None = None) -> None: self.generator = generator self.generator_id = generator_id if generator_id is not None else str(uuid.uuid4()) diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py index 48ea8fdea..7303d3bbf 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py @@ -1,5 +1,6 @@ import pickle -from typing import Any, Dict, List, Optional, Sequence, Set +from collections.abc import Sequence +from typing import Any import numpy as np import numpy.typing as npt @@ -42,7 +43,7 @@ class GeneformerTokenizer(CellDatasetBuilder): - and the specified `obs_column_names` (cell metadata from the experiment obs dataframe) """ - obs_column_names: Set[str] + obs_column_names: set[str] max_input_tokens: int special_token: bool @@ -54,15 +55,15 @@ class GeneformerTokenizer(CellDatasetBuilder): model_gene_map: scipy.sparse.coo_matrix model_gene_tokens: npt.NDArray[np.int64] # Geneformer token for each column of model_gene_map model_gene_medians: npt.NDArray[np.float64] # float for each column of model_gene_map - model_cls_token: Optional[np.int64] = None - model_eos_token: Optional[np.int64] = None + model_cls_token: np.int64 | None = None + model_eos_token: np.int64 | None = None def __init__( self, experiment: tiledbsoma.Experiment, *, - obs_column_names: Optional[Sequence[str]] = None, - obs_attributes: Optional[Sequence[str]] = None, + obs_column_names: Sequence[str] | None = None, + obs_attributes: Sequence[str] | None = None, max_input_tokens: int = 2048, special_token: bool = False, token_dictionary_file: str = "", @@ -147,10 +148,10 @@ def _load_geneformer_data( map_data = [] map_i = [] map_j = [] - model_gene_id_by_ensg: Dict[str, int] = {} + model_gene_id_by_ensg: dict[str, int] = {} model_gene_count = 0 - model_gene_tokens: List[np.int64] = [] - model_gene_medians: List[np.float64] = [] + model_gene_tokens: list[np.int64] = [] + model_gene_medians: list[np.float64] = [] for gene_id, row in genes_df.iterrows(): ensg = row["feature_id"] # ENSG... gene id, which keys Geneformer's dicts if gene_mapping is not None: @@ -198,7 +199,7 @@ def __enter__(self) -> "GeneformerTokenizer": self.obs_df = self.obs(column_names=obs_column_names).concat().to_pandas().set_index("soma_joinid") return self - def cell_item(self, cell_joinid: int, cell_Xrow: scipy.sparse.csr_matrix) -> Dict[str, Any]: + def cell_item(self, cell_joinid: int, cell_Xrow: scipy.sparse.csr_matrix) -> dict[str, Any]: """Given the expression vector for one cell, compute the Dataset item providing the Geneformer inputs (token sequence and metadata). """ diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/pytorch.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/pytorch.py index 5bef673c9..2e12f68b6 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/pytorch.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/pytorch.py @@ -3,11 +3,12 @@ import logging import os import typing +from collections.abc import Iterator, Sequence from contextlib import contextmanager from datetime import timedelta from math import ceil from time import time -from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple, Union +from typing import Any, TypeAlias import numpy as np import numpy.typing as npt @@ -32,13 +33,13 @@ pytorch_logger = logging.getLogger("cellxgene_census.experimental.pytorch") # TODO: Rename to reflect the correct order of the Tensors within the tuple: (X, obs) -ObsAndXDatum = Tuple[Tensor, Tensor] +ObsAndXDatum = tuple[Tensor, Tensor] """Return type of ``ExperimentDataPipe`` that pairs a Tensor of ``obs`` row(s) with a Tensor of ``X`` matrix row(s). The Tensors are rank 1 if ``batch_size`` is 1, otherwise the Tensors are rank 2.""" # "Chunk" of X data, returned by each `Method` above -ChunkX = Union[npt.NDArray[Any], sparse.csr_matrix] +ChunkX: TypeAlias = npt.NDArray[Any] | sparse.csr_matrix @define @@ -58,7 +59,7 @@ def __len__(self) -> int: return len(self.obs) -Encoders = Dict[str, Encoder] +Encoders = dict[str, Encoder] """A dictionary of ``Encoder``s keyed by the ``obs`` column name.""" @@ -97,7 +98,7 @@ def __add__(self, other: "Stats") -> "Stats": @contextmanager def _open_experiment( uri: str, - aws_region: Optional[str] = None, + aws_region: str | None = None, ) -> soma.Experiment: """Internal method for opening a SOMA ``Experiment`` as a context manager.""" context = get_default_soma_context().replace(tiledb_config={"vfs.s3.region": aws_region} if aws_region else {}) @@ -107,8 +108,8 @@ def _open_experiment( def _tables_to_np( - tables: Iterator[Tuple[Table, Any]], shape: Tuple[int, int] -) -> typing.Generator[Tuple[npt.NDArray[Any], Any, int], None, None]: + tables: Iterator[tuple[Table, Any]], shape: tuple[int, int] +) -> typing.Generator[tuple[npt.NDArray[Any], Any, int], None, None]: for tbl, indices in tables: row_indices, col_indices, data = (x.to_numpy() for x in tbl.columns) nnz = len(data) @@ -135,10 +136,10 @@ def __init__( obs: soma.DataFrame, X: soma.SparseNDArray, obs_column_names: Sequence[str], - obs_joinids_chunked: List[npt.NDArray[np.int64]], + obs_joinids_chunked: list[npt.NDArray[np.int64]], var_joinids: npt.NDArray[np.int64], - shuffle_chunk_count: Optional[int] = None, - shuffle_rng: Optional[Generator] = None, + shuffle_chunk_count: int | None = None, + shuffle_rng: Generator | None = None, return_sparse_X: bool = False, ): self.obs = obs @@ -221,7 +222,7 @@ def __next__(self) -> _SOMAChunk: return _SOMAChunk(obs=obs_batch, X=X_batch, stats=stats) -def list_split(arr_list: List[Any], sublist_len: int) -> List[List[Any]]: +def list_split(arr_list: list[Any], sublist_len: int) -> list[list[Any]]: """Splits a python list into a list of sublists where each sublist is of size `sublist_len`. TODO: Replace with `itertools.batched` when Python 3.12 becomes the minimum supported version. """ @@ -238,7 +239,7 @@ def list_split(arr_list: List[Any], sublist_len: int) -> List[List[Any]]: return result -def run_gc() -> Tuple[Tuple[Any, Any, Any], Tuple[Any, Any, Any], float]: # noqa: D103 +def run_gc() -> tuple[tuple[Any, Any, Any], tuple[Any, Any, Any], float]: # noqa: D103 proc = psutil.Process(os.getpid()) pre_gc = proc.memory_full_info(), psutil.virtual_memory(), psutil.swap_memory() @@ -266,7 +267,7 @@ class _ObsAndXIterator(Iterator[ObsAndXDatum]): soma_chunk_iter: Iterator[_SOMAChunk] """The iterator for SOMA chunks of paired obs and X data""" - soma_chunk: Optional[_SOMAChunk] + soma_chunk: _SOMAChunk | None """The current SOMA chunk of obs and X data""" i: int = -1 @@ -277,15 +278,15 @@ def __init__( obs: soma.DataFrame, X: soma.SparseNDArray, obs_column_names: Sequence[str], - obs_joinids_chunked: List[npt.NDArray[np.int64]], + obs_joinids_chunked: list[npt.NDArray[np.int64]], var_joinids: npt.NDArray[np.int64], batch_size: int, - encoders: List[Encoder], + encoders: list[Encoder], stats: Stats, return_sparse_X: bool, use_eager_fetch: bool, - shuffle_chunk_count: Optional[int] = None, - shuffle_rng: Optional[Generator] = None, + shuffle_chunk_count: int | None = None, + shuffle_rng: Generator | None = None, ) -> None: self.soma_chunk_iter = _ObsAndXSOMAIterator( obs, @@ -362,7 +363,7 @@ def __next__(self) -> ObsAndXDatum: return X_tensor, obs_tensor - def _read_partial_torch_batch(self, batch_size: int) -> Tuple[pd.DataFrame, ChunkX]: + def _read_partial_torch_batch(self, batch_size: int) -> tuple[pd.DataFrame, ChunkX]: """Reads a torch-size batch of data from the current SOMA chunk, returning a torch-size batch whose size may contain fewer rows than the requested ``batch_size``. This can happen when the remaining rows in the current SOMA chunk are fewer than the requested ``batch_size``. @@ -443,15 +444,15 @@ class ExperimentDataPipe(pipes.IterDataPipe[Dataset[ObsAndXDatum]]): # type: ig _initialized: bool - _obs_joinids: Optional[npt.NDArray[np.int64]] + _obs_joinids: npt.NDArray[np.int64] | None - _var_joinids: Optional[npt.NDArray[np.int64]] + _var_joinids: npt.NDArray[np.int64] | None - _encoders: List[Encoder] + _encoders: list[Encoder] _stats: Stats - _shuffle_rng: Optional[Generator] + _shuffle_rng: Generator | None # TODO: Consider adding another convenience method wrapper to construct this object whose signature is more closely # aligned with get_anndata() params (i.e. "exploded" AxisQuery params). @@ -460,17 +461,17 @@ def __init__( experiment: soma.Experiment, measurement_name: str = "RNA", X_name: str = "raw", - obs_query: Optional[soma.AxisQuery] = None, - var_query: Optional[soma.AxisQuery] = None, + obs_query: soma.AxisQuery | None = None, + var_query: soma.AxisQuery | None = None, obs_column_names: Sequence[str] = (), batch_size: int = 1, shuffle: bool = True, - seed: Optional[int] = None, + seed: int | None = None, return_sparse_X: bool = False, - soma_chunk_size: Optional[int] = 64, + soma_chunk_size: int | None = 64, use_eager_fetch: bool = True, - encoders: Optional[List[Encoder]] = None, - shuffle_chunk_count: Optional[int] = 2000, + shuffle_chunk_count: int | None = 2000, + encoders: list[Encoder] | None = None, ) -> None: r"""Construct a new ``ExperimentDataPipe``. @@ -596,10 +597,10 @@ def _init(self) -> None: @staticmethod def _subset_ids_to_partition( - ids_chunked: List[npt.NDArray[np.int64]], + ids_chunked: list[npt.NDArray[np.int64]], partition_index: int, num_partitions: int, - ) -> List[npt.NDArray[np.int64]]: + ) -> list[npt.NDArray[np.int64]]: """Returns a single partition of the obs_joinids_chunked (a 2D ndarray), based upon the current process's distributed rank and world size. """ @@ -622,7 +623,7 @@ def _compute_partitions( loader_partitions: int, dist_partition: int, num_dist_partitions: int, - ) -> Tuple[int, int]: + ) -> tuple[int, int]: # NOTE: Can alternately use a `worker_init_fn` to split among workers split workload total_partitions = num_dist_partitions * loader_partitions partition = dist_partition * loader_partitions + loader_partition @@ -665,7 +666,7 @@ def __iter__(self) -> Iterator[ObsAndXDatum]: dist_partition=dist.get_rank() if dist.is_initialized() else 0, num_dist_partitions=dist.get_world_size() if dist.is_initialized() else 1, ) - obs_joinids_chunked_partition: List[npt.NDArray[np.int64]] = self._subset_ids_to_partition( + obs_joinids_chunked_partition: list[npt.NDArray[np.int64]] = self._subset_ids_to_partition( obs_joinids_chunked, partition, partitions ) @@ -693,7 +694,7 @@ def __iter__(self) -> Iterator[ObsAndXDatum]: ) @staticmethod - def _chunk_ids(ids: npt.NDArray[np.int64], chunk_size: int) -> List[npt.NDArray[np.int64]]: + def _chunk_ids(ids: npt.NDArray[np.int64], chunk_size: int) -> list[npt.NDArray[np.int64]]: num_chunks = max(1, ceil(len(ids) / chunk_size)) pytorch_logger.debug(f"Shuffling {len(ids)} obs joinids into {num_chunks} chunks of {chunk_size}") return np.array_split(ids, num_chunks) @@ -708,7 +709,7 @@ def __len__(self) -> int: def __getitem__(self, index: int) -> ObsAndXDatum: raise NotImplementedError("IterDataPipe can only be iterated") - def _build_obs_encoders(self, query: soma.ExperimentAxisQuery) -> List[Encoder]: + def _build_obs_encoders(self, query: soma.ExperimentAxisQuery) -> list[Encoder]: pytorch_logger.debug("Initializing encoders") encoders = [] @@ -748,7 +749,7 @@ def stats(self) -> Stats: return self._stats @property - def shape(self) -> Tuple[int, int]: + def shape(self) -> tuple[int, int]: """Get the shape of the data that will be returned by this :class:`cellxgene_census.experimental.ml.pytorch.ExperimentDataPipe`. This is the number of obs (cell) and var (feature) counts in the returned data. If used in multiprocessing mode (i.e. :class:`torch.utils.data.DataLoader` instantiated with num_workers > 0), the obs (cell) count will reflect diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_highly_variable_genes.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_highly_variable_genes.py index c47ad9f1e..188513c65 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_highly_variable_genes.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_highly_variable_genes.py @@ -1,14 +1,14 @@ from __future__ import annotations import os +from collections.abc import Callable, Sequence from concurrent import futures -from typing import Any, Callable, Sequence, cast +from typing import Any, Literal, cast import numpy as np import pandas as pd import tiledbsoma as soma from somacore.options import SparseDFCoord -from typing_extensions import Literal from ..._experiment import _get_experiment from ..util._eager_iter import _EagerIterator diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py index 2eaf71a7b..13bbe76c5 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py @@ -1,5 +1,3 @@ -from typing import Optional, Tuple - import numba import numpy as np import numpy.typing as npt @@ -43,7 +41,7 @@ def update( self, var_vec: npt.NDArray[np.int64], val_vec: npt.NDArray[np.float32], - batch_vec: Optional[npt.NDArray[np.int64]] = None, + batch_vec: npt.NDArray[np.int64] | None = None, ) -> None: if self.n_batches == 1: assert batch_vec is None @@ -54,7 +52,7 @@ def update( def finalize( self, - ) -> Tuple[ + ) -> tuple[ npt.NDArray[np.float64], npt.NDArray[np.float64], npt.NDArray[np.float64], @@ -125,7 +123,7 @@ def update( self, var_vec: npt.NDArray[np.int64], val_vec: npt.NDArray[np.float32], - batch_vec: Optional[npt.NDArray[np.int64]] = None, + batch_vec: npt.NDArray[np.int64] | None = None, ) -> None: if self.n_batches == 1: assert batch_vec is None @@ -147,7 +145,7 @@ def update( self.clip_val, ) - def finalize(self) -> Tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]: + def finalize(self) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]: return self.counts_sum, self.squared_counts_sum @@ -282,7 +280,7 @@ def _mbomv_combine_batches( n_samples: npt.NDArray[np.int64], u: npt.NDArray[np.float64], M2: npt.NDArray[np.float64], -) -> Tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]: +) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]: """Combine all batches using Chan's parallel adaptation of Welford's. Returns tuple of (u, M2). diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_stats.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_stats.py index a5930525a..8cef3e24e 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_stats.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_stats.py @@ -1,7 +1,8 @@ from __future__ import annotations +from collections.abc import Generator from concurrent import futures -from typing import Any, Generator +from typing import Any import numpy as np import numpy.typing as npt diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/util/_eager_iter.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/util/_eager_iter.py index 6ee5db37a..9c229b889 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/util/_eager_iter.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/util/_eager_iter.py @@ -1,9 +1,10 @@ import logging import threading from collections import deque +from collections.abc import Iterator from concurrent import futures from concurrent.futures import Future -from typing import Deque, Iterator, Optional, TypeVar +from typing import TypeVar util_logger = logging.getLogger("cellxgene_census.experimental.util") @@ -14,13 +15,13 @@ class _EagerIterator(Iterator[_T]): def __init__( self, iterator: Iterator[_T], - pool: Optional[futures.Executor] = None, + pool: futures.Executor | None = None, ): super().__init__() self.iterator = iterator self._pool = pool or futures.ThreadPoolExecutor() self._own_pool = pool is None - self._future: Optional[Future[_T]] = None + self._future: Future[_T] | None = None self._begin_next() def _begin_next(self) -> None: @@ -56,14 +57,14 @@ def __init__( self, iterator: Iterator[_T], max_pending: int = 1, - pool: Optional[futures.Executor] = None, + pool: futures.Executor | None = None, ): super().__init__() self.iterator = iterator self.max_pending = max_pending self._pool = pool or futures.ThreadPoolExecutor() self._own_pool = pool is None - self._pending_results: Deque[futures.Future[_T]] = deque() + self._pending_results: deque[futures.Future[_T]] = deque() self._lock = threading.Lock() self._begin_next() diff --git a/api/python/cellxgene_census/tests/experimental/ml/test_pytorch.py b/api/python/cellxgene_census/tests/experimental/ml/test_pytorch.py index 3490c4bbd..e76b87508 100644 --- a/api/python/cellxgene_census/tests/experimental/ml/test_pytorch.py +++ b/api/python/cellxgene_census/tests/experimental/ml/test_pytorch.py @@ -1,6 +1,5 @@ import pathlib -import sys -from typing import Callable, List, Optional, Sequence, Union +from collections.abc import Callable, Sequence from unittest.mock import patch import numpy as np @@ -50,17 +49,17 @@ def pytorch_seq_x_value_gen(obs_range: range, var_range: range) -> spmatrix: @pytest.fixture -def X_layer_names() -> List[str]: +def X_layer_names() -> list[str]: return ["raw"] @pytest.fixture -def obsp_layer_names() -> Optional[List[str]]: +def obsp_layer_names() -> list[str] | None: return None @pytest.fixture -def varp_layer_names() -> Optional[List[str]]: +def varp_layer_names() -> list[str] | None: return None @@ -102,8 +101,8 @@ def add_sparse_array( @pytest.fixture(scope="function") def soma_experiment( tmp_path: pathlib.Path, - obs_range: Union[int, range], - var_range: Union[int, range], + obs_range: int | range, + var_range: int | range, X_value_gen: Callable[[range, range], sparse.spmatrix], obsp_layer_names: Sequence[str], varp_layer_names: Sequence[str], @@ -485,10 +484,6 @@ def test_custom_encoders_fail_if_columns_defined(soma_experiment: Experiment) -> @pytest.mark.experimental -@pytest.mark.skipif( - (sys.version_info.major, sys.version_info.minor) == (3, 9), - reason="fails intermittently with OOM error for 3.9", -) # noinspection PyTestParametrized @pytest.mark.parametrize("obs_range,var_range,X_value_gen", [(6, 3, pytorch_x_value_gen)]) def test_multiprocessing__returns_full_result(soma_experiment: Experiment) -> None: @@ -520,11 +515,11 @@ def test_distributed__returns_data_partition_for_rank( """Tests pytorch._partition_obs_joinids() behavior in a simulated PyTorch distributed processing mode, using mocks to avoid having to do real PyTorch distributed setup.""" - with patch("cellxgene_census.experimental.ml.pytorch.dist.is_initialized") as mock_dist_is_initialized, patch( - "cellxgene_census.experimental.ml.pytorch.dist.get_rank" - ) as mock_dist_get_rank, patch( - "cellxgene_census.experimental.ml.pytorch.dist.get_world_size" - ) as mock_dist_get_world_size: + with ( + patch("cellxgene_census.experimental.ml.pytorch.dist.is_initialized") as mock_dist_is_initialized, + patch("cellxgene_census.experimental.ml.pytorch.dist.get_rank") as mock_dist_get_rank, + patch("cellxgene_census.experimental.ml.pytorch.dist.get_world_size") as mock_dist_get_world_size, + ): mock_dist_is_initialized.return_value = True mock_dist_get_rank.return_value = 1 mock_dist_get_world_size.return_value = 3 @@ -556,13 +551,12 @@ def test_distributed_and_multiprocessing__returns_data_partition_for_rank( DataLoader multiprocessing mode, using mocks to avoid having to do distributed pytorch setup or real DataLoader multiprocessing.""" - with patch("torch.utils.data.get_worker_info") as mock_get_worker_info, patch( - "cellxgene_census.experimental.ml.pytorch.dist.is_initialized" - ) as mock_dist_is_initialized, patch( - "cellxgene_census.experimental.ml.pytorch.dist.get_rank" - ) as mock_dist_get_rank, patch( - "cellxgene_census.experimental.ml.pytorch.dist.get_world_size" - ) as mock_dist_get_world_size: + with ( + patch("torch.utils.data.get_worker_info") as mock_get_worker_info, + patch("cellxgene_census.experimental.ml.pytorch.dist.is_initialized") as mock_dist_is_initialized, + patch("cellxgene_census.experimental.ml.pytorch.dist.get_rank") as mock_dist_get_rank, + patch("cellxgene_census.experimental.ml.pytorch.dist.get_world_size") as mock_dist_get_world_size, + ): mock_get_worker_info.return_value = WorkerInfo(id=1, num_workers=2, seed=1234) mock_dist_is_initialized.return_value = True mock_dist_get_rank.return_value = 1 diff --git a/api/python/cellxgene_census/tests/experimental/pp/test_stats.py b/api/python/cellxgene_census/tests/experimental/pp/test_stats.py index 3c113ea07..ecc410d0e 100644 --- a/api/python/cellxgene_census/tests/experimental/pp/test_stats.py +++ b/api/python/cellxgene_census/tests/experimental/pp/test_stats.py @@ -1,4 +1,4 @@ -from typing import Any, Tuple, Union +from typing import Any import numpy as np import numpy.ma as ma @@ -11,7 +11,7 @@ from cellxgene_census.experimental import pp -def var(X: Union[sparse.csc_matrix, sparse.csr_matrix], axis: int = 0, ddof: int = 1) -> Any: +def var(X: sparse.csc_matrix | sparse.csr_matrix, axis: int = 0, ddof: int = 1) -> Any: """ Variance of a sparse matrix calculated as mean(X**2) - mean(X)**2 with Bessel's correction applied for unbiased estimate @@ -52,7 +52,7 @@ def test_mean_variance( calc_mean: bool, calc_variance: bool, small_mem_context: soma.SOMATileDBContext, - obs_coords: Tuple[None, slice], + obs_coords: tuple[None, slice], ) -> None: with cellxgene_census.open_soma(census_version="latest", context=small_mem_context) as census: with census["census_data"][experiment_name].axis_query( @@ -119,7 +119,7 @@ def test_mean_variance_nnz_only( calc_mean: bool, calc_variance: bool, small_mem_context: soma.SOMATileDBContext, - obs_coords: Tuple[None, slice], + obs_coords: tuple[None, slice], ) -> None: # Note: since this test requires materializing the matrix in memory to compute the mean/variance, # we're going to use a coord slice based approach. This will ensure the matrix can fit in memory. diff --git a/api/python/cellxgene_census/tests/experimental/test_embeddings_search.py b/api/python/cellxgene_census/tests/experimental/test_embeddings_search.py index b3c0f6f77..a58a31628 100644 --- a/api/python/cellxgene_census/tests/experimental/test_embeddings_search.py +++ b/api/python/cellxgene_census/tests/experimental/test_embeddings_search.py @@ -1,5 +1,5 @@ import json -from typing import Any, Dict, List +from typing import Any import anndata as ad import numpy as np @@ -15,7 +15,7 @@ @pytest.mark.experimental @pytest.mark.live_corpus -def test_embeddings_search(true_neighbors: Dict[str, Any], query_result: NeighborObs) -> None: +def test_embeddings_search(true_neighbors: dict[str, Any], query_result: NeighborObs) -> None: # check result shapes rslt = query_result assert isinstance(rslt.neighbor_ids, np.ndarray) @@ -96,7 +96,7 @@ def test_predict_obs_metadata(query_anndata: ad.AnnData, query_result: NeighborO @pytest.fixture(scope="module") -def true_neighbors() -> Dict[int, List[Dict[str, Any]]]: +def true_neighbors() -> dict[int, list[dict[str, Any]]]: ans = {} for line in TRUE_NEAREST_NEIGHBORS_JSON.strip().split("\n"): example = json.loads(line) @@ -105,7 +105,7 @@ def true_neighbors() -> Dict[int, List[Dict[str, Any]]]: @pytest.fixture(scope="module") -def query_anndata(true_neighbors: Dict[str, Any]) -> ad.AnnData: +def query_anndata(true_neighbors: dict[str, Any]) -> ad.AnnData: with cellxgene_census.open_soma(census_version=TRUE_NEAREST_NEIGHBORS_CENSUS_VERSION) as census: return cellxgene_census.get_anndata( census, diff --git a/api/python/cellxgene_census/tests/test_acceptance.py b/api/python/cellxgene_census/tests/test_acceptance.py index d4587e03e..bd01b840b 100644 --- a/api/python/cellxgene_census/tests/test_acceptance.py +++ b/api/python/cellxgene_census/tests/test_acceptance.py @@ -11,7 +11,8 @@ See README.md for historical data. """ -from typing import Any, Dict, Iterator, Optional, Tuple +from collections.abc import Iterator +from typing import Any import pyarrow as pa import pytest @@ -21,7 +22,7 @@ from cellxgene_census._open import DEFAULT_TILEDB_CONFIGURATION -def make_context(census_version: str, config: Optional[Dict[str, Any]] = None) -> soma.SOMATileDBContext: +def make_context(census_version: str, config: dict[str, Any] | None = None) -> soma.SOMATileDBContext: config = config or {} version = cellxgene_census.get_census_version_description(census_version) s3_region = version["soma"].get("s3_region", "us-west-2") @@ -51,7 +52,7 @@ def test_load_axes(organism: str) -> None: del var_df -def table_iter_is_ok(tbl_iter: Iterator[pa.Table], stop_after: Optional[int] = 2) -> bool: +def table_iter_is_ok(tbl_iter: Iterator[pa.Table], stop_after: int | None = 2) -> bool: """ Utility that verifies that the value is an iterator of pa.Table. @@ -78,7 +79,7 @@ def table_iter_is_ok(tbl_iter: Iterator[pa.Table], stop_after: Optional[int] = 2 pytest.param(None, DEFAULT_TILEDB_CONFIGURATION, marks=pytest.mark.expensive), ], ) -def test_incremental_read_obs(organism: str, stop_after: Optional[int], ctx_config: Optional[Dict[str, Any]]) -> None: +def test_incremental_read_obs(organism: str, stop_after: int | None, ctx_config: dict[str, Any] | None) -> None: """Verify that obs, var and X[raw] can be read incrementally, i.e., in chunks""" # ctx_config=None open census with a small (default) TileDB buffer size, which reduces @@ -101,7 +102,7 @@ def test_incremental_read_obs(organism: str, stop_after: Optional[int], ctx_conf pytest.param(None, DEFAULT_TILEDB_CONFIGURATION, marks=pytest.mark.expensive), ], ) -def test_incremental_read_var(organism: str, stop_after: Optional[int], ctx_config: Optional[Dict[str, Any]]) -> None: +def test_incremental_read_var(organism: str, stop_after: int | None, ctx_config: dict[str, Any] | None) -> None: """Verify that var can be read incrementally, i.e., in chunks""" # ctx_config=None open census with a small (default) TileDB buffer size, which reduces @@ -143,9 +144,9 @@ def test_incremental_read_var(organism: str, stop_after: Optional[int], ctx_conf ) def test_incremental_read_X( organism: str, - stop_after: Optional[int], - ctx_config: Optional[Dict[str, Any]], - coords: Optional[Tuple[slice, slice]], + stop_after: int | None, + ctx_config: dict[str, Any] | None, + coords: tuple[slice, slice] | None, ) -> None: """Verify that obs, var and X[raw] can be read incrementally, i.e., in chunks""" @@ -165,7 +166,7 @@ def test_incremental_read_X( ["tissue=='aorta'", pytest.param("tissue=='brain'", marks=pytest.mark.expensive)], ) @pytest.mark.parametrize("stop_after", [2, pytest.param(None, marks=pytest.mark.expensive)]) -def test_incremental_query(organism: str, obs_value_filter: str, stop_after: Optional[int]) -> None: +def test_incremental_query(organism: str, obs_value_filter: str, stop_after: int | None) -> None: """Verify incremental read of query result.""" # use default TileDB configuration with cellxgene_census.open_soma(census_version="latest") as census: @@ -260,9 +261,9 @@ def test_incremental_query(organism: str, obs_value_filter: str, stop_after: Opt ) def test_get_anndata( organism: str, - obs_value_filter: Optional[str], - obs_coords: Optional[slice], - ctx_config: Optional[Dict[str, Any]], + obs_value_filter: str | None, + obs_coords: slice | None, + ctx_config: dict[str, Any] | None, ) -> None: """Verify query and read into AnnData""" ctx_config = ctx_config or {} diff --git a/api/python/cellxgene_census/tests/test_get_anndata.py b/api/python/cellxgene_census/tests/test_get_anndata.py index fb3375f97..f3161cca8 100644 --- a/api/python/cellxgene_census/tests/test_get_anndata.py +++ b/api/python/cellxgene_census/tests/test_get_anndata.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Literal +from typing import Any, Literal import numpy as np import pandas as pd @@ -83,7 +83,7 @@ def test_get_anndata_x_layer(census: soma.Collection, layer: str) -> None: @pytest.mark.live_corpus @pytest.mark.parametrize("layers", [["raw", "normalized"], ["normalized", "raw"]]) -def test_get_anndata_two_layers(census: soma.Collection, layers: List[str]) -> None: +def test_get_anndata_two_layers(census: soma.Collection, layers: list[str]) -> None: ad_primary_layer_in_X = cellxgene_census.get_anndata( census, organism="Homo sapiens", @@ -165,7 +165,7 @@ def test_get_anndata_obsm_one_layer(dec_lts_census: soma.Collection, obsm_layer: @pytest.mark.live_corpus @pytest.mark.parametrize("obsm_layers", [["scvi", "geneformer"]]) -def test_get_anndata_obsm_two_layers(dec_lts_census: soma.Collection, obsm_layers: List[str]) -> None: +def test_get_anndata_obsm_two_layers(dec_lts_census: soma.Collection, obsm_layers: list[str]) -> None: # NOTE: This test only works on the 2023-12-15 LTS Census, since in newer releases # the embeddings aren't distributed via the `obsm_layer` parameter. ad = cellxgene_census.get_anndata( @@ -184,8 +184,10 @@ def test_get_anndata_obsm_two_layers(dec_lts_census: soma.Collection, obsm_layer @pytest.mark.live_corpus -@pytest.mark.parametrize("obs_embeddings", [["scvi", "scgpt"]]) -def test_get_anndata_obs_embeddings(lts_census: soma.Collection, obs_embeddings: List[str]) -> None: +@pytest.mark.parametrize("obs_embeddings", [["scvi", "geneformer"]]) +def test_get_anndata_obs_embeddings(lts_census: soma.Collection, obs_embeddings: list[str]) -> None: + # NOTE: when the next LTS gets released (>2023-12-15), embeddings may or may not be available, + # so this test could require adjustments. ad = cellxgene_census.get_anndata( lts_census, organism="Homo sapiens", @@ -204,7 +206,7 @@ def test_get_anndata_obs_embeddings(lts_census: soma.Collection, obs_embeddings: @pytest.mark.live_corpus @pytest.mark.parametrize("var_embeddings", [["nmf"]]) -def test_get_anndata_var_embeddings(dec_lts_census: soma.Collection, var_embeddings: List[str]) -> None: +def test_get_anndata_var_embeddings(dec_lts_census: soma.Collection, var_embeddings: list[str]) -> None: # NOTE: this test only works on the 2023-12-15 LTS Census, since var embeddings # aren't available in the newer releases. @@ -299,7 +301,7 @@ def test_deprecated_column_api(census: soma.Collection) -> None: pd.testing.assert_frame_equal(ad_curr.var, ad_prev.var) -def _map_to_get_anndata_args(query: Dict[str, Any], axis: Literal["obs", "var"]) -> Dict[str, Any]: +def _map_to_get_anndata_args(query: dict[str, Any], axis: Literal["obs", "var"]) -> dict[str, Any]: """Helper to map arguments of get_obs/ get_var to get_anndata.""" result = {} if "coords" in query: @@ -332,7 +334,7 @@ def _map_to_get_anndata_args(query: Dict[str, Any], axis: Literal["obs", "var"]) pytest.param({"value_filter": "tissue_general == 'vasculature'"}, id="value_filter"), ], ) -def test_get_obs(lts_census: soma.Collection, query: Dict[str, Any]) -> None: +def test_get_obs(lts_census: soma.Collection, query: dict[str, Any]) -> None: adata_obs = cellxgene_census.get_anndata( lts_census, organism="Mus musculus", **_map_to_get_anndata_args(query, "obs") ).obs @@ -358,7 +360,7 @@ def test_get_obs(lts_census: soma.Collection, query: Dict[str, Any]) -> None: pytest.param({"value_filter": "feature_name in ['Gm53058', '0610010K14Rik']"}, id="value_filter"), ], ) -def test_get_var(lts_census: soma.Collection, query: Dict[str, Any]) -> None: +def test_get_var(lts_census: soma.Collection, query: dict[str, Any]) -> None: adata_var = cellxgene_census.get_anndata( lts_census, organism="Mus musculus", obs_coords=slice(0), **_map_to_get_anndata_args(query, "var") ).var diff --git a/api/python/cellxgene_census/tests/test_lts_compat.py b/api/python/cellxgene_census/tests/test_lts_compat.py index dbe646cdd..2c486d541 100644 --- a/api/python/cellxgene_census/tests/test_lts_compat.py +++ b/api/python/cellxgene_census/tests/test_lts_compat.py @@ -9,7 +9,8 @@ from __future__ import annotations from collections import deque -from typing import Iterator, Literal, Sequence, Union, get_args +from collections.abc import Iterator, Sequence +from typing import Literal, TypeAlias, get_args import pyarrow as pa import pytest @@ -27,14 +28,9 @@ ] CollectionTypeNames = ["SOMACollection", "SOMAExperiment", "SOMAMeasurement"] -SOMATypes = Union[ - soma.Collection, - soma.DataFrame, - soma.SparseNDArray, - soma.DenseNDArray, - soma.Experiment, - soma.Measurement, -] +SOMATypes: TypeAlias = ( + soma.Collection | soma.DataFrame | soma.SparseNDArray | soma.DenseNDArray | soma.Experiment | soma.Measurement +) def walk_census( diff --git a/api/python/cellxgene_census/tests/test_open.py b/api/python/cellxgene_census/tests/test_open.py index df20b3337..5945ea9e4 100644 --- a/api/python/cellxgene_census/tests/test_open.py +++ b/api/python/cellxgene_census/tests/test_open.py @@ -442,8 +442,8 @@ def test_opening_census_without_anon_access_fails_with_bogus_creds() -> None: os.environ["AWS_SECRET_ACCESS_KEY"] = "fake_key" # Passing an empty context with pytest.raises( - tiledb.TileDBError, - match=r"The AWS Access Key Id you provided does not exist in our records", + (tiledb.TileDBError, soma.DoesNotExistError), + match=r"does not exist", ): cellxgene_census.open_soma(census_version="latest", context=soma.SOMATileDBContext()) diff --git a/api/python/cellxgene_census/tests/test_user_agent.py b/api/python/cellxgene_census/tests/test_user_agent.py index dc410df9a..41612c649 100644 --- a/api/python/cellxgene_census/tests/test_user_agent.py +++ b/api/python/cellxgene_census/tests/test_user_agent.py @@ -3,9 +3,10 @@ import json import os +from collections.abc import Callable from functools import partial from pathlib import Path -from typing import TYPE_CHECKING, Callable +from typing import TYPE_CHECKING import numpy as np import proxy diff --git a/api/python/notebooks/README.md b/api/python/notebooks/README.md index 2b1c683ef..cdf89656d 100644 --- a/api/python/notebooks/README.md +++ b/api/python/notebooks/README.md @@ -9,7 +9,7 @@ Demonstration notebooks for the CZ CELLxGENE Discover Census. There are two kind You must be on a Linux or MacOS system, with the following installed: -* Python 3.8 to 3.11 +* Python 3.10 to 3.12 * Jupyter or some other means of running notebooks (e.g., vscode) For now, it is recommended that you do all this on a host with sufficient memory, diff --git a/docs/cellxgene_census_docsite_installation.md b/docs/cellxgene_census_docsite_installation.md index 4654eb37a..0cfbd969b 100644 --- a/docs/cellxgene_census_docsite_installation.md +++ b/docs/cellxgene_census_docsite_installation.md @@ -4,7 +4,7 @@ The Census API requires a Linux or MacOS system with: -- Python 3.8 to Python 3.11. Or R, supported versions TBD. +- Python 3.10 to Python 3.12. Or R, supported versions TBD. - Recommended: >16 GB of memory. - Recommended: >5 Mbps internet connection. - Recommended: for increased performance use the API through a AWS-EC2 instance from the region `us-west-2`. The Census data builds are hosted in a AWS-S3 bucket in that region.