Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[python] New embeddings API #1023

Merged
merged 15 commits into from
Apr 1, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@
import tiledbsoma as soma


def _get_experiment_name(organism: str) -> str:
"""Given an organism name, return the experiment name."""
# lower/snake case the organism name to find the experiment name
return re.sub(r"[ ]+", "_", organism).lower()


def _get_experiment(census: soma.Collection, organism: str) -> soma.Experiment:
"""Given a census :class:`tiledbsoma.Collection`, return the experiment for the named organism.
Organism matching is somewhat flexible, attempting to map from human-friendly
Expand Down Expand Up @@ -39,8 +45,7 @@ def _get_experiment(census: soma.Collection, organism: str) -> soma.Experiment:

>>> human = get_experiment(census, "homo_sapiens")
"""
# lower/snake case the organism name to find the experiment name
exp_name = re.sub(r"[ ]+", "_", organism).lower()
exp_name = _get_experiment_name(organism)

if exp_name not in census["census_data"]:
raise ValueError(f"Unknown organism {organism} - does not exist")
Expand Down
65 changes: 62 additions & 3 deletions api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,11 @@
import tiledbsoma as soma
from somacore.options import SparseDFCoord

from ._experiment import _get_experiment
from ._experiment import _get_experiment, _get_experiment_name
from ._release_directory import get_census_version_directory
from ._util import _extract_census_version, _uri_join

CENSUS_EMBEDDINGS_LOCATION_BASE_URI = "s3://cellxgene-contrib-public/contrib/cell-census/soma/"


def get_anndata(
Expand All @@ -22,11 +26,16 @@ def get_anndata(
X_name: str = "raw",
X_layers: Optional[Sequence[str]] = (),
obsm_layers: Optional[Sequence[str]] = (),
ebezzi marked this conversation as resolved.
Show resolved Hide resolved
obsp_layers: Optional[Sequence[str]] = (),
varm_layers: Optional[Sequence[str]] = (),
varp_layers: Optional[Sequence[str]] = (),
obs_value_filter: Optional[str] = None,
obs_coords: Optional[SparseDFCoord] = None,
var_value_filter: Optional[str] = None,
var_coords: Optional[SparseDFCoord] = None,
column_names: Optional[soma.AxisColumnNames] = None,
obs_embeddings: Optional[Sequence[str]] = (),
var_embeddings: Optional[Sequence[str]] = (),
) -> anndata.AnnData:
"""Convenience wrapper around :class:`tiledbsoma.Experiment` query, to build and execute a query,
and return it as an :class:`anndata.AnnData` object.
Expand Down Expand Up @@ -58,12 +67,26 @@ def get_anndata(
Columns to fetch for ``obs`` and ``var`` dataframes.
obsm_layers:
Additional obsm layers to read and return in the ``obsm`` slot.
obsp_layers:
Additional obsp layers to read and return in the ``obsp`` slot.
varm_layers:
Additional varm layers to read and return in the ``varm`` slot.
varp_layers:
Additional varp layers to read and return in the ``varp`` slot.
obs_embeddings:
Additional embeddings to be returned as part of the ``obsm`` slot.
Use :func:`get_all_available_embeddings` to retrieve available embeddings
for this Census version and organism.
var_embeddings:
Additional embeddings to be returned as part of the ``varm`` slot.
Use :func:`get_all_available_embeddings` to retrieve available embeddings
for this Census version and organism.
ebezzi marked this conversation as resolved.
Show resolved Hide resolved

Returns:
An :class:`anndata.AnnData` object containing the census slice.

Lifecycle:
maturing
experimental
ebezzi marked this conversation as resolved.
Show resolved Hide resolved

Examples:
>>> get_anndata(census, "Mus musculus", obs_value_filter="tissue_general in ['brain', 'lung']")
Expand All @@ -75,14 +98,50 @@ def get_anndata(
exp = _get_experiment(census, organism)
obs_coords = (slice(None),) if obs_coords is None else (obs_coords,)
var_coords = (slice(None),) if var_coords is None else (var_coords,)

if obsm_layers and obs_embeddings and set(obsm_layers) & set(obs_embeddings):
raise ValueError("Cannot request both `obsm_layers` and `obs_embeddings` for the same embedding name")

if varm_layers and var_embeddings and set(varm_layers) & set(var_embeddings):
raise ValueError("Cannot request both `varm_layers` and `var_embeddings` for the same embedding name")

with exp.axis_query(
measurement_name,
obs_query=soma.AxisQuery(value_filter=obs_value_filter, coords=obs_coords),
var_query=soma.AxisQuery(value_filter=var_value_filter, coords=var_coords),
) as query:
return query.to_anndata(
adata = query.to_anndata(
X_name=X_name,
column_names=column_names,
X_layers=X_layers,
obsm_layers=obsm_layers,
varm_layers=varm_layers,
obsp_layers=obsp_layers,
varp_layers=varp_layers,
)

# If obs_embeddings or var_embeddings are defined, inject them in the appropriate slot
if obs_embeddings or var_embeddings:
from .experimental._embedding import _get_embedding, get_embedding_metadata_by_name

census_version = _extract_census_version(census)
experiment_name = _get_experiment_name(organism)
census_directory = get_census_version_directory()

if obs_embeddings:
obs_soma_joinids = query.obs_joinids()
for emb in obs_embeddings:
emb_metadata = get_embedding_metadata_by_name(emb, experiment_name, census_version, "obs_embedding")
uri = _uri_join(CENSUS_EMBEDDINGS_LOCATION_BASE_URI, f"{census_version}/{emb_metadata['id']}")
embedding = _get_embedding(census, census_directory, census_version, uri, obs_soma_joinids)
adata.obsm[emb] = embedding

if var_embeddings:
var_soma_joinids = query.var_joinids()
for emb in var_embeddings:
emb_metadata = get_embedding_metadata_by_name(emb, experiment_name, census_version, "var_embedding")
uri = _uri_join(CENSUS_EMBEDDINGS_LOCATION_BASE_URI, f"{census_version}/{emb_metadata['id']}")
embedding = _get_embedding(census, census_directory, census_version, uri, var_soma_joinids)
adata.varm[emb] = embedding

return adata
12 changes: 12 additions & 0 deletions api/python/cellxgene_census/src/cellxgene_census/_util.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import urllib.parse

import tiledbsoma as soma


def _uri_join(base: str, url: str) -> str:
"""Like urllib.parse.urljoin, but doesn't get confused by s3://."""
Expand All @@ -18,3 +20,13 @@ def _uri_join(base: str, url: str) -> str:
p_url.fragment,
]
return urllib.parse.urlunparse(parts)


def _extract_census_version(census: soma.Collection) -> str:
ebezzi marked this conversation as resolved.
Show resolved Hide resolved
"""Extract the Census version from the given Census object."""
try:
version: str = urllib.parse.urlparse(census.uri).path.split("/")[2]
except (KeyError, IndexError):
raise ValueError("Unable to extract Census version.") from None

return version
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
"""Experimental API for the CELLxGENE Discover Census."""

from ._embedding import get_embedding, get_embedding_metadata
from ._embedding import (
get_all_available_embeddings,
get_all_census_versions_with_embedding,
get_embedding,
get_embedding_metadata,
get_embedding_metadata_by_name,
)

__all__ = [
"get_embedding",
"get_embedding_metadata",
"get_embedding_metadata_by_name",
"get_all_available_embeddings",
"get_all_census_versions_with_embedding",
]
Loading
Loading