chanzuckerberg · ivirshup · May 23, 2024 · May 17, 2024 · May 17, 2024 · May 17, 2024
diff --git a/api/python/cellxgene_census/src/cellxgene_census/__init__.py b/api/python/cellxgene_census/src/cellxgene_census/__init__.py
@@ -21,7 +21,7 @@
 
 from importlib import metadata
 
-from ._get_anndata import get_anndata
+from ._get_anndata import get_anndata, get_obs, get_var
 from ._open import (
     download_source_h5ad,
     get_default_soma_context,
@@ -44,6 +44,8 @@
 __all__ = [
     "download_source_h5ad",
     "get_anndata",
+    "get_obs",
+    "get_var",
     "get_census_version_description",
     "get_census_version_directory",
     "get_census_mirror_directory",

diff --git a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py
@@ -7,9 +7,10 @@
 Methods to retrieve slices of the census as AnnData objects.
 """
 
-from typing import Optional, Sequence
+from typing import Literal, Optional, Sequence
 
 import anndata
+import pandas as pd
 import tiledbsoma as soma
 from somacore.options import SparseDFCoord
 
@@ -146,3 +147,90 @@ def get_anndata(
                     adata.varm[emb] = embedding
 
         return adata
+
+
+def _get_axis_metadata(
+    census: soma.Collection,
+    axis: Literal["obs", "var"],
+    organism: str,
+    *,
+    value_filter: Optional[str] = None,
+    coords: Optional[SparseDFCoord] = slice(None),
+    column_names: Optional[Sequence[str]] = None,
+) -> pd.DataFrame:
+    exp = _get_experiment(census, organism)
+    coords = (slice(None),) if coords is None else (coords,)
+    if axis == "obs":
+        df = exp.obs
+    elif axis == "var":
+        df = exp.ms["RNA"].var
+    else:
+        raise ValueError(f"axis should be either 'obs' or 'var', but '{axis}' was passed")
+    result: pd.DataFrame = (
+        df.read(coords=coords, column_names=column_names, value_filter=value_filter).concat().to_pandas()
+    )
+    return result
+
+
+def get_obs(
+    census: soma.Collection,
+    organism: str,
+    *,
+    value_filter: Optional[str] = None,
+    coords: Optional[SparseDFCoord] = slice(None),
+    column_names: Optional[Sequence[str]] = None,
+) -> pd.DataFrame:
+    """Get the observation metadata for a query on the census.
+
+    Args:
+        census:
+            The census object, usually returned by :func:`open_soma`.
+        organism:
+            The organism to query, usually one of ``"Homo sapiens`` or ``"Mus musculus"``
+        value_filter:
+            Value filter for the ``obs`` metadata. Value is a filter query written in the
+            SOMA ``value_filter`` syntax.
+        coords:
+            Coordinates for the ``obs`` axis, which is indexed by the ``soma_joinid`` value.
+            May be an ``int``, a list of ``int``, or a slice. The default, ``None``, selects all.
+        column_names:
+            Columns to fetch.
+
+    Returns:
+        A :class:`pandas.DataFrame` object containing metadata for the queried slice.
+    """
+    return _get_axis_metadata(
+        census, "obs", organism, value_filter=value_filter, coords=coords, column_names=column_names
+    )
+
+
+def get_var(
+    census: soma.Collection,
+    organism: str,
+    *,
+    value_filter: Optional[str] = None,
+    coords: Optional[SparseDFCoord] = slice(None),
+    column_names: Optional[Sequence[str]] = None,
+) -> pd.DataFrame:
+    """Get the variable metadata for a query on the census.
+
+    Args:
+        census:
+            The census object, usually returned by :func:`open_soma`.
+        organism:
+            The organism to query, usually one of ``"Homo sapiens`` or ``"Mus musculus"``
+        value_filter:
+            Value filter for the ``var`` metadata. Value is a filter query written in the
+            SOMA ``value_filter`` syntax.
+        coords:
+            Coordinates for the ``var`` axis, which is indexed by the ``soma_joinid`` value.
+            May be an ``int``, a list of ``int``, or a slice. The default, ``None``, selects all.
+        column_names:
+            Columns to fetch.
+
+    Returns:
+        A :class:`pandas.DataFrame` object containing metadata for the queried slice.
+    """
+    return _get_axis_metadata(
+        census, "var", organism, value_filter=value_filter, coords=coords, column_names=column_names
+    )
diff --git a/api/python/cellxgene_census/tests/test_get_anndata.py b/api/python/cellxgene_census/tests/test_get_anndata.py
@@ -1,6 +1,7 @@
-from typing import List
+from typing import Any, Dict, List, Literal
 
 import numpy as np
+import pandas as pd
 import pytest
 import tiledbsoma as soma
 
@@ -12,7 +13,7 @@ def census() -> soma.Collection:
     return cellxgene_census.open_soma(census_version="latest")
 
 
-@pytest.fixture
+@pytest.fixture(scope="function")
 def lts_census() -> soma.Collection:
     return cellxgene_census.open_soma(census_version="stable")
 
@@ -264,3 +265,73 @@ def test_get_anndata_obsm_layers_and_add_obs_embedding_fails(lts_census: soma.Co
                 obsm_layers=["scvi"],
                 obs_embeddings=["scvi"],
             )
+
+
+def _map_to_get_anndata_args(query: Dict[str, Any], axis: Literal["obs", "var"]) -> Dict[str, Any]:
+    """Helper to map arguments of get_obs/ get_var to get_anndata."""
+    result = {}
+    if "coords" in query:
+        result[f"{axis}_coords"] = query["coords"]
+    if "value_filter" in query:
+        result[f"{axis}_value_filter"] = query["value_filter"]
+    if "column_names" in query:
+        result["column_names"] = {axis: query["column_names"]}
+    return result
+
+
+@pytest.mark.live_corpus
+@pytest.mark.parametrize(
+    "query",
+    [
+        pytest.param(
+            {
+                "coords": slice(100),
+                "column_names": [
+                    "soma_joinid",
+                    "cell_type",
+                    "tissue",
+                    "tissue_general",
+                    "assay",
+                ],
+            },
+            id="coords+column-names",
+        ),
+        pytest.param({"coords": slice(100, 300)}, id="coords"),
+        pytest.param({"value_filter": "tissue_general == 'vasculature'"}, id="value_filter"),
+    ],
+)
+def test_get_obs(lts_census: soma.Collection, query: Dict[str, Any]) -> None:
+    adata_obs = cellxgene_census.get_anndata(
+        lts_census, organism="Mus musculus", **_map_to_get_anndata_args(query, "obs")
+    ).obs
+    only_obs = cellxgene_census.get_obs(lts_census, "Mus musculus", **query)
+    # account for a difference:
+    only_obs.index = only_obs.index.astype(str)
+
+    pd.testing.assert_frame_equal(adata_obs, only_obs)
+
+
+@pytest.mark.live_corpus
+@pytest.mark.parametrize(
+    "query",
+    [
+        pytest.param(
+            {
+                "coords": slice(100),
+                "column_names": ["soma_joinid", "feature_id", "feature_name", "feature_length"],
+            },
+            id="coords+column-names",
+        ),
+        pytest.param({"coords": slice(100, 300)}, id="coords"),
+        pytest.param({"value_filter": "feature_name in ['Gm53058', '0610010K14Rik']"}, id="value_filter"),
+    ],
+)
+def test_get_var(lts_census: soma.Collection, query: Dict[str, Any]) -> None:
+    adata_var = cellxgene_census.get_anndata(
+        lts_census, organism="Mus musculus", obs_coords=slice(0), **_map_to_get_anndata_args(query, "var")
+    ).var
+    only_var = cellxgene_census.get_var(lts_census, "Mus musculus", **query)
+    # AnnData instantiation converts the index to string, so we match that behaviour for comparisons sake
+    only_var.index = only_var.index.astype(str)
+
+    pd.testing.assert_frame_equal(adata_var, only_var)
diff --git a/docs/python-api.rst b/docs/python-api.rst
@@ -25,6 +25,8 @@ Get slice as AnnData
     :nosignatures:
 
     cellxgene_census.get_anndata
+    cellxgene_census.get_obs
+    cellxgene_census.get_var
 
 Feature presence matrix
 -----------------------