From 8f3299b3e6af8ba10f87fdca125d7fa203c74bcb Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Fri, 30 Aug 2024 12:44:16 +0200 Subject: [PATCH] IO for nullable string arrays (#1558) Co-authored-by: Ilan Gold --- .azure-pipelines.yml | 1 + docs/release-notes/1558.feature.md | 3 + pyproject.toml | 3 + src/anndata/_io/specs/methods.py | 124 ++++++++++++++------ src/anndata/_io/zarr.py | 1 + src/anndata/_settings.py | 8 ++ src/anndata/tests/helpers.py | 177 ++++++++++++++++++++--------- src/testing/anndata/_pytest.py | 6 +- tests/test_concatenate.py | 6 +- tests/test_helpers.py | 15 +++ tests/test_io_elementwise.py | 30 ++++- 11 files changed, 280 insertions(+), 94 deletions(-) create mode 100644 docs/release-notes/1558.feature.md diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml index 0d9496423..23518455d 100644 --- a/.azure-pipelines.yml +++ b/.azure-pipelines.yml @@ -88,6 +88,7 @@ jobs: inputs: codeCoverageTool: Cobertura summaryFileLocation: "test-data/coverage.xml" + failIfCoverageEmpty: true condition: eq(variables['TEST_TYPE'], 'coverage') - task: PublishTestResults@2 diff --git a/docs/release-notes/1558.feature.md b/docs/release-notes/1558.feature.md new file mode 100644 index 000000000..e90783f0c --- /dev/null +++ b/docs/release-notes/1558.feature.md @@ -0,0 +1,3 @@ +Read and write support for nullable string arrays ({class}`pandas.arrays.StringArray`). +Use pandas’ {doc}`pandas:user_guide/options` `mode.string_storage` to control which storage mode is used when reading `dtype="string"` columns. +{user}`flying-sheep` diff --git a/pyproject.toml b/pyproject.toml index 08612d387..99909ad7e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -115,8 +115,11 @@ version-file = "src/anndata/_version.py" packages = ["src/anndata", "src/testing"] [tool.coverage.run] +data_file = "test-data/coverage" source_pkgs = ["anndata"] omit = ["src/anndata/_version.py", "**/test_*.py"] +[tool.coverage.xml] +output = "test-data/coverage.xml" [tool.coverage.paths] source = ["./src", "**/site-packages"] diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 794d60437..1da0476b9 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -1,5 +1,6 @@ from __future__ import annotations +import warnings from collections.abc import Mapping from functools import partial from itertools import product @@ -10,6 +11,7 @@ import h5py import numpy as np import pandas as pd +from packaging.version import Version from scipy import sparse import anndata as ad @@ -37,13 +39,16 @@ _require_group_write_dataframe, ) +from ..._settings import settings from .registry import _REGISTRY, IOSpec, read_elem, read_elem_partial if TYPE_CHECKING: + from collections.abc import Callable from os import PathLike from typing import Any, Literal from numpy import typing as npt + from numpy.typing import NDArray from anndata._types import ( ArrayStorageType, @@ -506,10 +511,12 @@ def write_vlen_string_array_zarr( ): import numcodecs - # Workaround for https://github.com/zarr-developers/numcodecs/issues/514 - # TODO: Warn to upgrade numcodecs if fixed - if not elem.flags.writeable: - elem = elem.copy() + if Version(numcodecs.__version__) < Version("0.13"): + msg = "Old numcodecs version detected. Please update for improved performance and stability." + warnings.warn(msg) + # Workaround for https://github.com/zarr-developers/numcodecs/issues/514 + if hasattr(elem, "flags") and not elem.flags.writeable: + elem = elem.copy() f.create_dataset( k, @@ -1014,44 +1021,85 @@ def read_partial_categorical(elem, *, items=None, indices=(slice(None),)): @_REGISTRY.register_write( ZarrGroup, pd.arrays.BooleanArray, IOSpec("nullable-boolean", "0.1.0") ) -def write_nullable_integer( +@_REGISTRY.register_write( + H5Group, pd.arrays.StringArray, IOSpec("nullable-string-array", "0.1.0") +) +@_REGISTRY.register_write( + ZarrGroup, pd.arrays.StringArray, IOSpec("nullable-string-array", "0.1.0") +) +def write_nullable( f: GroupStorageType, k: str, - v: pd.arrays.IntegerArray | pd.arrays.BooleanArray, + v: pd.arrays.IntegerArray | pd.arrays.BooleanArray | pd.arrays.StringArray, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): + if ( + isinstance(v, pd.arrays.StringArray) + and not settings.allow_write_nullable_strings + ): + msg = ( + "`anndata.settings.allow_write_nullable_strings` is False, " + "because writing of `pd.arrays.StringArray` is new " + "and not supported in anndata < 0.11, still use by many people. " + "Opt-in to writing these arrays by toggling the setting to True." + ) + raise RuntimeError(msg) g = f.require_group(k) - if v._mask is not None: - _writer.write_elem(g, "mask", v._mask, dataset_kwargs=dataset_kwargs) - _writer.write_elem(g, "values", v._data, dataset_kwargs=dataset_kwargs) + values = ( + v.to_numpy(na_value="") + if isinstance(v, pd.arrays.StringArray) + else v.to_numpy(na_value=0, dtype=v.dtype.numpy_dtype) + ) + _writer.write_elem(g, "values", values, dataset_kwargs=dataset_kwargs) + _writer.write_elem(g, "mask", v.isna(), dataset_kwargs=dataset_kwargs) -@_REGISTRY.register_read(H5Group, IOSpec("nullable-integer", "0.1.0")) -@_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-integer", "0.1.0")) -def read_nullable_integer( - elem: GroupStorageType, *, _reader: Reader +def _read_nullable( + elem: GroupStorageType, + *, + _reader: Reader, + # BaseMaskedArray + array_type: Callable[ + [NDArray[np.number], NDArray[np.bool_]], pd.api.extensions.ExtensionArray + ], ) -> pd.api.extensions.ExtensionArray: - if "mask" in elem: - return pd.arrays.IntegerArray( - _reader.read_elem(elem["values"]), mask=_reader.read_elem(elem["mask"]) - ) - else: - return pd.array(_reader.read_elem(elem["values"])) + return array_type( + _reader.read_elem(elem["values"]), + mask=_reader.read_elem(elem["mask"]), + ) -@_REGISTRY.register_read(H5Group, IOSpec("nullable-boolean", "0.1.0")) -@_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-boolean", "0.1.0")) -def read_nullable_boolean( - elem: GroupStorageType, *, _reader: Reader +def _string_array( + values: np.ndarray, mask: np.ndarray ) -> pd.api.extensions.ExtensionArray: - if "mask" in elem: - return pd.arrays.BooleanArray( - _reader.read_elem(elem["values"]), mask=_reader.read_elem(elem["mask"]) - ) - else: - return pd.array(_reader.read_elem(elem["values"])) + """Construct a string array from values and mask.""" + arr = pd.array(values, dtype="string") + arr[mask] = pd.NA + return arr + + +_REGISTRY.register_read(H5Group, IOSpec("nullable-integer", "0.1.0"))( + read_nullable_integer := partial(_read_nullable, array_type=pd.arrays.IntegerArray) +) +_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-integer", "0.1.0"))( + read_nullable_integer +) + +_REGISTRY.register_read(H5Group, IOSpec("nullable-boolean", "0.1.0"))( + read_nullable_boolean := partial(_read_nullable, array_type=pd.arrays.BooleanArray) +) +_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-boolean", "0.1.0"))( + read_nullable_boolean +) + +_REGISTRY.register_read(H5Group, IOSpec("nullable-string-array", "0.1.0"))( + read_nullable_string := partial(_read_nullable, array_type=_string_array) +) +_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-string-array", "0.1.0"))( + read_nullable_string +) ########### @@ -1091,17 +1139,19 @@ def write_hdf5_scalar( f.create_dataset(key, data=np.array(value), **dataset_kwargs) -# fmt: off for numeric_scalar_type in [ - bool, np.bool_, - np.uint8, np.uint16, np.uint32, np.uint64, - int, np.int8, np.int16, np.int32, np.int64, - float, *np.floating.__subclasses__(), + *(bool, np.bool_), + *(np.uint8, np.uint16, np.uint32, np.uint64), + *(int, np.int8, np.int16, np.int32, np.int64), + *(float, *np.floating.__subclasses__()), *np.complexfloating.__subclasses__(), ]: - _REGISTRY.register_write(H5Group, numeric_scalar_type, IOSpec("numeric-scalar", "0.2.0"))(write_hdf5_scalar) - _REGISTRY.register_write(ZarrGroup, numeric_scalar_type, IOSpec("numeric-scalar", "0.2.0"))(write_scalar) -# fmt: on + _REGISTRY.register_write( + H5Group, numeric_scalar_type, IOSpec("numeric-scalar", "0.2.0") + )(write_hdf5_scalar) + _REGISTRY.register_write( + ZarrGroup, numeric_scalar_type, IOSpec("numeric-scalar", "0.2.0") + )(write_scalar) _REGISTRY.register_write(ZarrGroup, str, IOSpec("string", "0.2.0"))(write_scalar) _REGISTRY.register_write(ZarrGroup, np.str_, IOSpec("string", "0.2.0"))(write_scalar) diff --git a/src/anndata/_io/zarr.py b/src/anndata/_io/zarr.py index 4cab3ea8d..2564738ad 100644 --- a/src/anndata/_io/zarr.py +++ b/src/anndata/_io/zarr.py @@ -103,6 +103,7 @@ def callback(func, elem_name: str, elem, iospec): @report_read_key_on_error def read_dataset(dataset: zarr.Array): + """Legacy method for reading datasets without encoding_type.""" value = dataset[...] if not hasattr(value, "dtype"): return value diff --git a/src/anndata/_settings.py b/src/anndata/_settings.py index 860eb0616..f8ba1be3b 100644 --- a/src/anndata/_settings.py +++ b/src/anndata/_settings.py @@ -420,5 +420,13 @@ def validate_bool(val) -> None: get_from_env=check_and_get_bool, ) +settings.register( + "allow_write_nullable_strings", + default_value=False, + description="Whether or not to allow writing of `pd.arrays.StringArray`.", + validate=validate_bool, + get_from_env=check_and_get_bool, +) + ################################################################################## ################################################################################## diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index 185808b8d..54571b418 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -36,7 +36,19 @@ if TYPE_CHECKING: from collections.abc import Collection - from typing import Literal + from typing import Callable, Literal, TypeGuard, TypeVar + + DT = TypeVar("DT") + + +try: + from pandas.core.arrays.integer import IntegerDtype +except ImportError: + IntegerDtype = ( + *(pd.Int8Dtype, pd.Int16Dtype, pd.Int32Dtype, pd.Int64Dtype), + *(pd.UInt8Dtype, pd.UInt16Dtype, pd.UInt32Dtype, pd.UInt64Dtype), + ) + # Give this to gen_adata when dask array support is expected. GEN_ADATA_DASK_ARGS = dict( @@ -45,30 +57,43 @@ np.ndarray, pd.DataFrame, DaskArray, + *((sparse.csr_array,) if CAN_USE_SPARSE_ARRAY else ()), ), varm_types=( sparse.csr_matrix, np.ndarray, pd.DataFrame, DaskArray, + *((sparse.csr_array,) if CAN_USE_SPARSE_ARRAY else ()), ), layers_types=( sparse.csr_matrix, np.ndarray, pd.DataFrame, DaskArray, + *((sparse.csr_array,) if CAN_USE_SPARSE_ARRAY else ()), ), ) -if CAN_USE_SPARSE_ARRAY: - GEN_ADATA_DASK_ARGS["obsm_types"] = GEN_ADATA_DASK_ARGS["obsm_types"] + ( - sparse.csr_array, - ) - GEN_ADATA_DASK_ARGS["varm_types"] = GEN_ADATA_DASK_ARGS["varm_types"] + ( - sparse.csr_array, - ) - GEN_ADATA_DASK_ARGS["layers_types"] = GEN_ADATA_DASK_ARGS["layers_types"] + ( - sparse.csr_array, - ) + + +DEFAULT_KEY_TYPES = ( + sparse.csr_matrix, + np.ndarray, + pd.DataFrame, + *((sparse.csr_array,) if CAN_USE_SPARSE_ARRAY else ()), +) + + +DEFAULT_COL_TYPES = ( + pd.CategoricalDtype(ordered=False), + pd.CategoricalDtype(ordered=True), + np.int64, + np.float64, + np.uint8, + np.bool_, + pd.BooleanDtype, + pd.Int32Dtype, +) def gen_vstr_recarray(m, n, dtype=None): @@ -82,30 +107,82 @@ def gen_vstr_recarray(m, n, dtype=None): ) -def gen_typed_df(n, index=None): - # TODO: Think about allowing index to be passed for n - letters = np.fromiter(iter(ascii_letters), "U1") - if n > len(letters): - letters = letters[: n // 2] # Make sure categories are repeated - return pd.DataFrame( - { - "cat": pd.Categorical(np.random.choice(letters, n)), - "cat_ordered": pd.Categorical(np.random.choice(letters, n), ordered=True), - "int64": np.random.randint(-50, 50, n), - "float64": np.random.random(n), - "uint8": np.random.randint(255, size=n, dtype="uint8"), - "bool": np.random.randint(0, 2, size=n, dtype=bool), - "nullable-bool": pd.arrays.BooleanArray( +def issubdtype( + a: np.dtype | pd.api.extensions.ExtensionDtype | type, + b: type[DT] | tuple[type[DT], ...], +) -> TypeGuard[DT]: + if isinstance(b, tuple): + return any(issubdtype(a, t) for t in b) + if isinstance(a, type) and issubclass(a, pd.api.extensions.ExtensionDtype): + return issubclass(a, b) + if isinstance(a, pd.api.extensions.ExtensionDtype): + return isinstance(a, b) + try: + return np.issubdtype(a, b) + except TypeError: # pragma: no cover + pytest.fail(f"issubdtype can’t handle everything yet: {a} {b}") + + +def gen_random_column( + n: int, dtype: np.dtype | pd.api.extensions.ExtensionDtype +) -> tuple[str, np.ndarray | pd.api.extensions.ExtensionArray]: + if issubdtype(dtype, pd.CategoricalDtype): + # TODO: Think about allowing index to be passed for n + letters = np.fromiter(iter(ascii_letters), "U1") + if n > len(letters): + letters = letters[: n // 2] # Make sure categories are repeated + key = "cat" if dtype.ordered else "cat_unordered" + return key, pd.Categorical(np.random.choice(letters, n), dtype=dtype) + if issubdtype(dtype, pd.BooleanDtype): + return ( + "nullable-bool", + pd.arrays.BooleanArray( np.random.randint(0, 2, size=n, dtype=bool), mask=np.random.randint(0, 2, size=n, dtype=bool), ), - "nullable-int": pd.arrays.IntegerArray( + ) + if issubdtype(dtype, IntegerDtype): + return ( + "nullable-int", + pd.arrays.IntegerArray( np.random.randint(0, 1000, size=n, dtype=np.int32), mask=np.random.randint(0, 2, size=n, dtype=bool), ), - }, - index=index, - ) + ) + if issubdtype(dtype, pd.StringDtype): + letters = np.fromiter(iter(ascii_letters), "U1") + array = np.array(np.random.choice(letters, n), dtype=dtype) + array[np.random.randint(0, 2, size=n, dtype=bool)] = pd.NA + return "string", array + # if issubdtype(dtype, pd.DatetimeTZDtype): + # return "datetime", pd.to_datetime(np.random.randint(0, 1000, size=n)) + if issubdtype(dtype, np.bool_): + return "bool", np.random.randint(0, 2, size=n, dtype=dtype) + + if not issubdtype(dtype, np.number): # pragma: no cover + pytest.fail(f"Unexpected dtype: {dtype}") + + n_bits = 8 * (dtype().itemsize if isinstance(dtype, type) else dtype.itemsize) + + if issubdtype(dtype, np.unsignedinteger): + return f"uint{n_bits}", np.random.randint(0, 255, n, dtype=dtype) + if issubdtype(dtype, np.signedinteger): + return f"int{n_bits}", np.random.randint(-50, 50, n, dtype=dtype) + if issubdtype(dtype, np.floating): + return f"float{n_bits}", np.random.random(n).astype(dtype) + + pytest.fail(f"Unexpected numeric dtype: {dtype}") # pragma: no cover + + +def gen_typed_df( + n: int, + index: pd.Index[str] | None = None, + dtypes: Collection[np.dtype | pd.api.extensions.ExtensionDtype] = DEFAULT_COL_TYPES, +): + columns = [gen_random_column(n, dtype) for dtype in dtypes] + col_names = [n for n, _ in columns] + assert len(col_names) == len(set(col_names)), "Duplicate column names generated!" + return pd.DataFrame(dict(columns), index=index) def _gen_awkward_inner(shape, rng, dtype): @@ -182,20 +259,11 @@ def gen_typed_df_t2_size(m, n, index=None, columns=None) -> pd.DataFrame: return df -default_key_types = ( - sparse.csr_matrix, - np.ndarray, - pd.DataFrame, -) -if CAN_USE_SPARSE_ARRAY: - default_key_types = default_key_types + (sparse.csr_array,) - - def maybe_add_sparse_array( mapping: Mapping, types: Collection[type], format: Literal["csr", "csc"], - random_state: int, + random_state: np.random.Generator, shape: tuple[int, int], ): if CAN_USE_SPARSE_ARRAY: @@ -209,15 +277,20 @@ def maybe_add_sparse_array( # TODO: Use hypothesis for this? def gen_adata( shape: tuple[int, int], - X_type=sparse.csr_matrix, - X_dtype=np.float32, - # obs_dtypes, - # var_dtypes, - obsm_types: Collection[type] = default_key_types + (AwkArray,), - varm_types: Collection[type] = default_key_types + (AwkArray,), - layers_types: Collection[type] = default_key_types, - random_state=None, - sparse_fmt: str = "csr", + X_type: Callable[[np.ndarray], object] = sparse.csr_matrix, + *, + X_dtype: np.dtype = np.float32, + obs_dtypes: Collection[ + np.dtype | pd.api.extensions.ExtensionDtype + ] = DEFAULT_COL_TYPES, + var_dtypes: Collection[ + np.dtype | pd.api.extensions.ExtensionDtype + ] = DEFAULT_COL_TYPES, + obsm_types: Collection[type] = DEFAULT_KEY_TYPES + (AwkArray,), + varm_types: Collection[type] = DEFAULT_KEY_TYPES + (AwkArray,), + layers_types: Collection[type] = DEFAULT_KEY_TYPES, + random_state: np.random.Generator | None = None, + sparse_fmt: Literal["csr", "csc"] = "csr", ) -> AnnData: """\ Helper function to generate a random AnnData for testing purposes. @@ -253,8 +326,8 @@ def gen_adata( M, N = shape obs_names = pd.Index(f"cell{i}" for i in range(shape[0])) var_names = pd.Index(f"gene{i}" for i in range(shape[1])) - obs = gen_typed_df(M, obs_names) - var = gen_typed_df(N, var_names) + obs = gen_typed_df(M, obs_names, dtypes=obs_dtypes) + var = gen_typed_df(N, var_names, dtypes=var_dtypes) # For #147 obs.rename(columns=dict(cat="obs_cat"), inplace=True) var.rename(columns=dict(cat="var_cat"), inplace=True) @@ -267,7 +340,7 @@ def gen_adata( obsm = dict( array=np.random.random((M, 50)), sparse=sparse.random(M, 100, format=sparse_fmt, random_state=random_state), - df=gen_typed_df(M, obs_names), + df=gen_typed_df(M, obs_names, dtypes=obs_dtypes), awk_2d_ragged=gen_awkward((M, None)), da=da.random.random((M, 50)), ) @@ -282,7 +355,7 @@ def gen_adata( varm = dict( array=np.random.random((N, 50)), sparse=sparse.random(N, 100, format=sparse_fmt, random_state=random_state), - df=gen_typed_df(N, var_names), + df=gen_typed_df(N, var_names, dtypes=var_dtypes), awk_2d_ragged=gen_awkward((N, None)), da=da.random.random((N, 50)), ) diff --git a/src/testing/anndata/_pytest.py b/src/testing/anndata/_pytest.py index d29ac334e..8eaa7c7bd 100644 --- a/src/testing/anndata/_pytest.py +++ b/src/testing/anndata/_pytest.py @@ -32,10 +32,14 @@ def pytest_configure(config: pytest.Config) -> None: @pytest.fixture(autouse=True) -def _suppress_env_for_doctests(request: pytest.FixtureRequest) -> None: +def _anndata_test_env(request: pytest.FixtureRequest) -> None: + import anndata + if isinstance(request.node, pytest.DoctestItem): request.getfixturevalue("_doctest_env") + anndata.settings.reset(anndata.settings._registered_options.keys()) + @pytest.fixture def _doctest_env( diff --git a/tests/test_concatenate.py b/tests/test_concatenate.py index 03126284b..075b5cfe6 100644 --- a/tests/test_concatenate.py +++ b/tests/test_concatenate.py @@ -26,6 +26,7 @@ BASE_MATRIX_PARAMS, CUPY_MATRIX_PARAMS, DASK_MATRIX_PARAMS, + DEFAULT_COL_TYPES, GEN_ADATA_DASK_ARGS, as_dense_dask_array, assert_equal, @@ -1375,8 +1376,9 @@ def test_concat_size_0_axis(axis_name, join_type, merge_strategy, shape): """Regression test for https://github.com/scverse/anndata/issues/526""" axis, axis_name = merge._resolve_axis(axis_name) alt_axis = 1 - axis - a = gen_adata((5, 7)) - b = gen_adata(shape) + col_dtypes = (*DEFAULT_COL_TYPES, pd.StringDtype) + a = gen_adata((5, 7), obs_dtypes=col_dtypes, var_dtypes=col_dtypes) + b = gen_adata(shape, obs_dtypes=col_dtypes, var_dtypes=col_dtypes) expected_size = expected_shape(a, b, axis=axis, join=join_type) diff --git a/tests/test_helpers.py b/tests/test_helpers.py index adf5a7dce..4645fedd5 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -18,6 +18,7 @@ BASE_MATRIX_PARAMS, CUPY_MATRIX_PARAMS, DASK_MATRIX_PARAMS, + DEFAULT_COL_TYPES, as_cupy, as_cupy_sparse_dask_array, as_dense_cupy_dask_array, @@ -26,6 +27,8 @@ assert_equal, gen_adata, gen_awkward, + gen_random_column, + issubdtype, report_name, ) from anndata.utils import axis_len @@ -89,6 +92,18 @@ def test_gen_awkward(shape, datashape): assert arr.type == arr_type +@pytest.mark.parametrize("dtype", [*DEFAULT_COL_TYPES, pd.StringDtype]) +def test_gen_random_column(dtype): + _, col = gen_random_column(10, dtype) + assert len(col) == 10 + # CategoricalDtypes are the only one specified as instances currently + if isinstance(dtype, pd.CategoricalDtype): + assert issubdtype(col.dtype, pd.CategoricalDtype) + assert col.dtype.ordered == dtype.ordered + else: + assert issubdtype(col.dtype, dtype) + + # Does this work for every warning? def test_report_name(): def raise_error(): diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index ed89dbe7f..91d97d6c9 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -138,15 +138,34 @@ def create_sparse_store( id="sp_mat_csc", ), pytest.param(pd.DataFrame({"a": [1, 2, 3]}), "dataframe", id="pd_df"), - pytest.param(pd.Categorical(list("aabccedd")), "categorical", id="pd_cat"), + pytest.param( + pd.Categorical(list("aabccedd") + [pd.NA]), + "categorical", + id="pd_cat_np_str", + ), pytest.param( pd.Categorical(list("aabccedd"), ordered=True), "categorical", - id="pd_cat_ord", + id="pd_cat_np_str_ord", + ), + pytest.param( + pd.array(list("aabccedd") + [pd.NA], dtype="string").astype("category"), + "categorical", + id="pd_cat_pd_str", ), pytest.param( pd.Categorical([1, 2, 1, 3], ordered=True), "categorical", id="pd_cat_num" ), + pytest.param( + pd.array(["hello", "world"], dtype="string"), + "nullable-string-array", + id="pd_arr_str", + ), + pytest.param( + pd.array(["hello", "world", pd.NA], dtype="string"), + "nullable-string-array", + id="pd_arr_str_mask", + ), pytest.param( pd.arrays.IntegerArray( np.ones(5, dtype=int), mask=np.array([True, False, True, False, True]) @@ -175,6 +194,8 @@ def create_sparse_store( ], ) def test_io_spec(store, value, encoding_type): + ad.settings.allow_write_nullable_strings = True + key = f"key_for_{encoding_type}" write_elem(store, key, value, dataset_kwargs={}) @@ -410,6 +431,11 @@ def test_write_io_error(store, obj): assert re.search(full_pattern, msg) +def test_write_nullable_string_error(store): + with pytest.raises(RuntimeError, match=r"allow_write_nullable_strings.*is False"): + write_elem(store, "/el", pd.array([""], dtype="string")) + + def test_categorical_order_type(store): # https://github.com/scverse/anndata/issues/853 cat = pd.Categorical([0, 1], ordered=True)