From 85cef506031bf6d6be645e677a286a6a60fbc494 Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Mon, 23 Sep 2024 23:25:20 -0700 Subject: [PATCH 1/8] refactor validation into separate module, perf improvements from not iterating over every element of array, rename `array` to `value` universally in serialized json lol --- .gitignore | 4 +- docs/api/validation/dtype.md | 7 +++ docs/api/validation/index.md | 6 +++ docs/api/{ => validation}/shape.md | 2 +- docs/index.md | 2 +- src/numpydantic/__init__.py | 2 +- src/numpydantic/dtype.py | 3 ++ src/numpydantic/interface/dask.py | 6 +-- src/numpydantic/interface/interface.py | 39 ++++++++-------- src/numpydantic/interface/numpy.py | 6 +-- src/numpydantic/interface/zarr.py | 6 +-- src/numpydantic/ndarray.py | 14 ++++-- src/numpydantic/schema.py | 11 +++-- src/numpydantic/serialization.py | 51 +++++++++++++++------ src/numpydantic/validation/__init__.py | 11 +++++ src/numpydantic/validation/dtype.py | 55 +++++++++++++++++++++++ src/numpydantic/{ => validation}/shape.py | 4 +- tests/conftest.py | 22 +++++++++ tests/test_interface/test_zarr.py | 2 +- 19 files changed, 196 insertions(+), 57 deletions(-) create mode 100644 docs/api/validation/dtype.md create mode 100644 docs/api/validation/index.md rename docs/api/{ => validation}/shape.md (57%) create mode 100644 src/numpydantic/validation/__init__.py create mode 100644 src/numpydantic/validation/dtype.py rename src/numpydantic/{ => validation}/shape.py (98%) diff --git a/.gitignore b/.gitignore index 3b0fc39..1470b8c 100644 --- a/.gitignore +++ b/.gitignore @@ -159,4 +159,6 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ .pdm-python -ndarray.pyi \ No newline at end of file +ndarray.pyi + +prof/ \ No newline at end of file diff --git a/docs/api/validation/dtype.md b/docs/api/validation/dtype.md new file mode 100644 index 0000000..fd599ca --- /dev/null +++ b/docs/api/validation/dtype.md @@ -0,0 +1,7 @@ +# dtype + +```{eval-rst} +.. automodule:: numpydantic.validation.dtype + :members: + :undoc-members: +``` \ No newline at end of file diff --git a/docs/api/validation/index.md b/docs/api/validation/index.md new file mode 100644 index 0000000..d0f5a11 --- /dev/null +++ b/docs/api/validation/index.md @@ -0,0 +1,6 @@ +# validation + +```{toctree} +dtype +shape +``` \ No newline at end of file diff --git a/docs/api/shape.md b/docs/api/validation/shape.md similarity index 57% rename from docs/api/shape.md rename to docs/api/validation/shape.md index 4c8638c..60a7a19 100644 --- a/docs/api/shape.md +++ b/docs/api/validation/shape.md @@ -1,7 +1,7 @@ # shape ```{eval-rst} -.. automodule:: numpydantic.shape +.. automodule:: numpydantic.validation.shape :members: :undoc-members: ``` \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index 0880a59..9caaaf7 100644 --- a/docs/index.md +++ b/docs/index.md @@ -484,13 +484,13 @@ interfaces api/index api/interface/index +api/validation/index api/dtype api/ndarray api/maps api/meta api/schema api/serialization -api/shape api/types ``` diff --git a/src/numpydantic/__init__.py b/src/numpydantic/__init__.py index 803d8d2..98b8bd1 100644 --- a/src/numpydantic/__init__.py +++ b/src/numpydantic/__init__.py @@ -4,7 +4,7 @@ from numpydantic.ndarray import NDArray from numpydantic.meta import update_ndarray_stub -from numpydantic.shape import Shape +from numpydantic.validation.shape import Shape update_ndarray_stub() diff --git a/src/numpydantic/dtype.py b/src/numpydantic/dtype.py index 12d766a..84b28cc 100644 --- a/src/numpydantic/dtype.py +++ b/src/numpydantic/dtype.py @@ -12,6 +12,9 @@ Some types like `Integer` are compound types - tuples of multiple dtypes. Check these using ``in`` rather than ``==``. This interface will develop in future versions to allow a single dtype check. + +For internal helper functions for validating dtype, +see :mod:`numpydantic.validation.dtype` """ import sys diff --git a/src/numpydantic/interface/dask.py b/src/numpydantic/interface/dask.py index cd36a65..95d0619 100644 --- a/src/numpydantic/interface/dask.py +++ b/src/numpydantic/interface/dask.py @@ -33,11 +33,11 @@ class DaskJsonDict(JsonDict): name: str chunks: Iterable[tuple[int, ...]] dtype: str - array: list + value: list def to_array_input(self) -> DaskArray: """Construct a dask array""" - np_array = np.array(self.array, dtype=self.dtype) + np_array = np.array(self.value, dtype=self.dtype) array = from_array( np_array, name=self.name, @@ -100,7 +100,7 @@ def to_json( if info.round_trip: as_json = DaskJsonDict( type=cls.name, - array=as_json, + value=as_json, name=array.name, chunks=array.chunks, dtype=str(np_array.dtype), diff --git a/src/numpydantic/interface/interface.py b/src/numpydantic/interface/interface.py index bee85b6..42bb891 100644 --- a/src/numpydantic/interface/interface.py +++ b/src/numpydantic/interface/interface.py @@ -20,8 +20,8 @@ ShapeError, TooManyMatchesError, ) -from numpydantic.shape import check_shape from numpydantic.types import DtypeType, NDArrayType, ShapeType +from numpydantic.validation import validate_dtype, validate_shape T = TypeVar("T", bound=NDArrayType) U = TypeVar("U", bound="JsonDict") @@ -76,6 +76,21 @@ def match_by_name(self) -> Optional[Type["Interface"]]: class JsonDict(BaseModel): """ Representation of array when dumped with round_trip == True. + + .. admonition:: Developer's Note + + Any JsonDict that contains an actual array should be named ``value`` + rather than array (or any other name), and nothing but the + array data should be named ``value`` . + + During JSON serialization, it becomes ambiguous what contains an array + of data vs. an array of metadata. For the moment we would like to + reserve the ability to have lists of metadata, so until we rule that out, + we would like to be able to avoid iterating over every element of an array + in any context parameter transformation like relativizing/absolutizing paths. + To avoid that, it's good to agree on a single value name -- ``value`` -- + and avoid using it for anything else. + """ type: str @@ -274,25 +289,7 @@ def validate_dtype(self, dtype: DtypeType) -> bool: Validate the dtype of the given array, returning ``True`` if valid, ``False`` if not. """ - if self.dtype is Any: - return True - - if isinstance(self.dtype, tuple): - valid = dtype in self.dtype - elif self.dtype is np.str_: - valid = getattr(dtype, "type", None) in (np.str_, str) or dtype in ( - np.str_, - str, - ) - else: - # try to match as any subclass, if self.dtype is a class - try: - valid = issubclass(dtype, self.dtype) - except TypeError: - # expected, if dtype or self.dtype is not a class - valid = dtype == self.dtype - - return valid + return validate_dtype(dtype, self.dtype) def raise_for_dtype(self, valid: bool, dtype: DtypeType) -> None: """ @@ -326,7 +323,7 @@ def validate_shape(self, shape: Tuple[int, ...]) -> bool: if self.shape is Any: return True - return check_shape(shape, self.shape) + return validate_shape(shape, self.shape) def raise_for_shape(self, valid: bool, shape: Tuple[int, ...]) -> None: """ diff --git a/src/numpydantic/interface/numpy.py b/src/numpydantic/interface/numpy.py index ad97474..6c84232 100644 --- a/src/numpydantic/interface/numpy.py +++ b/src/numpydantic/interface/numpy.py @@ -27,13 +27,13 @@ class NumpyJsonDict(JsonDict): type: Literal["numpy"] dtype: str - array: list + value: list def to_array_input(self) -> ndarray: """ Construct a numpy array """ - return np.array(self.array, dtype=self.dtype) + return np.array(self.value, dtype=self.dtype) class NumpyInterface(Interface): @@ -99,6 +99,6 @@ def to_json( if info.round_trip: json_array = NumpyJsonDict( - type=cls.name, dtype=str(array.dtype), array=json_array + type=cls.name, dtype=str(array.dtype), value=json_array ) return json_array diff --git a/src/numpydantic/interface/zarr.py b/src/numpydantic/interface/zarr.py index 41cad03..5dc647e 100644 --- a/src/numpydantic/interface/zarr.py +++ b/src/numpydantic/interface/zarr.py @@ -63,7 +63,7 @@ class ZarrJsonDict(JsonDict): type: Literal["zarr"] file: Optional[str] = None path: Optional[str] = None - array: Optional[list] = None + value: Optional[list] = None def to_array_input(self) -> Union[ZarrArray, ZarrArrayPath]: """ @@ -73,7 +73,7 @@ def to_array_input(self) -> Union[ZarrArray, ZarrArrayPath]: if self.file: array = ZarrArrayPath(file=self.file, path=self.path) else: - array = zarr.array(self.array) + array = zarr.array(self.value) return array @@ -202,7 +202,7 @@ def to_json( as_json["info"]["hexdigest"] = array.hexdigest() if dump_array or not is_file: - as_json["array"] = array[:].tolist() + as_json["value"] = array[:].tolist() as_json = ZarrJsonDict(**as_json) else: diff --git a/src/numpydantic/ndarray.py b/src/numpydantic/ndarray.py index fb81f69..6969d44 100644 --- a/src/numpydantic/ndarray.py +++ b/src/numpydantic/ndarray.py @@ -13,7 +13,7 @@ """ -from typing import TYPE_CHECKING, Any, Tuple +from typing import TYPE_CHECKING, Any, Literal, Tuple, get_origin import numpy as np from pydantic import GetJsonSchemaHandler @@ -29,6 +29,7 @@ ) from numpydantic.serialization import jsonize_array from numpydantic.types import DtypeType, NDArrayType, ShapeType +from numpydantic.validation.dtype import is_union from numpydantic.vendor.nptyping.error import InvalidArgumentsError from numpydantic.vendor.nptyping.ndarray import NDArrayMeta as _NDArrayMeta from numpydantic.vendor.nptyping.nptyping_type import NPTypingType @@ -86,11 +87,18 @@ def __instancecheck__(self, instance: Any): except InterfaceError: return False + def _is_literal_like(cls, item: Any) -> bool: + """ + Changes from nptyping: + - doesn't just ducktype for literal but actually, yno, checks for being literal + """ + return get_origin(item) is Literal + def _get_shape(cls, dtype_candidate: Any) -> "Shape": """ Override of base method to use our local definition of shape """ - from numpydantic.shape import Shape + from numpydantic.validation.shape import Shape if dtype_candidate is Any or dtype_candidate is Shape: shape = Any @@ -120,7 +128,7 @@ def _get_dtype(cls, dtype_candidate: Any) -> DType: if dtype_candidate is Any: dtype = Any - elif is_dtype: + elif is_dtype or is_union(dtype_candidate): dtype = dtype_candidate elif issubclass(dtype_candidate, Structure): # pragma: no cover dtype = dtype_candidate diff --git a/src/numpydantic/schema.py b/src/numpydantic/schema.py index cafa1f4..bfea3aa 100644 --- a/src/numpydantic/schema.py +++ b/src/numpydantic/schema.py @@ -5,7 +5,7 @@ import hashlib import json -from typing import TYPE_CHECKING, Any, Callable, Optional +from typing import TYPE_CHECKING, Any, Callable, Optional, get_args import numpy as np from pydantic import BaseModel @@ -16,6 +16,7 @@ from numpydantic.interface import Interface from numpydantic.maps import np_to_python from numpydantic.types import DtypeType, NDArrayType, ShapeType +from numpydantic.validation.dtype import is_union from numpydantic.vendor.nptyping.structure import StructureMeta if TYPE_CHECKING: # pragma: no cover @@ -51,7 +52,6 @@ def _lol_dtype( """Get the innermost dtype schema to use in the generated pydantic schema""" if isinstance(dtype, StructureMeta): # pragma: no cover raise NotImplementedError("Structured dtypes are currently unsupported") - if isinstance(dtype, tuple): # if it's a meta-type that refers to a generic float/int, just make that if dtype in (dt.Float, dt.Number): @@ -66,7 +66,10 @@ def _lol_dtype( array_type = core_schema.union_schema( [_lol_dtype(t, _handler) for t in types_] ) - + elif is_union(dtype): + array_type = core_schema.union_schema( + [_lol_dtype(t, _handler) for t in get_args(dtype)] + ) else: try: python_type = np_to_python[dtype] @@ -110,7 +113,7 @@ def list_of_lists_schema(shape: "Shape", array_type: CoreSchema) -> ListSchema: array_type ( :class:`pydantic_core.CoreSchema` ): The pre-rendered pydantic core schema to use in the innermost list entry """ - from numpydantic.shape import _is_range + from numpydantic.validation.shape import _is_range shape_parts = [part.strip() for part in shape.__args__[0].split(",")] # labels, if present diff --git a/src/numpydantic/serialization.py b/src/numpydantic/serialization.py index 1f1edd0..07924eb 100644 --- a/src/numpydantic/serialization.py +++ b/src/numpydantic/serialization.py @@ -4,7 +4,7 @@ """ from pathlib import Path -from typing import Any, Callable, TypeVar, Union +from typing import Any, Callable, Iterable, TypeVar, Union from pydantic_core.core_schema import SerializationInfo @@ -16,6 +16,9 @@ def jsonize_array(value: Any, info: SerializationInfo) -> Union[list, dict]: """Use an interface class to render an array as JSON""" + # perf: keys to skip in generation - anything named "value" is array data. + skip = ["value"] + interface_cls = Interface.match_output(value) array = interface_cls.to_json(value, info) if isinstance(array, JsonDict): @@ -25,19 +28,37 @@ def jsonize_array(value: Any, info: SerializationInfo) -> Union[list, dict]: if info.context.get("mark_interface", False): array = interface_cls.mark_json(array) + if isinstance(array, list): + return array + + # ---- Perf Barrier ------------------------------------------------------ + # put context args intended to **wrap** the array above + # put context args intended to **modify** the array below + # + # above, we assume that a list is **data** not to be modified. + # below, we must mark whenever the data is in the line of fire + # to avoid an expensive iteration. + if info.context.get("absolute_paths", False): - array = _absolutize_paths(array) + array = _absolutize_paths(array, skip) else: relative_to = info.context.get("relative_to", ".") - array = _relativize_paths(array, relative_to) + array = _relativize_paths(array, relative_to, skip) else: - # relativize paths by default - array = _relativize_paths(array, ".") + if isinstance(array, list): + return array + + # ---- Perf Barrier ------------------------------------------------------ + # same as above, ensure any keys that contain array values are skipped right now + + array = _relativize_paths(array, ".", skip) return array -def _relativize_paths(value: dict, relative_to: str = ".") -> dict: +def _relativize_paths( + value: dict, relative_to: str = ".", skip: Iterable = tuple() +) -> dict: """ Make paths relative to either the current directory or the provided ``relative_to`` directory, if provided in the context @@ -46,6 +67,8 @@ def _relativize_paths(value: dict, relative_to: str = ".") -> dict: # pdb.set_trace() def _r_path(v: Any) -> Any: + if not isinstance(v, (str, Path)): + return v try: path = Path(v) if not path.exists(): @@ -54,10 +77,10 @@ def _r_path(v: Any) -> Any: except (TypeError, ValueError): return v - return _walk_and_apply(value, _r_path) + return _walk_and_apply(value, _r_path, skip) -def _absolutize_paths(value: dict) -> dict: +def _absolutize_paths(value: dict, skip: Iterable = tuple()) -> dict: def _a_path(v: Any) -> Any: try: path = Path(v) @@ -67,23 +90,25 @@ def _a_path(v: Any) -> Any: except (TypeError, ValueError): return v - return _walk_and_apply(value, _a_path) + return _walk_and_apply(value, _a_path, skip) -def _walk_and_apply(value: T, f: Callable[[U], U]) -> T: +def _walk_and_apply(value: T, f: Callable[[U], U], skip: Iterable = tuple()) -> T: """ Walk an object, applying a function """ if isinstance(value, dict): for k, v in value.items(): + if k in skip: + continue if isinstance(v, dict): - _walk_and_apply(v, f) + _walk_and_apply(v, f, skip) elif isinstance(v, list): - value[k] = [_walk_and_apply(sub_v, f) for sub_v in v] + value[k] = [_walk_and_apply(sub_v, f, skip) for sub_v in v] else: value[k] = f(v) elif isinstance(value, list): - value = [_walk_and_apply(v, f) for v in value] + value = [_walk_and_apply(v, f, skip) for v in value] else: value = f(value) return value diff --git a/src/numpydantic/validation/__init__.py b/src/numpydantic/validation/__init__.py new file mode 100644 index 0000000..73d7374 --- /dev/null +++ b/src/numpydantic/validation/__init__.py @@ -0,0 +1,11 @@ +""" +Helper functions for validation +""" + +from numpydantic.validation.dtype import validate_dtype +from numpydantic.validation.shape import validate_shape + +__all__ = [ + "validate_dtype", + "validate_shape", +] diff --git a/src/numpydantic/validation/dtype.py b/src/numpydantic/validation/dtype.py new file mode 100644 index 0000000..bc7723c --- /dev/null +++ b/src/numpydantic/validation/dtype.py @@ -0,0 +1,55 @@ +""" +Helper functions for validation of dtype. + +For literal dtypes intended for use by end-users, see :mod:`numpydantic.dtype` +""" + +from types import UnionType +from typing import Any, Union, get_args, get_origin + +import numpy as np + +from numpydantic.types import DtypeType + + +def validate_dtype(dtype: Any, target: DtypeType) -> bool: + """ + Validate a dtype against the target dtype + + Args: + dtype: The dtype to validate + target (:class:`.DtypeType`): The target dtype + + Returns: + bool: ``True`` if valid, ``False`` otherwise + """ + if target is Any: + return True + + if isinstance(target, tuple): + valid = dtype in target + elif is_union(target): + valid = any( + [validate_dtype(dtype, target_dt) for target_dt in get_args(target)] + ) + elif target is np.str_: + valid = getattr(dtype, "type", None) in (np.str_, str) or dtype in ( + np.str_, + str, + ) + else: + # try to match as any subclass, if target is a class + try: + valid = issubclass(dtype, target) + except TypeError: + # expected, if dtype or target is not a class + valid = dtype == target + + return valid + + +def is_union(dtype: DtypeType) -> bool: + """ + Check if a dtype is a union + """ + return get_origin(dtype) in (Union, UnionType) diff --git a/src/numpydantic/shape.py b/src/numpydantic/validation/shape.py similarity index 98% rename from src/numpydantic/shape.py rename to src/numpydantic/validation/shape.py index 62a567f..e899ecd 100644 --- a/src/numpydantic/shape.py +++ b/src/numpydantic/validation/shape.py @@ -2,7 +2,7 @@ Declaration and validation functions for array shapes. Mostly a mildly modified version of nptyping's -:func:`npytping.shape_expression.check_shape` +:func:`npytping.shape_expression.validate_shape` and its internals to allow for extended syntax, including ranges of shapes. Modifications from nptyping: @@ -105,7 +105,7 @@ def validate_shape_expression(shape_expression: Union[ShapeExpression, Any]) -> @lru_cache -def check_shape(shape: ShapeTuple, target: "Shape") -> bool: +def validate_shape(shape: ShapeTuple, target: "Shape") -> bool: """ Check whether the given shape corresponds to the given shape_expression. :param shape: the shape in question. diff --git a/tests/conftest.py b/tests/conftest.py index c9035f4..076e95d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -80,6 +80,8 @@ class SubClass(BasicModel): FLOAT: TypeAlias = NDArray[Shape["*, *, *"], Float] STRING: TypeAlias = NDArray[Shape["*, *, *"], str] MODEL: TypeAlias = NDArray[Shape["*, *, *"], BasicModel] +UNION_PIPE: TypeAlias = NDArray[Shape["*, *, *"], np.uint32 | np.float32] +UNION_TYPE: TypeAlias = NDArray[Shape["*, *, *"], Union[np.uint32, np.float32]] @pytest.fixture( @@ -147,6 +149,16 @@ def shape_cases(request) -> ValidationCase: ValidationCase(annotation=MODEL, dtype=BadModel, passes=False), ValidationCase(annotation=MODEL, dtype=int, passes=False), ValidationCase(annotation=MODEL, dtype=SubClass, passes=True), + ValidationCase(annotation=UNION_PIPE, dtype=np.uint32, passes=True), + ValidationCase(annotation=UNION_PIPE, dtype=np.float32, passes=True), + ValidationCase(annotation=UNION_PIPE, dtype=np.uint64, passes=False), + ValidationCase(annotation=UNION_PIPE, dtype=np.float64, passes=False), + ValidationCase(annotation=UNION_PIPE, dtype=str, passes=False), + ValidationCase(annotation=UNION_TYPE, dtype=np.uint32, passes=True), + ValidationCase(annotation=UNION_TYPE, dtype=np.float32, passes=True), + ValidationCase(annotation=UNION_TYPE, dtype=np.uint64, passes=False), + ValidationCase(annotation=UNION_TYPE, dtype=np.float64, passes=False), + ValidationCase(annotation=UNION_TYPE, dtype=str, passes=False), ], ids=[ "float", @@ -174,6 +186,16 @@ def shape_cases(request) -> ValidationCase: "model-badmodel", "model-int", "model-subclass", + "union-pipe-uint32", + "union-pipe-float32", + "union-pipe-uint64", + "union-pipe-float64", + "union-pipe-str", + "union-type-uint32", + "union-type-float32", + "union-type-uint64", + "union-type-float64", + "union-type-str", ], ) def dtype_cases(request) -> ValidationCase: diff --git a/tests/test_interface/test_zarr.py b/tests/test_interface/test_zarr.py index ed5c252..6b21b20 100644 --- a/tests/test_interface/test_zarr.py +++ b/tests/test_interface/test_zarr.py @@ -151,7 +151,7 @@ def test_zarr_to_json(store, model_blank, roundtrip, dump_array): if roundtrip: if dump_array: - assert as_json["array"] == lol_array + assert as_json["value"] == lol_array else: if as_json.get("file", False): assert "array" not in as_json From e63d9268b1bb78f86a9426d9b3d3e66c7aa211f6 Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Mon, 23 Sep 2024 23:30:40 -0700 Subject: [PATCH 2/8] python 3.9 compat --- src/numpydantic/validation/dtype.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/numpydantic/validation/dtype.py b/src/numpydantic/validation/dtype.py index bc7723c..d930fc0 100644 --- a/src/numpydantic/validation/dtype.py +++ b/src/numpydantic/validation/dtype.py @@ -3,14 +3,18 @@ For literal dtypes intended for use by end-users, see :mod:`numpydantic.dtype` """ - -from types import UnionType +import sys from typing import Any, Union, get_args, get_origin import numpy as np from numpydantic.types import DtypeType +if sys.version_info >= (3, 10): + from types import UnionType +else: + UnionType = None + def validate_dtype(dtype: Any, target: DtypeType) -> bool: """ @@ -52,4 +56,7 @@ def is_union(dtype: DtypeType) -> bool: """ Check if a dtype is a union """ - return get_origin(dtype) in (Union, UnionType) + if UnionType is None: + return get_origin(dtype) is Union + else: + return get_origin(dtype) in (Union, UnionType) From 46020b16d2dd57d39c2e6587121c7c6616b3ef67 Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Mon, 23 Sep 2024 23:32:39 -0700 Subject: [PATCH 3/8] need to set minimum python version in ruff lol --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0e6926b..7a4e9ac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -126,7 +126,7 @@ markers = [ ] [tool.ruff] -target-version = "py311" +target-version = "py39" include = ["src/numpydantic/**/*.py", "pyproject.toml"] exclude = ["tests"] From e334232ac484ee5c3d5403cdca9b00d4511f102c Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Mon, 23 Sep 2024 23:33:20 -0700 Subject: [PATCH 4/8] lint --- src/numpydantic/validation/dtype.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/numpydantic/validation/dtype.py b/src/numpydantic/validation/dtype.py index d930fc0..5eeb124 100644 --- a/src/numpydantic/validation/dtype.py +++ b/src/numpydantic/validation/dtype.py @@ -3,6 +3,7 @@ For literal dtypes intended for use by end-users, see :mod:`numpydantic.dtype` """ + import sys from typing import Any, Union, get_args, get_origin From 402bf09cf1c0706a1714e55ce053a97f3998eecd Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Mon, 23 Sep 2024 23:42:00 -0700 Subject: [PATCH 5/8] python 39 compat --- tests/conftest.py | 181 +++++++++++++++++++++++++--------------------- 1 file changed, 99 insertions(+), 82 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 076e95d..96f8a7e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,10 +3,6 @@ import pytest from typing import Any, Tuple, Union, Type -if sys.version_info.minor >= 10: - from typing import TypeAlias -else: - from typing_extensions import TypeAlias from pydantic import BaseModel, computed_field, ConfigDict from numpydantic import NDArray, Shape from numpydantic.ndarray import NDArrayMeta @@ -15,6 +11,15 @@ from tests.fixtures import * +if sys.version_info.minor >= 10: + from typing import TypeAlias + + YES_PIPE = True +else: + from typing_extensions import TypeAlias + + YES_PIPE = False + def pytest_addoption(parser): parser.addoption( @@ -80,8 +85,9 @@ class SubClass(BasicModel): FLOAT: TypeAlias = NDArray[Shape["*, *, *"], Float] STRING: TypeAlias = NDArray[Shape["*, *, *"], str] MODEL: TypeAlias = NDArray[Shape["*, *, *"], BasicModel] -UNION_PIPE: TypeAlias = NDArray[Shape["*, *, *"], np.uint32 | np.float32] UNION_TYPE: TypeAlias = NDArray[Shape["*, *, *"], Union[np.uint32, np.float32]] +if YES_PIPE: + UNION_PIPE: TypeAlias = NDArray[Shape["*, *, *"], np.uint32 | np.float32] @pytest.fixture( @@ -121,82 +127,93 @@ def shape_cases(request) -> ValidationCase: return request.param -@pytest.fixture( - scope="module", - params=[ - ValidationCase(dtype=float, passes=True), - ValidationCase(dtype=int, passes=False), - ValidationCase(dtype=np.uint8, passes=False), - ValidationCase(annotation=NUMBER, dtype=int, passes=True), - ValidationCase(annotation=NUMBER, dtype=float, passes=True), - ValidationCase(annotation=NUMBER, dtype=np.uint8, passes=True), - ValidationCase(annotation=NUMBER, dtype=np.float16, passes=True), - ValidationCase(annotation=NUMBER, dtype=str, passes=False), - ValidationCase(annotation=INTEGER, dtype=int, passes=True), - ValidationCase(annotation=INTEGER, dtype=np.uint8, passes=True), - ValidationCase(annotation=INTEGER, dtype=float, passes=False), - ValidationCase(annotation=INTEGER, dtype=np.float32, passes=False), - ValidationCase(annotation=INTEGER, dtype=str, passes=False), - ValidationCase(annotation=FLOAT, dtype=float, passes=True), - ValidationCase(annotation=FLOAT, dtype=np.float32, passes=True), - ValidationCase(annotation=FLOAT, dtype=int, passes=False), - ValidationCase(annotation=FLOAT, dtype=np.uint8, passes=False), - ValidationCase(annotation=FLOAT, dtype=str, passes=False), - ValidationCase(annotation=STRING, dtype=str, passes=True), - ValidationCase(annotation=STRING, dtype=int, passes=False), - ValidationCase(annotation=STRING, dtype=float, passes=False), - ValidationCase(annotation=MODEL, dtype=BasicModel, passes=True), - ValidationCase(annotation=MODEL, dtype=BadModel, passes=False), - ValidationCase(annotation=MODEL, dtype=int, passes=False), - ValidationCase(annotation=MODEL, dtype=SubClass, passes=True), - ValidationCase(annotation=UNION_PIPE, dtype=np.uint32, passes=True), - ValidationCase(annotation=UNION_PIPE, dtype=np.float32, passes=True), - ValidationCase(annotation=UNION_PIPE, dtype=np.uint64, passes=False), - ValidationCase(annotation=UNION_PIPE, dtype=np.float64, passes=False), - ValidationCase(annotation=UNION_PIPE, dtype=str, passes=False), - ValidationCase(annotation=UNION_TYPE, dtype=np.uint32, passes=True), - ValidationCase(annotation=UNION_TYPE, dtype=np.float32, passes=True), - ValidationCase(annotation=UNION_TYPE, dtype=np.uint64, passes=False), - ValidationCase(annotation=UNION_TYPE, dtype=np.float64, passes=False), - ValidationCase(annotation=UNION_TYPE, dtype=str, passes=False), - ], - ids=[ - "float", - "int", - "uint8", - "number-int", - "number-float", - "number-uint8", - "number-float16", - "number-str", - "integer-int", - "integer-uint8", - "integer-float", - "integer-float32", - "integer-str", - "float-float", - "float-float32", - "float-int", - "float-uint8", - "float-str", - "str-str", - "str-int", - "str-float", - "model-model", - "model-badmodel", - "model-int", - "model-subclass", - "union-pipe-uint32", - "union-pipe-float32", - "union-pipe-uint64", - "union-pipe-float64", - "union-pipe-str", - "union-type-uint32", - "union-type-float32", - "union-type-uint64", - "union-type-float64", - "union-type-str", - ], -) +DTYPE_CASES = [ + ValidationCase(dtype=float, passes=True), + ValidationCase(dtype=int, passes=False), + ValidationCase(dtype=np.uint8, passes=False), + ValidationCase(annotation=NUMBER, dtype=int, passes=True), + ValidationCase(annotation=NUMBER, dtype=float, passes=True), + ValidationCase(annotation=NUMBER, dtype=np.uint8, passes=True), + ValidationCase(annotation=NUMBER, dtype=np.float16, passes=True), + ValidationCase(annotation=NUMBER, dtype=str, passes=False), + ValidationCase(annotation=INTEGER, dtype=int, passes=True), + ValidationCase(annotation=INTEGER, dtype=np.uint8, passes=True), + ValidationCase(annotation=INTEGER, dtype=float, passes=False), + ValidationCase(annotation=INTEGER, dtype=np.float32, passes=False), + ValidationCase(annotation=INTEGER, dtype=str, passes=False), + ValidationCase(annotation=FLOAT, dtype=float, passes=True), + ValidationCase(annotation=FLOAT, dtype=np.float32, passes=True), + ValidationCase(annotation=FLOAT, dtype=int, passes=False), + ValidationCase(annotation=FLOAT, dtype=np.uint8, passes=False), + ValidationCase(annotation=FLOAT, dtype=str, passes=False), + ValidationCase(annotation=STRING, dtype=str, passes=True), + ValidationCase(annotation=STRING, dtype=int, passes=False), + ValidationCase(annotation=STRING, dtype=float, passes=False), + ValidationCase(annotation=MODEL, dtype=BasicModel, passes=True), + ValidationCase(annotation=MODEL, dtype=BadModel, passes=False), + ValidationCase(annotation=MODEL, dtype=int, passes=False), + ValidationCase(annotation=MODEL, dtype=SubClass, passes=True), + ValidationCase(annotation=UNION_TYPE, dtype=np.uint32, passes=True), + ValidationCase(annotation=UNION_TYPE, dtype=np.float32, passes=True), + ValidationCase(annotation=UNION_TYPE, dtype=np.uint64, passes=False), + ValidationCase(annotation=UNION_TYPE, dtype=np.float64, passes=False), + ValidationCase(annotation=UNION_TYPE, dtype=str, passes=False), +] + +DTYPE_IDS = [ + "float", + "int", + "uint8", + "number-int", + "number-float", + "number-uint8", + "number-float16", + "number-str", + "integer-int", + "integer-uint8", + "integer-float", + "integer-float32", + "integer-str", + "float-float", + "float-float32", + "float-int", + "float-uint8", + "float-str", + "str-str", + "str-int", + "str-float", + "model-model", + "model-badmodel", + "model-int", + "model-subclass", + "union-type-uint32", + "union-type-float32", + "union-type-uint64", + "union-type-float64", + "union-type-str", +] + +if YES_PIPE: + DTYPE_CASES.extend( + [ + ValidationCase(annotation=UNION_PIPE, dtype=np.uint32, passes=True), + ValidationCase(annotation=UNION_PIPE, dtype=np.float32, passes=True), + ValidationCase(annotation=UNION_PIPE, dtype=np.uint64, passes=False), + ValidationCase(annotation=UNION_PIPE, dtype=np.float64, passes=False), + ValidationCase(annotation=UNION_PIPE, dtype=str, passes=False), + ] + ) + DTYPE_IDS.extend( + [ + "union-pipe-uint32", + "union-pipe-float32", + "union-pipe-uint64", + "union-pipe-float64", + "union-pipe-str", + ] + ) + + +@pytest.fixture(scope="module", params=DTYPE_CASES, ids=DTYPE_IDS) def dtype_cases(request) -> ValidationCase: return request.param From 37f3a3bddf2cfde163a9b98fc0b09d04b3e97795 Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Mon, 23 Sep 2024 23:59:43 -0700 Subject: [PATCH 6/8] add dtype docs page --- docs/dtype.md | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++ docs/index.md | 1 + 2 files changed, 99 insertions(+) create mode 100644 docs/dtype.md diff --git a/docs/dtype.md b/docs/dtype.md new file mode 100644 index 0000000..f7be5ca --- /dev/null +++ b/docs/dtype.md @@ -0,0 +1,98 @@ +# dtype + +```{todo} +This section is under construction as of 1.6.1 + +Much of the details of dtypes are covered in [syntax](./syntax.md) +and in {mod}`numpydantic.dtype` , but this section will specifically +address how dtypes are handled both generically and by interfaces +as we expand custom dtype handling <3. + +For details of support and implementation until the docs have time for some love, +please see the tests, which are the source of truth for the functionality +of the library for now and forever. +``` + +Recall the general syntax: + +``` +NDArray[Shape, dtype] +``` + +These are the docs for what can do in `dtype`. + +## Scalar Dtypes + +Python builtin types and numpy types should be handled transparently, +with some exception for complex numbers and objects (described below). + +### Numbers + +#### Complex numbers + +```{todo} +Document limitations for complex numbers and strategies for serialization/validation +``` + +### Datetimes + +```{todo} +Datetimes are supported by every interface except :class:`.VideoInterface` , +with the caveat that HDF5 loses timezone information, and thus all timestamps should +be re-encoded to UTC before saving/loading. + +More generic datetime support is TODO. +``` + +### Objects + +```{todo} +Generic objects are supported by all interfaces except +:class:`.VideoInterface` , :class;`.HDF5Interface` , and :class:`.ZarrInterface` . + +this might be expected, but there is also hope, TODO fill in serialization plans. +``` + +### Strings + +```{todo} +Strings are supported by all interfaces except :class:`.VideoInterface` . + +TODO is fill in the subtleties of how this works +``` + +## Generic Dtypes + +```{todo} +For now these are handled as tuples of dtypes, see the source of +{ref}`numpydantic.dtype.Float` . They should either be handled as Unions +or as a more prescribed meta-type. + +For now, use `int` and `float` to refer to the general concepts of +"any int" or "any float" even if this is a bit mismatched from the numpy usage. +``` + +## Extended Python Typing Universe + +### Union Types + +Union types can be used as expected. + +Union types are tested recursively -- if any item within a ``Union`` matches +the expected dtype at a given level of recursion, the dtype test passes. + +```python +class MyModel(BaseModel): + array: NDArray[Any, int | float] +``` + +## Compound Dtypes + +```{todo} +Compound dtypes are currently unsupported, +though the HDF5 interface supports indexing into compound dtypes +as separable dimensions/arrays using the third "field" parameter in +{class}`.hdf5.H5ArrayPath` . +``` + + diff --git a/docs/index.md b/docs/index.md index 9caaaf7..bced657 100644 --- a/docs/index.md +++ b/docs/index.md @@ -473,6 +473,7 @@ dumped = instance.model_dump_json(context={'zarr_dump_array': True}) design syntax +dtype serialization interfaces ``` From dfcd8479f0614c9a9cf71a1a629897d5e4d5e926 Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Tue, 24 Sep 2024 00:15:56 -0700 Subject: [PATCH 7/8] test walk and apply --- src/numpydantic/serialization.py | 2 +- tests/test_serialization.py | 33 ++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/src/numpydantic/serialization.py b/src/numpydantic/serialization.py index 07924eb..f901994 100644 --- a/src/numpydantic/serialization.py +++ b/src/numpydantic/serialization.py @@ -93,7 +93,7 @@ def _a_path(v: Any) -> Any: return _walk_and_apply(value, _a_path, skip) -def _walk_and_apply(value: T, f: Callable[[U], U], skip: Iterable = tuple()) -> T: +def _walk_and_apply(value: T, f: Callable[[U, bool], U], skip: Iterable = tuple()) -> T: """ Walk an object, applying a function """ diff --git a/tests/test_serialization.py b/tests/test_serialization.py index 702dc1a..5d0b2d8 100644 --- a/tests/test_serialization.py +++ b/tests/test_serialization.py @@ -10,6 +10,8 @@ import numpy as np import json +from numpydantic.serialization import _walk_and_apply + pytestmark = pytest.mark.serialization @@ -93,3 +95,34 @@ def test_relative_to_path(hdf5_at_path, tmp_output_dir, model_blank): # shouldn't have absolutized subpath even if it's pathlike assert data["path"] == expected_dataset + + +def test_walk_and_apply(): + """ + Walk and apply should recursively apply a function to everything in a nesty structure + """ + test = { + "a": 1, + "b": 1, + "c": [ + {"a": 1, "b": {"a": 1, "b": 1}, "c": [1, 1, 1]}, + {"a": 1, "b": [1, 1, 1]}, + ], + } + + def _mult_2(v, skip: bool = False): + return v * 2 + + def _assert_2(v, skip: bool = False): + assert v == 2 + return v + + walked = _walk_and_apply(test, _mult_2) + _walk_and_apply(walked, _assert_2) + + assert walked["a"] == 2 + assert walked["c"][0]["a"] == 2 + assert walked["c"][0]["b"]["a"] == 2 + assert all([w == 2 for w in walked["c"][0]["c"]]) + assert walked["c"][1]["a"] == 2 + assert all([w == 2 for w in walked["c"][1]["b"]]) From 2e7031c58d88763b4ba2a1ba11cf02afa5b44f8a Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Tue, 24 Sep 2024 00:23:15 -0700 Subject: [PATCH 8/8] changelog, bump version --- docs/changelog.md | 27 +++++++++++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/docs/changelog.md b/docs/changelog.md index af6375e..5874507 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -4,6 +4,33 @@ ### 1.6.* +#### 1.6.1 - 24-09-23 - Support Union Dtypes + +It's now possible to do this, like it always should have been + +```python +class MyModel(BaseModel): + array: NDArray[Any, int | float] +``` + +**Features** +- Support for Union Dtypes + +**Structure** +- New `validation` module containing `shape` and `dtype` convenience methods + to declutter main namespace and make a grouping for related code +- Rename all serialized arrays within a container dict to `value` to be able + to identify them by convention and avoid long iteration - see perf below. + +**Perf** +- Avoid iterating over every item in an array trying to convert it to a path for + a several order of magnitude perf improvement over `1.6.0` (oops) + +**Docs** +- Page for `dtypes`, mostly stubs at the moment, but more explicit documentation + about what kind of dtypes we support. + + #### 1.6.0 - 24-09-23 - Roundtrip JSON Serialization Roundtrip JSON serialization is here - with serialization to list of lists, diff --git a/pyproject.toml b/pyproject.toml index 7a4e9ac..af66c4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "numpydantic" -version = "1.6.0" +version = "1.6.1" description = "Type and shape validation and serialization for arbitrary array types in pydantic models" authors = [ {name = "sneakers-the-rat", email = "sneakers-the-rat@protonmail.com"},