diff --git a/CHANGES.rst b/CHANGES.rst index a59642e7e..89ce2a332 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -31,6 +31,10 @@ - Deprecate ``ignore_implicit_conversion`` and "implicit conversion" [#1724] +- Add ``lazy_tree`` option to ``asdf.open`` and ``asdf.config`` + to allow lazy deserialization of ASDF tagged tree nodes to + custom objects. [#1733] + 3.2.0 (2024-04-05) ------------------ diff --git a/asdf/_asdf.py b/asdf/_asdf.py index c9acb4749..560615c5a 100644 --- a/asdf/_asdf.py +++ b/asdf/_asdf.py @@ -5,6 +5,7 @@ import pathlib import time import warnings +import weakref from packaging.version import Version @@ -12,7 +13,7 @@ from . import _display as display from . import _node_info as node_info from . import _version as version -from . import constants, generic_io, reference, schema, treeutil, util, versioning, yamlutil +from . import constants, generic_io, lazy_nodes, reference, schema, treeutil, util, versioning, yamlutil from ._block.manager import Manager as BlockManager from ._helpers import validate_version from .config import config_context, get_config @@ -173,6 +174,10 @@ def __init__( # custom_tree_to_tagged_tree or tagged_tree_to_custom_tree). self._tree_modification_context = treeutil._TreeModificationContext() + # A cache of tagged objects and their converted custom objects used when + # a file is read with "lazy_tree=True". Used by lazy_nodes. + self._tagged_object_cache = lazy_nodes._TaggedObjectCache() + self._fd = None self._closed = False self._external_asdf_by_uri = {} @@ -531,6 +536,7 @@ def close(self): # as we're closing the file, also empty out the # tree so that references to array data can be released self._tree = AsdfObject() + self._tagged_object_cache.clear() for external in self._external_asdf_by_uri.values(): external.close() self._external_asdf_by_uri.clear() @@ -878,6 +884,7 @@ def _open_asdf( fd, validate_checksums=False, extensions=None, + lazy_tree=NotSet, _get_yaml_content=False, _force_raw_types=False, strict_extension_check=False, @@ -889,7 +896,7 @@ def _open_asdf( msg = "'strict_extension_check' and 'ignore_missing_extensions' are incompatible options" raise ValueError(msg) - with config_context(): + with config_context() as cfg: # validate_checksums (unlike memmap and lazy_load) is provided # here instead of in __init__ self._blocks._validate_checksums = validate_checksums @@ -970,7 +977,14 @@ def _open_asdf( self.close() raise - tree = yamlutil.tagged_tree_to_custom_tree(tree, self, _force_raw_types) + if lazy_tree is NotSet: + lazy_tree = cfg.lazy_tree + if lazy_tree and not _force_raw_types: + obj = AsdfObject() + obj.data = lazy_nodes.AsdfDictNode(tree, weakref.ref(self)) + tree = obj + else: + tree = yamlutil.tagged_tree_to_custom_tree(tree, self, _force_raw_types) if not (ignore_missing_extensions or _force_raw_types): self._check_extensions(tree, strict=strict_extension_check) @@ -988,6 +1002,7 @@ def _open_impl( mode="r", validate_checksums=False, extensions=None, + lazy_tree=NotSet, _get_yaml_content=False, _force_raw_types=False, strict_extension_check=False, @@ -1002,6 +1017,7 @@ def _open_impl( generic_file, validate_checksums=validate_checksums, extensions=extensions, + lazy_tree=lazy_tree, _get_yaml_content=_get_yaml_content, _force_raw_types=_force_raw_types, strict_extension_check=strict_extension_check, @@ -1604,6 +1620,7 @@ def open_asdf( _force_raw_types=False, copy_arrays=False, memmap=NotSet, + lazy_tree=NotSet, lazy_load=True, custom_schema=None, strict_extension_check=False, @@ -1665,6 +1682,15 @@ def open_asdf( Note: even if ``lazy_load`` is `False`, ``memmap`` is still taken into account. + lazy_tree : bool, optional + When `True` the ASDF tree will not be converted to custom objects + when the file is loaded. Instead, objects will be "lazily" converted + only when they are accessed. Note that the tree will not contain dict + and list instances for containers and instead return instances of classes + defined in `asdf.lazy_nodes`. Since objects are converted when they + are accessed, traversing the tree (like is done during `AsdfFile.info` + and `AsdfFile.search`) will result in nodes being converted. + custom_schema : str, optional Path to a custom schema file that will be used for a secondary validation pass. This can be used to ensure that particular ASDF @@ -1718,6 +1744,7 @@ def open_asdf( mode=mode, validate_checksums=validate_checksums, extensions=extensions, + lazy_tree=lazy_tree, _get_yaml_content=_get_yaml_content, _force_raw_types=_force_raw_types, strict_extension_check=strict_extension_check, diff --git a/asdf/_core/_converters/complex.py b/asdf/_core/_converters/complex.py index 52e2f6daa..b62e7d58c 100644 --- a/asdf/_core/_converters/complex.py +++ b/asdf/_core/_converters/complex.py @@ -13,7 +13,6 @@ class ComplexConverter(Converter): tags = ["tag:stsci.edu:asdf/core/complex-1.0.0"] - types = [*list(util._iter_subclasses(np.complexfloating)), complex] def to_yaml_tree(self, obj, tag, ctx): diff --git a/asdf/_core/_converters/integer.py b/asdf/_core/_converters/integer.py index 61ddf882d..22272cdc2 100644 --- a/asdf/_core/_converters/integer.py +++ b/asdf/_core/_converters/integer.py @@ -8,7 +8,6 @@ class IntegerConverter(Converter): "tag:stsci.edu:asdf/core/integer-1.0.0", "tag:stsci.edu:asdf/core/integer-1.1.0", ] - types = ["asdf.tags.core.integer.IntegerType"] def to_yaml_tree(self, obj, tag, ctx): diff --git a/asdf/_tests/conftest.py b/asdf/_tests/conftest.py index 31353ff24..1343f63ab 100644 --- a/asdf/_tests/conftest.py +++ b/asdf/_tests/conftest.py @@ -55,3 +55,10 @@ def httpserver(request): @pytest.fixture() def test_data_path(): return importlib.resources.files("asdf") / "_tests" / "data" + + +@pytest.fixture(params=[True, False], ids=["lazy", "not-lazy"]) +def with_lazy_tree(request): + with config.config_context() as cfg: + cfg.lazy_tree = request.param + yield diff --git a/asdf/_tests/core/_converters/test_complex.py b/asdf/_tests/core/_converters/test_complex.py index b1a504c0a..56600f18e 100644 --- a/asdf/_tests/core/_converters/test_complex.py +++ b/asdf/_tests/core/_converters/test_complex.py @@ -80,7 +80,7 @@ def test_valid_nan_complex(valid): pass -def test_roundtrip(): +def test_roundtrip(tmp_path): values = { "a": 0 + 0j, "b": 1 + 1j, @@ -88,8 +88,10 @@ def test_roundtrip(): "d": -1 - 1j, } - result = helpers.roundtrip_object(values) - - assert len(values) == len(result) - for key, value in values.items(): - assert result[key] == value + fn = tmp_path / "test.asdf" + asdf.AsdfFile({"values": values}).write_to(fn) + with asdf.open(fn) as af: + result = af["values"] + assert len(values) == len(result) + for key, value in values.items(): + assert result[key] == value diff --git a/asdf/_tests/core/_converters/test_tree.py b/asdf/_tests/core/_converters/test_tree.py index bcd7238f0..4c40b36e5 100644 --- a/asdf/_tests/core/_converters/test_tree.py +++ b/asdf/_tests/core/_converters/test_tree.py @@ -60,7 +60,7 @@ def test_software(): assert result == software -def test_history_entry(): +def test_history_entry(tmp_path): history_entry = HistoryEntry( description="Some history happened here", time=datetime.datetime.now(), @@ -68,9 +68,10 @@ def test_history_entry(): extra="property", ) - result = helpers.roundtrip_object(history_entry) - - assert result == history_entry + fn = tmp_path / "test.asdf" + asdf.AsdfFile({"obj": history_entry}).write_to(fn) + with asdf.open(fn) as af: + assert af["obj"] == history_entry def test_subclass_metadata(): diff --git a/asdf/_tests/tags/core/tests/test_ndarray.py b/asdf/_tests/tags/core/tests/test_ndarray.py index 339567e8d..ea830fca5 100644 --- a/asdf/_tests/tags/core/tests/test_ndarray.py +++ b/asdf/_tests/tags/core/tests/test_ndarray.py @@ -919,8 +919,9 @@ def test_inline_shape_mismatch(): """ buff = helpers.yaml_to_asdf(content) - with pytest.raises(ValueError, match=r"inline data doesn't match the given shape"), asdf.open(buff): - pass + with pytest.raises(ValueError, match=r"inline data doesn't match the given shape"): + with asdf.open(buff) as af: + af["arr"] def test_broadcasted_array(tmp_path): diff --git a/asdf/_tests/test_block_converter.py b/asdf/_tests/test_block_converter.py index 462f7c363..71e9bc1fb 100644 --- a/asdf/_tests/test_block_converter.py +++ b/asdf/_tests/test_block_converter.py @@ -125,8 +125,10 @@ def test_block_data_callback_converter(tmp_path): # id(arr) would change every time a = BlockDataCallback(lambda: np.zeros(3, dtype="uint8")) - b = helpers.roundtrip_object(a) - assert_array_equal(a.data, b.data) + tfn = tmp_path / "tmp.asdf" + asdf.AsdfFile({"obj": a}).write_to(tfn) + with asdf.open(tfn) as af: + assert_array_equal(a.data, af["obj"].data) # make a tree without the BlockData instance to avoid # the initial validate which will trigger block allocation diff --git a/asdf/_tests/test_extension.py b/asdf/_tests/test_extension.py index 788451918..fd7a0ec6d 100644 --- a/asdf/_tests/test_extension.py +++ b/asdf/_tests/test_extension.py @@ -5,6 +5,7 @@ from packaging.specifiers import SpecifierSet from yaml.representer import RepresenterError +import asdf from asdf import AsdfFile, config_context from asdf.exceptions import AsdfManifestURIMismatchWarning, AsdfWarning, ValidationError from asdf.extension import ( @@ -899,7 +900,7 @@ def from_yaml_tree(self, node, tag, ctx): config.add_extension(extension) -def test_reference_cycle(): +def test_reference_cycle(tmp_path, with_lazy_tree): class FractionWithInverse(fractions.Fraction): def __init__(self, *args, **kwargs): self._inverse = None @@ -940,9 +941,11 @@ class FractionWithInverseExtension: f2 = FractionWithInverse(5, 3) f1.inverse = f2 f2.inverse = f1 - - read_f1 = roundtrip_object(f1) - assert read_f1.inverse.inverse is read_f1 + fn = tmp_path / "test.asdf" + asdf.AsdfFile({"obj": f1}).write_to(fn) + with asdf.open(fn) as af: + read_f1 = af["obj"] + assert read_f1.inverse.inverse is read_f1 def test_manifest_uri_id_mismatch_warning(tmp_path): diff --git a/asdf/_tests/test_lazy_nodes.py b/asdf/_tests/test_lazy_nodes.py new file mode 100644 index 000000000..3490c1a1d --- /dev/null +++ b/asdf/_tests/test_lazy_nodes.py @@ -0,0 +1,405 @@ +import collections +import copy +import gc +import weakref + +import numpy as np +import pytest + +import asdf +from asdf.lazy_nodes import AsdfDictNode, AsdfListNode, AsdfOrderedDictNode, _resolve_af_ref, _to_lazy_node + + +def test_slice_access(): + af = asdf.AsdfFile() + node = AsdfListNode([0, 1, 2], weakref.ref(af)) + assert node[0] == 0 + assert node[1] == 1 + assert node[2] == 2 + assert node[:2] == [0, 1] + assert isinstance(node[:2], AsdfListNode) + assert node[1:2] == [ + 1, + ] + assert isinstance(node[1:2], AsdfListNode) + assert node[:-1] == [0, 1] + assert isinstance(node[:-1], AsdfListNode) + assert node[::-1] == [2, 1, 0] + assert isinstance(node[::-1], AsdfListNode) + assert node[::2] == [0, 2] + assert isinstance(node[::2], AsdfListNode) + assert node[1::2] == [ + 1, + ] + assert isinstance(node[1::2], AsdfListNode) + + +def test_nested_node_conversion(): + tree = { + # lll = list in list in list, etc... + "lll": [[[0]]], + "lld": [[{"a": 0}]], + "ldl": [{"a": [0]}], + "ldd": [{"a": {"a": [0]}}], + "dll": {"a": [[0]]}, + "dld": {"a": [{"a": 0}]}, + "ddl": {"a": {"a": [0]}}, + "ddd": {"a": {"a": {"a": 0}}}, + } + af = asdf.AsdfFile() + node = AsdfDictNode(tree, weakref.ref(af)) + for key in node: + obj = node[key] + for code in key: + if code == "l": + assert isinstance(obj, AsdfListNode) + obj = obj[0] + else: + assert isinstance(obj, AsdfDictNode) + obj = obj["a"] + + +def test_lazy_tree_ref(tmp_path): + fn = tmp_path / "test.asdf" + + arr = np.arange(42) + tree = { + "a": arr, + "b": {"c": arr}, + "d": [ + arr, + ], + } + + af = asdf.AsdfFile(tree) + af.write_to(fn) + + with asdf.open(fn, lazy_tree=True) as af: + assert isinstance(af.tree.data.tagged["a"], asdf.tagged.Tagged) + assert isinstance(af.tree.data.tagged["b"]["c"], asdf.tagged.Tagged) + assert isinstance(af.tree.data.tagged["d"][0], asdf.tagged.Tagged) + assert isinstance(af["b"], AsdfDictNode) + assert isinstance(af["d"], AsdfListNode) + np.testing.assert_array_equal(af["a"], arr) + assert af["a"] is af["b"]["c"] + assert af["a"] is af["d"][0] + + +def test_ordered_dict(): + tree = {"a": collections.OrderedDict({"b": [1, 2, collections.OrderedDict({"c": 3})]})} + + af = asdf.AsdfFile() + + node = AsdfDictNode(tree, weakref.ref(af)) + assert isinstance(node["a"], AsdfOrderedDictNode) + assert isinstance(node["a"]["b"], AsdfListNode) + assert isinstance(node["a"]["b"][2], AsdfOrderedDictNode) + + +@pytest.mark.parametrize( + "NodeClass,data,base", + [ + (AsdfDictNode, {"a": 1}, collections.abc.Mapping), + (AsdfListNode, [1, 2], collections.abc.Sequence), + (AsdfOrderedDictNode, {"a": 1}, collections.OrderedDict), + ], +) +def test_node_inheritance(NodeClass, data, base): + node = NodeClass(data) + assert isinstance(node, base) + + +@pytest.mark.parametrize( + "NodeClass,base", + [ + (AsdfDictNode, dict), + (AsdfListNode, list), + (AsdfOrderedDictNode, dict), + ], +) +def test_node_empty_init(NodeClass, base): + node = NodeClass() + assert type(node.tagged) == base + + +@pytest.mark.parametrize( + "node", + [ + AsdfDictNode({"a": 1, "b": 2}), + AsdfListNode([1, 2, 3]), + AsdfOrderedDictNode({"a": 1, "b": 2}), + ], +) +@pytest.mark.parametrize("copy_operation", [copy.copy, copy.deepcopy]) +def test_copy(node, copy_operation): + copied_node = copy_operation(node) + assert isinstance(copied_node, type(node)) + assert copied_node == node + + +@pytest.mark.parametrize( + "NodeClass,data", + [ + (AsdfDictNode, {1: "a", 2: "b"}), + (AsdfListNode, [1, 2]), + (AsdfOrderedDictNode, collections.OrderedDict({1: "a", 2: "b"})), + ], +) +def test_node_equality(NodeClass, data): + node = NodeClass(data) + assert node == node + assert not node != node + assert node == data + data.pop(1) + assert node != data + + +def test_cache_clear_on_close(tmp_path): + fn = tmp_path / "test.asdf" + + arr = np.arange(42) + tree = {"a": arr} + asdf.AsdfFile(tree).write_to(fn) + + with asdf.open(fn, lazy_tree=True) as af: + # grab a weakref to this array, it should fail + # to resolve after the with exits + ref = weakref.ref(af["a"]) + + gc.collect() + assert ref() is None + + +def test_access_after_del(tmp_path): + fn = tmp_path / "test.asdf" + + arr = np.arange(42) + tree = {"a": {"b": arr}} + asdf.AsdfFile(tree).write_to(fn) + + with asdf.open(fn, lazy_tree=True) as af: + d = af["a"] + + del af + + with pytest.raises(asdf.exceptions.AsdfLazyReferenceError, match="Failed to resolve"): + d["b"] + + +def test_lazy_tree_option(tmp_path): + fn = tmp_path / "test.asdf" + + arr = np.arange(42) + tree = {"a": {"b": arr}} + asdf.AsdfFile(tree).write_to(fn) + + with asdf.open(fn, lazy_tree=True) as af: + assert isinstance(af["a"], AsdfDictNode) + + with asdf.open(fn, lazy_tree=False) as af: + assert not isinstance(af["a"], AsdfDictNode) + + # test default (False) + with asdf.open(fn) as af: + assert not isinstance(af["a"], AsdfDictNode) + + with asdf.config_context() as cfg: + cfg.lazy_tree = True + with asdf.open(fn) as af: + assert isinstance(af["a"], AsdfDictNode) + cfg.lazy_tree = False + with asdf.open(fn) as af: + assert not isinstance(af["a"], AsdfDictNode) + + +def test_resolve_af_ref(): + with pytest.raises(asdf.exceptions.AsdfLazyReferenceError, match="Failed to resolve"): + _resolve_af_ref(None) + af = asdf.AsdfFile() + af_ref = weakref.ref(af) + assert _resolve_af_ref(af_ref) is af + del af + with pytest.raises(asdf.exceptions.AsdfLazyReferenceError, match="Failed to resolve"): + _resolve_af_ref(af_ref) + + +@pytest.mark.parametrize( + "NodeClass,data", + [ + (AsdfDictNode, {1: "a", 2: "b"}), + (AsdfListNode, [1, 2]), + (AsdfOrderedDictNode, collections.OrderedDict({1: "a", 2: "b"})), + (int, 1), # a non-wrappable class + ], +) +def test_to_lazy_node(NodeClass, data): + node = _to_lazy_node(data, None) + assert isinstance(node, NodeClass) + + +def test_lazy_node_treeutil_support(): + af = asdf.AsdfFile() + af_ref = weakref.ref(af) + tree = { + "ordered_dict": AsdfOrderedDictNode({"a": 1}, af_ref), + "dict": AsdfDictNode({"b": 2}, af_ref), + "list": AsdfListNode([3, 4], af_ref), + } + seen_ints = set() + + def callback(node): + if isinstance(node, int): + seen_ints.add(node) + + asdf.treeutil.walk_and_modify(tree, callback) + + assert seen_ints == set([1, 2, 3, 4]) + + +@pytest.fixture() +def cache_test_tree_path(tmp_path): + my_array = np.arange(3, dtype="uint8") + my_list = [my_array, my_array] + tree = {"a": my_list, "b": my_list} + af = asdf.AsdfFile(tree) + fn = tmp_path / "test.asdf" + af.write_to(fn) + return fn + + +def test_cache_resolves_ref(cache_test_tree_path): + with asdf.open(cache_test_tree_path, lazy_tree=True) as af: + # since 'a' and 'b' were the same list when the file was saved + # they should be the same list on read + assert af["a"] is af["b"] + # same for the arrays in the list + assert af["a"][0] is af["a"][1] + + +def test_cache_frees_deleted_object(cache_test_tree_path): + with asdf.open(cache_test_tree_path, lazy_tree=True) as af: + # load 1 of the 2 lists + l0 = af["a"] + # grab a weakref to the list (to not hold onto the list) + lref = weakref.ref(l0) + # now delete all references to the list (including the one in the tree) + del l0, af.tree["a"] + # trigger garbage collection + gc.collect() + # check that the weakref fails to resolve (so the list was freed) + assert lref() is None + # and we can no longer access 'a' + with pytest.raises(KeyError, match="'a'"): + af["a"] + # but can get 'b' + assert af["b"][0] is af["b"][1] + + +def test_cache_non_weakref(): + """ + Test that an object that cannot weak reference can be cached + """ + tagged_node = {} + obj = complex(1, 1) + cache_item = asdf.lazy_nodes._TaggedObjectCacheItem(tagged_node, obj) + del obj + gc.collect() + assert cache_item.custom_object == complex(1, 1) + + +@pytest.fixture(params=[True, False, None], ids=["lazy", "not-lazy", "undefined"]) +def lazy_test_class(request): + class Foo: + def __init__(self, data): + self.data = data + + tag_uri = "asdf://somewhere.org/tags/foo-1.0.0" + + class FooConverter: + tags = [tag_uri] + types = [Foo] + + def to_yaml_tree(self, obj, tag, ctx): + return obj.data + + def from_yaml_tree(self, node, tag, ctx): + return Foo(node) + + lazy = request.param + if lazy is not None: + FooConverter.lazy = lazy + # also set lazy on the class to pass it to the test + Foo.lazy = lazy + + class FooExtension: + extension_uri = "asdf://somewhere.org/extensions/minimum-1.0.0" + converters = [FooConverter()] + tags = [tag_uri] + + with asdf.config_context() as cfg: + cfg.add_extension(FooExtension()) + yield Foo + + +def test_lazy_converter(tmp_path, lazy_test_class): + obj = lazy_test_class({"a": 1}) + + fn = tmp_path / "test.asdf" + + af = asdf.AsdfFile({"obj": obj}) + af.write_to(fn) + + with asdf.open(fn, lazy_tree=True) as af: + if lazy_test_class.lazy is None or not lazy_test_class.lazy: + target_class = dict + else: + target_class = AsdfDictNode + assert isinstance(af["obj"].data, target_class) + + +@pytest.fixture() +def lazy_generator_class(request): + + class Foo: + def __init__(self, data=None): + self.data = data or {} + + tag_uri = "asdf://somewhere.org/tags/foo-1.0.0" + + class FooConverter: + tags = [tag_uri] + types = [Foo] + lazy = True + + def to_yaml_tree(self, obj, tag, ctx): + return obj.data + + def from_yaml_tree(self, node, tag, ctx): + obj = Foo() + yield obj + obj.data = node + + class FooExtension: + extension_uri = "asdf://somewhere.org/extensions/minimum-1.0.0" + converters = [FooConverter()] + tags = [tag_uri] + + with asdf.config_context() as cfg: + cfg.add_extension(FooExtension()) + yield Foo + + +def test_lazy_generator_converter(tmp_path, lazy_generator_class): + """ + Test that a converter that returns a generator is not lazy + (even if it's marked as lazy). + """ + obj = lazy_generator_class({"a": 1}) + + fn = tmp_path / "test.asdf" + + af = asdf.AsdfFile({"obj": obj}) + af.write_to(fn) + + with asdf.open(fn, lazy_tree=True) as af: + assert isinstance(af["obj"].data, dict) diff --git a/asdf/_tests/test_reference.py b/asdf/_tests/test_reference.py index 5f57f94a9..8cbfb16b0 100644 --- a/asdf/_tests/test_reference.py +++ b/asdf/_tests/test_reference.py @@ -225,7 +225,7 @@ def test_internal_reference(tmp_path): assert b"{$ref: ''}" in buff.getvalue() -def test_implicit_internal_reference(tmp_path): +def test_implicit_internal_reference(tmp_path, with_lazy_tree): target = {"foo": "bar"} nested_in_dict = {"target": target} nested_in_list = [target] diff --git a/asdf/_tests/test_schema.py b/asdf/_tests/test_schema.py index 831c35f1a..a72732701 100644 --- a/asdf/_tests/test_schema.py +++ b/asdf/_tests/test_schema.py @@ -453,8 +453,9 @@ class CustomExtension: # This should cause a warning but not an error because without explicitly # providing an extension, our custom type will not be recognized and will # simply be converted to a raw type. - with pytest.warns(AsdfConversionWarning, match=tag_uri), asdf.open(buff): - pass + with pytest.warns(AsdfConversionWarning, match=tag_uri): + with asdf.open(buff) as af: + af["custom"] buff.seek(0) with config_context() as cfg: diff --git a/asdf/_tests/test_types.py b/asdf/_tests/test_types.py index 71d87b316..825c5d718 100644 --- a/asdf/_tests/test_types.py +++ b/asdf/_tests/test_types.py @@ -5,7 +5,7 @@ from asdf.testing.helpers import yaml_to_asdf -def test_undefined_tag(): +def test_undefined_tag(with_lazy_tree): # This tests makes sure that ASDF still returns meaningful structured data # even when it encounters a schema tag that it does not specifically # implement as an extension @@ -26,6 +26,7 @@ def test_undefined_tag(): with pytest.warns(Warning) as warning: afile = asdf.open(buff) missing = afile.tree["undefined_data"] + missing[3] assert missing[0] == 5 assert missing[1] == {"message": "there is no tag"} @@ -37,11 +38,12 @@ def test_undefined_tag(): # filter out only AsdfConversionWarning warning = [w for w in warning if w.category == AsdfConversionWarning] assert len(warning) == 2 - for i, tag in enumerate(["also_undefined-1.3.0", "undefined_tag-1.0.0"]): - assert ( - str(warning[i].message) - == f"tag:nowhere.org:custom/{tag} is not recognized, converting to raw Python data structure" - ) + messages = {str(w.message) for w in warning} + match = { + f"tag:nowhere.org:custom/{tag} is not recognized, converting to raw Python data structure" + for tag in ("undefined_tag-1.0.0", "also_undefined-1.3.0") + } + assert messages == match # Make sure no warning occurs if explicitly ignored buff.seek(0) diff --git a/asdf/config.py b/asdf/config.py index 462c57829..3a5561f75 100644 --- a/asdf/config.py +++ b/asdf/config.py @@ -26,6 +26,7 @@ DEFAULT_ALL_ARRAY_COMPRESSION_KWARGS = None DEFAULT_DEFAULT_ARRAY_SAVE_BASE = True DEFAULT_CONVERT_UNKNOWN_NDARRAY_SUBCLASSES = True +DEFAULT_LAZY_TREE = False class AsdfConfig: @@ -49,6 +50,7 @@ def __init__(self): self._all_array_compression_kwargs = DEFAULT_ALL_ARRAY_COMPRESSION_KWARGS self._default_array_save_base = DEFAULT_DEFAULT_ARRAY_SAVE_BASE self._convert_unknown_ndarray_subclasses = DEFAULT_CONVERT_UNKNOWN_NDARRAY_SUBCLASSES + self._lazy_tree = DEFAULT_LAZY_TREE self._lock = threading.RLock() @@ -458,6 +460,24 @@ def convert_unknown_ndarray_subclasses(self): def convert_unknown_ndarray_subclasses(self, value): self._convert_unknown_ndarray_subclasses = value + @property + def lazy_tree(self): + """ + Get configuration that controls if ASDF tree contents + are lazily converted to custom objects or if all custom + objects are created when the file is opened. See the + ``lazy_tree`` argument for `asdf.open`. + + Returns + ------- + bool + """ + return self._lazy_tree + + @lazy_tree.setter + def lazy_tree(self, value): + self._lazy_tree = value + def __repr__(self): return ( "" ) diff --git a/asdf/exceptions.py b/asdf/exceptions.py index ca572fdcc..879aa25c2 100644 --- a/asdf/exceptions.py +++ b/asdf/exceptions.py @@ -3,6 +3,7 @@ __all__ = [ "AsdfConversionWarning", "AsdfDeprecationWarning", + "AsdfLazyReferenceError", "AsdfManifestURIMismatchWarning", "AsdfPackageVersionWarning", "AsdfProvisionalAPIWarning", @@ -63,3 +64,12 @@ class AsdfManifestURIMismatchWarning(AsdfWarning): A warning indicaing that an extension registered with a manifest contains a id that does not match the uri of the manifest. """ + + +class AsdfLazyReferenceError(ReferenceError): + """ + Indicates that the lazy tree node failed to resolve a reference + to an AsdfFile instance. This likely means the AsdfFile was garbage + collected and you may need to update your code to keep the AsdfFile + in memory (by keeping a reference). + """ diff --git a/asdf/extension/_converter.py b/asdf/extension/_converter.py index 3696fffc3..fca64a936 100644 --- a/asdf/extension/_converter.py +++ b/asdf/extension/_converter.py @@ -34,6 +34,11 @@ class Converter(abc.ABC): and return a str, the selected tag (should be one of tags) or `None` which will trigger the result of ``to_yaml_tree`` to be used to look up the next converter for this object. + + The ``lazy`` attribute is optional. If ``True`` asdf will + pass "lazy" objects to the converter. If ``False`` (or not + defined) asdf will convert all child objects before calling + `from_yaml_tree`. """ @classmethod @@ -117,7 +122,10 @@ def from_yaml_tree(self, node, tag, ctx): For container types received by this method (dict or list), the children of the container will have already been converted - by prior calls to from_yaml_tree implementations. + by prior calls to from_yaml_tree implementations unless + ``lazy_tree`` was set to ``True`` for `asdf.open`. With a lazy + tree the container types will be `asdf.lazy_nodes` (which act + like dict or list but convert child objects when accessed). Note on circular references: trees that reference themselves among their descendants must be handled with care. Most @@ -204,6 +212,17 @@ def __init__(self, delegate, extension): msg = "Converter property 'types' must contain str or type values" raise TypeError(msg) + @property + def lazy(self): + """ + Boolean indicating if this Converter supports "lazy" node objects + + Returns + ------- + bool + """ + return getattr(self._delegate, "lazy", False) + @property def tags(self): """ diff --git a/asdf/lazy_nodes.py b/asdf/lazy_nodes.py new file mode 100644 index 000000000..13d056dd6 --- /dev/null +++ b/asdf/lazy_nodes.py @@ -0,0 +1,311 @@ +""" +Objects that act like dict, list, OrderedDict but allow +lazy conversion of tagged ASDF tree nodes to custom objects. +""" + +import collections +import inspect +import warnings +import weakref + +from . import tagged, yamlutil +from .exceptions import AsdfConversionWarning, AsdfLazyReferenceError +from .extension._serialization_context import BlockAccess + +__all__ = ["AsdfDictNode", "AsdfListNode", "AsdfOrderedDictNode"] + + +class _TaggedObjectCacheItem: + """ + A tagged node and a (weakref) to the converted custom object + """ + + def __init__(self, tagged_node, custom_object): + self.tagged_node = tagged_node + try: + self._custom_object_ref = weakref.ref(custom_object) + except TypeError: + # if a weakref is not possible, store the object + self._custom_object_ref = lambda obj=custom_object: obj + + @property + def custom_object(self): + return self._custom_object_ref() + + +class _TaggedObjectCache: + """ + A cache of tagged nodes and their corresponding custom objects. + + This is critical for trees that contain references/pointers to the + same object at multiple locations in the tree. + + Only weakrefs are key to the custom objects to allow large items + deleted from the tree to be garbage collected. This means that an + item added to the cache may later fail to retrieve (if the weakref-ed + custom object was deleted). + """ + + def __init__(self): + # start with a clear cache + self.clear() + + def clear(self): + self._cache = {} + + def retrieve(self, tagged_node): + """ + Check the cache for a previously converted object. + + Parameters + ---------- + tagged_node : Tagged + The tagged representation of the custom object + + Returns + ------- + custom_object : None or the converted object + The custom object previously converted from the tagged_node or + ``None`` if the object hasn't been converted (or was previously + deleted from the tree). + """ + key = id(tagged_node) + if key not in self._cache: + return None + item = self._cache[key] + custom_object = item.custom_object + if custom_object is None: + del self._cache[key] + return custom_object + + def store(self, tagged_node, custom_object): + """ + Store a converted custom object in the cache. + + Parameters + ---------- + tagged_node : Tagged + The tagged representation of the custom object + + custom_object : converted object + The custom object (a weakref to this object will be kept in the cache). + """ + self._cache[id(tagged_node)] = _TaggedObjectCacheItem(tagged_node, custom_object) + + +def _resolve_af_ref(af_ref): + msg = "Failed to resolve AsdfFile reference" + if af_ref is None: + raise AsdfLazyReferenceError(msg) + af = af_ref() + if af is None: + raise AsdfLazyReferenceError(msg) + return af + + +def _to_lazy_node(node, af_ref): + """ + Convert an object to a _AsdfNode subclass. + If the object does not have a corresponding subclass + it will be returned unchanged. + """ + if isinstance(node, list): + return AsdfListNode(node, af_ref) + elif isinstance(node, collections.OrderedDict): + return AsdfOrderedDictNode(node, af_ref) + elif isinstance(node, dict): + return AsdfDictNode(node, af_ref) + return node + + +class _AsdfNode: + """ + The "lazy node" base class that handles object + conversion and wrapping and contains a weak reference + to the `asdf.AsdfFile` that triggered the creation of this + node (when the "lazy tree" was loaded). + """ + + def __init__(self, data=None, af_ref=None): + self._af_ref = af_ref + self.data = data + + @property + def tagged(self): + """ + Return the tagged tree backing this node + """ + return self.data + + def _convert_and_cache(self, value, key): + """ + Convert ``value`` to either: + + - a custom object if ``value`` is `asdf.tagged.Tagged` + - an ``asdf.lazy_nodes.AsdfListNode` if ``value`` is + a ``list`` + - an ``asdf.lazy_nodes.AsdfDictNode` if ``value`` is + a ``dict`` + - an ``asdf.lazy_nodes.AsdfOrderedDictNode` if ``value`` is + a ``OrderedDict`` + - otherwise return ``value`` unmodified + + After conversion the result (``obj``) will be stored in this + `asdf.lazy_nodes._AsdfNode` using the provided key and cached + in the corresponding `asdf.AsdfFile` instance (so other + references to ``value`` in the tree will return the same + ``obj``). + + Parameters + ---------- + value : + The object to convert from a Tagged to custom object + or wrap with an _AsdfNode or return unmodified. + + key : + The key under which the converted/wrapped object will + be stored. + + + Returns + ------- + obj : + The converted or wrapped (or the value if no conversion + or wrapping is required). + """ + # if the value has already been wrapped, return it + if isinstance(value, _AsdfNode): + return value + if not isinstance(value, tagged.Tagged) and type(value) not in _base_type_to_node_map: + return value + af = _resolve_af_ref(self._af_ref) + # if the obj that will be returned from this value + # is already cached, use the cached obj + if (obj := af._tagged_object_cache.retrieve(value)) is not None: + self[key] = obj + return obj + # for Tagged instances, convert them to their custom obj + if isinstance(value, tagged.Tagged): + extension_manager = af.extension_manager + tag = value._tag + if not extension_manager.handles_tag(tag): + if not af._ignore_unrecognized_tag: + warnings.warn( + f"{tag} is not recognized, converting to raw Python data structure", + AsdfConversionWarning, + ) + obj = _to_lazy_node(value, self._af_ref) + else: + converter = extension_manager.get_converter_for_tag(tag) + if not getattr(converter, "lazy", False) or inspect.isgeneratorfunction( + converter._delegate.from_yaml_tree + ): + obj = yamlutil.tagged_tree_to_custom_tree(value, af) + else: + data = _to_lazy_node(value.data, self._af_ref) + sctx = af._create_serialization_context(BlockAccess.READ) + obj = converter.from_yaml_tree(data, tag, sctx) + sctx.assign_object(obj) + sctx.assign_blocks() + sctx._mark_extension_used(converter.extension) + else: + # for non-tagged objects, wrap in an _AsdfNode + node_type = _base_type_to_node_map[type(value)] + obj = node_type(value, self._af_ref) + # cache the converted/wrapped obj with the AsdfFile so other + # references to the same Tagged value will result in the + # same obj + af._tagged_object_cache.store(value, obj) + self[key] = obj + return obj + + +class AsdfListNode(_AsdfNode, collections.UserList): + """ + An class that acts like a ``list``. The items in this ``list`` + will start out as tagged nodes which will only be converted to + custom objects the first time they are indexed (the custom object + will then be cached for later reuse). + + If sliced, this will return a new instance of `AsdfListNode` for + the sliced portion of the list. + """ + + def __init__(self, data=None, af_ref=None): + if data is None: + data = [] + _AsdfNode.__init__(self, data, af_ref) + collections.UserList.__init__(self, data) + + def __copy__(self): + return AsdfListNode(self.data.copy(), self._af_ref) + + def __eq__(self, other): + if self is other: + return True + return list(self) == list(other) + + def __ne__(self, other): + return not self.__eq__(other) + + def __getitem__(self, key): + # key might be an int or slice + value = super().__getitem__(key) + if isinstance(key, slice): + return AsdfListNode(value, self._af_ref) + return self._convert_and_cache(value, key) + + +class AsdfDictNode(_AsdfNode, collections.UserDict): + """ + An class that acts like a ``dict``. The values for this + ``dict`` will start out as tagged nodes which will only + be converted to custom objects the first time the corresponding + key is used (the custom object will then be cached for later + reuse). + """ + + def __init__(self, data=None, af_ref=None): + if data is None: + data = {} + _AsdfNode.__init__(self, data, af_ref) + collections.UserDict.__init__(self, data) + + def __copy__(self): + return AsdfDictNode(self.data.copy(), self._af_ref) + + def __eq__(self, other): + if self is other: + return True + return dict(self) == dict(other) + + def __ne__(self, other): + return not self.__eq__(other) + + def __getitem__(self, key): + return self._convert_and_cache(super().__getitem__(key), key) + + +class AsdfOrderedDictNode(AsdfDictNode, collections.OrderedDict): + """ + An class that acts like a ``collections.OrderedDict``. The values + for this ``OrderedDict`` will start out as tagged nodes which will only + be converted to custom objects the first time the corresponding + key is used (the custom object will then be cached for later + reuse). + """ + + def __init__(self, data=None, af_ref=None): + if data is None: + data = collections.OrderedDict() + AsdfDictNode.__init__(self, data, af_ref) + + def __copy__(self): + return AsdfOrderedDictNode(self.data.copy(), self._af_ref) + + +_base_type_to_node_map = { + dict: AsdfDictNode, + list: AsdfListNode, + collections.OrderedDict: AsdfOrderedDictNode, +} diff --git a/asdf/tags/core/__init__.py b/asdf/tags/core/__init__.py index f075e6fce..f1e13165a 100644 --- a/asdf/tags/core/__init__.py +++ b/asdf/tags/core/__init__.py @@ -1,3 +1,5 @@ +import collections + from .constant import Constant from .external_reference import ExternalArrayReference from .integer import IntegerType @@ -18,7 +20,10 @@ ] -class AsdfObject(dict): +# AsdfObject inherits both collections.UserDict and dict to allow it +# to pass an isinstance(..., dict) check and to allow it to be "lazy" +# loaded when "lazy_tree=True". +class AsdfObject(collections.UserDict, dict): pass diff --git a/asdf/treeutil.py b/asdf/treeutil.py index 4b562d039..a76fd924b 100644 --- a/asdf/treeutil.py +++ b/asdf/treeutil.py @@ -6,7 +6,7 @@ import warnings from contextlib import contextmanager -from . import tagged +from . import lazy_nodes, tagged from .exceptions import AsdfDeprecationWarning, AsdfWarning from .util import NotSet @@ -66,12 +66,12 @@ def recurse(tree): if tree_id in seen: return - if isinstance(tree, (list, tuple)): + if isinstance(tree, (list, tuple, lazy_nodes.AsdfListNode)): seen.add(tree_id) for val in tree: yield from recurse(val) seen.remove(tree_id) - elif isinstance(tree, dict): + elif isinstance(tree, (dict, lazy_nodes.AsdfDictNode)): seen.add(tree_id) for val in tree.values(): yield from recurse(val) @@ -293,7 +293,10 @@ def _handle_callback(node, json_id): return _handle_generator(result) def _handle_mapping(node, json_id): - result = node.__class__() + if isinstance(node, lazy_nodes.AsdfDictNode): + result = {} + else: + result = node.__class__() if isinstance(node, tagged.Tagged): result._tag = node._tag @@ -324,7 +327,10 @@ def _handle_mapping(node, json_id): del result[key] def _handle_mutable_sequence(node, json_id): - result = node.__class__() + if isinstance(node, lazy_nodes.AsdfListNode): + result = [] + else: + result = node.__class__() if isinstance(node, tagged.Tagged): result._tag = node._tag @@ -370,11 +376,11 @@ def _handle_immutable_sequence(node, json_id): return result def _handle_children(node, json_id): - if isinstance(node, dict): + if isinstance(node, (dict, lazy_nodes.AsdfDictNode)): result = _handle_mapping(node, json_id) elif isinstance(node, tuple): result = _handle_immutable_sequence(node, json_id) - elif isinstance(node, list): + elif isinstance(node, (list, lazy_nodes.AsdfListNode)): result = _handle_mutable_sequence(node, json_id) else: result = node @@ -397,7 +403,7 @@ def _recurse(node, json_id=None): # URIs. Ignore an id that is not a string, since it may # be an object defining an id property and not an id # itself (this is common in metaschemas). - if isinstance(node, dict) and "id" in node and isinstance(node["id"], str): + if isinstance(node, (dict, lazy_nodes.AsdfDictNode)) and "id" in node and isinstance(node["id"], str): json_id = node["id"] if postorder: @@ -444,10 +450,10 @@ def get_children(node): node has no children (either it is an empty container, or is a non-container type) """ - if isinstance(node, dict): + if isinstance(node, (dict, lazy_nodes.AsdfDictNode)): return list(node.items()) - if isinstance(node, (list, tuple)): + if isinstance(node, (list, tuple, lazy_nodes.AsdfListNode)): return list(enumerate(node)) return [] @@ -468,4 +474,4 @@ def is_container(node): bool True if node is a container, False otherwise """ - return isinstance(node, (dict, list, tuple)) + return isinstance(node, (dict, list, tuple, lazy_nodes.AsdfDictNode, lazy_nodes.AsdfListNode)) diff --git a/docs/asdf/config.rst b/docs/asdf/config.rst index d3f501727..39261a02f 100644 --- a/docs/asdf/config.rst +++ b/docs/asdf/config.rst @@ -45,6 +45,7 @@ the currently active config: io_block_size: -1 legacy_fill_schema_defaults: True validate_on_read: True + lazy_tree: False > The latter method, `~asdf.config_context`, returns a context manager that @@ -70,6 +71,7 @@ This allows for short-lived configuration changes that do not impact other code: io_block_size: -1 legacy_fill_schema_defaults: True validate_on_read: False + lazy_tree: False > >>> asdf.get_config() Special note to library maintainers diff --git a/docs/asdf/developer_api.rst b/docs/asdf/developer_api.rst index b04999733..7f0515d8e 100644 --- a/docs/asdf/developer_api.rst +++ b/docs/asdf/developer_api.rst @@ -32,3 +32,5 @@ to create their own custom ASDF types and extensions. :no-inheritance-diagram: .. automodapi:: asdf.testing.helpers + +.. automodapi:: asdf.lazy_nodes diff --git a/docs/asdf/extending/converters.rst b/docs/asdf/extending/converters.rst index c2bc60b57..6f98c8695 100644 --- a/docs/asdf/extending/converters.rst +++ b/docs/asdf/extending/converters.rst @@ -42,11 +42,13 @@ node is permitted to contain nested complex objects; these will in turn be passed to other ``to_yaml_tree`` methods in other Converters. `Converter.from_yaml_tree` - a method that accepts a simple node object from parsed YAML and -returns the appropriate complex Python object. Nested nodes in the received node -will have already been converted to complex objects by other calls to ``from_yaml_tree`` -methods, except where reference cycles are present -- see +returns the appropriate complex Python object. For a non-lazy-tree, nested +nodes in the received node will have already been converted to complex objects +by other calls to ``from_yaml_tree`` methods, except where reference cycles are present -- see :ref:`extending_converters_reference_cycles` for information on how to handle that -situation. +situation. For a ``lazy_tree`` (see `asdf.open`) the node will contain `asdf.lazy_nodes` +instances which act like dicts and lists but convert child objects only when they are +accessed. Additionally, the Converter interface includes a method that must be implemented when some logic is required to select the tag to assign to a ``to_yaml_tree`` result: @@ -54,6 +56,14 @@ when some logic is required to select the tag to assign to a ``to_yaml_tree`` re `Converter.select_tag` - an optional method that accepts a complex Python object and a list candidate tags and returns the tag that should be used to serialize the object. +`Converter.lazy` - a boolean attribute indicating if this converter accepts "lazy" objects +(those defined in `asdf.lazy_nodes`). This is mostly useful for container-like classes +(where the "lazy" objects can defer conversion of contained objects until they are accessed). +If a converter produces a generator lazy should be set to ``False`` as asdf will need +to generate nodes further out the branch to fully resolve the object returned from the +generator. + + A simple example ================