add back validate_xml

tlambert03 · Jul 1, 2023 · 8eba2c7 · 8eba2c7
1 parent d1b4b52
commit 8eba2c7
Show file tree

Hide file tree

Showing 4 changed files with 199 additions and 82 deletions.
diff --git a/src/ome_types/__init__.py b/src/ome_types/__init__.py
@@ -5,7 +5,20 @@
 except PackageNotFoundError:
     __version__ = "unknown"
 
+from ome_types import model
 from ome_types._conversion import from_tiff, from_xml, to_dict, to_xml
 from ome_types.model import OME
+from ome_types.units import ureg
+from ome_types.validation import validate_xml
 
-__all__ = ["__version__", "OME", "from_xml", "from_tiff", "to_dict", "to_xml"]
+__all__ = [
+    "__version__",
+    "from_tiff",
+    "from_xml",
+    "model",
+    "OME",
+    "to_dict",
+    "to_xml",
+    "ureg",
+    "validate_xml",
+]
diff --git a/src/ome_types/validation.py b/src/ome_types/validation.py
@@ -0,0 +1,79 @@
+from __future__ import annotations
+
+from contextlib import suppress
+from functools import lru_cache
+from pathlib import Path
+from typing import IO, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import xmlschema
+
+    XMLSourceType = str | bytes | Path | IO[str] | IO[bytes]
+
+NS_OME = r"{http://www.openmicroscopy.org/Schemas/OME/2016-06}"
+OME_2016_06_XSD = str(Path(__file__).parent / "ome-2016-06.xsd")
+
+
+class ValidationError(ValueError):
+    ...
+
+
+def validate_xml(xml: XMLSourceType, schema: Path | str | None = None) -> None:
+    """Validate XML against an XML Schema.
+
+    By default, will validate against the OME 2016-06 schema.
+    """
+    with suppress(ImportError):
+        return validate_xml_with_lxml(xml, schema)
+
+    with suppress(ImportError):
+        return validate_xml_with_xmlschema(xml, schema)
+
+    raise ImportError(  # pragma: no cover
+        "Validation requires either `lxml` or `xmlschema`. "
+        "Please pip install one of them."
+    ) from None
+
+
+def validate_xml_with_lxml(
+    xml: XMLSourceType, schema: Path | str | None = None
+) -> None:
+    """Validate XML against an XML Schema using lxml."""
+    from lxml import etree
+
+    tree = etree.parse(schema or OME_2016_06_XSD)  # noqa: S320
+    xmlschema = etree.XMLSchema(tree)
+
+    doc = etree.parse(xml)  # noqa: S320
+    if not xmlschema.validate(doc):
+        msg = f"Validation of {str(xml)[:20]!r} failed:"
+        for error in xmlschema.error_log:
+            print(dir(error))
+            msg += f"\n  - line {error.line}: {error.message}"
+        raise ValidationError(msg)
+
+
+def validate_xml_with_xmlschema(
+    xml: XMLSourceType, schema: Path | str | None = None
+) -> None:
+    """Validate XML against an XML Schema using xmlschema."""
+    from xmlschema.exceptions import XMLSchemaException
+
+    xmlschema = _get_XMLSchema(schema or OME_2016_06_XSD)
+    try:
+        xmlschema.validate(xml)
+    except XMLSchemaException as e:
+        raise ValidationError(str(e)) from None
+
+
+@lru_cache(maxsize=None)
+def _get_XMLSchema(schema: Path | str) -> xmlschema.XMLSchema:
+    import xmlschema
+
+    xml_schema = xmlschema.XMLSchema(schema)
+    # FIXME Hack to work around xmlschema poor support for keyrefs to
+    # substitution groups
+    ls_sgs = xml_schema.maps.substitution_groups[f"{NS_OME}LightSourceGroup"]
+    ls_id_maps = xml_schema.maps.identities[f"{NS_OME}LightSourceIDKey"]
+    ls_id_maps.elements = {e: None for e in ls_sgs}
+    return xml_schema
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -1,25 +1,22 @@
+from __future__ import annotations
+
 import pickle
 import re
 from functools import lru_cache
 from pathlib import Path
+from typing import TYPE_CHECKING, cast
 from xml.dom import minidom
 from xml.etree import ElementTree
 
 import pytest
 import xmlschema
 from pydantic import ValidationError
-from xmlschema.validators.exceptions import XMLSchemaValidationError
 
 import ome_types
 from ome_types import from_tiff, from_xml, model, to_xml
 
-ValidationErrors = [ValidationError, XMLSchemaValidationError]
-try:
-    from lxml.etree import XMLSchemaValidateError
-
-    ValidationErrors.append(XMLSchemaValidateError)
-except ImportError:
-    pass
+if TYPE_CHECKING:
+    from _pytest.mark.structures import ParameterSet
 
 TESTS = Path(__file__).parent
 DATA = TESTS / "data"
@@ -31,10 +28,7 @@
     "transformations-downgrade",
     "invalid_xml_annotation",
 }
-SHOULD_FAIL_ROUNDTRIP_LXML = {
-    "folders-simple-taxonomy",
-    "folders-larger-taxonomy",
-}
+
 SKIP_ROUNDTRIP = {
     # These have XMLAnnotations with extra namespaces and mixed content, which
     # the automated round-trip test code doesn't properly verify yet. So even
@@ -50,42 +44,19 @@
 OME_2016_06_XSD = str(Path(ome_types.__file__).parent / "ome-2016-06.xsd")
 
 
-@lru_cache(maxsize=None)
-def _get_schema() -> xmlschema.XMLSchema:
-    schema = xmlschema.XMLSchema(OME_2016_06_XSD)
-    # FIXME Hack to work around xmlschema poor support for keyrefs to
-    # substitution groups
-    ls_sgs = schema.maps.substitution_groups[f"{NS_OME}LightSourceGroup"]
-    ls_id_maps = schema.maps.identities[f"{NS_OME}LightSourceIDKey"]
-    ls_id_maps.elements = {e: None for e in ls_sgs}
-    return schema
-
-
-def mark_xfail(fname):
-    return pytest.param(
-        fname,
-        marks=pytest.mark.xfail(
-            strict=True, reason="Unexpected success. You fixed it!"
-        ),
-    )
-
-
-def mark_skip(fname):
-    return pytest.param(fname, marks=pytest.mark.skip)
-
-
-def true_stem(p):
+def true_stem(p: Path) -> str:
     return p.name.partition(".")[0]
 
 
 all_xml = list(DATA.glob("*.ome.xml"))
-xml_roundtrip = []
+xml_roundtrip: list[Path | ParameterSet] = []
 for f in all_xml:
     stem = true_stem(f)
     if stem in SHOULD_FAIL_ROUNDTRIP:
-        f = mark_xfail(f)
+        mrk = pytest.mark.xfail(strict=True, reason="Unexpected success. You fixed it!")
+        f = pytest.param(f, marks=mrk)  # type: ignore
     elif stem in SKIP_ROUNDTRIP:
-        f = mark_skip(f)
+        f = pytest.param(f, marks=pytest.mark.skip)  # type: ignore
     xml_roundtrip.append(f)
 
 
@@ -98,7 +69,7 @@ def test_from_valid_xml(valid_xml: Path, validate: bool) -> None:
 
 
 @pytest.mark.parametrize("validate", validate)
-def test_from_invalid_xml(invalid_xml: Path, validate) -> None:
+def test_from_invalid_xml(invalid_xml: Path, validate: bool) -> None:
     if validate:
         with pytest.raises(ValidationError):
             from_xml(invalid_xml, validate=validate)
@@ -118,45 +89,7 @@ def test_from_tiff(validate: bool) -> None:
     assert ome.images[0].pixels.channels[0].samples_per_pixel == 1
 
 
-def _sort_elements(element, recursive=True):
-    # Sort the child elements alphabetically by their tag name
-    sorted_children = sorted(element, key=lambda child: child.tag)
-
-    # Replace the existing child elements with the sorted ones
-    element[:] = sorted_children
-
-    # Recursively sort child elements for each subelement
-    if recursive:
-        for child in element:
-            _sort_elements(child)
-
-
-def _canonicalize(xml: str, strip_empty: bool) -> str:
-    schema = _get_schema()
-
-    d = schema.decode(xml, use_defaults=True)
-    # Strip extra whitespace in the schemaLocation value.
-    d["@xsi:schemaLocation"] = re.sub(r"\s+", " ", d["@xsi:schemaLocation"])
-    root = schema.encode(d, path=NS_OME + "OME", use_defaults=True)
-    # These are the tags that appear in the example files with empty
-    # content. Since our round-trip will drop empty elements, we'll need to
-    # strip them from the "original" documents before comparison.
-    if strip_empty:
-        for tag in ("Description", "LightPath", "Map"):
-            for e in root.findall(f".//{NS_OME}{tag}[.='']..."):
-                e.remove(e.find(f"{NS_OME}{tag}"))
-    # ET.canonicalize can't handle an empty namespace so we need to
-    # re-register the OME namespace with an actual name before calling
-    # tostring.
-    _sort_elements(root)
-    ElementTree.register_namespace("ome", URI_OME)
-    xml_out = ElementTree.tostring(root, "unicode")
-    xml_out = ElementTree.canonicalize(xml_out, strip_text=True)
-    xml_out = minidom.parseString(xml_out).toprettyxml(indent="  ")
-    return xml_out
-
-
-def test_roundtrip(valid_xml: Path):
+def test_roundtrip(valid_xml: Path) -> None:
     """Ensure we can losslessly round-trip XML through the model and back."""
     if true_stem(valid_xml) in SKIP_ROUNDTRIP:
         pytest.xfail("known issues with canonicalization")
@@ -173,7 +106,7 @@ def test_roundtrip(valid_xml: Path):
         raise AssertionError
 
 
-def test_roundtrip_inverse(valid_xml, tmp_path: Path):
+def test_roundtrip_inverse(valid_xml: Path, tmp_path: Path) -> None:
     """both variants have been touched by the model, here..."""
     ome1 = from_xml(valid_xml)
 
@@ -234,3 +167,66 @@ def test_refs() -> None:
 
 def test_with_ome_ns() -> None:
     assert from_xml(DATA / "ome_ns.ome.xml").experimenters
+
+
+# ########## Canonicalization utils ##########
+
+
+@lru_cache(maxsize=None)
+def _get_schema() -> xmlschema.XMLSchema:
+    schema = xmlschema.XMLSchema(OME_2016_06_XSD)
+    # FIXME Hack to work around xmlschema poor support for keyrefs to
+    # substitution groups
+    ls_sgs = schema.maps.substitution_groups[f"{NS_OME}LightSourceGroup"]
+    ls_id_maps = schema.maps.identities[f"{NS_OME}LightSourceIDKey"]
+    ls_id_maps.elements = {e: None for e in ls_sgs}
+    return schema
+
+
+def _sort_elements(element, recursive=True):
+    # Sort the child elements alphabetically by their tag name
+    sorted_children = sorted(element, key=lambda child: child.tag)
+
+    # Replace the existing child elements with the sorted ones
+    element[:] = sorted_children
+
+    # Recursively sort child elements for each subelement
+    if recursive:
+        for child in element:
+            _sort_elements(child)
+
+
+def _add_defaults(xml: str) -> ElementTree.Element:
+    # we use xmlschema to encode then decode the xml to add missing
+    # default values for the sake of canonicalization
+    # this is the ONLY remaining thing that xmlschema is used for
+    # (and only in testing).
+
+    schema = _get_schema()
+    d = cast(dict, schema.decode(xml, use_defaults=True))
+    # Strip extra whitespace in the schemaLocation value.
+    d["@xsi:schemaLocation"] = re.sub(r"\s+", " ", d["@xsi:schemaLocation"])
+    return schema.encode(d, path=NS_OME + "OME", use_defaults=True)  # type: ignore
+
+
+def _canonicalize(xml: str, strip_empty: bool) -> str:
+    root = _add_defaults(xml)
+
+    # These are the tags that appear in the example files with empty
+    # content. Since our round-trip will drop empty elements, we'll need to
+    # strip them from the "original" documents before comparison.
+    if strip_empty:
+        for tag in ("Description", "LightPath", "Map"):
+            for e in root.findall(f".//{NS_OME}{tag}[.='']..."):
+                e.remove(e.find(f"{NS_OME}{tag}"))  # type: ignore
+
+    # ET.canonicalize can't handle an empty namespace so we need to
+    # re-register the OME namespace with an actual name before calling
+    # tostring.
+    _sort_elements(root)
+
+    ElementTree.register_namespace("ome", URI_OME)
+    xml_out = ElementTree.tostring(root, "unicode")
+    xml_out = ElementTree.canonicalize(xml_out, strip_text=True)
+    xml_out = minidom.parseString(xml_out).toprettyxml(indent="  ")
+    return xml_out
diff --git a/tests/test_validation.py b/tests/test_validation.py
@@ -0,0 +1,29 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable
+
+import pytest
+
+from ome_types import (
+    validate_xml,  # noqa: F401 ... just to make sure it's importable
+    validation,
+)
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+VALIDATORS: dict[str, Callable] = {
+    "lxml": validation.validate_xml_with_lxml,
+    "xmlschema": validation.validate_xml_with_xmlschema,
+}
+
+
+@pytest.mark.parametrize("backend", ["lxml", "xmlschema"])
+def test_validation_good(valid_xml: Path, backend: str) -> None:
+    VALIDATORS[backend](valid_xml)
+
+
+@pytest.mark.parametrize("backend", ["lxml", "xmlschema"])
+def test_validation_raises(invalid_xml: Path, backend: str) -> None:
+    with pytest.raises(validation.ValidationError):
+        VALIDATORS[backend](invalid_xml)