Skip to content

Commit

Permalink
add back validate_xml
Browse files Browse the repository at this point in the history
  • Loading branch information
tlambert03 committed Jul 1, 2023
1 parent d1b4b52 commit 8eba2c7
Show file tree
Hide file tree
Showing 4 changed files with 199 additions and 82 deletions.
15 changes: 14 additions & 1 deletion src/ome_types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,20 @@
except PackageNotFoundError:
__version__ = "unknown"

from ome_types import model
from ome_types._conversion import from_tiff, from_xml, to_dict, to_xml
from ome_types.model import OME
from ome_types.units import ureg
from ome_types.validation import validate_xml

__all__ = ["__version__", "OME", "from_xml", "from_tiff", "to_dict", "to_xml"]
__all__ = [
"__version__",
"from_tiff",
"from_xml",
"model",
"OME",
"to_dict",
"to_xml",
"ureg",
"validate_xml",
]
79 changes: 79 additions & 0 deletions src/ome_types/validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from __future__ import annotations

from contextlib import suppress
from functools import lru_cache
from pathlib import Path
from typing import IO, TYPE_CHECKING

if TYPE_CHECKING:
import xmlschema

XMLSourceType = str | bytes | Path | IO[str] | IO[bytes]

NS_OME = r"{http://www.openmicroscopy.org/Schemas/OME/2016-06}"
OME_2016_06_XSD = str(Path(__file__).parent / "ome-2016-06.xsd")


class ValidationError(ValueError):
...


def validate_xml(xml: XMLSourceType, schema: Path | str | None = None) -> None:
"""Validate XML against an XML Schema.
By default, will validate against the OME 2016-06 schema.
"""
with suppress(ImportError):
return validate_xml_with_lxml(xml, schema)

with suppress(ImportError):
return validate_xml_with_xmlschema(xml, schema)

raise ImportError( # pragma: no cover
"Validation requires either `lxml` or `xmlschema`. "
"Please pip install one of them."
) from None


def validate_xml_with_lxml(
xml: XMLSourceType, schema: Path | str | None = None
) -> None:
"""Validate XML against an XML Schema using lxml."""
from lxml import etree

tree = etree.parse(schema or OME_2016_06_XSD) # noqa: S320
xmlschema = etree.XMLSchema(tree)

doc = etree.parse(xml) # noqa: S320
if not xmlschema.validate(doc):
msg = f"Validation of {str(xml)[:20]!r} failed:"
for error in xmlschema.error_log:
print(dir(error))
msg += f"\n - line {error.line}: {error.message}"
raise ValidationError(msg)


def validate_xml_with_xmlschema(
xml: XMLSourceType, schema: Path | str | None = None
) -> None:
"""Validate XML against an XML Schema using xmlschema."""
from xmlschema.exceptions import XMLSchemaException

xmlschema = _get_XMLSchema(schema or OME_2016_06_XSD)
try:
xmlschema.validate(xml)
except XMLSchemaException as e:
raise ValidationError(str(e)) from None


@lru_cache(maxsize=None)
def _get_XMLSchema(schema: Path | str) -> xmlschema.XMLSchema:
import xmlschema

xml_schema = xmlschema.XMLSchema(schema)
# FIXME Hack to work around xmlschema poor support for keyrefs to
# substitution groups
ls_sgs = xml_schema.maps.substitution_groups[f"{NS_OME}LightSourceGroup"]
ls_id_maps = xml_schema.maps.identities[f"{NS_OME}LightSourceIDKey"]
ls_id_maps.elements = {e: None for e in ls_sgs}
return xml_schema
158 changes: 77 additions & 81 deletions tests/test_model.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,22 @@
from __future__ import annotations

import pickle
import re
from functools import lru_cache
from pathlib import Path
from typing import TYPE_CHECKING, cast
from xml.dom import minidom
from xml.etree import ElementTree

import pytest
import xmlschema
from pydantic import ValidationError
from xmlschema.validators.exceptions import XMLSchemaValidationError

import ome_types
from ome_types import from_tiff, from_xml, model, to_xml

ValidationErrors = [ValidationError, XMLSchemaValidationError]
try:
from lxml.etree import XMLSchemaValidateError

ValidationErrors.append(XMLSchemaValidateError)
except ImportError:
pass
if TYPE_CHECKING:
from _pytest.mark.structures import ParameterSet

TESTS = Path(__file__).parent
DATA = TESTS / "data"
Expand All @@ -31,10 +28,7 @@
"transformations-downgrade",
"invalid_xml_annotation",
}
SHOULD_FAIL_ROUNDTRIP_LXML = {
"folders-simple-taxonomy",
"folders-larger-taxonomy",
}

SKIP_ROUNDTRIP = {
# These have XMLAnnotations with extra namespaces and mixed content, which
# the automated round-trip test code doesn't properly verify yet. So even
Expand All @@ -50,42 +44,19 @@
OME_2016_06_XSD = str(Path(ome_types.__file__).parent / "ome-2016-06.xsd")


@lru_cache(maxsize=None)
def _get_schema() -> xmlschema.XMLSchema:
schema = xmlschema.XMLSchema(OME_2016_06_XSD)
# FIXME Hack to work around xmlschema poor support for keyrefs to
# substitution groups
ls_sgs = schema.maps.substitution_groups[f"{NS_OME}LightSourceGroup"]
ls_id_maps = schema.maps.identities[f"{NS_OME}LightSourceIDKey"]
ls_id_maps.elements = {e: None for e in ls_sgs}
return schema


def mark_xfail(fname):
return pytest.param(
fname,
marks=pytest.mark.xfail(
strict=True, reason="Unexpected success. You fixed it!"
),
)


def mark_skip(fname):
return pytest.param(fname, marks=pytest.mark.skip)


def true_stem(p):
def true_stem(p: Path) -> str:
return p.name.partition(".")[0]


all_xml = list(DATA.glob("*.ome.xml"))
xml_roundtrip = []
xml_roundtrip: list[Path | ParameterSet] = []
for f in all_xml:
stem = true_stem(f)
if stem in SHOULD_FAIL_ROUNDTRIP:
f = mark_xfail(f)
mrk = pytest.mark.xfail(strict=True, reason="Unexpected success. You fixed it!")
f = pytest.param(f, marks=mrk) # type: ignore
elif stem in SKIP_ROUNDTRIP:
f = mark_skip(f)
f = pytest.param(f, marks=pytest.mark.skip) # type: ignore
xml_roundtrip.append(f)


Expand All @@ -98,7 +69,7 @@ def test_from_valid_xml(valid_xml: Path, validate: bool) -> None:


@pytest.mark.parametrize("validate", validate)
def test_from_invalid_xml(invalid_xml: Path, validate) -> None:
def test_from_invalid_xml(invalid_xml: Path, validate: bool) -> None:
if validate:
with pytest.raises(ValidationError):
from_xml(invalid_xml, validate=validate)
Expand All @@ -118,45 +89,7 @@ def test_from_tiff(validate: bool) -> None:
assert ome.images[0].pixels.channels[0].samples_per_pixel == 1


def _sort_elements(element, recursive=True):
# Sort the child elements alphabetically by their tag name
sorted_children = sorted(element, key=lambda child: child.tag)

# Replace the existing child elements with the sorted ones
element[:] = sorted_children

# Recursively sort child elements for each subelement
if recursive:
for child in element:
_sort_elements(child)


def _canonicalize(xml: str, strip_empty: bool) -> str:
schema = _get_schema()

d = schema.decode(xml, use_defaults=True)
# Strip extra whitespace in the schemaLocation value.
d["@xsi:schemaLocation"] = re.sub(r"\s+", " ", d["@xsi:schemaLocation"])
root = schema.encode(d, path=NS_OME + "OME", use_defaults=True)
# These are the tags that appear in the example files with empty
# content. Since our round-trip will drop empty elements, we'll need to
# strip them from the "original" documents before comparison.
if strip_empty:
for tag in ("Description", "LightPath", "Map"):
for e in root.findall(f".//{NS_OME}{tag}[.='']..."):
e.remove(e.find(f"{NS_OME}{tag}"))
# ET.canonicalize can't handle an empty namespace so we need to
# re-register the OME namespace with an actual name before calling
# tostring.
_sort_elements(root)
ElementTree.register_namespace("ome", URI_OME)
xml_out = ElementTree.tostring(root, "unicode")
xml_out = ElementTree.canonicalize(xml_out, strip_text=True)
xml_out = minidom.parseString(xml_out).toprettyxml(indent=" ")
return xml_out


def test_roundtrip(valid_xml: Path):
def test_roundtrip(valid_xml: Path) -> None:
"""Ensure we can losslessly round-trip XML through the model and back."""
if true_stem(valid_xml) in SKIP_ROUNDTRIP:
pytest.xfail("known issues with canonicalization")
Expand All @@ -173,7 +106,7 @@ def test_roundtrip(valid_xml: Path):
raise AssertionError


def test_roundtrip_inverse(valid_xml, tmp_path: Path):
def test_roundtrip_inverse(valid_xml: Path, tmp_path: Path) -> None:
"""both variants have been touched by the model, here..."""
ome1 = from_xml(valid_xml)

Expand Down Expand Up @@ -234,3 +167,66 @@ def test_refs() -> None:

def test_with_ome_ns() -> None:
assert from_xml(DATA / "ome_ns.ome.xml").experimenters


# ########## Canonicalization utils ##########


@lru_cache(maxsize=None)
def _get_schema() -> xmlschema.XMLSchema:
schema = xmlschema.XMLSchema(OME_2016_06_XSD)
# FIXME Hack to work around xmlschema poor support for keyrefs to
# substitution groups
ls_sgs = schema.maps.substitution_groups[f"{NS_OME}LightSourceGroup"]
ls_id_maps = schema.maps.identities[f"{NS_OME}LightSourceIDKey"]
ls_id_maps.elements = {e: None for e in ls_sgs}
return schema


def _sort_elements(element, recursive=True):
# Sort the child elements alphabetically by their tag name
sorted_children = sorted(element, key=lambda child: child.tag)

# Replace the existing child elements with the sorted ones
element[:] = sorted_children

# Recursively sort child elements for each subelement
if recursive:
for child in element:
_sort_elements(child)


def _add_defaults(xml: str) -> ElementTree.Element:
# we use xmlschema to encode then decode the xml to add missing
# default values for the sake of canonicalization
# this is the ONLY remaining thing that xmlschema is used for
# (and only in testing).

schema = _get_schema()
d = cast(dict, schema.decode(xml, use_defaults=True))
# Strip extra whitespace in the schemaLocation value.
d["@xsi:schemaLocation"] = re.sub(r"\s+", " ", d["@xsi:schemaLocation"])
return schema.encode(d, path=NS_OME + "OME", use_defaults=True) # type: ignore


def _canonicalize(xml: str, strip_empty: bool) -> str:
root = _add_defaults(xml)

# These are the tags that appear in the example files with empty
# content. Since our round-trip will drop empty elements, we'll need to
# strip them from the "original" documents before comparison.
if strip_empty:
for tag in ("Description", "LightPath", "Map"):
for e in root.findall(f".//{NS_OME}{tag}[.='']..."):
e.remove(e.find(f"{NS_OME}{tag}")) # type: ignore

# ET.canonicalize can't handle an empty namespace so we need to
# re-register the OME namespace with an actual name before calling
# tostring.
_sort_elements(root)

ElementTree.register_namespace("ome", URI_OME)
xml_out = ElementTree.tostring(root, "unicode")
xml_out = ElementTree.canonicalize(xml_out, strip_text=True)
xml_out = minidom.parseString(xml_out).toprettyxml(indent=" ")
return xml_out
29 changes: 29 additions & 0 deletions tests/test_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Callable

import pytest

from ome_types import (
validate_xml, # noqa: F401 ... just to make sure it's importable
validation,
)

if TYPE_CHECKING:
from pathlib import Path

VALIDATORS: dict[str, Callable] = {
"lxml": validation.validate_xml_with_lxml,
"xmlschema": validation.validate_xml_with_xmlschema,
}


@pytest.mark.parametrize("backend", ["lxml", "xmlschema"])
def test_validation_good(valid_xml: Path, backend: str) -> None:
VALIDATORS[backend](valid_xml)


@pytest.mark.parametrize("backend", ["lxml", "xmlschema"])
def test_validation_raises(invalid_xml: Path, backend: str) -> None:
with pytest.raises(validation.ValidationError):
VALIDATORS[backend](invalid_xml)

0 comments on commit 8eba2c7

Please sign in to comment.