diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9e20ea53..2e3de94c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -39,6 +39,7 @@ repos:
         - id: check-merge-conflict
         - id: check-symlinks # Symlinks that don't point to anything?
         - id: check-yaml # Check Yaml file syntax
+          args: [--allow-multiple-documents]
         - id: debug-statements # Avoid commiting debug/breakpoints
         - id: end-of-file-fixer # Normalise on exactly one newline
         - id: fix-byte-order-marker # No UTF-8 byte order marks
diff --git a/eo3/__init__.py b/eo3/__init__.py
index 209b3d08..c107b4a8 100644
--- a/eo3/__init__.py
+++ b/eo3/__init__.py
@@ -1,20 +1,15 @@
 from ._version import get_versions
-from .assemble import IncompleteDatasetError
-from .images import GridSpec, ValidDataMethod
-from .model import Eo3DatasetDocBase
-from .properties import Eo3DictBase
+from .fields import Range
+from .model import DatasetMetadata
 
-REPO_URL = "https://github.com/GeoscienceAustralia/eo-datasets.git"
+REPO_URL = "https://github.com/opendatacube/eo3.git"
 
 __version__ = get_versions()["version"]
 del get_versions
 
 __all__ = (
-    "Eo3DatasetDocBase",
-    "Eo3DictBase",
-    "GridSpec",
-    "IncompleteDatasetError",
+    "DatasetMetadata",
+    "Range",
     "REPO_URL",
-    "ValidDataMethod",
     "__version__",
 )
diff --git a/eo3/assemble.py b/eo3/assemble.py
deleted file mode 100644
index 90671b30..00000000
--- a/eo3/assemble.py
+++ /dev/null
@@ -1,146 +0,0 @@
-"""
-API for easily writing an ODC Dataset
-"""
-from pathlib import PosixPath
-from urllib.parse import urlsplit
-
-from eo3.uris import uri_resolve
-from eo3.validation_msg import ValidationMessage
-
-
-class AssemblyError(Exception):
-    pass
-
-
-class IncompleteDatasetError(Exception):
-    """
-    Raised when a dataset is missing essential things and so cannot be written.
-
-    (such as mandatory metadata)
-    """
-
-    def __init__(self, validation: ValidationMessage) -> None:
-        self.validation = validation
-
-
-class IncompleteDatasetWarning(UserWarning):
-    """A non-critical warning for invalid or incomplete metadata"""
-
-    def __init__(self, validation: ValidationMessage) -> None:
-        self.validation = validation
-
-    def __str__(self) -> str:
-        return str(self.validation)
-
-
-def _validate_property_name(name: str):
-    """
-    >>> _validate_property_name('eo:gsd')
-    >>> _validate_property_name('thumbnail:full_resolution')
-    >>> _validate_property_name('full resolution')
-    Traceback (most recent call last):
-       ...
-    ValueError: Not a valid property name 'full resolution' (must be alphanumeric with colons or underscores)
-    >>> _validate_property_name('Mr Sprinkles')
-    Traceback (most recent call last):
-      ...
-    ValueError: Not a valid property name 'Mr Sprinkles' (must be alphanumeric with colons or underscores)
-    """
-    if not name.replace(":", "").isidentifier():
-        raise ValueError(
-            f"Not a valid property name {name!r} "
-            "(must be alphanumeric with colons or underscores)"
-        )
-
-
-def _default_metadata_path(dataset_url: str):
-    """
-    The default metadata path for a given dataset location url.
-
-    By default, we put a sibling file with extension 'odc-metadata.yaml':
-    >>> _default_metadata_path('file:///tmp/ls7_nbar_20120403_c1/esri-scene.stac-item.json')
-    'file:///tmp/ls7_nbar_20120403_c1/esri-scene.odc-metadata.yaml'
-    >>> _default_metadata_path('s3://deafrica-data/jaxa/alos_palsar_mosaic/2017/N05E040/N05E040_2017.tif')
-    's3://deafrica-data/jaxa/alos_palsar_mosaic/2017/N05E040/N05E040_2017.odc-metadata.yaml'
-    >>> _default_metadata_path('file:///tmp/ls7_nbar_20120403_c1/my-dataset.tar.gz')
-    'file:///tmp/ls7_nbar_20120403_c1/my-dataset.odc-metadata.yaml'
-
-    Or, if a directory, we place one inside:
-    >>> _default_metadata_path('file:///tmp/ls7_nbar_20120403_c1/')
-    'file:///tmp/ls7_nbar_20120403_c1/odc-metadata.yaml'
-
-    If a tar/zip file, place it alongside.
-    >>> _default_metadata_path('tar:///g/data/v10/somewhere/my-dataset.tar!/')
-    'file:///g/data/v10/somewhere/my-dataset.odc-metadata.yaml'
-    >>> _default_metadata_path('zip:///g/data/v10/landsat-dataset.zip!')
-    'file:///g/data/v10/landsat-dataset.odc-metadata.yaml'
-
-    Unless it's already a metadata path:
-    >>> _default_metadata_path('file:///tmp/ls7_nbar_20120403_c1/odc-metadata.yaml')
-    'file:///tmp/ls7_nbar_20120403_c1/odc-metadata.yaml'
-    """
-    # Already a metadata url?
-    if dataset_url.endswith("odc-metadata.yaml"):
-        return dataset_url
-
-    # If a tar URL, convert to file before proceding.
-    u = urlsplit(dataset_url)
-    path = PosixPath(u.path)
-    if u.scheme in ("tar", "zip"):
-        dataset_url = f"file://{path.as_posix()}"
-
-    # A directory, place a default name inside.
-    if dataset_url.endswith("/"):
-        return f"{dataset_url}odc-metadata.yaml"
-
-    # Otherwise a sibling file to the dataset file.
-    base_url, file_name = dataset_url.rsplit("/", maxsplit=1)
-    file_stem = file_name.split(".")[0]
-    return uri_resolve(dataset_url, f"{base_url}/{file_stem}.odc-metadata.yaml")
-
-
-def relative_url(base: str, offset: str, allow_absolute=False):
-    """
-    >>> relative_url('file:///tmp/dataset/odc-metadata.yaml', 'file:///tmp/dataset/my-image.tif')
-    'my-image.tif'
-    >>> relative_url('file:///tmp/dataset/odc-metadata.yaml', 'file:///tmp/dataset/images/my-image.tif')
-    'images/my-image.tif'
-    >>> relative_url(
-    ...    'https://example.test/dataset/odc-metadata.yaml',
-    ...    'https://example.test/dataset/images/my-image.tif'
-    ... )
-    'images/my-image.tif'
-    >>> # Outside the base directory
-    >>> relative_url('https://example.test/dataset/odc-metadata.yaml', 'https://example.test/my-image.tif')
-    Traceback (most recent call last):
-    ...
-    ValueError: Absolute paths are not allowed, and file 'https://example.test/my-image.tif' is outside location \
-'https://example.test/dataset/odc-metadata.yaml'
-    >>> # Matching paths, different hosts.
-    >>> relative_url('https://example.test/odc-metadata.yaml', 'https://example2.test/my-image.tif')
-    Traceback (most recent call last):
-      ...
-    ValueError: Absolute paths are not allowed, and file 'https://example2.test/my-image.tif' is outside location \
-'https://example.test/odc-metadata.yaml'
-    """
-    base_parts = urlsplit(base)
-    offset_parts = urlsplit(offset)
-    if not allow_absolute:
-        if (base_parts.hostname, base_parts.scheme) != (
-            offset_parts.hostname,
-            offset_parts.scheme,
-        ):
-            raise ValueError(
-                f"Absolute paths are not allowed, and file {offset!r} is outside location {base!r}"
-            )
-
-    base_dir, _ = base_parts.path.rsplit("/", 1)
-    try:
-        return PosixPath(offset_parts.path).relative_to(base_dir).as_posix()
-    except ValueError:
-        if not allow_absolute:
-            raise ValueError(
-                f"Absolute paths are not allowed, and file {offset!r} is outside location {base!r}"
-            )
-        # We can't make it relative, return the absolute.
-        return offset
diff --git a/eo3/documents.py b/eo3/documents.py
deleted file mode 100644
index c971c678..00000000
--- a/eo3/documents.py
+++ /dev/null
@@ -1,295 +0,0 @@
-"""
-Common methods for UI code.
-"""
-
-import gzip
-import json
-import os
-import posixpath
-from pathlib import Path, PurePath
-from typing import Dict, Generator, Tuple
-from urllib.parse import urlparse
-
-from boltons import iterutils
-
-from eo3 import serialise
-
-_DOCUMENT_EXTENSIONS = (".yaml", ".yml", ".json")
-_COMPRESSION_EXTENSIONS = ("", ".gz")
-
-# Both compressed (*.gz) and uncompressed.
-_ALL_SUPPORTED_EXTENSIONS = tuple(
-    doc_type + compression_type
-    for doc_type in _DOCUMENT_EXTENSIONS
-    for compression_type in _COMPRESSION_EXTENSIONS
-)
-
-DEFAULT_SYSTEM_NAMES = ("odc-metadata", "agdc-md")
-
-
-def is_supported_document_type(path):
-    """
-    Does a document path look like a supported type?
-    :type path: pathlib.Path
-    :rtype: bool
-    >>> from pathlib import Path
-    >>> is_supported_document_type(Path('/tmp/something.yaml'))
-    True
-    >>> is_supported_document_type(Path('/tmp/something.YML'))
-    True
-    >>> is_supported_document_type(Path('/tmp/something.yaml.gz'))
-    True
-    >>> is_supported_document_type(Path('/tmp/something.tif'))
-    False
-    >>> is_supported_document_type(Path('/tmp/something.tif.gz'))
-    False
-    """
-    return any(
-        [str(path).lower().endswith(suffix) for suffix in _ALL_SUPPORTED_EXTENSIONS]
-    )
-
-
-def find_metadata_path(dataset_path, system_names=None):
-    """
-    Find a metadata path for a given input/dataset path.
-
-    :type dataset_path: pathlib.Path
-    :rtype: Path
-    """
-
-    if system_names is None:
-        system_names = DEFAULT_SYSTEM_NAMES
-    # They may have given us a metadata file directly.
-    if dataset_path.is_file() and is_supported_document_type(dataset_path):
-        return dataset_path
-
-    for system_name in system_names:
-        # Otherwise there may be a sibling file with appended suffix '.ga-md.yaml'.
-        expected_name = dataset_path.parent.joinpath(
-            f"{dataset_path.stem}.{system_name}"
-        )
-        found = _find_any_metadata_suffix(expected_name)
-        if found:
-            return found
-
-    if dataset_path.is_dir():
-        # Eo3-style.
-        for m in dataset_path.glob("*.odc-metadata.*"):
-            return m
-
-        for system_name in "agdc", "ga":
-            # Otherwise if it's a directory, there may be an 'ga-metadata.yaml' file describing all contained datasets.
-            expected_name = dataset_path.joinpath(system_name + "-metadata")
-            found = _find_any_metadata_suffix(expected_name)
-            if found:
-                return found
-
-    return None
-
-
-def _find_any_metadata_suffix(path):
-    """
-    Find any metadata files that exist with the given file name/path.
-    (supported suffixes are tried on the name)
-    :type path: pathlib.Path
-    """
-    existing_paths = list(
-        filter(is_supported_document_type, path.parent.glob(path.name + "*"))
-    )
-    if not existing_paths:
-        return None
-
-    if len(existing_paths) > 1:
-        raise ValueError(f"Multiple matched metadata files: {existing_paths!r}")
-
-    return existing_paths[0]
-
-
-def find_and_read_documents(
-    *paths: Path, system_names=None
-) -> Generator[Tuple[Path, Dict], None, None]:
-    # TODO EODATASETS: default system_names no longer include 'ga-md'
-    # Scan all paths immediately so we can fail fast if some are wrong.
-    metadata_paths = [
-        (path, find_metadata_path(path, system_names=system_names)) for path in paths
-    ]
-
-    missing_paths = [path for (path, md) in metadata_paths if md is None]
-    if missing_paths:
-        raise ValueError(
-            f"No metadata found for input path{'s' if len(missing_paths) > 1 else ''}: "
-            f"{', '.join(map(str, missing_paths))}"
-        )
-
-    for input_path, metadata_path in metadata_paths:
-        yield from read_documents(metadata_path)
-
-
-def read_documents(*paths: Path) -> Generator[Tuple[Path, Dict], None, None]:
-    """
-    Read & parse documents from the filesystem (yaml or json).
-
-    Note that a single yaml file can contain multiple documents.
-    """
-    for path in paths:
-        suffix = path.suffix.lower()
-
-        # If compressed, open as gzip stream.
-        opener = open
-        if suffix == ".gz":
-            suffix = path.suffixes[-2].lower()
-            opener = gzip.open
-
-        with opener(str(path), "r") as f:
-            if suffix in (".yaml", ".yml"):
-                for parsed_doc in serialise.loads_yaml(f):
-                    yield path, parsed_doc
-            elif suffix == ".json":
-                yield path, json.load(f)
-            else:
-                raise ValueError(
-                    "Unknown document type for {}; expected one of {!r}.".format(
-                        path.name, _ALL_SUPPORTED_EXTENSIONS
-                    )
-                )
-
-
-def docpath_set(doc, path, value):
-    """
-    Set a value in a document using a path (sequence of keys).
-
-    (It's designed to mirror `boltons.iterutils.get_path()` and related methods)
-
-    >>> d = {'a': 1}
-    >>> docpath_set(d, ['a'], 2)
-    >>> d
-    {'a': 2}
-    >>> d = {'a':{'b':{'c': 1}}}
-    >>> docpath_set(d, ['a', 'b', 'c'], 2)
-    >>> d
-    {'a': {'b': {'c': 2}}}
-    >>> d = {}
-    >>> docpath_set(d, ['a'], 2)
-    >>> d
-    {'a': 2}
-    >>> d = {}
-    >>> docpath_set(d, ['a', 'b'], 2)
-    Traceback (most recent call last):
-    ...
-    KeyError: 'a'
-    >>> d
-    {}
-    >>> docpath_set(d, [], 2)
-    Traceback (most recent call last):
-    ...
-    ValueError: Cannot set a value to an empty path
-    """
-    if not path:
-        raise ValueError("Cannot set a value to an empty path")
-
-    d = doc
-    for part in path[:-1]:
-        d = d[part]
-
-    d[path[-1]] = value
-
-
-def make_paths_relative(
-    doc: Dict, base_directory: PurePath, allow_paths_outside_base=False
-):
-    """
-    Find all pathlib.Path values in a document structure and make them relative to the given path.
-
-    >>> from copy import deepcopy
-    >>> base = PurePath('/tmp/basket')
-    >>> doc = {'id': 1, 'fruits': [{'apple': PurePath('/tmp/basket/fruits/apple.txt')}]}
-    >>> make_paths_relative(doc, base)
-    >>> doc
-    {'id': 1, 'fruits': [{'apple': 'fruits/apple.txt'}]}
-    >>> # No change if repeated. (relative paths still relative)
-    >>> previous = deepcopy(doc)
-    >>> make_paths_relative(doc, base)
-    >>> doc == previous
-    True
-    >>> # Relative pathlibs also become relative strings for consistency.
-    >>> doc = {'villains': PurePath('the-baron.txt')}
-    >>> make_paths_relative(doc, base)
-    >>> doc
-    {'villains': 'the-baron.txt'}
-    """
-    for doc_path, value in iterutils.research(
-        doc, lambda p, k, v: isinstance(v, PurePath)
-    ):
-        value: PurePath
-        value = relative_path(
-            value, base_directory, allow_paths_outside_base=allow_paths_outside_base
-        )
-        docpath_set(doc, doc_path, value.as_posix())
-
-
-def relative_url(value: str, base: str, allow_paths_outside_base=False) -> str:
-    """
-    Make a single url relative to the base url if it is inside it.
-
-    By default, will throw a ValueError if not able to make it relative to the path.
-
-
-    >>> relative_url('file:///g/data/v10/0/2015/blue.jpg', 'file:///g/data/v10/0/2015/odc-metadata.yaml')
-    'blue.jpg'
-    >>> relative_url('https://example.test/2015/images/blue.jpg', 'https://example.test/2015/odc-metadata.yaml')
-    'images/blue.jpg'
-    >>> relative_url('file:///g/data/v10/0/2018/blue.jpg', 'file:///g/data/v10/0/2015/odc-metadata.yaml')
-    Traceback (most recent call last):
-      ...
-    ValueError: Path 'file:///g/data/v10/0/2018/blue.jpg' is outside path 'file:///g/data/v10/0/2015/odc-metadata.yaml'\
- (allow_paths_outside_base=False)
-    """
-
-    if not value:
-        return value
-
-    if not value.startswith(base) and not value.startswith(os.path.dirname(base)):
-        if not allow_paths_outside_base:
-            raise ValueError(
-                f"Path {value!r} is outside path {base!r} "
-                f"(allow_paths_outside_base={allow_paths_outside_base})"
-            )
-        return value
-
-    return _make_relurl(value, base)
-
-
-def _make_relurl(target: str, base: str) -> str:
-    base = urlparse(base)
-    target = urlparse(target)
-    if base.netloc != target.netloc:
-        raise ValueError("target and base netlocs do not match")
-    base_dir = "." + posixpath.dirname(base.path)
-    target = "." + target.path
-    return posixpath.relpath(target, start=base_dir)
-
-
-def relative_path(
-    value: PurePath, base_directory: PurePath, allow_paths_outside_base=False
-) -> PurePath:
-    """
-    Make a single path relative to the base directory if it is inside it.
-
-    By default, will throw a ValueError if not able to make it relative to the path.
-
-    >>> val =  PurePath('/tmp/minimal-pkg/loch_ness_sightings_2019-07-04_blue.tif')
-    >>> base = PurePath('/tmp/minimal-pkg')
-    >>> relative_path(val, base).as_posix()
-    'loch_ness_sightings_2019-07-04_blue.tif'
-    """
-    if not value or not value.is_absolute():
-        return value
-
-    if base_directory not in value.parents:
-        if not allow_paths_outside_base:
-            raise ValueError(
-                f"Path {value.as_posix()!r} is outside path {base_directory.as_posix()!r} "
-                f"(allow_paths_outside_base={allow_paths_outside_base})"
-            )
-        return value
-    return value.relative_to(base_directory)
diff --git a/eo3/eo3_core.py b/eo3/eo3_core.py
index 776fbf2a..efa849e4 100644
--- a/eo3/eo3_core.py
+++ b/eo3/eo3_core.py
@@ -1,6 +1,6 @@
 """ Tools for working with EO3 metadata
 """
-# TODO CORE: copied from datacube.index.eo3
+import warnings
 from functools import reduce
 from typing import Any, Dict, Iterable, Optional, Tuple, Union
 from uuid import UUID
@@ -16,10 +16,9 @@
     polygon,
 )
 
-EO3_SCHEMA = "https://schemas.opendatacube.org/dataset"
+from eo3.schema import ODC_DATASET_SCHEMA_URL
 
 
-# This is should become eo3.models.GridDoc
 class EO3Grid:
     def __init__(self, grid: Dict[str, Any]) -> None:
         shape = grid.get("shape")
@@ -28,6 +27,7 @@ def __init__(self, grid: Dict[str, Any]) -> None:
         if len(shape) != 2:
             raise ValueError("Grid shape must be two dimensional")
         self.shape: Tuple[int, int] = tuple(int(x) for x in shape)
+
         xform = grid.get("transform")
         if xform is None:
             raise ValueError("Each grid must have a transform")
@@ -35,11 +35,18 @@ def __init__(self, grid: Dict[str, Any]) -> None:
             raise ValueError("Grid transform must have 6 or 9 elements.")
         for elem in xform:
             if type(elem) not in (int, float):
-                raise ValueError("All grid transform elements must be numbers")
+                raise ValueError(
+                    f"All grid transform elements must be numbers, got {type(elem)}"
+                )
         if len(xform) == 9 and list(xform[6:]) != [0, 0, 1]:
             raise ValueError("Grid transform must be a valid Affine matrix")
         self.transform = Affine(*xform[:6])
 
+        crs = grid.get("crs")
+        if crs is not None:
+            check_crs_epsg(crs)
+        self.crs = crs
+
     def points(self, ring: bool = False) -> CoordList:
         ny, nx = (float(dim) for dim in self.shape)
         pts = [(0.0, 0.0), (nx, 0.0), (nx, ny), (0.0, ny)]
@@ -52,6 +59,8 @@ def ref_points(self) -> Dict[str, Dict[str, float]]:
         return {n: dict(x=x, y=y) for n, (x, y) in zip(nn, self.points())}
 
     def polygon(self, crs: Optional[SomeCRS] = None) -> Geometry:
+        # use grid's own CRS if it was provided
+        crs = self.crs if self.crs is not None else crs
         return polygon(self.points(ring=True), crs=crs)
 
 
@@ -66,7 +75,11 @@ def eo3_lonlat_bbox(
         return lonlat_bounds(valid_data, resolution=resolution)
 
     all_grids_extent = reduce(
-        lambda x, y: x.union(y), (grid.polygon(crs) for grid in grids)
+        lambda x, y: x.union(y),
+        (
+            grid.polygon(grid.crs) if grid.crs is not None else grid.polygon(crs)
+            for grid in grids
+        ),
     )
     return lonlat_bounds(all_grids_extent, resolution=resolution)
 
@@ -119,6 +132,7 @@ def eo3_grid_spatial(
     crs = doc.get("crs", None)
     if crs is None or not gridspecs:
         raise ValueError("Input must have crs and grids.")
+    check_crs_epsg(crs)
     grids = {name: EO3Grid(grid_spec) for name, grid_spec in gridspecs.items()}
     grid = grids.get(grid_name)
     if not grid:
@@ -154,7 +168,10 @@ def eo3_grid_spatial(
 def add_eo3_parts(
     doc: Dict[str, Any], resolution: Optional[float] = None
 ) -> Dict[str, Any]:
-    """Add spatial keys the DB requires to eo3 metadata"""
+    """Add spatial keys the DB required by eo3 metadata"""
+    # don't attempt to recalculate gs info if it already exists
+    if doc.get("grid_spatial"):
+        return doc
     return dict(**doc, **eo3_grid_spatial(doc, resolution=resolution))
 
 
@@ -174,7 +191,7 @@ def is_doc_eo3(doc: Dict[str, Any]) -> bool:
     if schema is None:
         return False
 
-    if schema == EO3_SCHEMA:
+    if schema == ODC_DATASET_SCHEMA_URL:
         return True
 
     # Otherwise it has an unknown schema.
@@ -205,45 +222,57 @@ def is_doc_geo(doc: Dict[str, Any], check_eo3: bool = True) -> bool:
 
 
 def prep_eo3(
-    doc: Dict[str, Any], auto_skip: bool = False, resolution: Optional[float] = None
+    doc: Dict[str, Any],
+    resolution: Optional[float] = None,  # can we remove this?
+    remap_lineage=True,
 ) -> Dict[str, Any]:
     """Modify spatial and lineage sections of eo3 metadata
     :param doc: input document
-    :param auto_skip: If true check if dataset is EO3 and if not
-                      silently return input dataset without modifications
+    :param remap_lineage: If True (default) disambiguate lineage classifiers so that
+                          source_id and classifier form a unique index (for indexes that DON'T
+                          support external_lineage).
+                          If False, leave lineage in the same format.
     """
     if doc is None:
         return None
 
-    if auto_skip:
-        if not is_doc_eo3(doc):
-            return doc
-
     def stringify(u: Optional[Union[str, UUID]]) -> Optional[str]:
         return u if isinstance(u, str) else str(u) if u else None
 
     doc["id"] = stringify(doc.get("id", None))
 
     doc = add_eo3_parts(doc, resolution=resolution)
-    lineage = doc.pop("lineage", {})
-
-    def remap_lineage(name, uuids) -> Dict[str, Any]:
-        """Turn name, [uuid] -> {name: {id: uuid}}"""
-        if len(uuids) == 0:
-            return {}
-        if isinstance(uuids, dict) or isinstance(uuids[0], dict):
-            raise ValueError("Embedded lineage not supported for eo3 metadata types")
-        if len(uuids) == 1:
-            return {name: {"id": stringify(uuids[0])}}
-
-        out = {}
-        for idx, uuid in enumerate(uuids, start=1):
-            out[name + str(idx)] = {"id": stringify(uuid)}
-        return out
-
-    sources = {}
-    for name, uuids in lineage.items():
-        sources.update(remap_lineage(name, uuids))
-
-    doc["lineage"] = dict(source_datasets=sources)
+    if remap_lineage:
+        lineage = doc.pop("lineage", {})
+
+        def lineage_remap(name, uuids) -> Dict[str, Any]:
+            """Turn name, [uuid] -> {name: {id: uuid}}"""
+            if len(uuids) == 0:
+                return {}
+            if isinstance(uuids, dict) or isinstance(uuids[0], dict):
+                raise ValueError(
+                    "Embedded lineage not supported for eo3 metadata types"
+                )
+            if len(uuids) == 1:
+                return {name: {"id": stringify(uuids[0])}}
+
+            out = {}
+            for idx, uuid in enumerate(uuids, start=1):
+                out[name + str(idx)] = {"id": stringify(uuid)}
+            return out
+
+        sources = {}
+        for name, uuids in lineage.items():
+            sources.update(lineage_remap(name, uuids))
+
+        doc["lineage"] = dict(source_datasets=sources)
     return doc
+
+
+def check_crs_epsg(crs):
+    """Check if CRS is WKT when it could be provided as EPSG (preferred)"""
+    crs = CRS(crs)
+    if crs.epsg is not None and not str(crs).startswith("EPSG"):
+        warnings.warn(
+            f"Prefer an EPSG code to a WKT when possible. (Can change CRS to 'epsg:{crs.epsg}')"
+        )
diff --git a/eo3/fields.py b/eo3/fields.py
new file mode 100644
index 00000000..2b7a755c
--- /dev/null
+++ b/eo3/fields.py
@@ -0,0 +1,245 @@
+# Core TODO: copied over from datacube.model.fields
+"""Non-db specific implementation of metadata search fields.
+
+This allows extraction of fields of interest from dataset metadata document.
+"""
+import decimal
+from collections import namedtuple
+from typing import Any, Dict, List, Mapping
+
+import toolz  # type: ignore[import]
+
+from eo3.utils import parse_time
+
+Range = namedtuple("Range", ("begin", "end"))
+
+# Allowed values for field 'type' (specified in a metadata type docuemnt)
+_AVAILABLE_TYPE_NAMES = (
+    "numeric-range",
+    "double-range",
+    "integer-range",
+    "datetime-range",
+    "string",
+    "numeric",
+    "double",
+    "integer",
+    "datetime",
+    "object",
+    # For backwards compatibility (alias for numeric-range)
+    "float-range",
+)
+
+_TYPE_PARSERS = {
+    "string": str,
+    "double": float,
+    "integer": int,
+    "numeric": decimal.Decimal,
+    "datetime": parse_time,
+    "object": lambda x: x,
+}
+
+
+class Expression:
+    # No properties at the moment. These are built and returned by the
+    # DB driver (from Field methods), so they're mostly an opaque token.
+
+    # A simple equals implementation for comparison in test code.
+    def __eq__(self, other) -> bool:
+        if self.__class__ != other.__class__:
+            return False
+        return self.__dict__ == other.__dict__
+
+    def evaluate(self, ctx):
+        raise NotImplementedError()
+
+
+class SimpleEqualsExpression(Expression):
+    def __init__(self, field, value):
+        self.field = field
+        self.value = value
+
+    def evaluate(self, ctx):
+        return self.field.extract(ctx) == self.value
+
+
+class Field:
+    """
+    A searchable field within a dataset/storage metadata document.
+    """
+
+    # type of field.
+    # If type is not specified, the field is a string
+    # This should always be one of _AVAILABLE_TYPE_NAMES
+    type_name = "string"
+
+    def __init__(self, name: str, description: str):
+        self.name = name
+
+        self.description = description
+
+        # Does selecting this affect the output rows?
+        # (eg. Does this join other tables that aren't 1:1 with datasets.)
+        self.affects_row_selection = False
+
+        if self.type_name not in _AVAILABLE_TYPE_NAMES:
+            raise ValueError(f"Invalid type name {self.type_name!r}")
+
+    def __eq__(self, value) -> Expression:  # type: ignore
+        """
+        Is this field equal to a value?
+
+        this returns an Expression object (hence type ignore above)
+        """
+        raise NotImplementedError("equals expression")
+
+    def between(self, low, high) -> Expression:
+        """
+        Is this field in a range?
+        """
+        raise NotImplementedError("between expression")
+
+
+class SimpleField(Field):
+    def __init__(self, offset, converter, type_name, name="", description=""):
+        self.offset = offset
+        self._converter = converter
+        self.type_name = type_name
+        super().__init__(name, description)
+
+    def __eq__(self, value) -> Expression:  # type: ignore[override]
+        return SimpleEqualsExpression(self, value)
+
+    def extract(self, doc):
+        v = toolz.get_in(self.offset, doc, default=None)
+        if v is None:
+            return None
+        return self._converter(v)
+
+
+class RangeField(Field):
+    def __init__(
+        self, min_offset, max_offset, base_converter, type_name, name="", description=""
+    ):
+        self.type_name = type_name
+        self._converter = base_converter
+        self.min_offset = min_offset
+        self.max_offset = max_offset
+        super().__init__(name, description)
+
+    def extract(self, doc):
+        def extract_raw(paths):
+            vv = [toolz.get_in(p, doc, default=None) for p in paths]
+            return [self._converter(v) for v in vv if v is not None]
+
+        v_min = extract_raw(self.min_offset)
+        v_max = extract_raw(self.max_offset)
+
+        v_min = None if len(v_min) == 0 else min(v_min)
+        v_max = None if len(v_max) == 0 else max(v_max)
+
+        if v_min is None and v_max is None:
+            return None
+
+        return Range(v_min, v_max)
+
+
+def parse_search_field(doc, name=""):
+    _type = doc.get("type", "string")
+
+    if _type in _TYPE_PARSERS:
+        offset = doc.get("offset", None)
+        if offset is None:
+            raise ValueError("Missing offset")
+
+        return SimpleField(
+            offset,
+            _TYPE_PARSERS[_type],
+            _type,
+            name=name,
+            description=doc.get("description", ""),
+        )
+
+    if not _type.endswith("-range"):
+        raise ValueError("Unsupported search field type: " + str(_type))
+
+    raw_type = _type.split("-")[0]
+
+    if (
+        raw_type == "float"
+    ):  # float-range is supposed to be supported, but not just float?
+        raw_type = "numeric"
+        _type = "numeric-range"
+
+    if raw_type not in _TYPE_PARSERS:
+        raise ValueError("Unsupported search field type: " + str(_type))
+
+    min_offset = doc.get("min_offset", None)
+    max_offset = doc.get("max_offset", None)
+
+    if min_offset is None or max_offset is None:
+        raise ValueError("Need to specify both min_offset and max_offset")
+
+    return RangeField(
+        min_offset,
+        max_offset,
+        _TYPE_PARSERS[raw_type],
+        _type,
+        name=name,
+        description=doc.get("description", ""),
+    )
+
+
+def get_search_fields(metadata_definition: Mapping[str, Any]) -> Dict[str, Field]:
+    """Construct search fields dictionary not tied to any specific db implementation."""
+    fields = toolz.get_in(["dataset", "search_fields"], metadata_definition, {})
+    return {n: parse_search_field(doc, name=n) for n, doc in fields.items()}
+
+
+def parse_offset_field(name="", offset=[]):
+    field_types = {
+        "id": "string",
+        "label": "string",
+        "format": "string",
+        "sources": "object",
+        "creation_dt": "datetime",
+        "grid_spatial": "object",
+        "measurements": "object",
+    }
+
+    if name in field_types:
+        _type = field_types[name]
+        return SimpleField(offset, _TYPE_PARSERS[_type], _type, name=name)
+
+
+def get_system_fields(metadata_definition: Mapping[str, Any]) -> Dict[str, Field]:
+    """Construct system fields dictionary not tied to any specific db implementation."""
+    fields = metadata_definition.get("dataset")
+    return {
+        name: parse_offset_field(name, offset)
+        for name, offset in fields.items()
+        if name != "search_fields"
+    }
+
+
+def get_all_fields(metadata_definition: Mapping[str, Any]) -> Dict[str, Field]:
+    """Construct dictionary of all fields"""
+    search_fields = {
+        name: field for name, field in get_search_fields(metadata_definition).items()
+    }
+    system_offsets = {
+        name: field for name, field in get_system_fields(metadata_definition).items()
+    }
+    return dict(**system_offsets, **search_fields)
+
+
+def all_field_offsets(metadata_definition: Mapping[str, Any]) -> Dict[str, List[Any]]:
+    """Get a mapping of all field names -> offset"""
+    all_fields = get_all_fields(metadata_definition)
+    return {
+        name: (
+            [field.offset]
+            if hasattr(field, "offset")
+            else field.min_offset + field.max_offset
+        )
+        for name, field in all_fields.items()
+    }
diff --git a/eo3/images.py b/eo3/images.py
deleted file mode 100644
index 112b215a..00000000
--- a/eo3/images.py
+++ /dev/null
@@ -1,1330 +0,0 @@
-import math
-import os
-import string
-import sys
-import tempfile
-from collections import defaultdict
-from enum import Enum, auto
-from pathlib import Path, PurePath
-from typing import (
-    Dict,
-    Generator,
-    Iterable,
-    List,
-    Mapping,
-    Optional,
-    Sequence,
-    Set,
-    Tuple,
-    Union,
-)
-
-import attr
-import numpy
-import rasterio
-import rasterio.features
-import shapely
-import shapely.affinity
-import shapely.ops
-import xarray
-from affine import Affine
-from rasterio import DatasetReader
-from rasterio.coords import BoundingBox
-from rasterio.crs import CRS
-from rasterio.enums import Resampling
-from rasterio.io import DatasetWriter, MemoryFile
-from rasterio.shutil import copy as rio_copy
-from rasterio.warp import calculate_default_transform, reproject
-from scipy.ndimage import binary_fill_holes
-from shapely.geometry import box
-from shapely.geometry.base import CAP_STYLE, JOIN_STYLE, BaseGeometry
-
-from eo3.model import Eo3DatasetDocBase, GridDoc, MeasurementDoc
-from eo3.properties import FileFormat
-
-DEFAULT_OVERVIEWS = (8, 16, 32)
-
-try:
-    import h5py
-except ImportError:
-    h5py = None
-
-
-class ValidDataMethod(Enum):
-    """
-    How to calculate the valid data geometry for an image?
-    """
-
-    #: Vectorize the full valid pixel mask as-is.
-    #:
-    #: In some circumstances this can be very slow.
-    #: `filled` may be safer.
-    #:
-    thorough = auto()
-
-    #: Fill holes in the valid pixel mask before vectorizing.
-    #:
-    #: (Potentially much faster than ``thorough`` if there's many small
-    #: nodata holes, as they will create many tiny polygons.
-    #: *slightly* slower if no holes exist.)
-    filled = auto()
-
-    #: Take convex-hull of valid pixel mask before vectorizing.
-    #:
-    #: This is much slower than ``filled``, but will work in cases where
-    #: you have a lot of internal geometry that aren't holes.
-    #: Such as SLC-Off Landsat 7 data.
-    #:
-    #: Requires 'scikit-image' dependency.
-    convex_hull = auto()
-
-    #: Use the image file bounds, ignoring actual pixel values.
-    bounds = auto()
-
-
-@attr.s(auto_attribs=True, slots=True, hash=True, frozen=True)
-class GridSpec:
-    """
-    The grid spec defines the coordinates/transform and size of pixels of a
-    measurment.
-
-    The easiest way to create one is use the ``GridSpec.from_*()`` class methods, such as
-    ``GridSpec.from_path(my_image_path)``.
-
-    To create one manually:
-
-    >>> from eo3 import GridSpec
-    >>> from affine import Affine
-    >>> from rasterio.crs import CRS
-    >>> g = GridSpec(shape=(7721, 7621),
-    ...              transform=Affine(30.0, 0.0, 241485.0, 0.0, -30.0, -2281485.0),
-    ...              crs=CRS.from_epsg(32656))
-    >>> # Numbers copied from equivalent rio dataset.bounds call.
-    >>> g.bounds
-    BoundingBox(left=241485.0, bottom=-2513115.0, right=470115.0, top=-2281485.0)
-    >>> g.resolution_yx
-    (30.0, 30.0)
-    """
-
-    #:
-    shape: Tuple[int, int]
-    #:
-    transform: Affine
-    #:
-    crs: CRS = attr.ib(
-        metadata=dict(doc_exclude=True), default=None, hash=False, eq=False
-    )
-
-    @classmethod
-    def from_dataset_doc(cls, ds: Eo3DatasetDocBase, grid="default") -> "GridSpec":
-        """
-        Create from an existing parsed metadata document
-
-        :param grid: Grid name to read, if not the default.
-        """
-        g = ds.grids[grid]
-
-        if ds.crs.startswith("epsg:"):
-            crs = CRS.from_epsg(ds.crs[5:])
-        else:
-            crs = CRS.from_wkt(ds.crs)
-
-        return GridSpec(g.shape, g.transform, crs=crs)
-
-    @classmethod
-    def from_rio(cls, dataset: rasterio.DatasetReader) -> "GridSpec":
-        """Create from an open rasterio dataset"""
-        return cls(shape=dataset.shape, transform=dataset.transform, crs=dataset.crs)
-
-    @property
-    def resolution_yx(self):
-        return abs(self.transform[4]), abs(self.transform[0])
-
-    @classmethod
-    def from_odc_xarray(cls, dataset: xarray.Dataset) -> "GridSpec":
-        """Create from an ODC xarray"""
-        shape = {v.shape for v in dataset.data_vars.values()}.pop()
-        return cls(
-            shape=shape,
-            transform=dataset.geobox.transform,
-            crs=CRS.from_wkt(dataset.geobox.crs.crs_str),
-        )
-
-    @classmethod
-    def from_path(cls, path: str) -> "GridSpec":
-        """Create from the spec of a (rio-readable) filesystem path or url"""
-        with rasterio.open(path) as rio:
-            return GridSpec.from_rio(rio)
-
-    @property
-    def bounds(self):
-        """
-        Get bounding box.
-        """
-        return BoundingBox(
-            *(self.transform * (0, self.shape[0]))
-            + (self.transform * (self.shape[1], 0))
-        )
-
-
-def generate_tiles(
-    samples: int, lines: int, xtile: int = None, ytile: int = None
-) -> Generator[Tuple[Tuple[int, int], Tuple[int, int]], None, None]:
-    """
-    Generates a list of tile indices for a 2D array.
-
-    :param samples:
-        An integer expressing the total number of samples in an array.
-
-    :param lines:
-        An integer expressing the total number of lines in an array.
-
-    :param xtile:
-        (Optional) The desired size of the tile in the x-direction.
-        Default is all samples
-
-    :param ytile:
-        (Optional) The desired size of the tile in the y-direction.
-        Default is min(100, lines) lines.
-
-    :return:
-        Each tuple in the generator contains
-        ((ystart,yend),(xstart,xend)).
-
-    >>> import pprint
-    >>> tiles = generate_tiles(1624, 1567, xtile=1000, ytile=400)
-    >>> pprint.pprint(list(tiles))
-    [((0, 400), (0, 1000)),
-     ((0, 400), (1000, 1624)),
-     ((400, 800), (0, 1000)),
-     ((400, 800), (1000, 1624)),
-     ((800, 1200), (0, 1000)),
-     ((800, 1200), (1000, 1624)),
-     ((1200, 1567), (0, 1000)),
-     ((1200, 1567), (1000, 1624))]
-    """
-
-    def create_tiles(samples, lines, xstart, ystart):
-        """
-        Creates a generator object for the tiles.
-        """
-        for ystep in ystart:
-            if ystep + ytile < lines:
-                yend = ystep + ytile
-            else:
-                yend = lines
-            for xstep in xstart:
-                if xstep + xtile < samples:
-                    xend = xstep + xtile
-                else:
-                    xend = samples
-                yield ((ystep, yend), (xstep, xend))
-
-    # check for default or out of bounds
-    if xtile is None or xtile < 0:
-        xtile = samples
-    if ytile is None or ytile < 0:
-        ytile = min(100, lines)
-
-    xstart = numpy.arange(0, samples, xtile)
-    ystart = numpy.arange(0, lines, ytile)
-
-    tiles = create_tiles(samples, lines, xstart, ystart)
-
-    return tiles
-
-
-def _common_suffix(names: Iterable[str]) -> str:
-    return os.path.commonprefix([s[::-1] for s in names])[::-1]
-
-
-def _find_a_common_name(
-    group_of_names: Sequence[str], all_possible_names: Set[str] = None
-) -> Optional[str]:
-    """
-    If we have a list of band names, can we find a nice name for the group of them?
-
-    (used when naming the grid for a set of bands)
-
-    >>> _find_a_common_name(['nbar_blue', 'nbar_red'])
-    'nbar'
-    >>> _find_a_common_name(['nbar_band08', 'nbart_band08'])
-    'band08'
-    >>> _find_a_common_name(['nbar:band08', 'nbart:band08'])
-    'band08'
-    >>> _find_a_common_name(['panchromatic'])
-    'panchromatic'
-    >>> _find_a_common_name(['nbar_panchromatic'])
-    'nbar_panchromatic'
-    >>> # It's ok to find nothing.
-    >>> _find_a_common_name(['nbar_blue', 'nbar_red', 'qa'])
-    >>> _find_a_common_name(['a', 'b'])
-    >>> # If a name is taken by non-group memebers, it shouldn't be chosen
-    >>> # (There's an 'nbar' prefix outside of the group, so shouldn't be found)
-    >>> all_names = {'nbar_blue', 'nbar_red', 'nbar_green', 'nbart_blue'}
-    >>> _find_a_common_name(['nbar_blue', 'nbar_red'], all_possible_names=all_names)
-    >>> _find_a_common_name(['nbar_blue', 'nbar_red', 'nbar_green'], all_possible_names=all_names)
-    'nbar'
-    """
-    options = []
-
-    non_group_names = (all_possible_names or set()).difference(group_of_names)
-
-    # If all measurements have a common prefix (like 'nbar_') it makes a nice grid name.
-    prefix = os.path.commonprefix(group_of_names)
-    if not any(name.startswith(prefix) for name in non_group_names):
-        options.append(prefix)
-
-    suffix = _common_suffix(group_of_names)
-    if not any(name.endswith(suffix) for name in non_group_names):
-        options.append(suffix)
-
-    if not options:
-        return None
-
-    options = [s.strip("_:") for s in options]
-    # Pick the longest candidate.
-    options.sort(key=len, reverse=True)
-    return options[0] or None
-
-
-@attr.s(auto_attribs=True, slots=True)
-class _MeasurementLocation:
-    path: Union[Path, str]
-    layer: str = None
-
-
-_Measurements = Dict[str, _MeasurementLocation]
-
-
-class MeasurementBundler:
-    """
-    Incrementally record the information for a set of measurements/images to group into grids,
-    calculate geometry etc, suitable for metadata.
-    """
-
-    def __init__(self):
-        # The measurements grouped by their grid.
-        # (value is band_name->Path)
-        self._measurements_per_grid: Dict[GridSpec, _Measurements] = defaultdict(dict)
-        # Valid data mask per grid, in pixel coordinates.
-        self.mask_by_grid: Dict[GridSpec, numpy.ndarray] = {}
-
-    def record_image(
-        self,
-        name: str,
-        grid: GridSpec,
-        path: Union[PurePath, str],
-        img: numpy.ndarray,
-        layer: Optional[str] = None,
-        nodata: Optional[Union[float, int]] = None,
-        expand_valid_data=True,
-    ):
-        for measurements in self._measurements_per_grid.values():
-            if name in measurements:
-                raise ValueError(
-                    f"Duplicate addition of band called {name!r}. "
-                    f"Original at {measurements[name]} and now {path}"
-                )
-
-        self._measurements_per_grid[grid][name] = _MeasurementLocation(path, layer)
-        if expand_valid_data:
-            self._expand_valid_data_mask(grid, img, nodata)
-
-    def _expand_valid_data_mask(
-        self, grid: GridSpec, img: numpy.ndarray, nodata: Union[float, int]
-    ):
-        if nodata is None:
-            nodata = float("nan") if numpy.issubdtype(img.dtype, numpy.floating) else 0
-
-        if math.isnan(nodata):
-            valid_values = numpy.isfinite(img)
-        else:
-            valid_values = img != nodata
-
-        mask = self.mask_by_grid.get(grid)
-        if mask is None:
-            mask = valid_values
-        else:
-            mask |= valid_values
-        self.mask_by_grid[grid] = mask
-
-    def _as_named_grids(self) -> Dict[str, Tuple[GridSpec, _Measurements]]:
-        """Get our grids with sensible (hopefully!), names."""
-
-        # Order grids from most to fewest measurements.
-        # PyCharm's typing seems to get confused by the sorted() call.
-        # noinspection PyTypeChecker
-        grids_by_frequency: List[Tuple[GridSpec, _Measurements]] = sorted(
-            self._measurements_per_grid.items(), key=lambda k: len(k[1]), reverse=True
-        )
-
-        # The largest group is the default.
-        default_grid = grids_by_frequency.pop(0)
-
-        named_grids = {"default": default_grid}
-
-        # No other grids? Nothing to do!
-        if not grids_by_frequency:
-            return named_grids
-
-        # First try to name them via common prefixes, suffixes etc.
-        all_measurement_names = set(self.iter_names())
-        for grid, measurements in grids_by_frequency:
-            if len(measurements) == 1:
-                grid_name = "_".join(measurements.keys())
-            else:
-                grid_name = _find_a_common_name(
-                    list(measurements.keys()), all_possible_names=all_measurement_names
-                )
-                if not grid_name:
-                    # Nothing useful found!
-                    break
-
-            if grid_name in named_grids:
-                # Clash of names! This strategy wont work.
-                break
-
-            named_grids[grid_name] = (grid, measurements)
-        else:
-            # We finished without a clash.
-            return named_grids
-
-        # Otherwise, try resolution names:
-        named_grids = {"default": default_grid}
-        for grid, measurements in grids_by_frequency:
-            res_y, res_x = grid.resolution_yx
-            if res_x > 1:
-                res_x = int(res_x)
-            grid_name = f"{res_x}"
-            if grid_name in named_grids:
-                # Clash of names! This strategy wont work.
-                break
-
-            named_grids[grid_name] = (grid, measurements)
-        else:
-            # We finished without a clash.
-            return named_grids
-
-        # No strategies worked!
-        # Enumerated, alphabetical letter names. Grid 'a', Grid 'b', etc...
-        grid_names = list(string.ascii_letters)
-        if len(grids_by_frequency) > len(grid_names):
-            raise NotImplementedError(
-                f"More than {len(grid_names)} grids that cannot be named!"
-            )
-        return {
-            "default": default_grid,
-            **{
-                grid_names[i]: (grid, measurements)
-                for i, (grid, measurements) in enumerate(grids_by_frequency)
-            },
-        }
-
-    def as_geo_docs(self) -> Tuple[CRS, Dict[str, GridDoc], Dict[str, MeasurementDoc]]:
-        """Calculate combined geo information for metadata docs"""
-
-        if not self._measurements_per_grid:
-            return None, None, None
-
-        grid_docs: Dict[str, GridDoc] = {}
-        measurement_docs: Dict[str, MeasurementDoc] = {}
-        crs = None
-        for grid_name, (grid, measurements) in self._as_named_grids().items():
-            # Validate assumption: All grids should have same CRS
-            if crs is None:
-                crs = grid.crs
-            # CRS equality is tricky. This may not work.
-            # We're assuming a group of measurements specify their CRS
-            # the same way if they are the same.
-            elif grid.crs != crs:
-                raise ValueError(
-                    f"Measurements have different CRSes in the same dataset:\n"
-                    f"\t{crs.to_string()!r}\n"
-                    f"\t{grid.crs.to_string()!r}\n"
-                )
-
-            grid_docs[grid_name] = GridDoc(grid.shape, grid.transform)
-
-            for measurement_name, measurement_path in measurements.items():
-                # No measurement groups in the doc: we replace with underscores.
-                measurement_name = measurement_name.replace(":", "_")
-
-                measurement_docs[measurement_name] = MeasurementDoc(
-                    path=measurement_path.path,
-                    layer=measurement_path.layer,
-                    grid=grid_name if grid_name != "default" else None,
-                )
-        return crs, grid_docs, measurement_docs
-
-    def consume_and_get_valid_data(
-        self, valid_data_method: ValidDataMethod = ValidDataMethod.thorough
-    ) -> BaseGeometry:
-        """
-        Consume the stored grids and produce the valid data for them.
-
-        (they are consumed in order to to minimise peak memory usage)
-
-        :param valid_data_method: How to calculate the valid-data polygon?
-        """
-
-        geoms = []
-
-        while self.mask_by_grid:
-            grid, mask = self.mask_by_grid.popitem()
-
-            if valid_data_method is ValidDataMethod.bounds:
-                geom = box(*grid.bounds)
-            elif valid_data_method is ValidDataMethod.filled:
-                mask = mask.astype("uint8")
-                binary_fill_holes(mask, output=mask)
-                geom = _grid_to_poly(grid, mask)
-            elif valid_data_method is ValidDataMethod.convex_hull:
-                # Requires optional dependency scikit-image
-                from skimage import morphology as morph
-
-                geom = _grid_to_poly(
-                    grid, morph.convex_hull_image(mask).astype("uint8")
-                )
-            elif valid_data_method is ValidDataMethod.thorough:
-                geom = _grid_to_poly(grid, mask.astype("uint8"))
-            else:
-                raise NotImplementedError(
-                    f"Unexpected valid data method: {valid_data_method}"
-                )
-            geoms.append(geom)
-        return shapely.ops.unary_union(geoms)
-
-    def iter_names(self) -> Generator[str, None, None]:
-        """All known measurement names"""
-        for grid, measurements in self._measurements_per_grid.items():
-            for band_name, _ in measurements.items():
-                yield band_name
-
-    def iter_paths(self) -> Generator[Tuple[GridSpec, str, Path], None, None]:
-        """All current measurement paths on disk"""
-        for grid, measurements in self._measurements_per_grid.items():
-            for band_name, meas_path in measurements.items():
-                yield grid, band_name, meas_path.path
-
-
-def _valid_shape(shape: BaseGeometry) -> BaseGeometry:
-    if shape.is_valid:
-        return shape
-    return shape.buffer(0)
-
-
-def _grid_to_poly(grid: GridSpec, mask: numpy.ndarray) -> BaseGeometry:
-    shape = shapely.ops.unary_union(
-        [
-            _valid_shape(shapely.geometry.shape(shape))
-            for shape, val in rasterio.features.shapes(mask)
-            if val == 1
-        ]
-    )
-    shape_y, shape_x = mask.shape
-    del mask
-    # convex hull
-    geom = shape.convex_hull
-    # buffer by 1 pixel
-    geom = geom.buffer(1, cap_style=CAP_STYLE.square, join_style=JOIN_STYLE.bevel)
-    # simplify with 1 pixel radius
-    geom = geom.simplify(1)
-    # intersect with image bounding box
-    geom = geom.intersection(shapely.geometry.box(0, 0, shape_x, shape_y))
-    # transform from pixel space into CRS space
-    geom = shapely.affinity.affine_transform(
-        geom,
-        (
-            grid.transform.a,
-            grid.transform.b,
-            grid.transform.d,
-            grid.transform.e,
-            grid.transform.xoff,
-            grid.transform.yoff,
-        ),
-    )
-    return geom
-
-
-@attr.s(auto_attribs=True)
-class WriteResult:
-    # path: Path
-
-    # The value to put in 'odc:file_format' metadata field.
-    file_format: FileFormat
-
-    # size_bytes: int
-
-
-class FileWrite:
-    """
-    Write COGs from arrays / files.
-
-    This code is derived from the old eugl packaging code and can probably be improved.
-    """
-
-    PREDICTOR_DEFAULTS = {
-        "int8": 2,
-        "uint8": 2,
-        "int16": 2,
-        "uint16": 2,
-        "int32": 2,
-        "uint32": 2,
-        "int64": 2,
-        "uint64": 2,
-        "float32": 3,
-        "float64": 3,
-    }
-
-    def __init__(
-        self,
-        gdal_options: Dict = None,
-        overview_blocksize: Optional[int] = None,
-    ) -> None:
-        super().__init__()
-        self.options = gdal_options or {}
-        self.overview_blocksize = overview_blocksize
-
-    @classmethod
-    def from_existing(
-        cls,
-        shape: Tuple[int, int],
-        overviews: bool = True,
-        blocksize_yx: Optional[Tuple[int, int]] = None,
-        overview_blocksize: Optional[int] = None,
-        compress="deflate",
-        zlevel=4,
-    ) -> "FileWrite":
-        """Returns write_img options according to the source imagery provided
-        :param overviews:
-            (boolean) sets overview flags in gdal config options
-        :param blockxsize:
-            (int) override the derived base blockxsize in cogtif conversion
-        :param blockysize:
-            (int) override the derived base blockysize in cogtif conversion
-
-        """
-        options = {"compress": compress, "zlevel": zlevel}
-
-        y_size, x_size = blocksize_yx or (512, 512)
-        # Do not set block sizes for small imagery
-        if shape[0] < y_size and shape[1] < x_size:
-            pass
-        else:
-            options["blockxsize"] = x_size
-            options["blockysize"] = y_size
-            options["tiled"] = "yes"
-
-        if overviews:
-            options["copy_src_overviews"] = "yes"
-
-        return FileWrite(options, overview_blocksize=overview_blocksize)
-
-    def write_from_ndarray(
-        self,
-        array: numpy.ndarray,
-        out_filename: Path,
-        geobox: GridSpec = None,
-        nodata: int = None,
-        overview_resampling=Resampling.nearest,
-        overviews: Optional[Tuple[int, ...]] = DEFAULT_OVERVIEWS,
-        tags: Optional[Mapping[str, str]] = None,
-    ) -> WriteResult:
-        """
-        Writes a 2D/3D image to disk using rasterio.
-
-        :param array:
-            A 2D/3D NumPy array.
-
-        :param out_filename:
-            A string containing the output file name.
-
-        :param geobox:
-            An instance of a GriddedGeoBox object.
-
-        :param nodata:
-            A value representing the no data value for the array.
-
-        :param overview_resampling:
-            If levels is set, build overviews using a resampling method
-            from `rasterio.enums.Resampling`
-            Default is `Resampling.nearest`.
-
-        :param tags:
-            File tags.
-
-        :notes:
-            If array is an instance of a `h5py.Dataset`, then the output
-            file will include blocksizes based on the `h5py.Dataset's`
-            chunks. To override the blocksizes, specify them using the
-            `options` keyword. Eg {'blockxsize': 512, 'blockysize': 512}.
-        """
-        if out_filename.exists():
-            # Sanity check. Our measurements should have different names...
-            raise RuntimeError(
-                f"measurement output file already exists? {out_filename}"
-            )
-
-        if tags is None:
-            tags = {}
-
-        dtype = array.dtype.name
-
-        # Check for excluded datatypes
-        excluded_dtypes = ["int64", "int8", "uint64"]
-        if dtype in excluded_dtypes:
-            raise TypeError(f"Datatype not supported: {dtype}")
-
-        # convert any bools to uin8
-        if dtype == "bool":
-            array = numpy.uint8(array)
-            dtype = "uint8"
-
-        ndims = array.ndim
-        shape = array.shape
-
-        # Get the (z, y, x) dimensions (assuming BSQ interleave)
-        if ndims == 2:
-            samples = shape[1]
-            lines = shape[0]
-            bands = 1
-        elif ndims == 3:
-            samples = shape[2]
-            lines = shape[1]
-            bands = shape[0]
-        else:
-            raise IndexError(f"Input array is not of 2 or 3 dimensions. Got {ndims}")
-
-        transform = None
-        projection = None
-        if geobox is not None:
-            transform = geobox.transform
-            projection = geobox.crs
-
-        rio_args = {
-            "count": bands,
-            "width": samples,
-            "height": lines,
-            "crs": projection,
-            "transform": transform,
-            "dtype": dtype,
-            "driver": "GTiff",
-            "predictor": self.PREDICTOR_DEFAULTS[dtype],
-        }
-        if nodata is not None:
-            rio_args["nodata"] = nodata
-
-        if h5py is not None and isinstance(array, h5py.Dataset):
-            # TODO: if array is 3D get x & y chunks
-            if array.chunks[1] == array.shape[1]:
-                # GDAL doesn't like tiled or blocksize options to be set
-                # the same length as the columns (probably true for rows as well)
-                array = array[:]
-            else:
-                y_tile, x_tile = array.chunks
-                tiles = generate_tiles(samples, lines, x_tile, y_tile)
-
-                if "tiled" in self.options:
-                    rio_args["blockxsize"] = self.options.get("blockxsize", x_tile)
-                    rio_args["blockysize"] = self.options.get("blockysize", y_tile)
-
-        # the user can override any derived blocksizes by supplying `options`
-        # handle case where no options are provided
-        for key in self.options:
-            rio_args[key] = self.options[key]
-
-        # Write to temp directory first so we can add levels afterwards with gdal.
-        with tempfile.TemporaryDirectory(
-            dir=out_filename.parent, prefix=".band_write"
-        ) as tmpdir:
-            unstructured_image = Path(tmpdir) / out_filename.name
-            """
-            This is a wrapper around rasterio writing tiles to
-            enable writing to a temporary location before rearranging
-            the overviews within the file by gdal when required
-            """
-            with rasterio.open(unstructured_image, "w", **rio_args) as outds:
-                if bands == 1:
-                    if h5py is not None and isinstance(array, h5py.Dataset):
-                        for tile in tiles:
-                            idx = (
-                                slice(tile[0][0], tile[0][1]),
-                                slice(tile[1][0], tile[1][1]),
-                            )
-                            outds.write(array[idx], 1, window=tile)
-                    else:
-                        outds.write(array, 1)
-                else:
-                    if h5py is not None and isinstance(array, h5py.Dataset):
-                        for tile in tiles:
-                            idx = (
-                                slice(tile[0][0], tile[0][1]),
-                                slice(tile[1][0], tile[1][1]),
-                            )
-                            subs = array[:, idx[0], idx[1]]
-                            for i in range(bands):
-                                outds.write(subs[i], i + 1, window=tile)
-                    else:
-                        for i in range(bands):
-                            outds.write(array[i], i + 1)
-                if tags is not None:
-                    outds.update_tags(**tags)
-
-                # overviews/pyramids to disk
-                if overviews:
-                    outds.build_overviews(overviews, overview_resampling)
-
-            if overviews:
-                # Move the overviews to the start of the file, as required to be COG-compliant.
-                with rasterio.Env(
-                    GDAL_TIFF_OVR_BLOCKSIZE=self.overview_blocksize or 512
-                ):
-                    rio_copy(
-                        unstructured_image,
-                        out_filename,
-                        **{"copy_src_overviews": True, **rio_args},
-                    )
-            else:
-                unstructured_image.rename(out_filename)
-
-        return WriteResult(file_format=FileFormat.GeoTIFF)
-
-    def create_thumbnail(
-        self,
-        rgb: Tuple[Path, Path, Path],
-        out: Path,
-        out_scale=10,
-        resampling=Resampling.average,
-        static_stretch: Tuple[int, int] = None,
-        percentile_stretch: Tuple[int, int] = (2, 98),
-        compress_quality: int = 85,
-        input_geobox: GridSpec = None,
-    ):
-        """
-        Generate a thumbnail jpg image using the given three paths as red,green, blue.
-
-        A linear stretch is performed on the colour. By default this is a dynamic 2% stretch
-        (the 2% and 98% percentile values of the input). The static_stretch parameter will
-        override this with a static range of values.
-
-        If the input image has a valid no data value, the no data will
-        be set to 0 in the output image.
-
-        Any non-contiguous data across the colour domain, will be set to
-        zero.
-        """
-        # No aux.xml file with our jpeg.
-        with rasterio.Env(GDAL_PAM_ENABLED=False):
-            with tempfile.TemporaryDirectory(
-                dir=out.parent, prefix=".thumbgen-"
-            ) as tmpdir:
-                tmp_quicklook_path = Path(tmpdir) / "quicklook.tif"
-
-                # We write an intensity-scaled, reprojected version of the dataset at full res.
-                # Then write a scaled JPEG verison. (TODO: can we do it in one step?)
-                ql_grid = _write_quicklook(
-                    rgb,
-                    tmp_quicklook_path,
-                    resampling,
-                    static_range=static_stretch,
-                    percentile_range=percentile_stretch,
-                    input_geobox=input_geobox,
-                )
-                out_crs = ql_grid.crs
-
-                # Scale and write as JPEG to the output.
-                (
-                    thumb_transform,
-                    thumb_width,
-                    thumb_height,
-                ) = calculate_default_transform(
-                    out_crs,
-                    out_crs,
-                    ql_grid.shape[1],
-                    ql_grid.shape[0],
-                    *ql_grid.bounds,
-                    dst_width=ql_grid.shape[1] // out_scale,
-                    dst_height=ql_grid.shape[0] // out_scale,
-                )
-                thumb_args = dict(
-                    driver="JPEG",
-                    quality=compress_quality,
-                    height=thumb_height,
-                    width=thumb_width,
-                    count=3,
-                    dtype="uint8",
-                    nodata=0,
-                    transform=thumb_transform,
-                    crs=out_crs,
-                )
-                with rasterio.open(tmp_quicklook_path, "r") as ql_ds:
-                    ql_ds: DatasetReader
-                    with rasterio.open(out, "w", **thumb_args) as thumb_ds:
-                        thumb_ds: DatasetWriter
-                        for index in thumb_ds.indexes:
-                            thumb_ds.write(
-                                ql_ds.read(
-                                    index,
-                                    out_shape=(thumb_height, thumb_width),
-                                    resampling=resampling,
-                                ),
-                                index,
-                            )
-
-    def create_thumbnail_from_numpy(
-        self,
-        rgb: Tuple[numpy.array, numpy.array, numpy.array],
-        out_scale=10,
-        resampling=Resampling.average,
-        static_stretch: Tuple[int, int] = None,
-        percentile_stretch: Tuple[int, int] = (2, 98),
-        compress_quality: int = 85,
-        input_geobox: GridSpec = None,
-        nodata: int = -999,
-    ):
-        """
-        Generate a thumbnail as numpy arrays.
-
-        Unlike the default `create_thumbnail` function, this is done entirely in-memory. It will likely require more
-        memory but does not touch the filesystem.
-
-        A linear stretch is performed on the colour. By default this is a dynamic 2% stretch
-        (the 2% and 98% percentile values of the input). The static_stretch parameter will
-        override this with a static range of values.
-
-        Any non-contiguous data across the colour domain, will be set to zero.
-        """
-        ql_grid, numpy_array_list, ql_write_args = _write_to_numpy_array(
-            rgb,
-            resampling,
-            static_range=static_stretch,
-            percentile_range=percentile_stretch,
-            input_geobox=input_geobox,
-            nodata=nodata,
-        )
-        out_crs = ql_grid.crs
-
-        # Scale and write as JPEG to the output.
-        (
-            thumb_transform,
-            thumb_width,
-            thumb_height,
-        ) = calculate_default_transform(
-            out_crs,
-            out_crs,
-            ql_grid.shape[1],
-            ql_grid.shape[0],
-            *ql_grid.bounds,
-            dst_width=ql_grid.shape[1] // out_scale,
-            dst_height=ql_grid.shape[0] // out_scale,
-        )
-        thumb_args = dict(
-            driver="JPEG",
-            quality=compress_quality,
-            height=thumb_height,
-            width=thumb_width,
-            count=3,
-            dtype="uint8",
-            nodata=0,
-            transform=thumb_transform,
-            crs=out_crs,
-        )
-
-        with MemoryFile() as mem_tif_file:
-            with mem_tif_file.open(**ql_write_args) as dataset:
-                for i, data in enumerate(numpy_array_list):
-                    dataset.write(data, i + 1)
-
-                with MemoryFile() as mem_jpg_file:
-                    with mem_jpg_file.open(**thumb_args) as thumbnail:
-                        for index in thumbnail.indexes:
-                            thumbnail.write(  # write the data from temp_tif to temp_jpg
-                                dataset.read(
-                                    index,
-                                    out_shape=(thumb_height, thumb_width),
-                                    resampling=Resampling.average,
-                                ),
-                                index,
-                            )
-
-                    return_bytes = mem_jpg_file.read()
-
-        return return_bytes
-
-    def create_thumbnail_singleband(
-        self,
-        in_file: Path,
-        out_file: Path,
-        bit: int = None,
-        lookup_table: Dict[int, Tuple[int, int, int]] = None,
-    ):
-        """
-        Write out a JPG thumbnail from a singleband image.
-        This takes in a path to a valid raster dataset and writes
-        out a file with only the values of the bit (integer) as white
-        """
-        if bit is not None and lookup_table is not None:
-            raise ValueError(
-                "Please set either bit or lookup_table, and not both of them"
-            )
-        if bit is None and lookup_table is None:
-            raise ValueError(
-                "Please set either bit or lookup_table, you haven't set either of them"
-            )
-
-        with rasterio.open(in_file) as dataset:
-            data = dataset.read()
-            out_data, stretch = self._filter_singleband_data(data, bit, lookup_table)
-
-        meta = dataset.meta
-        meta["driver"] = "GTiff"
-
-        with tempfile.TemporaryDirectory() as temp_dir:
-            if bit:
-                # Only use one file, three times
-                temp_file = Path(temp_dir) / "temp.tif"
-
-                with rasterio.open(temp_file, "w", **meta) as tmpdataset:
-                    tmpdataset.write(out_data)
-                self.create_thumbnail(
-                    (temp_file, temp_file, temp_file),
-                    out_file,
-                    static_stretch=stretch,
-                )
-            else:
-                # Use three different files
-                temp_files = tuple(Path(temp_dir) / f"temp_{i}.tif" for i in range(3))
-
-                for i in range(3):
-                    with rasterio.open(temp_files[i], "w", **meta) as tmpdataset:
-                        tmpdataset.write(out_data[i])
-                self.create_thumbnail(temp_files, out_file, static_stretch=stretch)
-
-    def create_thumbnail_singleband_from_numpy(
-        self,
-        input_data: numpy.array,
-        bit: int = None,
-        lookup_table: Dict[int, Tuple[int, int, int]] = None,
-        input_geobox: GridSpec = None,
-        nodata: int = -999,
-    ) -> bytes:
-        """
-        Output a thumbnail ready bytes from the input numpy array.
-        This takes a valid raster data (numpy arrary) and return
-        out bytes with only the values of the bit (integer) as white.
-        """
-        if bit is not None and lookup_table is not None:
-            raise ValueError(
-                "Please set either bit or lookup_table, and not both of them"
-            )
-        if bit is None and lookup_table is None:
-            raise ValueError(
-                "Please set either bit or lookup_table, you haven't set either of them"
-            )
-
-        out_data, stretch = self._filter_singleband_data(input_data, bit, lookup_table)
-
-        if bit:
-            rgb = [out_data, out_data, out_data]
-        else:
-            rgb = out_data
-
-        return self.create_thumbnail_from_numpy(
-            rgb=rgb,
-            static_stretch=stretch,
-            input_geobox=input_geobox,
-            nodata=nodata,
-        )
-
-    def _filter_singleband_data(
-        self,
-        data: numpy.array,
-        bit: int = None,
-        lookup_table: Dict[int, Tuple[int, int, int]] = None,
-    ):
-        """
-        Apply bit or lookup_table to filter the numpy array
-        and generate the thumbnail content.
-        """
-        if bit is not None:
-            out_data = numpy.copy(data)
-            out_data[data != bit] = 0
-            stretch = (0, bit)
-        if lookup_table is not None:
-            out_data = [
-                numpy.full_like(data, 0),
-                numpy.full_like(data, 0),
-                numpy.full_like(data, 0),
-            ]
-            stretch = (0, 255)
-
-            for value, rgb in lookup_table.items():
-                for index in range(3):
-                    out_data[index][data == value] = rgb[index]
-        return out_data, stretch
-
-
-def _write_to_numpy_array(
-    rgb: Sequence[numpy.array],
-    resampling: Resampling,
-    static_range: Tuple[int, int],
-    percentile_range: Tuple[int, int] = (2, 98),
-    input_geobox: GridSpec = None,
-    nodata: int = -999,
-) -> GridSpec:
-    """
-    Write an intensity-scaled wgs84 image using the given files as bands.
-    """
-    if input_geobox is None:
-        raise NotImplementedError("generating geobox from numpy is't yet supported")
-
-    out_crs = CRS.from_epsg(4326)
-    (
-        reprojected_transform,
-        reprojected_width,
-        reprojected_height,
-    ) = calculate_default_transform(
-        input_geobox.crs,
-        out_crs,
-        input_geobox.shape[1],
-        input_geobox.shape[0],
-        *input_geobox.bounds,
-    )
-    reproj_grid = GridSpec(
-        (reprojected_height, reprojected_width), reprojected_transform, crs=out_crs
-    )
-    ql_write_args = dict(
-        driver="GTiff",
-        dtype="uint8",
-        count=len(rgb),
-        width=reproj_grid.shape[1],
-        height=reproj_grid.shape[0],
-        transform=reproj_grid.transform,
-        crs=reproj_grid.crs,
-        nodata=0,
-        tiled="yes",
-    )
-
-    # Only set blocksize on larger imagery; enables reduced resolution processing
-    if reproj_grid.shape[0] > 512:
-        ql_write_args["blockysize"] = 512
-    if reproj_grid.shape[1] > 512:
-        ql_write_args["blockxsize"] = 512
-
-    # Calculate combined nodata mask
-    valid_data_mask = numpy.ones(input_geobox.shape, dtype="bool")
-    calculated_range = read_valid_mask_and_value_range(
-        valid_data_mask, _iter_arrays(rgb, nodata=nodata), percentile_range
-    )
-
-    output_list = []
-
-    for band_no, (image, nodata) in enumerate(
-        _iter_arrays(rgb, nodata=nodata), start=1
-    ):
-        reprojected_data = numpy.zeros(reproj_grid.shape, dtype=numpy.uint8)
-        reproject(
-            rescale_intensity(
-                image,
-                image_null_mask=~valid_data_mask,
-                in_range=(static_range or calculated_range),
-                out_range=(1, 255),
-                out_dtype=numpy.uint8,
-            ),
-            reprojected_data,
-            src_crs=input_geobox.crs,
-            src_transform=input_geobox.transform,
-            src_nodata=0,
-            dst_crs=reproj_grid.crs,
-            dst_nodata=0,
-            dst_transform=reproj_grid.transform,
-            resampling=resampling,
-            num_threads=2,
-        )
-        output_list.append(reprojected_data)
-        del reprojected_data
-
-    return reproj_grid, output_list, ql_write_args
-
-
-def _write_quicklook(
-    rgb: Sequence[Path],
-    dest_path: Path,
-    resampling: Resampling,
-    static_range: Tuple[int, int],
-    percentile_range: Tuple[int, int] = (2, 98),
-    input_geobox: GridSpec = None,
-) -> GridSpec:
-    """
-    Write an intensity-scaled wgs84 image using the given files as bands.
-    """
-    if input_geobox is None:
-        with rasterio.open(rgb[0]) as ds:
-            input_geobox = GridSpec.from_rio(ds)
-
-    out_crs = CRS.from_epsg(4326)
-    (
-        reprojected_transform,
-        reprojected_width,
-        reprojected_height,
-    ) = calculate_default_transform(
-        input_geobox.crs,
-        out_crs,
-        input_geobox.shape[1],
-        input_geobox.shape[0],
-        *input_geobox.bounds,
-    )
-    reproj_grid = GridSpec(
-        (reprojected_height, reprojected_width), reprojected_transform, crs=out_crs
-    )
-    ql_write_args = dict(
-        driver="GTiff",
-        dtype="uint8",
-        count=len(rgb),
-        width=reproj_grid.shape[1],
-        height=reproj_grid.shape[0],
-        transform=reproj_grid.transform,
-        crs=reproj_grid.crs,
-        nodata=0,
-        tiled="yes",
-    )
-
-    # Only set blocksize on larger imagery; enables reduced resolution processing
-    if reproj_grid.shape[0] > 512:
-        ql_write_args["blockysize"] = 512
-    if reproj_grid.shape[1] > 512:
-        ql_write_args["blockxsize"] = 512
-
-    with rasterio.open(dest_path, "w", **ql_write_args) as ql_ds:
-        ql_ds: DatasetWriter
-
-        # Calculate combined nodata mask
-        valid_data_mask = numpy.ones(input_geobox.shape, dtype="bool")
-        calculated_range = read_valid_mask_and_value_range(
-            valid_data_mask, _iter_images(rgb), percentile_range
-        )
-
-        for band_no, (image, nodata) in enumerate(_iter_images(rgb), start=1):
-            reprojected_data = numpy.zeros(reproj_grid.shape, dtype=numpy.uint8)
-            reproject(
-                rescale_intensity(
-                    image,
-                    image_null_mask=~valid_data_mask,
-                    in_range=(static_range or calculated_range),
-                    out_range=(1, 255),
-                    out_dtype=numpy.uint8,
-                ),
-                reprojected_data,
-                src_crs=input_geobox.crs,
-                src_transform=input_geobox.transform,
-                src_nodata=0,
-                dst_crs=reproj_grid.crs,
-                dst_nodata=0,
-                dst_transform=reproj_grid.transform,
-                resampling=resampling,
-                num_threads=2,
-            )
-            ql_ds.write(reprojected_data, band_no)
-            del reprojected_data
-
-    return reproj_grid
-
-
-LazyImages = Iterable[Tuple[numpy.ndarray, int]]
-
-
-def _iter_images(rgb: Sequence[Path]) -> LazyImages:
-    """
-    Lazily load a series of single-band images from a path.
-
-    Yields the image array and nodata value.
-    """
-    for path in rgb:
-        with rasterio.open(path) as ds:
-            ds: DatasetReader
-            if ds.count != 1:
-                raise NotImplementedError(
-                    "multi-band measurement files aren't yet supported"
-                )
-            yield ds.read(1), ds.nodata
-
-
-def _iter_arrays(rgb: Sequence[numpy.array], nodata: int) -> LazyImages:
-    """
-    Lazily load a series of single-band images from a path.
-
-    Yields the image array and nodata value.
-    """
-    for data in rgb:
-        yield data, nodata
-
-
-def read_valid_mask_and_value_range(
-    valid_data_mask: numpy.ndarray,
-    images: LazyImages,
-    calculate_percentiles: Optional[Tuple[int, int]] = None,
-) -> Optional[Tuple[int, int]]:
-    """
-    Read the given images, filling in a valid data mask and optional pixel percentiles.
-    """
-    calculated_range = (-sys.maxsize - 1, sys.maxsize)
-    for array, nodata in images:
-        valid_data_mask &= array != nodata
-
-        if calculate_percentiles is not None:
-            the_data = array[valid_data_mask]
-            # Check if there's a non-empty array first
-            if the_data.any():
-                # Numpy changed the 'interpolation' method, but we need to still support the
-                # older Python 3.6 module at NCI.
-                if numpy.__version__ < "1.22":
-                    low, high = numpy.percentile(
-                        the_data, calculate_percentiles, interpolation="nearest"
-                    )
-                else:
-                    low, high = numpy.percentile(
-                        the_data, calculate_percentiles, method="nearest"
-                    )
-                calculated_range = (
-                    max(low, calculated_range[0]),
-                    min(high, calculated_range[1]),
-                )
-
-    return calculated_range
-
-
-def rescale_intensity(
-    image: numpy.ndarray,
-    in_range: Tuple[int, int],
-    out_range: Optional[Tuple[int, int]] = None,
-    image_nodata: int = None,
-    image_null_mask: numpy.ndarray = None,
-    out_dtype=numpy.uint8,
-    out_nodata=0,
-) -> numpy.ndarray:
-    """
-    Based on scikit-image's rescale_intensity, but does fewer copies/allocations of the array.
-
-    (and it saves us bringing in the entire dependency for one small method)
-    """
-    if image_null_mask is None:
-        if image_nodata is None:
-            raise ValueError("Must specify either a null mask or a nodata val")
-        image_null_mask = image == image_nodata
-
-    imin, imax = in_range
-    omin, omax = out_range or (numpy.iinfo(out_dtype).min, numpy.iinfo(out_dtype).max)
-
-    # The intermediate calculation will need floats.
-    # We'll convert to it immediately to avoid modifying the input array
-    image = image.astype(numpy.float64)
-
-    numpy.clip(image, imin, imax, out=image)
-    image -= imin
-    image /= float(imax - imin)
-    image *= omax - omin
-    image += omin
-    image = image.astype(out_dtype)
-    image[image_null_mask] = out_nodata
-    return image
diff --git a/eo3/metadata/default-eo3-type.yaml b/eo3/metadata/default-eo3-type.yaml
new file mode 100644
index 00000000..4e617fe7
--- /dev/null
+++ b/eo3/metadata/default-eo3-type.yaml
@@ -0,0 +1,104 @@
+---
+# Metadata Type
+name: eo3
+description: Default EO3 with no custom fields
+dataset:
+  id:
+  - id
+  label:
+  - label
+  format:
+  - properties
+  - odc:file_format
+  sources:
+  - lineage
+  - source_datasets
+  creation_dt:
+  - properties
+  - odc:processing_datetime
+  grid_spatial:
+  - grid_spatial
+  - projection
+  measurements:
+  - measurements
+  search_fields:
+    lat:
+      type: double-range
+      max_offset:
+      - - extent
+        - lat
+        - end
+      min_offset:
+      - - extent
+        - lat
+        - begin
+      description: Latitude range
+    lon:
+      type: double-range
+      max_offset:
+      - - extent
+        - lon
+        - end
+      min_offset:
+      - - extent
+        - lon
+        - begin
+      description: Longitude range
+    time:
+      type: datetime-range
+      max_offset:
+      - - properties
+        - dtr:end_datetime
+      - - properties
+        - datetime
+      min_offset:
+      - - properties
+        - dtr:start_datetime
+      - - properties
+        - datetime
+      description: Acquisition time range
+    crs_raw:
+      offset:
+      - crs
+      indexed: false
+      description: The raw CRS string as it appears in metadata
+    platform:
+      offset:
+      - properties
+      - eo:platform
+      indexed: false
+      description: Platform code
+    instrument:
+      offset:
+      - properties
+      - eo:instrument
+      indexed: false
+      description: Instrument name
+    cloud_cover:
+      type: double
+      offset:
+      - properties
+      - eo:cloud_cover
+      indexed: false
+      description: Cloud cover percentage [0, 100]
+    region_code:
+      offset:
+      - properties
+      - odc:region_code
+      description: "Spatial reference code from the provider. For Landsat region_code
+        is a scene path row:\n        '{:03d}{:03d}.format(path,row)'.\nFor Sentinel
+        it is MGRS code. In general it is a unique string identifier that datasets
+        covering roughly the same spatial region share.\n"
+    product_family:
+      offset:
+      - properties
+      - odc:product_family
+      indexed: false
+      description: Product family code
+    dataset_maturity:
+      offset:
+      - properties
+      - dea:dataset_maturity
+      indexed: false
+      description: One of - final|interim|nrt  (near real time)
+...
diff --git a/eo3/metadata/validate.py b/eo3/metadata/validate.py
index 71360ec3..e2245726 100644
--- a/eo3/metadata/validate.py
+++ b/eo3/metadata/validate.py
@@ -2,7 +2,7 @@
 
 from attr import define
 
-from eo3 import serialise
+from eo3 import schema
 from eo3.validation_msg import ValidationMessage, ValidationMessages
 
 
@@ -158,7 +158,7 @@ def validate_metadata_type(doc: Dict) -> ValidationMessages:
         yield ValidationMessage.error("no_type_name", "Metadata type must have a name.")
         return
     # Validate it against ODC's schema (will be refused by ODC otherwise)
-    for error in serialise.METADATA_TYPE_SCHEMA.iter_errors(doc):
+    for error in schema.METADATA_TYPE_SCHEMA.iter_errors(doc):
         displayable_path = ".".join(map(str, error.absolute_path))
         context = f"Error in {name}: ({displayable_path}) " if displayable_path else ""
         yield ValidationMessage.error("document_schema", f"{context}{error.message} ")
diff --git a/eo3/model.py b/eo3/model.py
index da951247..ac2c598e 100644
--- a/eo3/model.py
+++ b/eo3/model.py
@@ -1,22 +1,48 @@
+import warnings
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union
-from uuid import UUID
+from typing import Mapping, Optional
 
-import affine
 import attr
-from odc.geo import CoordList, Geometry, SomeCRS
+import toolz
+from odc.geo import CRS, Geometry
 from odc.geo.geom import polygon
-from ruamel.yaml.comments import CommentedMap
-from shapely.geometry.base import BaseGeometry
+from pyproj.exceptions import CRSError
+from ruamel.yaml.timestamp import TimeStamp as RuamelTimeStamp
 
-from eo3.properties import Eo3DictBase, Eo3InterfaceBase
+from eo3 import validate
+from eo3.eo3_core import EO3Grid, prep_eo3
+from eo3.fields import Range, all_field_offsets, get_search_fields, get_system_fields
+from eo3.metadata.validate import validate_metadata_type
+from eo3.utils import default_utc, parse_time, read_file
+from eo3.validation_msg import ContextualMessager, ValidationMessages
 
 DEA_URI_PREFIX = "https://collections.dea.ga.gov.au"
-ODC_DATASET_SCHEMA_URL = "https://schemas.opendatacube.org/dataset"
+DEFAULT_METADATA_TYPE = read_file(
+    Path(__file__).parent / "metadata" / "default-eo3-type.yaml"
+)
 
-# Either a local filesystem path or a string URI.
-# (the URI can use any scheme supported by rasterio, such as tar:// or https:// or ...)
-Location = Union[Path, str]
+
+def datetime_type(value):
+    # Ruamel's TimeZone class can become invalid from the .replace(utc) call.
+    # (I think it no longer matches the internal ._yaml fields.)
+    # Convert to a regular datetime.
+    if isinstance(value, RuamelTimeStamp):
+        value = value.isoformat()
+    else:
+        value = parse_time(value)
+
+    # Store all dates with a timezone.
+    # yaml standard says all dates default to UTC.
+    # (and ruamel normalises timezones to UTC itself)
+    return default_utc(value)
+
+
+BASE_NORMALISERS = {
+    "datetime": datetime_type,
+    "dtr:end_datetime": datetime_type,
+    "dtr:start_datetime": datetime_type,
+    "odc:processing_datetime": datetime_type,
+}
 
 
 @attr.s(auto_attribs=True, slots=True)
@@ -33,31 +59,6 @@ class ProductDoc:
     href: str = None
 
 
-@attr.s(auto_attribs=True, slots=True, hash=True)
-class GridDoc:
-    """The grid describing a measurement/band's pixels"""
-
-    shape: Tuple[int, int]
-    transform: affine.Affine
-    crs: Optional[str] = None
-
-    def points(self, ring: bool = False) -> CoordList:
-        ny, nx = (float(dim) for dim in self.shape)
-        pts = [(0.0, 0.0), (nx, 0.0), (nx, ny), (0.0, ny)]
-        if ring:
-            pts += pts[:1]
-        return [self.transform * pt for pt in pts]
-
-    def ref_points(self) -> Dict[str, Dict[str, float]]:
-        nn = ["ul", "ur", "lr", "ll"]
-        return {n: dict(x=x, y=y) for n, (x, y) in zip(nn, self.points())}
-
-    def polygon(self, crs: Optional[SomeCRS] = None) -> Geometry:
-        if not crs:
-            crs = self.crs
-        return polygon(self.points(ring=True), crs=crs)
-
-
 @attr.s(auto_attribs=True, slots=True)
 class MeasurementDoc:
     """
@@ -87,55 +88,298 @@ class AccessoryDoc:
     name: str = attr.ib(metadata=dict(doc_exclude=True), default=None)
 
 
-@attr.s(auto_attribs=True, slots=True)
-class Eo3DatasetDocBase(Eo3InterfaceBase):
+class DatasetMetadata:
     """
-    A minimally-validated EO3 dataset document
+    A representation of an EO3 dataset document that allows for easy metadata access and validation.
+
+    :param raw_dict: The document describing the dataset as a dictionary. Can also provide a path to the dictionary
+    file via the `from_path` class method.
+
+    :param mdt_definition: The metadata type definition dictionary. Dataset fields are accessed based on the offsets
+    defined in the metadata type definition. If no metadata type definition is provided, it will default to the simple
+    eo3 metadata type with no custom fields. It can be updated later using the `metadata_type` property
 
-    Includes :class:`.Eo3InterfaceBase` methods for metadata access::
+    :param normalisers: A mapping of property normalisation functions, for any type or semantic normalisation that isn't
+    enforced by the dataset schema. By default it only normalisesdatetime strings to datetime.datetime objects
+    with a utc timezone if no timezone is specified
 
-        >>> p = Eo3DatasetDocBase()
-        >>> p.processed = '2018-04-03'
-        >>> p.properties['odc:processing_datetime']
-        datetime.datetime(2018, 4, 3, 0, 0, tzinfo=datetime.timezone.utc)
+    :param legacy_lineage: False if dataset uses external lineage
 
+    DatasetMetadata also allows access to the raw document, the raw properties dictionary, and dataset properties
+    not defined within the metadata type, such as locations, geometry, grids, measurements, accessories
+
+    Validation against the schema and the metadata type definition are conducted by default, as is geometry validation
+    via the call to `prep_eo3`, which adds/modifies metadata sections required for an eo3 dataset.
     """
 
-    #: Dataset UUID
-    id: UUID = None
-    #: Human-readable identifier for the dataset
-    label: str = None
-    #: The product name (local) and/or url (global)
-    product: ProductDoc = None
-    #: Location(s) where this dataset is stored.
-    #:
-    #: (ODC supports multiple locations when the same dataset is stored in multiple places)
-    #:
-    #: They are fully qualified URIs (``file://...`, ``https://...``, ``s3://...``)
-    #:
-    #: All other paths in the document (measurements, accessories) are relative to the
-    #: chosen location.
-    #:
-    #: If not supplied, the directory from which the metadata was read is treated as the root for the data.
-    locations: List[str] = None
-
-    #: CRS string. Eg. ``epsg:3577``
-    crs: str = None
-    #: Shapely geometry of the valid data coverage
-    #:
-    #: (it must contain all non-empty pixels of the image)
-    geometry: BaseGeometry = None
-    #: Grid specifications for measurements
-    grids: Dict[str, GridDoc] = None
-    #: Raw properties
-    properties: Eo3DictBase = attr.ib(factory=Eo3DictBase)
-    #: Loadable measurements of the dataset
-    measurements: Dict[str, MeasurementDoc] = None
-    #: References to accessory files
-    #:
-    #: Such as thumbnails, checksums, other kinds of metadata files.
-    #:
-    #: (any files included in the dataset that are not measurements)
-    accessories: Dict[str, AccessoryDoc] = attr.ib(factory=CommentedMap)
-    #: Links to source dataset uuids
-    lineage: Dict[str, List[UUID]] = attr.ib(factory=CommentedMap)
+    def __init__(
+        self,
+        raw_dict,
+        mdt_definition: Mapping = DEFAULT_METADATA_TYPE,
+        normalisers: Mapping = BASE_NORMALISERS,
+        legacy_lineage=True,
+    ):
+        try:
+            self.__dict__["_doc"] = prep_eo3(raw_dict, remap_lineage=legacy_lineage)
+        except CRSError:
+            raise validate.InvalidDatasetError(
+                f"invalid_crs: CRS {raw_dict.get('crs')} is not a valid CRS"
+            )
+        except ValueError as e:
+            raise validate.InvalidDatasetError(f"incomplete_geometry: {e}")
+
+        self.__dict__["_normalisers"] = normalisers
+        for key, val in self._doc["properties"].items():
+            self._doc["properties"][key] = self.normalise(key, val)
+
+        self.__dict__["_mdt_definition"] = mdt_definition
+
+        # The user-configurable search fields for this dataset type.
+        self.__dict__["_search_fields"] = {
+            name: field for name, field in get_search_fields(mdt_definition).items()
+        }
+        # The field offsets that the datacube itself understands: id, format, sources etc.
+        # (See the metadata-type-schema.yaml or the comments in default-metadata-types.yaml)
+        self.__dict__["_system_offsets"] = {
+            name: field for name, field in get_system_fields(mdt_definition).items()
+        }
+
+        self.__dict__["_all_offsets"] = all_field_offsets(mdt_definition)
+
+        self.__dict__["_msg"] = ContextualMessager(
+            {
+                "type": mdt_definition.get("name"),
+            }
+        )
+
+        validate.handle_validation_messages(self.validate_base())
+
+    def __getattr__(self, name):
+        if name in self.fields.keys():
+            return self.fields[name]
+        else:
+            raise AttributeError(
+                "Unknown field {!r}. Expected one of {!r}".format(
+                    name, list(self.fields.keys())
+                )
+            )
+
+    def __setattr__(self, name, val):
+        offset = self._all_offsets.get(name)
+        if offset is None:
+            # check for a @property.setter first
+            if hasattr(self, name):
+                super().__setattr__(name, val)
+                return
+            raise AttributeError(
+                "Unknown field offset {!r}. Expected one of {!r}".format(
+                    name, list(self._all_offsets.keys())
+                )
+            )
+
+        def _set_range_offset(name, val, offset, doc):
+            """Helper function for updating a field that expects a range"""
+            is_range = isinstance(val, Range)
+            # time can be a range or a single datetime
+            if name == "time":
+                if is_range:
+                    doc = toolz.assoc_in(
+                        doc,
+                        ["properties", "dtr:start_datetime"],
+                        self.normalise("dtr:start_datetime", val.begin),
+                    )
+                    doc = toolz.assoc_in(
+                        doc,
+                        ["properties", "dtr:end_datetime"],
+                        self.normalise("dtr:end_datetime", val.end),
+                    )
+                else:
+                    doc = toolz.assoc_in(
+                        doc, ["properties", "datetime"], self.normalise("datetime", val)
+                    )
+            # for all other range fields, value must be range
+            else:
+                if not is_range:
+                    raise TypeError(f"The {name} field expects a Range value")
+                # this assumes that offsets are in min, max order
+                # and that there aren't multiple possible offsets for each
+                doc = toolz.assoc_in(
+                    doc, offset[0], self.normalise(offset[0], val.begin)
+                )
+                doc = toolz.assoc_in(doc, offset[1], self.normalise(offset[0], val.end))
+            return doc
+
+        # handle if there are multiple offsets
+        if len(offset) > 1:
+            self._doc = _set_range_offset(name, val, offset, self._doc)
+        # otherwise it's a simple field
+        else:
+            self._doc = toolz.assoc_in(self._doc, *offset, self.normalise(*offset, val))
+
+    def __dir__(self):
+        return list(self.fields)
+
+    @property
+    def doc(self):
+        return self._doc
+
+    @property
+    def search_fields(self):
+        return {
+            name: field.extract(self.doc) for name, field in self._search_fields.items()
+        }
+
+    @property
+    def system_fields(self):
+        return {
+            name: field.extract(self.doc)
+            for name, field in self._system_offsets.items()
+        }
+
+    @property
+    def fields(self):
+        return dict(**self.system_fields, **self.search_fields)
+
+    @property
+    def properties(self):
+        return self.doc.get("properties")
+
+    @property
+    def metadata_type(self):
+        return self._mdt_definition
+
+    @metadata_type.setter
+    def metadata_type(self, val: Mapping):
+        validate.handle_validation_messages(validate_metadata_type(val))
+        self._mdt_definition = val
+        self._search_fields = {
+            name: field for name, field in get_search_fields(val).items()
+        }
+        self._system_offsets = {
+            name: field for name, field in get_system_fields(val).items()
+        }
+        self._all_offsets = all_field_offsets(val)
+        self._msg.context["type"] = val.get("name")
+
+    # Additional metadata not included in the metadata type
+    @property
+    def locations(self):
+        if self.doc.get("location"):
+            warnings.warn(
+                "`location` is deprecated and will be removed in a future release. Use `locations` instead."
+            )
+            return [self.doc.get("location")]
+        return self.doc.get("locations", None)
+
+    @property
+    def product(self):
+        return ProductDoc(**self.doc.get("product"))
+
+    @property
+    def geometry(self):
+        from shapely.geometry import shape
+
+        return shape(self.doc.get("geometry"))
+
+    @property
+    def grids(self):
+        return {key: EO3Grid(doc) for key, doc in self.doc.get("grids").items()}
+
+    @property
+    def measurements(self):
+        return {
+            key: MeasurementDoc(**doc)
+            for key, doc in self.doc.get("measurements").items()
+        }
+
+    @property
+    def accessories(self):
+        return {
+            key: AccessoryDoc(**doc) for key, doc in self.doc.get("accessories").items()
+        }
+
+    @property
+    def crs(self) -> str:
+        # get doc crs as an actual CRS
+        return CRS(self._doc.get("crs"))
+
+    # Core TODO: copied from datacube.model.Dataset
+    @property
+    def extent(self):
+        def xytuple(obj):
+            return obj["x"], obj["y"]
+
+        projection = self.grid_spatial
+        valid_data = projection.get("valid_data")
+        geo_ref_points = projection.get("geo_ref_points")
+        if valid_data:
+            return Geometry(valid_data, crs=self.crs)
+        elif geo_ref_points:
+            return polygon(
+                [
+                    xytuple(geo_ref_points[key])
+                    for key in ("ll", "ul", "ur", "lr", "ll")
+                ],
+                crs=self.crs,
+            )
+
+        return None
+
+    # Validation and other methods
+    def without_lineage(self):
+        return toolz.assoc(self._doc, "lineage", {})
+
+    def normalise(self, key, val):
+        """If property name is present in the normalisation mapping, apply the
+        normalisation function"""
+        # for easy dealing with offsets, such as when used in __setattr__
+        if key[0] == "properties":
+            key = key[1]
+        normalise = self._normalisers.get(key, None)
+        if normalise:
+            return normalise(val)
+        return val
+
+    def validate_to_product(self, product_definition: Mapping):
+        # Core TODO: replaces datacube.index.hl.check_dataset_consistent and check_consistent
+        self._msg.context["product"] = product_definition.get("name")
+        yield from validate.validate_ds_to_product(
+            self._doc, product_definition, self._msg
+        )
+
+    def validate_to_schema(self) -> ValidationMessages:
+        # don't error if properties 'extent' or 'grid_spatial' are present
+        doc = toolz.dissoc(self._doc, "extent", "grid_spatial")
+        yield from validate.validate_ds_to_schema(doc, self._msg)
+
+    def validate_to_mdtype(self) -> ValidationMessages:
+        yield from validate.validate_ds_to_metadata_type(
+            self._doc, self._mdt_definition, self._msg
+        )
+
+    def validate_measurements(self) -> ValidationMessages:
+        """Check that measurement paths and grid references are valid"""
+        for name, measurement in self.measurements.items():
+            grid_name = measurement.grid
+            if grid_name != "default" or self.grids:
+                if grid_name not in self.grids:
+                    yield self._msg.error(
+                        "invalid_grid_ref",
+                        f"Measurement {name!r} refers to unknown grid {grid_name!r}",
+                    )
+            yield from validate.validate_measurement_path(
+                name, measurement.path, self._msg
+            )
+
+    def validate_base(self) -> ValidationMessages:
+        """Basic validations that can be done with information present at initialisation"""
+        yield from self.validate_to_schema()
+        yield from self.validate_to_mdtype()
+        # measurements are not mandatory
+        if self.measurements:
+            yield from self.validate_measurements()
+
+    @classmethod
+    def from_path(cls, path):
+        # Create DatasetMetadata from filepath
+        return cls(read_file(path))
diff --git a/eo3/names.py b/eo3/names.py
deleted file mode 100644
index 0febe048..00000000
--- a/eo3/names.py
+++ /dev/null
@@ -1,64 +0,0 @@
-from pathlib import Path
-from urllib.parse import unquote, urlparse
-
-from eo3.model import Location
-from eo3.uris import is_url, is_vsipath, normalise_path, register_scheme
-
-# Needed when packaging zip or tar files.
-register_scheme("zip", "tar")
-
-
-def _strip_major_version(version: str) -> str:
-    """
-    >>> _strip_major_version('1.2.3')
-    '2.3'
-    >>> _strip_major_version('01.02.03')
-    '02.03'
-    >>> _strip_major_version('30.40')
-    '40'
-    >>> _strip_major_version('40')
-    ''
-    """
-    return ".".join(version.split(".")[1:])
-
-
-class MissingRequiredFields(ValueError):
-    ...
-
-
-def resolve_location(path: Location) -> str:
-    """
-    Make sure a dataset location is a URL, suitable to be
-    the dataset_location in datacube indexing.
-
-    Users may specify a pathlib.Path(), and we'll convert it as needed.
-    """
-    if isinstance(path, str):
-        if not is_url(path) and not is_vsipath(path):
-            raise ValueError(
-                "A string location is expected to be a URL or VSI path. "
-                "Perhaps you want to give it as a local pathlib.Path()?"
-            )
-        return path
-
-    path = normalise_path(path)
-    if ".tar" in path.suffixes:
-        return f"tar:{path}!/"
-    elif ".zip" in path.suffixes:
-        return f"zip:{path}!/"
-    else:
-        uri = unquote(path.as_uri())
-        # Base paths specified as directories must end in a slash,
-        # so they will be url joined as subfolders. (pathlib strips them)
-        if path.is_dir():
-            return f"{uri}/"
-        return uri
-
-
-def _as_path(url: str) -> Path:
-    """Try to convert the given URL to a local Path"""
-    parts = urlparse(url)
-    if not parts.scheme == "file":
-        raise ValueError(f"Expected a filesystem path, got a URL! {url!r}")
-
-    return Path(parts.path)
diff --git a/eo3/product/validate.py b/eo3/product/validate.py
index aae97e61..4540ff99 100644
--- a/eo3/product/validate.py
+++ b/eo3/product/validate.py
@@ -6,8 +6,8 @@
 from odc.geo import CRS
 from pyproj.exceptions import CRSError
 
-from eo3 import serialise
-from eo3.utils import _is_nan
+from eo3 import schema
+from eo3.utils.utils import _is_nan
 from eo3.validation_msg import ValidationMessage, ValidationMessages
 
 
@@ -18,7 +18,7 @@ def validate_product(doc: Dict) -> ValidationMessages:
 
     # Validate it against ODC's product schema.
     has_doc_errors = False
-    for error in serialise.PRODUCT_SCHEMA.iter_errors(doc):
+    for error in schema.PRODUCT_SCHEMA.iter_errors(doc):
         has_doc_errors = True
         displayable_path = ".".join(map(str, error.absolute_path))
         context = f"({displayable_path}) " if displayable_path else ""
@@ -79,13 +79,13 @@ def validate_product(doc: Dict) -> ValidationMessages:
 def validate_product_metadata(template: Dict, name: str) -> ValidationMessages:
     for key, value in template.items():
         if key == "product":
-            for prod_key, prod_val in template["product"].items():
+            for prod_key, prod_val in value.items():
                 if prod_key == "name":
-                    if template["product"]["name"] != name:
+                    if prod_val != name:
                         yield ValidationMessage.error(
                             "product_name_mismatch",
                             "If specified, metadata::product::name must match the product name "
-                            f"(Expected {name}, got {template['product']['name']})",
+                            f"(Expected {name}, got {prod_val})",
                         )
                     else:
                         yield ValidationMessage.warning(
@@ -98,7 +98,7 @@ def validate_product_metadata(template: Dict, name: str) -> ValidationMessages:
                         f"Only the name field is permitted in metadata::product::name ({prod_key})",
                     )
         elif key == "properties":
-            for prop_key, prop_val in template["properties"].items():
+            for prop_key, prop_val in value.items():
                 if isinstance(prop_val, dict):
                     yield ValidationMessage.error(
                         "nested_metadata",
@@ -386,7 +386,7 @@ def numpy_value_fits_dtype(value, dtype):
     if _is_nan(value):
         return np.issubdtype(dtype, np.floating)
     else:
-        return np.all(np.array([value], dtype=dtype) == [value])
+        return np.all(np.array([value]).astype(dtype) == [value])
 
 
 def _find_duplicates(values: Iterable[str]) -> Generator[str, None, None]:
diff --git a/eo3/properties.py b/eo3/properties.py
index cfda89a5..71cc93b1 100644
--- a/eo3/properties.py
+++ b/eo3/properties.py
@@ -1,6 +1,5 @@
 import collections.abc
 import warnings
-from abc import abstractmethod
 from collections import defaultdict
 from datetime import datetime
 from enum import Enum, EnumMeta
@@ -9,8 +8,7 @@
 import ciso8601
 from ruamel.yaml.timestamp import TimeStamp as RuamelTimeStamp
 
-from eo3.utils import _is_nan, default_utc
-from eo3.validation_msg import ContextualMessager, ValidationMessage, ValidationMessages
+from eo3.utils import default_utc
 
 
 class FileFormat(Enum):
@@ -61,6 +59,15 @@ def datetime_type(value):
     return default_utc(value)
 
 
+def degrees_type(value):
+    value = float(value)
+
+    if not (-360.0 <= value <= 360.0):
+        raise ValueError("Expected degrees between -360,+360")
+
+    return value
+
+
 def of_enum_type(
     vals: Union[EnumMeta, Tuple[str, ...]] = None, lower=False, upper=False, strict=True
 ) -> Callable[[str], str]:
@@ -87,31 +94,37 @@ def normalise(v: str):
     return normalise
 
 
-def percent_type(value):
-    value = float(value)
-
-    if not (0.0 <= value <= 100.0):
-        raise ValueError("Expected percent between 0,100")
+def producer_check(value):
+    if "." not in value:
+        warnings.warn(
+            "Property 'odc:producer' is expected to be a domain name, "
+            "eg 'usgs.gov' or 'ga.gov.au'"
+        )
     return value
 
 
-def degrees_type(value):
-    value = float(value)
-
-    if not (-360.0 <= value <= 360.0):
-        raise ValueError("Expected degrees between -360,+360")
-
-    return value
+def normalise_platforms(value: Union[str, list, set]):
+    """
+    >>> normalise_platforms('LANDSAT_8')
+    'landsat-8'
+    >>> # Multiple can be comma-separated. They're normalised independently and sorted.
+    >>> normalise_platforms('LANDSAT_8,Landsat-5,landsat-7')
+    'landsat-5,landsat-7,landsat-8'
+    >>> # Can be given as a list.
+    >>> normalise_platforms(['sentinel-2b','SENTINEL-2a'])
+    'sentinel-2a,sentinel-2b'
+    >>> # Deduplicated too
+    >>> normalise_platforms('landsat-5,landsat-5,LANDSAT-5')
+    'landsat-5'
+    """
+    if not isinstance(value, (list, set, tuple)):
+        value = value.split(",")
 
+    platforms = sorted({s.strip().lower().replace("_", "-") for s in value if s})
+    if not platforms:
+        return None
 
-def identifier_type(v: str):
-    v = v.replace("-", "_")
-    if not v.isidentifier() or not v.islower():
-        warnings.warn(
-            f"{v!r} is expected to be an identifier "
-            "(alphanumeric with underscores, typically lowercase)"
-        )
-    return v
+    return ",".join(platforms)
 
 
 # The primitive types allowed as stac values.
@@ -142,8 +155,7 @@ class Eo3DictBase(collections.abc.MutableMapping):
     the input dictionary on creation, but you can disable this with `normalise_input=False`.
     """
 
-    # Every property we know about.  Subclasses should extend this mapping.
-    # TODO: Really need to add at least dataset maturity and region code
+    # Every property we know about. Subclasses should extend this mapping.
     KNOWN_PROPERTIES: Mapping[str, Optional[NormaliseValueFn]] = {
         "datetime": datetime_type,
         "dtr:end_datetime": datetime_type,
@@ -151,8 +163,23 @@ class Eo3DictBase(collections.abc.MutableMapping):
         "odc:file_format": of_enum_type(FileFormat, strict=False),
         "odc:processing_datetime": datetime_type,
         "odc:product": None,
+        "dea:dataset_maturity": of_enum_type(("final", "interim", "nrt"), lower=True),
+        "odc:region_code": None,
+        "odc:producer": producer_check,
+        # Common STAC properties
+        "eo:gsd": None,
+        "eo:instrument": None,
+        "eo:platform": normalise_platforms,
+        "eo:constellation": None,
+        "eo:off_nadir": float,
+        "eo:azimuth": float,
+        "eo:sun_azimuth": degrees_type,
+        "eo:sun_elevation": degrees_type,
     }
 
+    # Required properties whose presence will be enforced.
+    REQUIRED_PROPERTIES = ["datetime", "odc:processing_datetime"]
+
     def __init__(self, properties: Mapping = None, normalise_input=True) -> None:
         if properties is None:
             properties = {}
@@ -213,7 +240,7 @@ def normalise_and_set(self, key, value, allow_override=True, expect_override=Fal
         :argument expect_override: We expect to overwrite a property, so don't produce a warning or error.
         """
         if key not in self.KNOWN_PROPERTIES:
-            warnings.warn(f"Unknown Stac property {key!r}. ")
+            warnings.warn(f"Unknown Stac property {key!r}.")
 
         if value is not None:
             normalise = self.KNOWN_PROPERTIES.get(key)
@@ -243,151 +270,17 @@ def normalise_and_set(self, key, value, allow_override=True, expect_override=Fal
     def nested(self):
         return nest_properties(self._props)
 
-    def validate_eo3_properties(self, msg: ContextualMessager) -> ValidationMessages:
-        for name, value in self.items():
-            yield from self.validate_eo3_property(name, value, msg)
-
-        # ODC requires this
-        if not self.get("odc:file_format"):
-            yield msg.error(
-                "global_file_format",
-                "Property 'odc:file_format' is empty",
-                hint="Usually 'GeoTIFF'",
+    def validate_properties(self):
+        # Enforce presence of properties identified as required
+        missing_required = []
+        for prop in self.REQUIRED_PROPERTIES:
+            if self._props.get(prop) is None:
+                missing_required.append(prop)
+        if missing_required:
+            raise KeyError(
+                f"The following required properties are missing or None: {', '.join(missing_required)}"
             )
 
-    def validate_eo3_property(
-        self, name, value, msg: ContextualMessager
-    ) -> ValidationMessages:
-        # Everything has already been through normalise_and_set above, so
-        # most of these errors are untriggerable?
-        if name in self.KNOWN_PROPERTIES:
-            normaliser = self.KNOWN_PROPERTIES.get(name)
-            if normaliser and value is not None:
-                try:
-                    normalised_value = normaliser(value)
-                    # A normaliser can return two values, the latter adding extra extracted fields.
-                    if isinstance(normalised_value, tuple):
-                        normalised_value = normalised_value[0]
-
-                    # It's okay for datetimes to be strings
-                    # .. since ODC's own loader does that.
-                    if isinstance(normalised_value, datetime) and isinstance(
-                        value, str
-                    ):
-                        value = ciso8601.parse_datetime(value)
-
-                    # Special case for dates, as "no timezone" and "utc timezone" are treated identical.
-                    if isinstance(value, datetime):
-                        value = default_utc(value)
-
-                    if not isinstance(value, type(normalised_value)):
-                        yield msg.warning(
-                            "property_type",
-                            f"Value {value} expected to be "
-                            f"{type(normalised_value).__name__!r} (got {type(value).__name__!r})",
-                        )
-                    elif normalised_value != value:
-                        if _is_nan(normalised_value) and _is_nan(value):
-                            # Both are NaNs, ignore.
-                            pass
-                        else:
-                            yield ValidationMessage.warning(
-                                "property_formatting",
-                                f"Property {value!r} expected to be {normalised_value!r}",
-                            )
-                except ValueError as e:
-                    yield msg.error("invalid_property", f"{name!r}: {e.args[0]}")
-        if name == "odc:producer":
-            # We use domain name to avoid arguing about naming conventions ('ga' vs 'geoscience-australia' vs ...)
-            if "." not in self["odc:producer"]:
-                yield msg.warning(
-                    "producer_domain",
-                    "Property 'odc:producer' should be the organisation's domain name. Eg. 'ga.gov.au'",
-                )
-
 
 class PropertyOverrideWarning(UserWarning):
     """A warning that a property was set twice with different values."""
-
-
-class Eo3InterfaceBase:
-    """
-    These are convenience properties for common metadata fields. They are available
-    on DatasetAssemblers and within other naming APIs.
-
-    (This is abstract. If you want one of these of your own, you probably want to create
-    an :class:`eo3.DatasetDoc`)
-
-    """
-
-    @property
-    @abstractmethod
-    def properties(self) -> Eo3DictBase:
-        raise NotImplementedError
-
-    @property
-    def product_name(self) -> Optional[str]:
-        """
-        The ODC product name
-        """
-        return self.properties.get("odc:product")
-
-    @product_name.setter
-    def product_name(self, value: str):
-        self.properties["odc:product"] = value
-
-    @property
-    def datetime_range(self) -> Tuple[datetime, datetime]:
-        """
-        An optional date range for the dataset.
-
-        The ``datetime`` is still mandatory when this is set.
-
-        This field is a shorthand for reading/setting the datetime-range
-        stac 0.6 extension properties: ``dtr:start_datetime`` and ``dtr:end_datetime``
-        """
-        return (
-            self.properties.get("dtr:start_datetime"),
-            self.properties.get("dtr:end_datetime"),
-        )
-
-    @datetime_range.setter
-    def datetime_range(self, val: Tuple[datetime, datetime]):
-        # TODO: string type conversion, better validation/errors
-        start, end = val
-        self.properties["dtr:start_datetime"] = start
-        self.properties["dtr:end_datetime"] = end
-
-    @property
-    def processed(self) -> datetime:
-        """When the dataset was created (Defaults to UTC if not specified)
-
-        Shorthand for the ``odc:processing_datetime`` field
-        """
-        return self.properties.get("odc:processing_datetime")
-
-    @processed.setter
-    def processed(self, value: Union[str, datetime]):
-        self.properties["odc:processing_datetime"] = value
-
-    def processed_now(self):
-        """
-        Shorthand for when the dataset was processed right now on the current system.
-        """
-        self.properties["odc:processing_datetime"] = datetime.utcnow()
-
-    # Note that giving a method the name 'datetime' will override the 'datetime' type
-    # for class-level declarations (ie, for any types on functions!)
-    # So we make an alias:
-    from datetime import datetime as datetime_
-
-    @property
-    def datetime(self) -> datetime_:
-        """
-        The searchable date and time of the assets. (Default to UTC if not specified)
-        """
-        return self.properties.get("datetime")
-
-    @datetime.setter
-    def datetime(self, val: datetime_):
-        self.properties["datetime"] = val
diff --git a/eo3/schema/__init__.py b/eo3/schema/__init__.py
new file mode 100644
index 00000000..4ffe8114
--- /dev/null
+++ b/eo3/schema/__init__.py
@@ -0,0 +1,10 @@
+from .schema import DATASET_SCHEMA, METADATA_TYPE_SCHEMA, PRODUCT_SCHEMA
+
+ODC_DATASET_SCHEMA_URL = "https://schemas.opendatacube.org/dataset"
+
+__all__ = (
+    "DATASET_SCHEMA",
+    "PRODUCT_SCHEMA",
+    "METADATA_TYPE_SCHEMA",
+    "ODC_DATASET_SCHEMA_URL",
+)
diff --git a/eo3/schema/dataset.schema.yaml b/eo3/schema/dataset.schema.yaml
index 636028ff..a4895693 100644
--- a/eo3/schema/dataset.schema.yaml
+++ b/eo3/schema/dataset.schema.yaml
@@ -23,6 +23,7 @@ properties:
         # Should match name field in product schema.  (alphanumeric plus underscore and hyphen)
         pattern: '^\w+$'
       href:
+        # Optional but recommended
         type: string
         format: url
     required:
@@ -59,7 +60,7 @@ properties:
           minItems: 6
           maxItems: 9
         crs:
-          # Optional - defaults to dataset crs, described abobe.
+          # Optional - defaults to dataset crs, described above.
           # Either an epsg code ('epsg:1234') (preferred!) or a WKT string if no EPSG is possible.
           type: string
       required:
@@ -81,6 +82,15 @@ properties:
         # TODO: "string" type is problematic as they're currently parsed directly into datetime objects...
         # type: string
         format: date-time
+
+      dea:dataset_maturity:
+        type: string
+        nullable: true
+        enum:
+          - final
+          - interim
+          - nrt
+          - null
     required:
       - odc:processing_datetime
       - datetime
diff --git a/eo3/schema/ingestor-config-type-schema.yaml b/eo3/schema/ingestor-config-type-schema.yaml
deleted file mode 100644
index 0102c39e..00000000
--- a/eo3/schema/ingestor-config-type-schema.yaml
+++ /dev/null
@@ -1,172 +0,0 @@
-"$schema": "http://json-schema.org/draft-04/schema#"
-# TODO CORE Copied from datacube/models/schema
-description: Schema for ingestor configuration.
-type: object
-properties:
-    source_type:
-        type: string
-    output_type:
-        type: string
-    description:
-        type: string
-    location:
-        type: string
-    file_path_template:
-        type: string
-    global_attributes:
-        type: object
-        properties:
-            title:
-                type: string
-            summary:
-                type: string
-            source:
-                type: string
-            history:
-                type: string
-            institution:
-                type: string
-            instrument:
-                type: string
-            cdm_data_type:
-                type: string
-            keywords:
-                type: string
-            keywords_vocabulary:
-                type: string
-            platform:
-                type: string
-            product_version:
-                type: [number, string]
-            publisher_email:
-                type: string
-            publisher_name:
-                type: string
-            publisher_url:
-                type: string
-            product_suite:
-                type: string
-            project:
-                type: string
-            coverage_content_type:
-                type: string
-            references:
-                type: string
-            license:
-                type: string
-            naming_authority:
-                type: string
-            acknowkledgment:
-                type: string
-    ingestion_bounds:
-        type: object
-        properties:
-            left:
-                type: number
-            bottom:
-                type: number
-            right:
-                type: number
-            top:
-                type: number
-    storage:
-        "$ref": "#/definitions/storage"
-    measurements:
-        type: array
-        additionalProperties: true
-        items:
-            "$ref": "#/definitions/measurement"
-required:
-    - output_type
-    - location
-    - file_path_template
-    - global_attributes
-    - storage
-    - measurements
-additionalProperties: true
-
-definitions:
-    dtype:
-        enum: ["float16", "float32", "float64", "int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64", "complex64", "complex128"]
-    measurement:
-        type: object
-        properties:
-            name:
-                type: string
-                pattern: '^\w+$'
-            dtype:
-                "$ref": "#/definitions/dtype"
-            nodata:
-                oneOf:
-                    - type: number
-                    - enum: [NaN, Inf, -Inf]
-            resampling_method:
-                type: string
-            src_varname:
-                type: string
-            zlib:
-                type: boolean
-            units:
-                type: string
-            aliases:
-                type: array
-                items:
-                    type: string
-            spectral_definition:
-                type: object
-                properties:
-                    wavelength:
-                        type: array
-                        items:
-                            type: number
-                    response:
-                        type: array
-                        items:
-                            type: number
-                flags_definition:
-                    type: object
-                    patternProperties:
-                        ".*":
-                        required: [bits, values]
-                    properties:
-                        bits:
-                            type: [number, array]
-                            values:
-                                type: object
-                                properties:
-                                    description:
-                                        type: string
-            attrs:
-                type: object
-                properties:
-                    long_name:
-                        type: string
-                    alias:
-                        type: string
-        required:
-            - name
-            - dtype
-            - nodata
-            - src_varname
-        additionalProperties: true
-
-    storage:
-        type: object
-        properties:
-            chunking:
-                type: object
-            crs:
-                type: string
-            dimension_order:
-                type: array
-            resolution:
-                type: object
-            tile_size:
-                type: object
-            origin:
-                type: object
-            driver:
-                type: string
-            bucket:
-                type: string
-        additionalProperties: true
diff --git a/eo3/schema/metadata-type-schema.yaml b/eo3/schema/metadata-type-schema.yaml
index 3780638f..0f30d081 100644
--- a/eo3/schema/metadata-type-schema.yaml
+++ b/eo3/schema/metadata-type-schema.yaml
@@ -1,4 +1,4 @@
-"$schema": "http://json-schema.org/draft-04/schema#"
+"$schema": "http://json-schema.org/draft-07/schema#"
 # TODO CORE Copied from datacube/models/schema
 description: Schema for metadata types.
 type: object
diff --git a/eo3/schema/product-schema.yaml b/eo3/schema/product-schema.yaml
index 147b3af7..8e0c45f7 100644
--- a/eo3/schema/product-schema.yaml
+++ b/eo3/schema/product-schema.yaml
@@ -1,4 +1,4 @@
-"$schema": "http://json-schema.org/draft-04/schema#"
+"$schema": "http://json-schema.org/draft-07/schema#"
 # TODO CORE Copied from datacube/models/schema
 description: Schema for dataset types.
 type: object
diff --git a/eo3/schema/schema.py b/eo3/schema/schema.py
new file mode 100644
index 00000000..ff20d87b
--- /dev/null
+++ b/eo3/schema/schema.py
@@ -0,0 +1,57 @@
+from pathlib import Path
+
+import jsonschema
+import referencing
+
+from eo3.utils import read_file
+
+
+def _is_json_array(checker, instance) -> bool:
+    """
+    By default, jsonschema only allows a json array to be a Python list.
+    Let's allow it to be a tuple too.
+    """
+    return isinstance(instance, (list, tuple))
+
+
+def _load_schema_validator(p: Path) -> jsonschema.Draft7Validator:
+    """
+    Create a schema instance for the file.
+
+    (Assumes they are trustworthy. Only local schemas!)
+    """
+    if not p.is_file():
+        raise ValueError(f"Can only load local schemas. Could not find file {str(p)}")
+    if p.suffix.lower() not in (".yaml", ".yml"):
+        raise ValueError(f"Unexpected file type {p.suffix}. Expected yaml")
+    schema = read_file(p)
+
+    # Allow schemas to reference other schemas relatively
+    def doc_reference(path):
+        path = p.parent.joinpath(path)
+        if not path.exists():
+            raise ValueError(f"Reference not found: {path}")
+        referenced_schema = read_file(path)
+        return referencing.Resource(referenced_schema, referencing.jsonschema.DRAFT7)
+
+    if p.parent:
+        registry = referencing.Registry(retrieve=doc_reference)
+    else:
+        registry = referencing.Registry()
+
+    jsonschema.Draft7Validator.check_schema(schema)
+    validator = jsonschema.validators.extend(
+        jsonschema.Draft7Validator,
+        type_checker=jsonschema.Draft7Validator.TYPE_CHECKER.redefine(
+            "array", _is_json_array
+        ),
+    )
+    return validator(schema, registry=registry)
+
+
+SCHEMAS_PATH = Path(__file__).parent
+DATASET_SCHEMA = _load_schema_validator(SCHEMAS_PATH / "dataset.schema.yaml")
+PRODUCT_SCHEMA = _load_schema_validator(SCHEMAS_PATH / "product-schema.yaml")
+METADATA_TYPE_SCHEMA = _load_schema_validator(
+    SCHEMAS_PATH / "metadata-type-schema.yaml"
+)
diff --git a/eo3/scripts/tostac.py b/eo3/scripts/tostac.py
index e4d641ea..8fcfc084 100644
--- a/eo3/scripts/tostac.py
+++ b/eo3/scripts/tostac.py
@@ -12,10 +12,16 @@
 from click import echo, style
 
 import eo3.stac as eo3stac
-from eo3 import serialise
-from eo3.model import Eo3DatasetDocBase
-from eo3.ui import PathPath
-from eo3.utils import jsonify_document
+from eo3.model import DatasetMetadata
+from eo3.utils import jsonify_document, normalise_path
+
+
+class PathPath(click.Path):
+    """
+    A Click argument that returns a normalised (absolute) pathlib Path"""
+
+    def convert(self, value, param, ctx):
+        return Path(normalise_path(super().convert(value, param, ctx)))
 
 
 @click.command(help=__doc__)
@@ -40,7 +46,7 @@ def run(
     validate: bool,
 ):
     for input_metadata in odc_metadata_files:
-        dataset = serialise.from_path(input_metadata)
+        dataset = DatasetMetadata.from_path(input_metadata)
 
         name = input_metadata.stem.replace(".odc-metadata", "")
         output_path = input_metadata.with_name(f"{name}.stac-item.json")
@@ -66,7 +72,7 @@ def run(
 
 
 def dc_to_stac(
-    dataset: Eo3DatasetDocBase,
+    dataset: DatasetMetadata,
     input_metadata: Path,
     output_path: Path,
     stac_base_url: str,
diff --git a/eo3/serialise.py b/eo3/serialise.py
index b1131dd6..f662a08d 100644
--- a/eo3/serialise.py
+++ b/eo3/serialise.py
@@ -1,30 +1,14 @@
-import uuid
 from datetime import datetime
-from functools import partial
 from pathlib import Path, PurePath
-from typing import IO, Dict, Iterable, Mapping, Tuple, Union
+from typing import Mapping
 from uuid import UUID
 
-import attr
-import cattr
-import ciso8601
-import click
-import jsonschema
 import numpy
-import shapely
-import shapely.affinity
-import shapely.ops
-from affine import Affine
 from ruamel.yaml import YAML, Representer
 from ruamel.yaml.comments import CommentedMap, CommentedSeq
-from shapely.geometry import shape
-from shapely.geometry.base import BaseGeometry
 
-from eo3.model import ODC_DATASET_SCHEMA_URL, Eo3DatasetDocBase, Eo3DictBase
+from eo3.model import DatasetMetadata
 from eo3.properties import FileFormat
-from eo3.utils import read_documents
-
-converter = cattr.Converter()
 
 
 def _format_representer(dumper, data: FileFormat):
@@ -117,195 +101,9 @@ def dumps_yaml(stream, *docs: Mapping) -> None:
     return yml.dump_all(docs, stream=stream)
 
 
-def load_yaml(p: Path) -> Dict:
-    with p.open() as f:
-        return _yaml().load(f)
-
-
-def _yaml():
-    return YAML(typ="safe")
-
-
-def loads_yaml(stream: Union[str, IO]) -> Iterable[Dict]:
-    """Dump yaml through a stream, using the default deserialisation settings."""
-    return _yaml().load_all(stream)
-
-
-def from_path(path: Path, skip_validation=False) -> Eo3DatasetDocBase:
-    """
-    Parse an EO3 document from a filesystem path
-
-    :param path: Filesystem path
-    :param skip_validation: Optionally disable validation (it's faster, but I hope your
-            doc is structured correctly)
-    """
-    if path.suffix.lower() not in (".yaml", ".yml"):
-        raise ValueError(f"Unexpected file type {path.suffix}. Expected yaml")
-
-    return from_doc(load_yaml(path), skip_validation=skip_validation)
-
-
-class InvalidDataset(Exception):
-    def __init__(self, path: Path, error_code: str, reason: str) -> None:
-        self.path = path
-        self.error_code = error_code
-        self.reason = reason
-
-
-def _is_json_array(checker, instance) -> bool:
-    """
-    By default, jsonschema only allows a json array to be a Python list.
-    Let's allow it to be a tuple too.
-    """
-    return isinstance(instance, (list, tuple))
-
-
-def _load_schema_validator(p: Path) -> jsonschema.Draft6Validator:
-    """
-    Create a schema instance for the file.
-
-    (Assumes they are trustworthy. Only local schemas!)
-    """
-    with p.open() as f:
-        schema = _yaml().load(f)
-    validator = jsonschema.validators.validator_for(schema)
-    validator.check_schema(schema)
-
-    # Allow schemas to reference other schemas relatively
-    def doc_reference(path):
-        path = p.parent.joinpath(path)
-        if not path.exists():
-            raise ValueError(f"Reference not found: {path}")
-        referenced_schema = next(iter(read_documents(path)))[1]
-        return referenced_schema
-
-    ref_resolver = jsonschema.RefResolver.from_schema(
-        schema, handlers={"": doc_reference}
-    )
-    custom_validator = jsonschema.validators.extend(
-        validator, type_checker=validator.TYPE_CHECKER.redefine("array", _is_json_array)
-    )
-    return custom_validator(schema, resolver=ref_resolver)
-
-
-SCHEMAS_PATH = Path(__file__).parent / "schema"
-DATASET_SCHEMA = _load_schema_validator(SCHEMAS_PATH / "dataset.schema.yaml")
-PRODUCT_SCHEMA = _load_schema_validator(SCHEMAS_PATH / "product-schema.yaml")
-METADATA_TYPE_SCHEMA = _load_schema_validator(
-    SCHEMAS_PATH / "metadata-type-schema.yaml"
-)
-
-
-def from_doc(doc: Dict, skip_validation=False) -> Eo3DatasetDocBase:
-    """
-    Parse a dictionary into an EO3 dataset.
-
-    By default it will validate it against the schema, which will result in far more
-    useful error messages if fields are missing.
-
-    :param doc: A dictionary, such as is returned from yaml.load or json.load
-    :param skip_validation: Optionally disable validation (it's faster, but I hope your
-            doc is structured correctly)
-    """
-    doc = doc.copy()
-    if not skip_validation:
-        # don't error if properties 'extent' or 'grid_spatial' are present
-        if doc.get("extent"):
-            del doc["extent"]
-        if doc.get("grid_spatial"):
-            del doc["grid_spatial"]
-        DATASET_SCHEMA.validate(doc)
-
-    location = doc.pop("location", None)
-    if location:
-        doc["locations"] = [location]
-
-    return converter.structure(doc, Eo3DatasetDocBase)
-
-
-def _structure_as_uuid(d, t):
-    return uuid.UUID(str(d))
-
-
-def _structure_as_stac_props(d, t, normalise_properties=False):
-    """
-    :param normalise_properties:
-        We don't normalise properties by default as we usually want it to reflect the original file.
-
-    """
-    return Eo3DictBase(
-        # The passed-in dictionary is stored internally, so we want to make a copy of it
-        # so that our serialised output is fully separate from the input.
-        dict(d),
-        normalise_input=normalise_properties,
-    )
-
-
-def _structure_as_affine(d: Tuple, t):
-    if len(d) not in [6, 9]:
-        raise ValueError(f"Expected 6 or 9 coefficients in transform. Got {d!r}")
-
-    if len(d) == 9:
-        if tuple(d[-3:]) != (0.0, 0.0, 1.0):
-            raise ValueError(
-                f"Nine-element affine should always end in [0, 0, 1]. Got {d!r}"
-            )
-        d = [*d[:-3]]
-
-    return Affine(*d)
-
-
-def _unstructure_as_stac_props(v: Eo3DictBase):
-    return v._props
-
-
-def _structure_as_shape(d, t):
-    return shape(d)
-
-
-converter.register_structure_hook(uuid.UUID, _structure_as_uuid)
-converter.register_structure_hook(BaseGeometry, _structure_as_shape)
-converter.register_structure_hook(
-    Eo3DictBase,
-    partial(_structure_as_stac_props, normalise_properties=False),
-)
-converter.register_structure_hook(Affine, _structure_as_affine)
-converter.register_unstructure_hook(Eo3DictBase, _unstructure_as_stac_props)
-
-
-def to_doc(d: Eo3DatasetDocBase) -> Dict:
-    """
-    Serialise a DatasetDoc to a dict
-
-    If you plan to write this out as a yaml file on disk, you're
-    better off with one of our formatted writers: :func:`.to_stream`, :func:`.to_path`.
-    """
-    doc = attr.asdict(
-        d,
-        recurse=True,
-        dict_factory=dict,
-        # Exclude fields that are the default.
-        filter=lambda attr, value: "doc_exclude" not in attr.metadata
-        and value != attr.default
-        # Exclude any fields set to None. The distinction should never matter in our docs.
-        and value is not None,
-        retain_collection_types=False,
-    )
-    doc["$schema"] = ODC_DATASET_SCHEMA_URL
-    if d.geometry is not None:
-        doc["geometry"] = shapely.geometry.mapping(d.geometry)
-    doc["id"] = str(d.id)
-    doc["properties"] = dict(d.properties)
-
-    if len(doc.get("locations", [])) == 1:
-        doc["location"] = doc.pop("locations")[0]
-
-    return doc
-
-
-def to_formatted_doc(d: Eo3DatasetDocBase) -> CommentedMap:
-    """Serialise a DatasetDoc to a yaml-serialisation-ready dict"""
-    doc = prepare_formatting(to_doc(d))
+def to_formatted_doc(d: DatasetMetadata) -> CommentedMap:
+    """Serialise to a yaml-serialisation-ready dict"""
+    doc = prepare_formatting(d.doc)
     # Add user-readable names for measurements as a comment if present.
     if d.measurements:
         for band_name, band_doc in d.measurements.items():
@@ -315,7 +113,7 @@ def to_formatted_doc(d: Eo3DatasetDocBase) -> CommentedMap:
     return doc
 
 
-def to_path(path: Path, *ds: Eo3DatasetDocBase):
+def to_path(path: Path, *ds: DatasetMetadata):
     """
     Output dataset(s) as a formatted YAML to a local path
 
@@ -324,7 +122,7 @@ def to_path(path: Path, *ds: Eo3DatasetDocBase):
     dump_yaml(path, *(to_formatted_doc(d) for d in ds))
 
 
-def to_stream(stream, *ds: Eo3DatasetDocBase):
+def to_stream(stream, *ds: DatasetMetadata):
     """
     Output dataset(s) as a formatted YAML to an output stream
 
@@ -436,30 +234,3 @@ def _add_space_before(d: CommentedMap, *keys):
     """Add an empty line to the document before a section (key)"""
     for key in keys:
         d.yaml_set_comment_before_after_key(key, before="\n")
-
-
-class ClickDatetime(click.ParamType):
-    """
-    Take a datetime parameter, supporting any ISO8601 date/time/timezone combination.
-    """
-
-    name = "date"
-
-    def convert(self, value, param, ctx):
-        if value is None:
-            return value
-
-        if isinstance(value, datetime):
-            return value
-
-        try:
-            return ciso8601.parse_datetime(value)
-        except ValueError:
-            self.fail(
-                (
-                    "Invalid date string {!r}. Expected any ISO date/time format "
-                    '(eg. "2017-04-03" or "2014-05-14 12:34")'.format(value)
-                ),
-                param,
-                ctx,
-            )
diff --git a/eo3/stac.py b/eo3/stac.py
index b99f2792..ef248120 100644
--- a/eo3/stac.py
+++ b/eo3/stac.py
@@ -18,8 +18,9 @@
 from pystac.extensions.view import ViewExtension
 from pystac.utils import datetime_to_str
 
-from eo3.model import Eo3DatasetDocBase, GridDoc
-from eo3.uris import uri_resolve
+from eo3.eo3_core import EO3Grid
+from eo3.model import DatasetMetadata
+from eo3.utils import uri_resolve
 
 # Mapping between EO3 field names and STAC properties object field names
 MAPPING_EO3_TO_STAC = {
@@ -103,7 +104,7 @@ def _asset_title_fields(asset_name: str) -> Optional[str]:
         return None
 
 
-def _proj_fields(grid: Dict[str, GridDoc], grid_name: str = "default") -> Dict:
+def _proj_fields(grid: Dict[str, EO3Grid], grid_name: str = "default") -> Dict:
     """
     Get any proj (Stac projection extension) fields if we have them for the grid.
     """
@@ -126,7 +127,9 @@ def _lineage_fields(lineage: Dict) -> Dict:
     """
     if lineage:
         lineage_dict = {
-            key: [str(uuid) for uuid in value] for key, value in lineage.items()
+            # there will only ever be one lineage id per level
+            key: [str(value["id"])]
+            for key, value in lineage.items()
         }
 
         return {"odc:lineage": lineage_dict}
@@ -136,7 +139,7 @@ def _lineage_fields(lineage: Dict) -> Dict:
 
 def _odc_links(
     explorer_base_url: str,
-    dataset: Eo3DatasetDocBase,
+    dataset: DatasetMetadata,
     collection_url: Optional[str],
 ) -> List[Link]:
     """
@@ -173,24 +176,17 @@ def _odc_links(
         warnings.warn("No collection provided for Stac Item.")
 
 
-def _get_projection(dataset: Eo3DatasetDocBase) -> Tuple[Optional[int], Optional[str]]:
+def _get_projection(dataset: DatasetMetadata) -> Tuple[Optional[int], Optional[str]]:
     if dataset.crs is None:
         return None, None
 
-    crs_l = dataset.crs.lower()
-    epsg = None
-    wkt = None
-    if crs_l.startswith("epsg:"):
-        epsg = int(crs_l.lstrip("epsg:"))
-    else:
-        wkt = dataset.crs
+    epsg = dataset.crs.epsg
+    wkt = None if epsg is not None else dataset.crs.wkt
 
     return epsg, wkt
 
 
-def eo3_to_stac_properties(
-    dataset: Eo3DatasetDocBase, crs: Optional[str] = None, title: str = None
-) -> Dict:
+def eo3_to_stac_properties(dataset: DatasetMetadata, title: str = None) -> Dict:
     """
     Convert EO3 properties dictionary to the Stac equivalent.
     """
@@ -207,7 +203,7 @@ def eo3_to_stac_properties(
 
 
 def to_pystac_item(
-    dataset: Eo3DatasetDocBase,
+    dataset: DatasetMetadata,
     stac_item_destination_url: str,
     dataset_location: Optional[str] = None,
     odc_dataset_metadata_url: Optional[str] = None,
@@ -231,7 +227,7 @@ def to_pystac_item(
     """
 
     if dataset.geometry is not None:
-        geom = Geometry(dataset.geometry, CRS(dataset.crs))
+        geom = Geometry(dataset.geometry, dataset.crs)
         wgs84_geometry = geom.to_crs(CRS("epsg:4326"), math.inf)
 
         geometry = wgs84_geometry.json
@@ -241,7 +237,7 @@ def to_pystac_item(
         bbox = None
 
     properties = eo3_to_stac_properties(dataset, title=dataset.label)
-    properties.update(_lineage_fields(dataset.lineage))
+    properties.update(_lineage_fields(dataset.sources))
 
     dt = properties["datetime"]
     del properties["datetime"]
@@ -349,7 +345,7 @@ def to_pystac_item(
 
 
 def to_stac_item(
-    dataset: Eo3DatasetDocBase,
+    dataset: DatasetMetadata,
     stac_item_destination_url: str,
     dataset_location: Optional[str] = None,
     odc_dataset_metadata_url: Optional[str] = None,
diff --git a/eo3/ui.py b/eo3/ui.py
deleted file mode 100644
index 786f2bcc..00000000
--- a/eo3/ui.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import os
-import urllib.parse
-from pathlib import Path
-from typing import Optional, Union
-from urllib.parse import parse_qsl, urljoin, urlparse
-
-import click
-
-from eo3.uris import normalise_path
-
-
-class PathPath(click.Path):
-    """
-    A Click argument that returns a normalised (absolute) pathlib Path"""
-
-    def convert(self, value, param, ctx):
-        return Path(normalise_path(super().convert(value, param, ctx)))
-
-
-def uri_resolve(base: Union[str, Path], path: Optional[str]) -> str:
-    """
-    Backport of datacube.utils.uris.uri_resolve()
-    """
-    if path:
-        p = Path(path)
-        if p.is_absolute():
-            return p.as_uri()
-
-    if isinstance(base, Path):
-        base = base.absolute().as_uri()
-    return urljoin(base, path)
-
-
-def bool_style(b, color=True) -> str:
-    if b:
-        return click.style("✓", fg=color and "green")
-    else:
-        return click.style("✗", fg=color and "yellow")
-
-
-def is_absolute(url):
-    """
-    >>> is_absolute('LC08_L1TP_108078_20151203_20170401_01_T1.TIF')
-    False
-    >>> is_absolute('data/LC08_L1TP_108078_20151203_20170401_01_T1.TIF')
-    False
-    >>> is_absolute('/g/data/somewhere/LC08_L1TP_108078_20151203_20170401_01_T1.TIF')
-    True
-    >>> is_absolute('file:///g/data/v10/somewhere/LC08_L1TP_108078_20151203_20170401_01_T1.TIF')
-    True
-    >>> is_absolute('http://example.com/LC08_L1TP_108078_20151203_20170401_01_T1.TIF')
-    True
-    >>> is_absolute('tar:///g/data/v10/somewhere/dataset.tar#LC08_L1TP_108078_20151203_20170401_01_T1.TIF')
-    True
-    """
-    location = urlparse(url)
-    return bool(location.scheme or location.netloc) or os.path.isabs(location.path)
-
-
-def get_part(url):
-    """
-    >>> get_part('path/to/file.tif')
-    >>> get_part('path/to/file.tif#page=2')
-    >>> get_part('path/to/file.tif#part=3')
-    3
-    >>> get_part('path/to/file.tif#part=one')
-    'one'
-    """
-    opts = dict(parse_qsl(urlparse(url).fragment))
-    part = opts.get("part")
-    if part is None:
-        return None
-    try:
-        return int(part)
-    except ValueError:
-        return part
-
-
-def register_scheme(*schemes):
-    """
-    Register additional uri schemes as supporting relative offsets (etc), so that band/measurement paths can be
-    calculated relative to the base uri.
-    """
-    urllib.parse.uses_netloc.extend(schemes)
-    urllib.parse.uses_relative.extend(schemes)
-    urllib.parse.uses_params.extend(schemes)
-
-
-register_scheme("tar")
-register_scheme("s3")
diff --git a/eo3/utils/__init__.py b/eo3/utils/__init__.py
new file mode 100644
index 00000000..878cba55
--- /dev/null
+++ b/eo3/utils/__init__.py
@@ -0,0 +1,47 @@
+from .uris import (
+    as_url,
+    get_part_from_uri,
+    is_absolute,
+    is_url,
+    is_vsipath,
+    mk_part_uri,
+    normalise_path,
+    uri_resolve,
+    uri_to_local_path,
+)
+from .utils import (
+    InvalidDocException,
+    contains,
+    default_utc,
+    flatten_dict,
+    jsonify_document,
+    netcdf_extract_string,
+    parse_time,
+    read_documents,
+    read_file,
+    read_strings_from_netcdf,
+    thread_local_cache,
+)
+
+__all__ = (
+    "is_url",
+    "uri_to_local_path",
+    "get_part_from_uri",
+    "mk_part_uri",
+    "is_vsipath",
+    "normalise_path",
+    "uri_resolve",
+    "as_url",
+    "is_absolute",
+    "default_utc",
+    "jsonify_document",
+    "InvalidDocException",
+    "read_documents",
+    "read_strings_from_netcdf",
+    "netcdf_extract_string",
+    "contains",
+    "thread_local_cache",
+    "parse_time",
+    "read_file",
+    "flatten_dict",
+)
diff --git a/eo3/aws.py b/eo3/utils/aws.py
similarity index 70%
rename from eo3/aws.py
rename to eo3/utils/aws.py
index 71024bdb..90acda4e 100644
--- a/eo3/aws.py
+++ b/eo3/utils/aws.py
@@ -2,51 +2,18 @@
 Helper methods for working with AWS
 """
 import os
-import threading
-import time
-from types import SimpleNamespace
 from typing import Any, Dict, Optional, Tuple, Union
 from urllib.parse import urlparse
 from urllib.request import urlopen
 
 import botocore
 import botocore.session
-from botocore.credentials import Credentials, ReadOnlyCredentials
+from botocore.credentials import ReadOnlyCredentials
 from botocore.session import Session
 
-# TODO CORE: Copy of datacube.utils.generic.py
-_LCL = threading.local()
-
-
-def thread_local_cache(
-    name: str, initial_value: Any = None, purge: bool = False
-) -> Any:
-    """Define/get thread local object with a given name.
-
-    :param name:          name for this cache
-    :param initial_value: Initial value if not set for this thread
-    :param purge:         If True delete from cache (returning what was there previously)
-
-    Returns
-    -------
-    value previously set in the thread or `initial_value`
-    """
-    absent = object()
-    cc = getattr(_LCL, name, absent)
-    absent = cc is absent
-
-    if absent:
-        cc = initial_value
-
-    if purge:
-        if not absent:
-            delattr(_LCL, name)
-    else:
-        if absent:
-            setattr(_LCL, name, cc)
-
-    return cc
+from eo3.utils import thread_local_cache
 
+# TODO: ideally this file would eventually be moved to a lower-level utils package
 
 # TODO CORE: Copy of datacube.utils.aws.__init__.py
 ByteRange = Union[slice, Tuple[int, int]]  # pylint: disable=invalid-name
@@ -61,7 +28,6 @@ def thread_local_cache(
     "ec2_current_region",
     "botocore_default_region",
     "auto_find_region",
-    "get_creds_with_retry",
     "mk_boto_session",
 )
 
@@ -168,26 +134,6 @@ def auto_find_region(
     return default
 
 
-def get_creds_with_retry(
-    session: Session, max_tries: int = 10, sleep: float = 0.1
-) -> Optional[Credentials]:
-    """Attempt to obtain credentials upto `max_tries` times with back off
-    :param session: botocore session, see mk_boto_session
-    :param max_tries: number of attempt before failing and returing None
-    :param sleep: number of seconds to sleep after first failure (doubles on every consecutive failure)
-    """
-    for i in range(max_tries):
-        if i > 0:
-            time.sleep(sleep)
-            sleep = min(sleep * 2, 10)
-
-        creds = session.get_credentials()
-        if creds is not None:
-            return creds
-
-    return None
-
-
 def mk_boto_session(
     profile: Optional[str] = None,
     creds: Optional[ReadOnlyCredentials] = None,
@@ -375,68 +321,3 @@ def s3_open(
     bucket, key = s3_url_parse(url)
     oo = s3.get_object(Bucket=bucket, Key=key, **kwargs)  # type: ignore[attr-defined]
     return oo["Body"]
-
-
-def s3_head_object(url: str, s3: MaybeS3 = None, **kwargs) -> Optional[Dict[str, Any]]:
-    """
-    Head object, return object metadata.
-
-    :param url: s3://bucket/path/to/object
-    :param s3: pre-configured s3 client, see make_s3_client()
-    :param kwargs: are passed on to ``s3.head_object(..)``
-    """
-    from botocore.exceptions import ClientError
-
-    s3 = s3 or s3_client()
-    bucket, key = s3_url_parse(url)
-
-    try:
-        oo = s3.head_object(Bucket=bucket, Key=key, **kwargs)  # type: ignore[attr-defined]
-    except ClientError:
-        return None
-
-    meta = oo.pop("ResponseMetadata", {})
-    code = meta.get("HTTPStatusCode", 0)
-    if 200 <= code < 300:
-        return oo
-
-    # it actually raises exceptions when http code is in the "fail" range
-    return None  # pragma: no cover
-
-
-def obtain_new_iam_auth_token(
-    url: str, region_name: str = "auto", profile_name: Optional[str] = None
-) -> str:
-    # Boto3 is not core requirement, but ImportError is probably the right exception to throw anyway.
-    from boto3.session import Session as Boto3Session
-
-    session = Boto3Session(profile_name=profile_name)
-    client = session.client("rds", region_name=region_name)
-    return client.generate_db_auth_token(
-        DBHostname=url.host, Port=url.port, DBUsername=url.username, Region=region_name
-    )
-
-
-# TODO CORE: Copy from datacube.utils.rio.rio
-_CFG_LOCK = threading.Lock()
-_CFG = SimpleNamespace(aws=None, cloud_defaults=False, kwargs={}, epoch=0)
-
-
-def set_default_rio_config(aws=None, cloud_defaults=False, **kwargs):
-    """Setup default configuration for rasterio/GDAL.
-
-    Doesn't actually activate one, just stores configuration for future
-    use from IO threads.
-
-    :param aws: Dictionary of options for rasterio.session.AWSSession
-                OR 'auto' -- session = rasterio.session.AWSSession()
-
-    :param cloud_defaults: When True inject settings for reading COGs
-    :param **kwargs: Passed on to rasterio.Env(..) constructor
-    """
-    global _CFG  # pylint: disable=global-statement
-
-    with _CFG_LOCK:
-        _CFG = SimpleNamespace(
-            aws=aws, cloud_defaults=cloud_defaults, kwargs=kwargs, epoch=_CFG.epoch + 1
-        )
diff --git a/eo3/uris.py b/eo3/utils/uris.py
similarity index 79%
rename from eo3/uris.py
rename to eo3/utils/uris.py
index c9ed689e..a8850bd1 100644
--- a/eo3/uris.py
+++ b/eo3/utils/uris.py
@@ -4,9 +4,11 @@
 import urllib.parse
 from pathlib import Path
 from typing import Optional, Union
-from urllib.parse import urljoin, urlparse
+from urllib.parse import parse_qsl, urljoin, urlparse
 from urllib.request import url2pathname
 
+# TODO: ideally this file would eventually be moved to a lower-level utils package
+
 # CORE TODO: forked from datacube.utils.uris
 
 
@@ -171,6 +173,44 @@ def as_url(maybe_uri: str) -> str:
         return pathlib.Path(maybe_uri).absolute().as_uri()
 
 
+def is_absolute(url):
+    """
+    >>> is_absolute('LC08_L1TP_108078_20151203_20170401_01_T1.TIF')
+    False
+    >>> is_absolute('data/LC08_L1TP_108078_20151203_20170401_01_T1.TIF')
+    False
+    >>> is_absolute('/g/data/somewhere/LC08_L1TP_108078_20151203_20170401_01_T1.TIF')
+    True
+    >>> is_absolute('file:///g/data/v10/somewhere/LC08_L1TP_108078_20151203_20170401_01_T1.TIF')
+    True
+    >>> is_absolute('http://example.com/LC08_L1TP_108078_20151203_20170401_01_T1.TIF')
+    True
+    >>> is_absolute('tar:///g/data/v10/somewhere/dataset.tar#LC08_L1TP_108078_20151203_20170401_01_T1.TIF')
+    True
+    """
+    location = urlparse(url)
+    return bool(location.scheme or location.netloc) or os.path.isabs(location.path)
+
+
+def get_part_from_uri(url):
+    """
+    >>> get_part_from_uri('path/to/file.tif')
+    >>> get_part_from_uri('path/to/file.tif#page=2')
+    >>> get_part_from_uri('path/to/file.tif#part=3')
+    3
+    >>> get_part_from_uri('path/to/file.tif#part=one')
+    'one'
+    """
+    opts = dict(parse_qsl(urlparse(url).fragment))
+    part = opts.get("part")
+    if part is None:
+        return None
+    try:
+        return int(part)
+    except ValueError:
+        return part
+
+
 def register_scheme(*schemes):
     """
     Register additional uri schemes as supporting relative offsets (etc), so that band/measurement paths can be
diff --git a/eo3/utils.py b/eo3/utils/utils.py
similarity index 75%
rename from eo3/utils.py
rename to eo3/utils/utils.py
index 01870996..3ed2a336 100644
--- a/eo3/utils.py
+++ b/eo3/utils/utils.py
@@ -5,30 +5,27 @@
 import math
 import os
 import re
+import threading
 from collections import OrderedDict
 from contextlib import contextmanager
 from datetime import date, datetime, timezone
 from decimal import Decimal
 from pathlib import Path
-from typing import Any, Dict, Iterable, Mapping, Sequence, Tuple, Union
+from typing import Any, Iterable, Mapping, Sequence, Tuple, Union
 from urllib.parse import urlparse
 from urllib.request import urlopen
 from uuid import UUID
 
-import yaml
-
-try:
-    from yaml import CSafeLoader as SafeLoader  # type: ignore
-except ImportError:
-    from yaml import SafeLoader  # type: ignore
-
 import ciso8601
 import click
+import dateutil.parser
 import numpy
+from ruamel.yaml import YAML, YAMLError
 
-from eo3.uris import as_url, mk_part_uri
+from .uris import as_url, mk_part_uri, uri_to_local_path
 
-EO3_SCHEMA = "https://schemas.opendatacube.org/dataset"
+# TODO: ideally the functions marked as 'general util' (originally copied
+# over from core) would eventually be moved to a lower-level utils package
 
 
 class ItemProvider(enum.Enum):
@@ -161,33 +158,6 @@ def get_collection_number(
     )
 
 
-def is_doc_eo3(doc: Dict[str, Any]) -> bool:
-    """Is this document eo3?
-
-    :param doc: Parsed ODC Dataset metadata document
-
-    :returns:
-        False if this document is a legacy dataset
-        True if this document is eo3
-
-    :raises ValueError: For an unsupported document
-    """
-    schema = doc.get("$schema")
-    # All legacy documents had no schema at all.
-    if schema is None:
-        return False
-
-    if schema == EO3_SCHEMA:
-        return True
-
-    # Otherwise it has an unknown schema.
-    #
-    # Reject it for now.
-    # We don't want future documents (like Stac items, or "eo4") to be quietly
-    # accepted as legacy eo.
-    raise ValueError(f"Unsupported dataset schema: {schema!r}")
-
-
 def flatten_dict(
     d: Mapping, prefix: str = None, separator: str = "."
 ) -> Iterable[Tuple[str, Any]]:
@@ -212,49 +182,26 @@ def flatten_dict(
 
 
 # CORE TODO: from datacube.utils.documents
+# TODO: general util
 @contextmanager
 def _open_from_s3(url):
     o = urlparse(url)
     if o.scheme != "s3":
         raise RuntimeError("Abort abort I don't know how to open non s3 urls")
 
-    from .aws import s3_open
+    from eo3.utils.aws import s3_open
 
     yield s3_open(url)
 
 
 # CORE TODO: from datacube.utils.documents
+# TODO: general util
 def _open_with_urllib(url):
     return urlopen(url)  # nosec B310
 
 
 # CORE TODO: from datacube.utils.documents
-class NoDatesSafeLoader(SafeLoader):  # pylint: disable=too-many-ancestors
-    @classmethod
-    def remove_implicit_resolver(cls, tag_to_remove):
-        """
-        Removes implicit resolvers for a particular tag
-
-        Takes care not to modify resolvers in super classes.
-
-        We want to load datetimes as strings, not dates. We go on to
-        serialise as json which doesn't have the advanced types of
-        yaml, and leads to slightly different objects down the track.
-        """
-        if "yaml_implicit_resolvers" not in cls.__dict__:
-            cls.yaml_implicit_resolvers = cls.yaml_implicit_resolvers.copy()
-
-        for first_letter, mappings in cls.yaml_implicit_resolvers.items():
-            cls.yaml_implicit_resolvers[first_letter] = [
-                (tag, regexp) for tag, regexp in mappings if tag != tag_to_remove
-            ]
-
-
-# CORE TODO: from datacube.utils.documents
-NoDatesSafeLoader.remove_implicit_resolver("tag:yaml.org,2002:timestamp")
-
-
-# CORE TODO: from datacube.utils.documents
+# TODO: general util
 _PROTOCOL_OPENERS = {
     "s3": _open_from_s3,
     "ftp": _open_with_urllib,
@@ -265,16 +212,57 @@ def remove_implicit_resolver(cls, tag_to_remove):
 
 
 # CORE TODO: from datacube.utils.documents
-def load_from_yaml(handle, parse_dates=False):
-    loader = SafeLoader if parse_dates else NoDatesSafeLoader
-    yield from yaml.load_all(handle, Loader=loader)  # noqa: DUO109
+# TODO: general util
+def load_from_yaml(handle):
+    yield from YAML(typ="safe").load_all(handle)  # noqa: DUO109
 
 
 # CORE TODO: from datacube.utils.documents
+# TODO: general util
 def load_from_json(handle):
     yield json.load(handle)
 
 
+# TODO: general util
+def load_from_netcdf(path):
+    for doc in read_strings_from_netcdf(path, variable="dataset"):
+        yield YAML(typ="safe").load(doc)
+
+
+# TODO: general util
+def netcdf_extract_string(chars):
+    """
+    Convert netcdf S|U chars to Unicode string.
+    """
+    import netCDF4  # type: ignore[import]
+
+    if isinstance(chars, str):
+        return chars
+
+    chars = netCDF4.chartostring(chars)
+    if chars.dtype.kind == "U":
+        return str(chars)
+    else:
+        return str(numpy.char.decode(chars))
+
+
+# TODO: general util
+def read_strings_from_netcdf(path, variable):
+    """
+    Load all of the string encoded data from a variable in a NetCDF file.
+
+    By 'string', the CF conventions mean ascii.
+
+    Useful for loading dataset metadata information.
+    """
+    import netCDF4
+
+    with netCDF4.Dataset(str(path)) as ds:
+        for chars in ds[variable]:
+            yield netcdf_extract_string(chars)
+
+
+# TODO: general util
 _PARSERS = {
     ".yaml": load_from_yaml,
     ".yml": load_from_yaml,
@@ -282,6 +270,7 @@ def load_from_json(handle):
 }
 
 
+# TODO: general util
 def transform_object_tree(f, o, key_transform=lambda k: k):
     """
     Apply a function (f) on all the values in the given document tree (o), returning a new document of
@@ -312,6 +301,7 @@ def recur(o_):
     return f(o)
 
 
+# TODO: general util
 def jsonify_document(doc):
     """
     Make a document ready for serialisation as JSON.
@@ -340,6 +330,7 @@ def fixup_value(v):
     return transform_object_tree(fixup_value, doc, key_transform=str)
 
 
+# TODO: general util
 def load_documents(path):
     """
     Load document/s from the specified path.
@@ -348,7 +339,7 @@ def load_documents(path):
 
      - JSON and YAML locally and remotely.
      - Compressed JSON and YAML locally
-     - Data Cube Dataset Documents inside local NetCDF files.  # CORE TODO: stripped out for now???
+     - Data Cube Dataset Documents inside local NetCDF files.
 
     :param path: path or URI to load documents from
     :return: generator of dicts
@@ -359,28 +350,30 @@ def load_documents(path):
     scheme = urlparse(url).scheme
     compressed = url[-3:] == ".gz"
 
-    # if scheme == 'file' and path[-3:] == '.nc':
-    #   path = uri_to_local_path(url)
-    #   yield from load_from_netcdf(path)
-    # lse:
-    with _PROTOCOL_OPENERS[scheme](url) as fh:
-        if compressed:
-            fh = gzip.open(fh)
-            path = path[:-3]
+    if scheme == "file" and path[-3:] == ".nc":
+        path = uri_to_local_path(url)
+        yield from load_from_netcdf(path)
+    else:
+        with _PROTOCOL_OPENERS[scheme](url) as fh:
+            if compressed:
+                fh = gzip.open(fh)
+                path = path[:-3]
 
-        suffix = Path(path).suffix
+            suffix = Path(path).suffix
 
-        parser = _PARSERS[suffix]
+            parser = _PARSERS[suffix]
 
-        yield from parser(fh)
+            yield from parser(fh)
 
 
 # CORE TODO: from datacube.utils.documents
+# TODO: general util
 class InvalidDocException(Exception):  # noqa: N818
     pass
 
 
 # CORE TODO: from datacube.utils.generic
+# TODO: general util
 def map_with_lookahead(it, if_one=None, if_many=None):
     """
     It's like normal map: creates a new generator by applying a function to every
@@ -408,6 +401,7 @@ def map_with_lookahead(it, if_one=None, if_many=None):
         yield proc(v)
 
 
+# TODO: general util
 def read_documents(*paths, uri=False):
     """
     Read and parse documents from the filesystem or remote URLs (yaml or json).
@@ -451,13 +445,20 @@ def add_uri_with_part(x):
             yield from process_file(path)
         except InvalidDocException as e:
             raise e
-        except (yaml.YAMLError, ValueError) as e:
+        except (YAMLError, ValueError) as e:
             raise InvalidDocException(f"Failed to load {path}: {e}")
         except Exception as e:
             raise InvalidDocException(f"Failed to load {path}: {e}")
 
 
+# TODO: general util
+def read_file(p: Path):
+    """Shorthand for when you just need to get the dict representation of 1 file"""
+    return next(iter(read_documents(p)))[1]
+
+
 # CORE TODO: from datacube.utils.changes
+# TODO: general util
 # Type that can be checked for changes.
 # (MyPy approximation without recursive references)
 Changable = Union[str, int, None, Sequence[Any], Mapping[str, Any]]
@@ -496,3 +497,61 @@ def _is_nan(v):
     if isinstance(v, str):
         return v == "NaN"
     return isinstance(v, float) and math.isnan(v)
+
+
+# CORE TODO: from datacube.utils.dates
+# TODO: general util
+def parse_time(time: Union[str, datetime]) -> datetime:
+    """Convert string to datetime object
+
+    This function deals with ISO8601 dates fast, and fallbacks to python for
+    other formats.
+
+    Calling this on datetime object is a no-op.
+    """
+    if isinstance(time, str):
+        try:
+            from ciso8601 import (  # pylint: disable=wrong-import-position # noqa: F401
+                parse_datetime,
+            )
+
+            return parse_datetime(time)
+        except (ImportError, ValueError):  # pragma: no cover
+            return dateutil.parser.parse(time)
+
+    return time
+
+
+# CORE TODO: from datacube.utils.generic.py
+# TODO: general util
+_LCL = threading.local()
+
+
+def thread_local_cache(
+    name: str, initial_value: Any = None, purge: bool = False
+) -> Any:
+    """Define/get thread local object with a given name.
+
+    :param name:          name for this cache
+    :param initial_value: Initial value if not set for this thread
+    :param purge:         If True delete from cache (returning what was there previously)
+
+    Returns
+    -------
+    value previously set in the thread or `initial_value`
+    """
+    absent = object()
+    cc = getattr(_LCL, name, absent)
+    absent = cc is absent
+
+    if absent:
+        cc = initial_value
+
+    if purge:
+        if not absent:
+            delattr(_LCL, name)
+    else:
+        if absent:
+            setattr(_LCL, name, cc)
+
+    return cc
diff --git a/eo3/validate.py b/eo3/validate.py
index ca7748e4..126b94d0 100644
--- a/eo3/validate.py
+++ b/eo3/validate.py
@@ -1,350 +1,28 @@
 """
 Validate ODC dataset documents
 """
-import enum
-from datetime import datetime
-from pathlib import Path
+import warnings
 from textwrap import indent
-from typing import (
-    Dict,
-    Generator,
-    Iterable,
-    List,
-    Mapping,
-    MutableMapping,
-    Optional,
-    Sequence,
-    Set,
-    Tuple,
-    Union,
-)
-from urllib.parse import urlparse
-from uuid import UUID
-
-import attr
-import cattr
-import ciso8601
-import rasterio
-import toolz
-from attr import Factory, define, field, frozen
-from cattrs import ClassValidationError
-from click import echo
-from rasterio import DatasetReader
-from rasterio.crs import CRS
-from rasterio.errors import CRSError
-from shapely.validation import explain_validity
-
-from eo3 import model, serialise, utils
-from eo3.eo3_core import prep_eo3
-from eo3.metadata.validate import validate_metadata_type
-from eo3.model import AccessoryDoc, Eo3DatasetDocBase
-from eo3.product.validate import validate_product
-from eo3.ui import get_part, is_absolute, uri_resolve
-from eo3.uris import is_url
-from eo3.utils import (
-    EO3_SCHEMA,
-    InvalidDocException,
-    _is_nan,
-    contains,
-    default_utc,
-    load_documents,
-    read_documents,
-)
-from eo3.validation_msg import (
-    ContextualMessager,
-    Level,
-    ValidationMessage,
-    ValidationMessages,
-)
-
-DEFAULT_NULLABLE_FIELDS = ("label",)
-DEFAULT_OPTIONAL_FIELDS = (
-    # Older product do not have this field at all, and when not specified it is considered stable.
-    "dataset_maturity",
-)
-
-
-class DocKind(enum.Enum):
-    # EO3 datacube dataset.
-    dataset = 1
-    # Datacube product
-    product = 2
-    # Datacube Metadata Type
-    metadata_type = 3
-    # Stac Item
-    stac_item = 4
-    # Legacy datacube ("eo1") dataset
-    legacy_dataset = 5
-    # Legacy product config for ingester
-    ingestion_config = 6
-
-    @property
-    def is_legacy(self):
-        return self in (self.legacy_dataset, self.ingestion_config)
-
-
-# What kind of document each suffix represents.
-# (full suffix will also have a doc type: .yaml, .json, .yaml.gz etc)
-# Example:  "my-test-dataset.odc-metadata.yaml"
-SUFFIX_KINDS = {
-    ".odc-metadata": DocKind.dataset,
-    ".odc-product": DocKind.product,
-    ".odc-type": DocKind.metadata_type,
-}
-# Inverse of above
-DOC_TYPE_SUFFIXES = {v: k for k, v in SUFFIX_KINDS.items()}
-
-
-def filename_doc_kind(path: Union[str, Path]) -> Optional["DocKind"]:
-    """
-    Get the expected file type for the given filename.
-
-    Returns None if it does not follow any naming conventions.
-
-    >>> filename_doc_kind('LC8_2014.odc-metadata.yaml').name
-    'dataset'
-    >>> filename_doc_kind('/tmp/something/water_bodies.odc-metadata.yaml.gz').name
-    'dataset'
-    >>> filename_doc_kind(Path('/tmp/something/ls8_fc.odc-product.yaml')).name
-    'product'
-    >>> filename_doc_kind(Path('/tmp/something/ls8_wo.odc-product.json.gz')).name
-    'product'
-    >>> filename_doc_kind(Path('/tmp/something/eo3_gqa.odc-type.yaml')).name
-    'metadata_type'
-    >>> filename_doc_kind(Path('/tmp/something/some_other_file.yaml'))
-    """
-
-    for suffix in reversed(Path(path).suffixes):
-        suffix = suffix.lower()
-        if suffix in SUFFIX_KINDS:
-            return SUFFIX_KINDS[suffix]
-
-    return None
+from typing import Dict, Iterable, List, Mapping, Set, Tuple
 
+import toolz
 
-def guess_kind_from_contents(doc: Dict):
-    """
-    What sort of document do the contents look like?
-    """
-    if "$schema" in doc and doc["$schema"] == EO3_SCHEMA:
-        return DocKind.dataset
-    if "metadata_type" in doc:
-        if "source_type" in doc:
-            return DocKind.ingestion_config
-        return DocKind.product
-    if ("dataset" in doc) and ("search_fields" in doc["dataset"]):
-        return DocKind.metadata_type
-    if "id" in doc:
-        if ("lineage" in doc) and ("platform" in doc):
-            return DocKind.legacy_dataset
-
-        if ("properties" in doc) and ("datetime" in doc["properties"]):
-            return DocKind.stac_item
-
-    return None
-
-
-@frozen(init=True)
-class ValidationExpectations:
-    """
-    What expectations do we have when validating this dataset?
-    """
-
-    #: Allow these extra measurement names to be included in the dataset.
-    #: (ODC allows unlisted measurement names, but it's usually a mistake)
-    allow_extra_measurements: Sequence[str] = ()
-
-    #: Do we expect full geometry information in every dataset?
-    #: (It's optional in ODC, but often a mistake to miss it)
-    require_geometry: bool = True
-
-    #: Are any of the configured fields nullable?
-    allow_nullable_fields: Sequence[str] = field(
-        default=Factory(lambda: DEFAULT_NULLABLE_FIELDS)
-    )
-    #: Can any of the fields be completely omitted from the document?
-    allow_missing_fields: Sequence[str] = field(
-        default=Factory(lambda: DEFAULT_OPTIONAL_FIELDS)
-    )
-
-    def with_document_overrides(self, doc: Dict):
-        """
-        Return an instance with any overrides from the given document.
-
-        (TODO: Overrides are passed in in "default_allowances" section of product or metadata
-        document but are not part of the schema, so using them renders the document
-        invalid. Bad API design, IMO.)
-        """
-        if "default_allowances" not in doc:
-            return self
-
-        overridden_values = {**attr.asdict(self), **doc["default_allowances"]}
-        # Merge, don't replace, these lists.
-        overridden_values["allow_nullable_fields"] = list(
-            {*overridden_values["allow_nullable_fields"], *self.allow_nullable_fields}
-        )
-        overridden_values["allow_missing_fields"] = list(
-            {*overridden_values["allow_missing_fields"], *self.allow_missing_fields}
-        )
-        overridden_values["allow_extra_measurements"] = list(
-            {
-                *overridden_values["allow_extra_measurements"],
-                *self.allow_extra_measurements,
-            }
-        )
-        return cattr.structure(overridden_values, self.__class__)
+from eo3 import schema, utils
+from eo3.fields import all_field_offsets
+from eo3.utils import contains, get_part_from_uri, is_absolute
+from eo3.validation_msg import ContextualMessager, Level, ValidationMessages
 
 
-def validate_dataset(
-    doc: Dict,
-    product_definition: Optional[Dict] = None,
-    product_definitions: Optional[Dict] = None,
-    metadata_type_definition: Optional[Mapping[str, Dict]] = None,
-    thorough: bool = False,
-    readable_location: Union[str, Path] = None,
-    expect: ValidationExpectations = None,
+def validate_ds_to_schema(
+    doc: Dict, msg: ContextualMessager = None
 ) -> ValidationMessages:
-    """
-    Validate a dataset document, optionally against the given product.
-
-    By default this will only look at the metadata, run with thorough=True to
-    open the data files too.
-
-    :param product_definition: Optionally check that the dataset matches this product definition.
-    :param thorough: Open the imagery too, to check that data types etc match.
-    :param readable_location: Dataset location to use, if not the metadata path.
-    :param expect: Where can we be lenient in validation?
-    """
-    # Prepare validation context and contextual message builder
-    expect = expect or ValidationExpectations()
-    validation_context = {}
-    if metadata_type_definition is not None:
-        expect = expect.with_document_overrides(metadata_type_definition)
-        validation_context["type"] = metadata_type_definition["name"]
-    if product_definition is not None:
-        expect = expect.with_document_overrides(product_definition)
-        validation_context["product"] = product_definition["name"]
-    elif product_definitions is not None:
-        product_name = doc.get("product", {}).get("name")
-        if product_name and product_name in product_definitions:
-            product_definition = product_definitions[product_name]
-            expect = expect.with_document_overrides(product_definition)
-            validation_context["product"] = product_name
-
-    msg = ContextualMessager(validation_context)
-
-    if expect.allow_extra_measurements:
-        yield msg.warning("extra_measurements", "Extra measurements are deprecated")
-
-    if thorough and not product_definition:
-        yield msg.error(
-            "no_product", "Must supply product definition for thorough validation"
-        )
-
-    # Validate against schema and deserialise to a (base eo3) dataset doc
-    yield from _validate_ds_to_schema(doc, msg)
-    if msg.errors:
-        return
-
-    # Validate Lineage before serialisation for clearer error reporting. (Get incomprehensible error messages
-    #   for invalid UUIDs)
-    yield from _validate_lineage(doc.get("lineage", {}), msg)
-    if msg.errors:
-        return
-
-    # TODO: How to make this step more extensible?
-    try:
-        dataset = serialise.from_doc(doc, skip_validation=True)
-    except ClassValidationError as e:
-
-        def expand(err: ClassValidationError) -> str:
-            expanded = err.message
-            try:
-                for sub_err in err.exceptions:
-                    expanded += expand(sub_err)
-            except AttributeError:
-                pass
-            return expanded
-
-        yield msg.error("serialisation_failure", f"Serialisation failed: {expand(e)}")
-        return
-
-    # non-schema basic validation
-    if not dataset.product.href:
-        yield msg.info("product_href", "A url (href) is recommended for products")
-
-    if doc.get("location"):
-        yield msg.warning(
-            "dataset_location",
-            "Location is deprecated and will be removed in a future release. Use 'locations' instead.",
-        )
-
-    # Validate geometry
-    yield from _validate_geo(dataset, msg, expect_geometry=expect.require_geometry)
-    if msg.errors:
-        return
-
-    # Previously a dataset could have no measurements (eg. telemetry data).
-    if expect.require_geometry:
-        if dataset.measurements:
-            yield from _validate_measurements(dataset, msg)
-            if msg.errors:
-                return
-
-    # Base properties
-    # Validation is implemented in Eo3DictBase so it can be extended
-    yield from dataset.properties.validate_eo3_properties(msg)
-
-    # Accessories
-    for acc_name, accessory in dataset.accessories.items():
-        yield from _validate_accessory(acc_name, accessory, msg)
-
-    required_measurements: Dict[str, ExpectedMeasurement] = {}
-
-    # Validate dataset against product and metadata type definitions
-    if product_definition is not None:
-        yield from _validate_ds_to_product(
-            dataset,
-            required_measurements,
-            product_definition,
-            allow_extra_measurements=expect.allow_extra_measurements,
-            msg=msg,
-        )
-        if msg.errors:
-            return
-
-    if metadata_type_definition:
-        yield from _validate_ds_to_metadata_type(
-            doc, metadata_type_definition, expect, msg
-        )
-
-    if thorough:
-        # Validate contents of actual data against measurement metadata
-        yield from _validate_ds_against_data(
-            dataset, readable_location, required_measurements, msg
-        )
-
-
-def _validate_ds_to_schema(doc: Dict, msg: ContextualMessager) -> ValidationMessages:
     """
     Validate against eo3 schema
     """
-    schema = doc.get("$schema")
-    if schema is None:
-        yield msg.error(
-            "no_schema",
-            f"No $schema field. "
-            f"You probably want an ODC dataset schema {model.ODC_DATASET_SCHEMA_URL!r}",
-        )
-        return
-    if schema != model.ODC_DATASET_SCHEMA_URL:
-        yield msg.error(
-            "unknown_doc_type",
-            f"Unknown doc schema {schema!r}. Only ODC datasets are supported ({model.ODC_DATASET_SCHEMA_URL!r})",
-        )
-        return
+    if msg is None:
+        msg = ContextualMessager()
 
-    for error in serialise.DATASET_SCHEMA.iter_errors(doc):
+    for error in schema.DATASET_SCHEMA.iter_errors(doc):
         displayable_path = ".".join(error.absolute_path)
 
         hint = None
@@ -354,331 +32,141 @@ def _validate_ds_to_schema(doc: Dict, msg: ContextualMessager) -> ValidationMess
         context = f"({displayable_path}) " if displayable_path else ""
         yield msg.error("structure", f"{context}{error.message} ", hint=hint)
 
-
-def _validate_measurements(dataset: Eo3DatasetDocBase, msg: ContextualMessager):
-    for name, measurement in dataset.measurements.items():
-        grid_name = measurement.grid
-        if grid_name != "default" or dataset.grids:
-            if grid_name not in dataset.grids:
-                yield msg.error(
-                    "invalid_grid_ref",
-                    f"Measurement {name!r} refers to unknown grid {grid_name!r}",
-                )
-
-        if is_absolute(measurement.path):
-            yield msg.warning(
-                "absolute_path",
-                f"measurement {name!r} has an absolute path: {measurement.path!r}",
-            )
-
-        part = get_part(measurement.path)
-        if part is not None:
+    # properties detailed in the schema that are optional but recommended
+    recommended = [["product", "href"], ["properties", "dea:dataset_maturity"]]
+    for r in recommended:
+        if toolz.get_in(r, doc) is None:
             yield msg.warning(
-                "uri_part",
-                f"measurement {name!r} has a part in the path. (Use band and/or layer instead)",
-            )
-        if isinstance(part, int):
-            if part < 0:
-                yield msg.error(
-                    "uri_invalid_part",
-                    f"measurement {name!r} has an invalid part (less than zero) in the path ({part})",
-                )
-        elif isinstance(part, str):
-            yield msg.error(
-                "uri_invalid_part",
-                f"measurement {name!r} has an invalid part (non-integer) in the path ({part})",
+                "recommended_field", f"Field {'->'.join(r)} is optional but recommended"
             )
 
 
-def _validate_accessory(name: str, accessory: AccessoryDoc, msg: ContextualMessager):
-    accessory.name = name
-    if is_absolute(accessory.path):
-        yield msg.warning(
-            "absolute_path",
-            f"Accessory {accessory.name!r} has an absolute path: {accessory.path!r}",
-        )
-
-
-def _validate_lineage(lineage, msg):
-    for label, parent_ids in lineage.items():
-        if len(parent_ids) > 1:
-            yield msg.info(
-                "nonflat_lineage",
-                f"Lineage label {label} has multiple sources and may get flattened on indexing "
-                "depending on the index driver",
-            )
-        for parent_id in parent_ids:
-            try:
-                UUID(parent_id)
-            except ValueError:
-                yield msg.error(
-                    "invalid_source_id",
-                    f"Lineage id in {label} is not a valid UUID {parent_id}",
-                )
-
-
-def _validate_ds_to_product(
-    dataset: Eo3DatasetDocBase,
-    required_measurements: MutableMapping[str, "ExpectedMeasurement"],
+def validate_ds_to_product(
+    doc: Dict,
     product_definition: Mapping,
-    allow_extra_measurements: Sequence[str],
-    msg: ContextualMessager,
+    msg: ContextualMessager = None,
 ):
-    required_measurements.update(
-        {
-            m.name: m
-            for m in map(
-                ExpectedMeasurement.from_definition,
-                product_definition.get("measurements") or (),
-            )
-        }
-    )
-    product_name = product_definition.get("name")
-    if product_name and product_name != dataset.product.name:
+    """Validate dataset is consistent with product definition"""
+    if msg is None:
+        msg = ContextualMessager({"product": product_definition.get("name")})
+
+    product_name = msg.context.get("product")
+    ds_product_name = doc.get("product").get("name")
+    if product_name and product_name != ds_product_name:
         yield msg.error(
             "product_mismatch",
-            f"Dataset product name {dataset.product.name!r} "
-            f"does not match the given product ({product_name!r}",
+            f"Dataset product name {ds_product_name!r} "
+            f"does not match the given product {product_name!r}",
         )
 
-    ds_props = dict(dataset.properties)
+    ds_props = doc.get("properties")
     prod_props = product_definition["metadata"].get("properties", {})
     if not contains(ds_props, prod_props):
         diffs = tuple(_get_printable_differences(ds_props, prod_props))
         difference_hint = _differences_as_hint(diffs)
         yield msg.error(
             "metadata_mismatch",
-            "Dataset template does not match product document template.",
+            f"Dataset template does not match product document template for product {product_name!r}.",
             hint=difference_hint,
         )
 
-    for name in required_measurements:
-        if name not in dataset.measurements.keys():
+    product_measurement_names = [
+        m["name"] for m in product_definition.get("measurements")
+    ]
+    doc_measurements = doc.get("measurements").keys()
+    for name in product_measurement_names:
+        if name not in doc_measurements:
             yield msg.error(
                 "missing_measurement",
                 f"Product {product_name} expects a measurement {name!r})",
             )
-    measurements_not_in_product = set(dataset.measurements.keys()).difference(
+    measurements_not_in_product = set(doc_measurements).difference(
         {m["name"] for m in product_definition.get("measurements") or ()}
     )
-    # Remove the measurements that are allowed to be extra.
-    measurements_not_in_product.difference_update(allow_extra_measurements or set())
 
     if measurements_not_in_product:
         things = ", ".join(sorted(measurements_not_in_product))
         yield msg.warning(
             "extra_measurements",
             f"Dataset has measurements not present in product definition for {product_name!r}: {things}",
-            hint="This may be valid, as it's allowed by ODC. Set `expect_extra_measurements` to mute this.",
         )
 
 
-def _validate_ds_to_metadata_type(
+def validate_ds_to_metadata_type(
     doc: Dict,
     metadata_type_definition: Dict,
-    expect: ValidationExpectations,
-    msg: ContextualMessager,
+    msg: ContextualMessager = None,
 ):
-    # Datacube does certain transforms on an eo3 doc before storage.
-    # We need to do the same, as the fields will be read from the storage.
-    prepared_doc = prep_eo3(doc)
+    """
+    Validate against the metadata type definition. A dataset doesn't have to include
+    all metadata type fields, but users should be warned that there are missing fields.
+    """
+    if msg is None:
+        msg = ContextualMessager()
 
-    all_nullable_fields = tuple(expect.allow_nullable_fields) + tuple(
-        expect.allow_missing_fields
-    )
-    for field_name, offsets in _get_field_offsets(
-        metadata_type=metadata_type_definition
-    ):
-        if (
-            # If a field is required...
-            (field_name not in expect.allow_missing_fields)
-            and
-            # ... and none of its offsets are in the document
-            not any(_has_offset(prepared_doc, offset) for offset in offsets)
+    for field_name, offsets in _get_field_offsets(metadata_type_definition):
+        # If none of a field's offsets are in the document - ignore for lineage
+        if field_name != "sources" and not any(
+            _has_offset(doc, offset) for offset in offsets
         ):
             # ... warn them.
             readable_offsets = " or ".join("->".join(offset) for offset in offsets)
             yield msg.warning(
                 "missing_field",
                 f"Dataset is missing field {field_name!r} "
-                f"for type {metadata_type_definition['name']!r}",
-                hint=f"Expected at {readable_offsets}",
+                f"expected by metadata type {metadata_type_definition['name']!r}",
+                hint=f"Expected at offset {readable_offsets}",
             )
             continue
 
-        if field_name not in all_nullable_fields:
-            value = None
-            for offset in offsets:
-                value = toolz.get_in(offset, prepared_doc)
-            if value is None:
-                yield msg.info(
-                    "null_field",
-                    f"Value is null for configured field {field_name!r}",
-                )
-
-
-def _validate_ds_against_data(
-    dataset: Eo3DatasetDocBase,
-    readable_location: str,
-    required_measurements: Dict[str, "ExpectedMeasurement"],
-    msg: ContextualMessager,
-):
-    # For each measurement, try to load it.
-    # If loadable, validate measurements exist and match expectations.
-    dataset_location = dataset.locations[0] if dataset.locations else readable_location
-    for name, measurement in dataset.measurements.items():
-        full_path = uri_resolve(dataset_location, measurement.path)
-        expected_measurement = required_measurements.get(name)
-
-        band = measurement.band or 1
-        with rasterio.open(full_path) as ds:
-            ds: DatasetReader
-
-            if band not in ds.indexes:
-                yield msg.error(
-                    "incorrect_band",
-                    f"Measurement {name!r} file contains no rio index {band!r}.",
-                    hint=f"contains indexes {ds.indexes!r}",
-                )
-                continue
-
-            if not expected_measurement:
-                # The measurement is not in the product definition
-                #
-                # This is only informational because a product doesn't have to define all
-                # measurements that the datasets contain.
-                #
-                # This is historically because dataset documents reflect the measurements that
-                # are stored on disk, which can differ. But products define the set of measurments
-                # that are mandatory in every dataset.
-                #
-                # (datasets differ when, for example, sensors go offline, or when there's on-disk
-                #  measurements like panchromatic that GA doesn't want in their product definitions)
-                if required_measurements:
-                    yield msg.info(
-                        "unspecified_measurement",
-                        f"Measurement {name} is not in the product",
-                    )
-            else:
-                expected_dtype = expected_measurement.dtype
-                band_dtype = ds.dtypes[band - 1]
-                if expected_dtype != band_dtype:
-                    yield ValidationMessage.error(
-                        "different_dtype",
-                        f"{name} dtype: "
-                        f"product {expected_dtype!r} != dataset {band_dtype!r}",
-                    )
-
-                ds_nodata = ds.nodatavals[band - 1]
-
-                # If the dataset is missing 'nodata', we can allow anything in product 'nodata'.
-                # (In ODC, nodata might be a fill value for loading data.)
-                if ds_nodata is None:
-                    continue
-
-                # Otherwise check that nodata matches.
-                expected_nodata = expected_measurement.nodata
-                if expected_nodata != ds_nodata and not (
-                    _is_nan(expected_nodata) and _is_nan(ds_nodata)
-                ):
-                    yield msg.error(
-                        "different_nodata",
-                        f"{name} nodata: "
-                        f"product {expected_nodata !r} != dataset {ds_nodata !r}",
-                    )
+
+def validate_measurement_path(
+    name, path, msg: ContextualMessager = None
+) -> ValidationMessages:
+    if msg is None:
+        msg = ContextualMessager()
+
+    if is_absolute(path):
+        yield msg.warning(
+            "absolute_path",
+            f"measurement {name!r} has an absolute path: {path!r}",
+        )
+
+    part = get_part_from_uri(path)
+    if part is not None:
+        yield msg.warning(
+            "uri_part",
+            f"measurement {name!r} has a part in the path. (Use band and/or layer instead)",
+        )
+    if isinstance(part, int):
+        if part < 0:
+            yield msg.error(
+                "uri_invalid_part",
+                f"measurement {name!r} has an invalid part (less than zero) in the path ({part})",
+            )
+    elif isinstance(part, str):
+        yield msg.error(
+            "uri_invalid_part",
+            f"measurement {name!r} has an invalid part (non-integer) in the path ({part})",
+        )
 
 
 def _has_offset(doc: Dict, offset: List[str]) -> bool:
     """
     Is the given offset present in the document?
     """
-    for key in offset:
-        if key not in doc:
-            return False
-        doc = doc[key]
-    return True
-
-
-@define
-class ExpectedMeasurement:
-    name: str
-    dtype: str
-    nodata: int
-
-    @classmethod
-    def from_definition(cls, doc: Dict):
-        return ExpectedMeasurement(doc["name"], doc.get("dtype"), doc.get("nodata"))
+    try:
+        toolz.get_in(offset, doc, no_default=True)
+        return True
+    except (KeyError, IndexError):
+        return False
 
 
 # Name of a field and its possible offsets in the document.
-FieldNameOffsetS = Tuple[str, Set[List[str]]]
-
-
-def validate_paths(
-    paths: List[str],
-    thorough: bool = False,
-    product_definitions: Dict[str, Dict] = None,
-    metadata_type_definitions: Dict[str, Dict] = None,
-    expect: ValidationExpectations = None,
-) -> Generator[Tuple[str, List[ValidationMessage]], None, None]:
-    """Validate the list of paths. Product documents can be specified before their datasets."""
-
-    products = dict(product_definitions or {})
-    metadata_types = dict(metadata_type_definitions or {})
-
-    for url, doc, was_specified_by_user in read_paths(paths):
-        messages = []
-        kind = filename_doc_kind(url)
-        if kind is None:
-            kind = guess_kind_from_contents(doc)
-            if kind and (kind in DOC_TYPE_SUFFIXES):
-                # It looks like an ODC doc but doesn't have the standard suffix.
-                messages.append(
-                    ValidationMessage.warning(
-                        "missing_suffix",
-                        f"Document looks like a {kind.name} but does not have "
-                        f'filename extension "{DOC_TYPE_SUFFIXES[kind]}{_readable_doc_extension(url)}"',
-                    )
-                )
-
-        if kind == DocKind.product:
-            messages.extend(validate_product(doc))
-            if "name" in doc:
-                products[doc["name"]] = doc
-        elif kind == DocKind.dataset:
-            messages.extend(
-                validate_eo3_doc(
-                    doc,
-                    url,
-                    products,
-                    metadata_types,
-                    thorough,
-                    expect=expect,
-                )
-            )
-        elif kind == DocKind.metadata_type:
-            messages.extend(validate_metadata_type(doc))
-            if "name" in doc:
-                metadata_types[doc["name"]] = doc
-
-        # Otherwise it's a file we don't support.
-        # If the user gave us the path explicitly, it seems to be an error.
-        # (if they didn't -- it was found via scanning directories -- we don't care.)
-        elif was_specified_by_user:
-            if kind is None:
-                raise ValueError(f"Unknown document type for {url}")
-            else:
-                raise NotImplementedError(
-                    f"Cannot currently validate {kind.name} files"
-                )
-        else:
-            # Not a doc type we recognise, and the user didn't specify it. Skip it.
-            continue
-
-        yield url, messages
+FieldNameOffsets = Tuple[str, Set[List[str]]]
 
 
-def _get_field_offsets(metadata_type: Dict) -> Iterable[FieldNameOffsetS]:
+def _get_field_offsets(metadata_type: Dict) -> Iterable[FieldNameOffsets]:
     """
     Yield all fields and their possible document-offsets that are expected for this metadata type.
 
@@ -688,141 +176,7 @@ def _get_field_offsets(metadata_type: Dict) -> Iterable[FieldNameOffsetS]:
     (Properties can have multiple offsets, where ODC will choose the first non-null one, hence the
     return of multiple offsets for each field.)
     """
-    dataset_section = metadata_type["dataset"]
-    search_fields = dataset_section["search_fields"]
-
-    # The fixed fields of ODC. 'id', 'label', etc.
-    for field_ in dataset_section:
-        if field_ == "search_fields":
-            continue
-
-        offset = dataset_section[field_]
-        if offset is not None:
-            yield field_, [offset]
-
-    # The configurable search fields.
-    for field_, spec in search_fields.items():
-        offsets = []
-        if "offset" in spec:
-            offsets.append(spec["offset"])
-        offsets.extend(spec.get("min_offset", []))
-        offsets.extend(spec.get("max_offset", []))
-
-        yield field_, offsets
-
-
-def _readable_doc_extension(uri: str):
-    """
-    >>> _readable_doc_extension('something.json.gz')
-    '.json.gz'
-    >>> _readable_doc_extension('something.yaml')
-    '.yaml'
-    >>> _readable_doc_extension('apple.odc-metadata.yaml.gz')
-    '.yaml.gz'
-    >>> _readable_doc_extension('products/tmad/tmad_product.yaml#part=1')
-    '.yaml'
-    >>> _readable_doc_extension('/tmp/human.06.tall.yml')
-    '.yml'
-    >>> # Not a doc, even though it's compressed.
-    >>> _readable_doc_extension('db_dump.gz')
-    >>> _readable_doc_extension('/tmp/nothing')
-    """
-    path = urlparse(uri).path
-    compression_formats = (".gz",)
-    doc_formats = (
-        ".yaml",
-        ".yml",
-        ".json",
-    )
-    suffix = "".join(
-        s.lower()
-        for s in Path(path).suffixes
-        if s.lower() in doc_formats + compression_formats
-    )
-    # If it's only compression, no doc format, it's not valid.
-    if suffix in compression_formats:
-        return None
-    return suffix or None
-
-
-def read_paths(
-    input_paths: Iterable[str],
-) -> Generator[Tuple[str, Union[Dict, str], bool], None, None]:
-    """
-    Read the given input paths, returning a URL, document, and whether
-    it was explicitly given by the user.
-
-    When a local directory is specified, inner readable docs are returned, but will
-    be marked as not explicitly specified.
-    """
-    for input_ in input_paths:
-        for uri, was_specified in expand_paths_as_uris([input_]):
-            try:
-                for full_uri, doc in read_documents(uri, uri=True):
-                    yield full_uri, doc, was_specified
-            except InvalidDocException as e:
-                if was_specified:
-                    raise
-                else:
-                    echo(e, err=True)
-
-
-def expand_paths_as_uris(
-    input_paths: Iterable[str],
-) -> Generator[Tuple[Path, bool], None, None]:
-    """
-    For any paths that are directories, find inner documents that are known.
-
-    Returns Tuples: path as a URL, and whether it was specified explicitly by user.
-    """
-    for input_ in input_paths:
-        if is_url(input_):
-            yield input_, True
-        else:
-            path = Path(input_).resolve()
-            if path.is_dir():
-                for found_path in path.rglob("*"):
-                    if _readable_doc_extension(found_path.as_uri()) is not None:
-                        yield found_path.as_uri(), False
-            else:
-                yield path.as_uri(), True
-
-
-def validate_eo3_doc(
-    doc: Dict,
-    location: Union[str, Path],
-    products: Dict[str, Dict],
-    metadata_types: Dict[str, Dict],
-    thorough: bool = False,
-    expect: ValidationExpectations = None,
-) -> List[ValidationMessage]:
-    messages = []
-
-    matched_product = None
-
-    metadata_type = None
-    if metadata_types and matched_product:
-        metadata_type = matched_product["metadata_type"]
-        if metadata_type not in metadata_types:
-            messages.append(
-                ValidationMessage(
-                    Level.error if thorough else Level.info,
-                    "no_metadata_type",
-                    f"Metadata type not provided {metadata_type}: not validating fields",
-                )
-            )
-
-    messages.extend(
-        validate_dataset(
-            doc,
-            product_definitions=products,
-            readable_location=location,
-            thorough=thorough,
-            metadata_type_definition=metadata_types.get(metadata_type),
-            expect=expect,
-        )
-    )
-    return messages
+    yield from all_field_offsets(metadata_type).items()
 
 
 def _get_printable_differences(dict1: Dict, dict2: Dict):
@@ -838,156 +192,31 @@ def _get_printable_differences(dict1: Dict, dict2: Dict):
             yield f"{path}: {v1!r} != {v2!r}"
 
 
-def _get_product_mismatch_reasons(dataset_doc: Dict, product_definition: Dict):
-    """
-    Which fields don't match the given dataset doc to a product definition?
-
-    Gives human-readable lines of text.
-    """
-    yield from _get_printable_differences(dataset_doc, product_definition["metadata"])
-
-
 def _differences_as_hint(product_diffs):
     return indent("\n".join(product_diffs), prefix="\t")
 
 
-def _validate_eo3_properties(dataset: Eo3DatasetDocBase, msg: ContextualMessager):
-    for name, value in dataset.properties.items():
-        if name in dataset.properties.KNOWN_PROPERTIES:
-            normaliser = dataset.properties.KNOWN_PROPERTIES.get(name)
-            if normaliser and value is not None:
-                try:
-                    normalised_value = normaliser(value)
-                    # A normaliser can return two values, the latter adding extra extracted fields.
-                    if isinstance(normalised_value, tuple):
-                        normalised_value = normalised_value[0]
-
-                    # It's okay for datetimes to be strings
-                    # .. since ODC's own loader does that.
-                    if isinstance(normalised_value, datetime) and isinstance(
-                        value, str
-                    ):
-                        value = ciso8601.parse_datetime(value)
-
-                    # Special case for dates, as "no timezone" and "utc timezone" are treated identical.
-                    if isinstance(value, datetime):
-                        value = default_utc(value)
-
-                    if not isinstance(value, type(normalised_value)):
-                        yield msg.warning(
-                            "property_type",
-                            f"Value {value} expected to be "
-                            f"{type(normalised_value).__name__!r} (got {type(value).__name__!r})",
-                        )
-                    elif normalised_value != value:
-                        if _is_nan(normalised_value) and _is_nan(value):
-                            # Both are NaNs, ignore.
-                            pass
-                        else:
-                            yield ValidationMessage.warning(
-                                "property_formatting",
-                                f"Property {value!r} expected to be {normalised_value!r}",
-                            )
-                except ValueError as e:
-                    yield msg.error("invalid_property", f"{name!r}: {e.args[0]}")
-        # else: warning for unknown property?
-    if "odc:producer" in dataset.properties:
-        producer = dataset.properties["odc:producer"]
-        # We use domain name to avoid arguing about naming conventions ('ga' vs 'geoscience-australia' vs ...)
-        if "." not in producer:
-            yield msg.warning(
-                "producer_domain",
-                "Property 'odc:producer' should be the organisation's domain name. Eg. 'ga.gov.au'",
-            )
-
-    # This field is a little odd, but is expected by the current version of ODC.
-    # (from discussion with Kirill)
-    if not dataset.properties.get("odc:file_format"):
-        yield msg.warning(
-            "global_file_format",
-            "Property 'odc:file_format' is empty",
-            hint="Usually 'GeoTIFF'",
-        )
-
-
-def _validate_geo(
-    dataset: Eo3DatasetDocBase, msg: ContextualMessager, expect_geometry: bool = True
-):
-    # If we're not expecting geometry, and there's no geometry, then there's nothing to see here.
-    if not expect_geometry and (
-        dataset.geometry is None and not dataset.grids and not dataset.crs
-    ):
-        yield msg.info("non_geo", "No geo information in dataset")
-        return
-
-    # Geometry is recommended but not required
-    if dataset.geometry is None:
-        if expect_geometry:
-            yield msg.info(
-                "incomplete_geo", "Dataset has some geo fields but no geometry"
-            )
-    elif not dataset.geometry.is_valid:
-        yield msg.error(
-            "invalid_geometry",
-            f"Geometry is not a valid shape: {explain_validity(dataset.geometry)!r}",
-        )
-        return
-
-    # CRS required
-    if not dataset.crs:
-        yield msg.error("incomplete_crs", "Dataset has some geo fields but no crs")
-    else:
-        # We only officially support epsg code (recommended) or wkt.
-        # TODO Anything supported by odc-geo
-        yield from _validate_crs(dataset.crs, msg)
-
-    # Grids is validated by schema - but is required
-    if not dataset.grids:
-        yield msg.error("incomplete_grids", "Dataset has some geo fields but no grids")
-    else:
-        yield from _validate_grids(dataset.grids, dataset.crs, msg)
-
-    return
-
-
-def _validate_crs(crs, msg):
-    if crs.lower().startswith("epsg:"):
-        try:
-            CRS.from_string(crs)
-        except CRSError as e:
-            yield msg.error("invalid_crs_epsg", e.args[0])
-
-        if crs.lower() != crs:
-            yield msg.warning("mixed_crs_case", "Recommend lowercase 'epsg:' prefix")
-    else:
-        wkt_crs = None
-        try:
-            wkt_crs = CRS.from_wkt(crs)
-        except CRSError as e:
-            yield msg.error(
-                "invalid_crs",
-                f"Expect either an epsg code or a WKT string: {e.args[0]}",
-            )
-
-        if wkt_crs and wkt_crs.is_epsg_code:
-            yield msg.warning(
-                "non_epsg",
-                f"Prefer an EPSG code to a WKT when possible. (Can change CRS to 'epsg:{wkt_crs.to_epsg()}')",
-            )
-
-
-def _validate_grids(grids, default_crs, msg):
-    for grid_name, grid_def in grids.items():
-        sub_msg = msg.sub_msg(grid=grid_name)
-        if not grid_def.crs:
-            grid_def.crs = default_crs
-        else:
-            yield from _validate_crs(grid_def.crs, sub_msg)
+class InvalidDatasetError(Exception):
+    """
+    Raised when a dataset is missing essential things (such as mandatory metadata)
+    or contains invalid values and so cannot be written.
+    """
 
 
-def _has_some_geo(dataset: Eo3DatasetDocBase) -> bool:
-    return dataset.geometry is not None or dataset.grids or dataset.crs
+class InvalidDatasetWarning(UserWarning):
+    """A non-critical warning for invalid or incomplete metadata"""
 
 
-def _load_doc(url):
-    return list(load_documents(url))
+def handle_validation_messages(messages: ValidationMessages):
+    """Capture multiple errors or warning messages and raise them as one"""
+    warns = []
+    errors = []
+    for msg in messages:
+        if msg.level == Level.warning:
+            warns.append(str(msg))
+        if msg.level == Level.error:
+            errors.append(str(msg))
+    if warns:
+        warnings.warn(InvalidDatasetWarning("\n".join(warns)))
+    if errors:
+        raise InvalidDatasetError("\n".join(errors))
diff --git a/eo3/validation_msg.py b/eo3/validation_msg.py
index 9c8278c0..3af32d88 100644
--- a/eo3/validation_msg.py
+++ b/eo3/validation_msg.py
@@ -22,12 +22,8 @@ class ValidationMessage:
     def __str__(self) -> str:
         hint = ""
         if self.hint:
-            hint = f" (Hint: {self.hint})"
-        if self.context:
-            context_str = ",".join(f"{k}: {v}" for k, v in self.context.items())
-            return f"{self.code} in [{context_str}]: {self.reason}{hint}"
-        else:
-            return f"{self.code}: {self.reason}{hint}"
+            hint = f"(Hint: {self.hint})"
+        return f"{self.code}: {self.reason} {hint}"
 
     @classmethod
     def info(
@@ -54,7 +50,7 @@ def error(
 
 
 class ContextualMessager:
-    def __init__(self, context: dict):
+    def __init__(self, context: dict = {}):
         self.context = context
         self.errors = 0
 
diff --git a/eo3/verify.py b/eo3/verify.py
deleted file mode 100644
index ef0aee2c..00000000
--- a/eo3/verify.py
+++ /dev/null
@@ -1,238 +0,0 @@
-import binascii
-import hashlib
-import logging
-import os
-import typing
-from distutils import spawn
-from pathlib import Path
-from urllib.parse import urlparse
-
-import boto3
-
-_LOG = logging.getLogger(__name__)
-
-
-def is_s3_uri(uri):
-    parsed_uri = urlparse(uri)
-    return parsed_uri.scheme == "s3"
-
-
-def get_bucket_key(s3_key):
-    """
-    Return bucket name and key from a s3 key
-    """
-    o = urlparse(s3_key)
-    bucket = o.netloc
-    key = o.path
-    # Remove the leading slash from the prefix/key
-    return bucket, key[1:]
-
-
-def find_exe(name: str):
-    """
-    Find the location of the given executable.
-
-    :return: the absolute path to the executable.
-    :rtype: str
-    """
-    executable = spawn.find_executable(name)
-    if not executable:
-        raise Exception(f"No {name!r} command found.")
-
-    return executable
-
-
-def calculate_file_sha1(filename):
-    """
-    :type filename: str or Path
-    :rtype: str
-    """
-    return calculate_file_hash(filename, hash_fn=hashlib.sha1)
-
-
-def calculate_file_hash(filename, hash_fn=hashlib.sha1, block_size=4096):
-    """
-    Calculate the hash of the contents of a given file path.
-    :type filename: str or Path
-    :param block_size: Number of bytes to read at a time. (for performance: doesn't affect result)
-    :param hash_fn: hashlib function to use. (typically sha1 or md5)
-    :return: String of hex characters.
-    :rtype: str
-    """
-    if is_s3_uri(str(filename)):
-        bucket, key = get_bucket_key(filename)
-        try:
-            region_name = os.environ["AWS_DEFAULT_REGION"]
-        except Exception as exp:
-            raise ValueError(
-                "Failed to find AWS_DEFAULT_REGION in the environment variables"
-            ) from exp
-        s3client = boto3.client("s3", region_name=region_name)
-        fileobj = s3client.get_object(Bucket=bucket, Key=key)
-        f = fileobj["Body"].read()
-        return calculate_hash(f, hash_fn, block_size)
-    else:
-        with Path(filename).open("rb") as f:
-            return calculate_hash(f, hash_fn, block_size)
-
-
-def calculate_hash(f, hash_fn=hashlib.sha1, block_size=4096):
-    m = hash_fn()
-
-    while True:
-        d = f.read(block_size)
-        if not d:
-            break
-        m.update(d)
-
-    return binascii.hexlify(m.digest()).decode("ascii")
-
-
-# 16K seems to be the sweet spot in performance on my machine.
-def calculate_file_crc32(filename, block_size=1024 * 16):
-    """
-    Calculate the crc32 of the contents of a given file path.
-    :type filename: str or Path
-    :param block_size: Number of bytes to read at a time. (for performance: doesn't affect result)
-    :return: String of hex characters.
-    :rtype: str
-    """
-    m = 0
-    with Path(filename).open("rb") as f:
-        while True:
-            d = f.read(block_size)
-            if not d:
-                break
-            m = binascii.crc32(d, m)
-
-    return f"{m & 0xFFFFFFFF:08x}"
-
-
-class PackageChecksum:
-    """
-    Incrementally build a checksum file for a package.
-
-    (By building incrementally we can better take advantage of filesystem caching)
-    """
-
-    def __init__(self):
-        self._file_hashes = {}
-
-    def add_file(self, file_path):
-        """
-        Add files to the checksum list (recursing into directories.)
-        :type file_path: Path
-        :rtype: None
-        """
-
-        if is_s3_uri(str(file_path)):
-            try:
-                region_name = os.environ["AWS_DEFAULT_REGION"]
-            except Exception as exp:
-                raise ValueError(
-                    "Failed to find AWS_DEFAULT_REGION in the environment variables"
-                ) from exp
-
-            s3client = boto3.client("s3", region_name)
-            bucket, key = get_bucket_key(file_path)
-            response_obj = s3client.list_objects_v2(Bucket=bucket, Prefix=key)
-            objs = [obj["Key"] for obj in response_obj["Contents"]]
-            if len(objs) > 1:
-                for file_path in objs:
-                    hash_ = self._checksum("s3://{bucket}/{file_path}")
-                    self._append_hash(file_path, hash_)
-            else:
-                hash_ = self._checksum(file_path)
-                self._append_hash(file_path, hash_)
-            return
-
-        if file_path.is_dir():
-            self.add_files(file_path.iterdir())
-        else:
-            hash_ = self._checksum(file_path)
-            self._append_hash(file_path, hash_)
-
-    def add(self, fd: typing.IO, name=None):
-        """
-        Add a checksum, reading the data from an open file descriptor.
-        """
-        name = name or fd.name
-        if not name:
-            raise ValueError("No usable name for checksummed file descriptor")
-
-        _LOG.info("Checksumming %r", name)
-        hash_ = calculate_hash(fd)
-        _LOG.debug("%r -> %r", name, hash_)
-        self._append_hash(name, hash_)
-
-    def _checksum(self, file_path):
-        _LOG.info("Checksumming %r", file_path)
-        hash_ = calculate_file_hash(file_path)
-        _LOG.debug("%r -> %r", file_path, hash_)
-        return hash_
-
-    def _append_hash(self, file_path, hash_):
-        self._file_hashes[Path(file_path).absolute()] = hash_
-
-    def add_files(self, file_paths):
-        for path in file_paths:
-            self.add_file(path)
-
-    def write(self, output_file: typing.Union[Path, str]):
-        """
-        Write checksums to the given file.
-        :type output_file: Path or str
-        """
-        output_file = Path(output_file)
-        with output_file.open("wb") as f:
-            f.writelines(
-                (
-                    "{}\t{}\n".format(
-                        str(hash_), str(filename.relative_to(output_file.parent))
-                    ).encode("utf-8")
-                    for filename, hash_ in sorted(self._file_hashes.items())
-                )
-            )
-
-    def read(self, checksum_path):
-        """
-        Read checksum values from the given checksum file
-        :type checksum_path: Path or str
-        """
-        checksum_path = Path(checksum_path)
-        with checksum_path.open("r") as f:
-            for line in f.readlines():
-                hash_, path = str(line).strip().split("\t")
-                self._append_hash(
-                    checksum_path.parent.joinpath(*path.split("/")), hash_
-                )
-
-    def items(self):
-        return self._file_hashes.items()
-
-    def __len__(self):
-        return len(self._file_hashes)
-
-    def iteratively_verify(self):
-        """
-        Lazily yield each file and whether it matches the known checksum.
-
-        :rtype: [(Path, bool)]
-        """
-        for path, hash_ in self.items():
-            calculated_hash = self._checksum(path)
-            yield path, calculated_hash == hash_
-
-    def __bool__(self):
-        return bool(self._file_hashes)
-
-    def __eq__(self, other):
-        if isinstance(other, self.__class__):
-            # pylint 1.6.4 isn't smart enough to know that this is protected access of the same class
-            # pylint: disable=protected-access
-            return self._file_hashes == other._file_hashes
-
-        return False
-
-    def __hash__(self) -> int:
-        return hash(self._file_hashes)
diff --git a/requirements/deployment.txt b/requirements/deployment.txt
index 921a147d..cdbc1a18 100644
--- a/requirements/deployment.txt
+++ b/requirements/deployment.txt
@@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.10
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
 #    pip-compile --extra=deployment --extra=docker --output-file=requirements/deployment.txt
@@ -9,12 +9,13 @@ affine==2.3.1
     #   eo3 (setup.py)
     #   odc-geo
     #   rasterio
-attrs==22.1.0
+attrs==23.1.0
     # via
     #   cattrs
     #   eo3 (setup.py)
     #   jsonschema
     #   rasterio
+    #   referencing
 boltons==21.0.0
     # via eo3 (setup.py)
 boto3==1.24.94
@@ -46,9 +47,7 @@ cligj==0.7.2
     # via rasterio
 defusedxml==0.7.1
     # via eo3 (setup.py)
-exceptiongroup==1.0.0rc9
-    # via cattrs
-gdal==3.3.2
+gdal==3.6.3
     # via eo3 (setup.py)
 h5py==3.7.0
     # via eo3 (setup.py)
@@ -56,8 +55,10 @@ jmespath==1.0.1
     # via
     #   boto3
     #   botocore
-jsonschema==4.16.0
+jsonschema==4.18.0
     # via eo3 (setup.py)
+jsonschema-specifications==2023.7.1
+    # via jsonschema
 numpy==1.23.4
     # via
     #   eo3 (setup.py)
@@ -82,8 +83,6 @@ pyproj==3.4.0
     # via
     #   eo3 (setup.py)
     #   odc-geo
-pyrsistent==0.18.1
-    # via jsonschema
 pystac==1.7.3
     # via eo3 (setup.py)
 python-dateutil==2.8.2
@@ -97,10 +96,17 @@ pytz==2022.5
     # via pandas
 rasterio==1.3.3
     # via eo3 (setup.py)
+referencing==0.30.2
+    # via
+    #   eo3 (setup.py)
+    #   jsonschema
+    #   jsonschema-specifications
+rpds-py==0.10.2
+    # via
+    #   jsonschema
+    #   referencing
 ruamel-yaml==0.17.21
     # via eo3 (setup.py)
-ruamel-yaml-clib==0.2.6
-    # via ruamel-yaml
 s3transfer==0.6.0
     # via boto3
 scipy==1.9.3
diff --git a/requirements/setup.txt b/requirements/setup.txt
index 3e02c3fc..99d417d8 100644
--- a/requirements/setup.txt
+++ b/requirements/setup.txt
@@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.10
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
 #    pip-compile --output-file=requirements/setup.txt --strip-extras
@@ -9,12 +9,13 @@ affine==2.4.0
     #   eo3 (setup.py)
     #   odc-geo
     #   rasterio
-attrs==22.1.0
+attrs==23.1.0
     # via
     #   cattrs
     #   eo3 (setup.py)
     #   jsonschema
     #   rasterio
+    #   referencing
 boltons==23.0.0
     # via eo3 (setup.py)
 boto3==1.26.129
@@ -46,16 +47,16 @@ cligj==0.7.2
     # via rasterio
 defusedxml==0.7.1
     # via eo3 (setup.py)
-exceptiongroup==1.1.1
-    # via cattrs
 h5py==3.8.0
     # via eo3 (setup.py)
 jmespath==1.0.1
     # via
     #   boto3
     #   botocore
-jsonschema==4.17.3
+jsonschema==4.18.0
     # via eo3 (setup.py)
+jsonschema-specifications==2023.7.1
+    # via jsonschema
 numpy==1.24.3
     # via
     #   eo3 (setup.py)
@@ -81,8 +82,6 @@ pyproj==3.4.0
     # via
     #   eo3 (setup.py)
     #   odc-geo
-pyrsistent==0.19.3
-    # via jsonschema
 pystac==1.7.3
     # via eo3 (setup.py)
 python-dateutil==2.8.2
@@ -96,6 +95,15 @@ pytz==2023.3
     # via pandas
 rasterio==1.3.6
     # via eo3 (setup.py)
+referencing==0.30.2
+    # via
+    #   eo3 (setup.py)
+    #   jsonschema
+    #   jsonschema-specifications
+rpds-py==0.10.2
+    # via
+    #   jsonschema
+    #   referencing
 ruamel-yaml==0.17.24
     # via eo3 (setup.py)
 ruamel-yaml-clib==0.2.7
diff --git a/requirements/test.txt b/requirements/test.txt
index 0f1dec58..fe9fa0f8 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.10
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
 #    pip-compile --extra=docker --extra=test --output-file=requirements/test.txt
@@ -11,7 +11,7 @@ affine==2.3.1
     #   rasterio
 alabaster==0.7.12
     # via sphinx
-attrs==22.1.0
+attrs==23.1.0
     # via
     #   cattrs
     #   eo3 (setup.py)
@@ -19,16 +19,20 @@ attrs==22.1.0
     #   morecantile
     #   pytest
     #   rasterio
+    #   referencing
 babel==2.10.3
     # via sphinx
 boltons==21.0.0
     # via eo3 (setup.py)
 boto3==1.24.94
-    # via eo3 (setup.py)
+    # via
+    #   eo3 (setup.py)
+    #   moto
 botocore==1.27.94
     # via
     #   boto3
     #   eo3 (setup.py)
+    #   moto
     #   s3transfer
 cachetools==5.2.0
     # via odc-geo
@@ -39,6 +43,8 @@ certifi==2022.12.7
     #   pyproj
     #   rasterio
     #   requests
+cffi==1.15.1
+    # via cryptography
 cftime==1.6.2
     # via netcdf4
 charset-normalizer==2.1.1
@@ -58,6 +64,10 @@ click-plugins==1.1.1
     # via rasterio
 cligj==0.7.2
     # via rasterio
+coverage[toml]==7.3.1
+    # via pytest-cov
+cryptography==41.0.3
+    # via moto
 deepdiff==6.2.1
     # via eo3 (setup.py)
 defusedxml==0.7.1
@@ -66,8 +76,6 @@ docutils==0.17.1
     # via
     #   sphinx
     #   sphinx-rtd-theme
-exceptiongroup==1.0.0rc9
-    # via cattrs
 flake8==5.0.4
     # via pep8-naming
 gdal==3.6.3
@@ -83,21 +91,29 @@ imagesize==1.4.1
 iniconfig==1.1.1
     # via pytest
 jinja2==3.1.2
-    # via sphinx
+    # via
+    #   moto
+    #   sphinx
 jmespath==1.0.1
     # via
     #   boto3
     #   botocore
-jsonschema==4.16.0
+jsonschema==4.18.0
     # via eo3 (setup.py)
+jsonschema-specifications==2023.7.1
+    # via jsonschema
 markupsafe==2.1.1
-    # via jinja2
+    # via
+    #   jinja2
+    #   werkzeug
 mccabe==0.7.0
     # via flake8
 mock==4.0.3
     # via eo3 (setup.py)
 morecantile==3.1.2
     # via rio-cogeo
+moto==4.2.2
+    # via eo3 (setup.py)
 netcdf4==1.6.1
     # via eo3 (setup.py)
 networkx==2.8.7
@@ -143,6 +159,8 @@ py==1.11.0
     # via pytest
 pycodestyle==2.9.1
     # via flake8
+pycparser==2.21
+    # via cffi
 pydantic==1.10.2
     # via
     #   morecantile
@@ -160,15 +178,20 @@ pyproj==3.4.0
     #   eo3 (setup.py)
     #   morecantile
     #   odc-geo
-pyrsistent==0.18.1
-    # via jsonschema
 pystac==1.7.3
     # via eo3 (setup.py)
 pytest==7.1.3
+    # via
+    #   eo3 (setup.py)
+    #   pytest-cov
+pytest-cov==4.1.0
+    # via eo3 (setup.py)
+pytest-httpserver==1.0.8
     # via eo3 (setup.py)
 python-dateutil==2.8.2
     # via
     #   botocore
+    #   moto
     #   pandas
     #   pystac
 python-rapidjson==1.9
@@ -179,18 +202,32 @@ pytz==2022.5
     #   pandas
 pywavelets==1.4.1
     # via scikit-image
+pyyaml==6.0.1
+    # via responses
 rasterio==1.3.3
     # via
     #   eo3 (setup.py)
     #   rio-cogeo
-requests==2.28.1
-    # via sphinx
+referencing==0.30.2
+    # via
+    #   eo3 (setup.py)
+    #   jsonschema
+    #   jsonschema-specifications
+requests==2.31.0
+    # via
+    #   moto
+    #   responses
+    #   sphinx
+responses==0.23.3
+    # via moto
 rio-cogeo==3.4.1
     # via eo3 (setup.py)
+rpds-py==0.10.2
+    # via
+    #   jsonschema
+    #   referencing
 ruamel-yaml==0.17.21
     # via eo3 (setup.py)
-ruamel-yaml-clib==0.2.6
-    # via ruamel-yaml
 s3transfer==0.6.0
     # via boto3
 scikit-image==0.19.3
@@ -237,14 +274,23 @@ tomli==2.0.1
     # via pytest
 toolz==0.12.0
     # via eo3 (setup.py)
+types-pyyaml==6.0.12.11
+    # via responses
 typing-extensions==4.4.0
     # via pydantic
 urllib3==1.26.12
     # via
     #   botocore
     #   requests
+    #   responses
+werkzeug==2.3.7
+    # via
+    #   moto
+    #   pytest-httpserver
 xarray==2022.10.0
     # via eo3 (setup.py)
+xmltodict==0.13.0
+    # via moto
 
 # The following packages are considered to be unsafe in a requirements file:
 # setuptools
diff --git a/setup.py b/setup.py
index f8e9ee37..6375a879 100755
--- a/setup.py
+++ b/setup.py
@@ -19,9 +19,11 @@
     "pep8-naming",
     "pytest",
     "pytest-cov",
+    "pytest-httpserver",
     "rio_cogeo",
     "sphinx-autodoc-typehints",
     "sphinx_rtd_theme",
+    "moto",
 ]
 
 EXTRAS_REQUIRE = {
@@ -78,7 +80,8 @@
         "click",
         "defusedxml",
         "h5py",
-        "jsonschema>=3",  # We want a Draft6Validator
+        "jsonschema==4.18.0",  # We want a Draft7Validator, but 4.18.0 is the only version that works
+        "referencing",
         "numpy>=1.15.4",
         "odc-geo",
         "pyproj",
diff --git a/test_file.txt b/test_file.txt
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/common.py b/tests/common.py
index 1897666a..52088e70 100644
--- a/tests/common.py
+++ b/tests/common.py
@@ -1,127 +1,14 @@
 import operator
-from pathlib import Path
-from textwrap import indent
-from typing import Dict, Iterable, Mapping, Sequence, Union
+from typing import Dict, Iterable, Mapping, Sequence
 
-import pytest
 import rapidjson
 from click.testing import CliRunner, Result
 from deepdiff import DeepDiff
 from deepdiff.model import DiffLevel
-from ruamel import yaml
-from shapely.geometry import shape
-from shapely.geometry.base import BaseGeometry
 
-from eo3 import Eo3DatasetDocBase, serialise
 from eo3.validation_msg import Level, ValidationMessage, ValidationMessages
 
 
-def check_prepare_outputs(
-    invoke_script,
-    run_args,
-    expected_doc: Dict,
-    expected_metadata_path: Path,
-    ignore_fields=(),
-):
-    """Call a prepare script and check for an expected output document."""
-    __tracebackhide__ = operator.methodcaller("errisinstance", AssertionError)
-    res = run_prepare_cli(invoke_script, *run_args)
-
-    try:
-        assert_expected_eo3_path(expected_doc, expected_metadata_path, ignore_fields)
-    except AssertionError:
-        print(f'Output:\n{indent(res.output, "    ")}')
-        raise
-
-
-def assert_expected_eo3_path(
-    expected_doc: Dict,
-    expected_path: Path,
-    ignore_fields=(),
-):
-    """
-    Check an output path of an EO3 dataset matches an expected document.
-
-    This is slightly smarter about doing geometry equality etc within the document.
-    """
-    __tracebackhide__ = operator.methodcaller("errisinstance", AssertionError)
-    assert (
-        expected_path.exists()
-    ), f"Expected output EO3 path doesn't exist: {expected_path}"
-    assert_same_as_file(
-        expected_doc,
-        expected_path,
-        # We check the geometry below
-        ignore_fields=("geometry",) + tuple(ignore_fields),
-    )
-
-    if "geometry" not in ignore_fields:
-        # Compare geometry after parsing, rather than comparing the raw dict values.
-        produced_dataset = serialise.from_path(expected_path)
-        expected_dataset = serialise.from_doc(expected_doc, skip_validation=True)
-        if expected_dataset.geometry is None:
-            assert produced_dataset.geometry is None, (
-                f"Expected a null geometry, "
-                f"but output included one: {produced_dataset.geometry.__geo_interface__!r}"
-            )
-        else:
-            assert_shapes_mostly_equal(
-                produced_dataset.geometry,
-                expected_dataset.geometry,
-                # Typically meters -- this is easily good enough accuracy.
-                0.0001,
-            )
-
-
-def assert_expected_eo3(
-    expected_doc: Eo3DatasetDocBase,
-    given_doc: Eo3DatasetDocBase,
-    *,
-    ignore_fields=(),
-):
-    """
-    Do the two DatasetDocs match?
-
-    (Unlike equality, gives reasonable error message of differences, and
-    compares geometry more intelligently.)
-    """
-    __tracebackhide__ = operator.methodcaller("errisinstance", AssertionError)
-    if expected_doc.geometry is None:
-        assert given_doc.geometry is None, "Expected no geometry"
-    else:
-        assert_shapes_mostly_equal(
-            given_doc.geometry, expected_doc.geometry, 0.00000001
-        )
-    e = serialise.to_doc(expected_doc)
-    g = serialise.to_doc(given_doc)
-    for f in ("geometry",) + ignore_fields:
-        e.pop(f)
-        g.pop(f)
-    assert_same(g, e)
-
-
-def assert_shapes_mostly_equal(
-    shape1: Union[BaseGeometry, dict],
-    shape2: Union[BaseGeometry, dict],
-    threshold: float,
-):
-    __tracebackhide__ = operator.methodcaller("errisinstance", AssertionError)
-
-    if isinstance(shape1, dict):
-        shape1 = shape(shape1)
-    if isinstance(shape2, dict):
-        shape2 = shape(shape2)
-
-    # Check area first, as it's a nicer error message when they're wildly different.
-    assert shape1.area == pytest.approx(
-        shape2.area, abs=threshold
-    ), f"Shapes have different areas: {shape1.area} != {shape2.area}"
-
-    s1 = shape1.simplify(tolerance=threshold)
-    s2 = shape2.simplify(tolerance=threshold)
-    assert (s1 - s2).area < threshold, f"{s1} is not mostly equal to {s2}"
-
-
 def assert_same(expected_doc: Dict, generated_doc: Dict):
     """
     Assert two documents are the same, ignoring trivial float differences
@@ -131,27 +18,6 @@ def assert_same(expected_doc: Dict, generated_doc: Dict):
     assert doc_diffs == {}, "\n".join(format_doc_diffs(expected_doc, generated_doc))
 
 
-def assert_same_as_file(expected_doc: Dict, generated_file: Path, ignore_fields=()):
-    """Assert a file contains the given document content (after normalisation etc)"""
-    __tracebackhide__ = operator.methodcaller("errisinstance", AssertionError)
-
-    assert generated_file.exists(), f"Expected file to exist {generated_file.name}"
-
-    with generated_file.open("r") as f:
-        generated_doc = yaml.YAML(typ="safe").load(f)
-
-    expected_doc = dict(expected_doc)
-    for field in ignore_fields:
-        if field in generated_doc:
-            del generated_doc[field]
-        if field in expected_doc:
-            del expected_doc[field]
-
-    expected_doc = dump_roundtrip(expected_doc)
-    generated_doc = dump_roundtrip(generated_doc)
-    assert_same(generated_doc, expected_doc)
-
-
 def run_prepare_cli(invoke_script, *args, expect_success=True) -> Result:
     """Run the prepare script as a command-line command"""
     __tracebackhide__ = True
diff --git a/tests/data/multi_doc.nc b/tests/data/multi_doc.nc
new file mode 100644
index 00000000..ea38eb32
Binary files /dev/null and b/tests/data/multi_doc.nc differ
diff --git a/tests/data/multi_doc.yml b/tests/data/multi_doc.yml
new file mode 100644
index 00000000..53cd5bb9
--- /dev/null
+++ b/tests/data/multi_doc.yml
@@ -0,0 +1,18 @@
+description: Document 1 of 3
+data:
+  number: 1
+  list: [1,2,3]
+
+---
+
+description: Document 2 of 3
+data:
+  number: 2
+  list: [1,2,3]
+
+---
+
+description: Document 3 of 3
+data:
+  number: 3
+  list: [1,2,3]
diff --git a/tests/data/multi_doc.yml.gz b/tests/data/multi_doc.yml.gz
new file mode 100644
index 00000000..f28be8be
Binary files /dev/null and b/tests/data/multi_doc.yml.gz differ
diff --git a/tests/data/sample.json b/tests/data/sample.json
new file mode 100644
index 00000000..afdfa7ff
--- /dev/null
+++ b/tests/data/sample.json
@@ -0,0 +1,11 @@
+{
+    "description": "File containing json document",
+    "data": {
+        "number": 1,
+        "list": [
+            1,
+            2,
+            3
+        ]
+    }
+}
diff --git a/tests/data/single_doc.yaml b/tests/data/single_doc.yaml
new file mode 100644
index 00000000..129776d4
--- /dev/null
+++ b/tests/data/single_doc.yaml
@@ -0,0 +1,4 @@
+description: File containing single yaml document
+data:
+  number: 1
+  list: [1,2,3]
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 73b0c3b3..7ce956a9 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -2,12 +2,12 @@
 import shutil
 from datetime import datetime
 from pathlib import Path
-from typing import Callable, Dict
+from typing import Dict
 
 import pytest
 
 from eo3 import serialise
-from eo3.model import Eo3DatasetDocBase
+from eo3.model import DatasetMetadata
 
 # from eo3.prepare.landsat_l1_prepare import normalise_nci_symlinks
 
@@ -50,18 +50,6 @@
 WOFS_PATH: Path = Path(__file__).parent / "data" / "wofs"
 
 
-# def path_offset(base: Path, offset: str):
-#    return str(normalise_nci_symlinks(base.absolute().joinpath(offset)))
-
-
-# def tar_offset(tar: Path, offset: str):
-#     return "tar:" + str(normalise_nci_symlinks(tar.absolute())) + "!" + offset
-
-
-def relative_offset(base, offset):
-    return offset
-
-
 @pytest.fixture
 def sentinel1_eo3() -> Path:
     with open(S1_EO3_PATH) as f:
@@ -79,22 +67,12 @@ def l1_c2_ls8_folder(tmp_path: Path) -> Path:
 
 
 @pytest.fixture
-def l1_ls8_metadata_path(
-    l1_ls8_folder: Path, l1_ls8_dataset: Eo3DatasetDocBase
-) -> Path:
+def l1_ls8_metadata_path(l1_ls8_folder: Path, l1_ls8_dataset: DatasetMetadata) -> Path:
     path = l1_ls8_folder / f"{l1_ls8_dataset.label}.odc-metadata.yaml"
     serialise.to_path(path, l1_ls8_dataset)
     return path
 
 
-@pytest.fixture
-def l1_ls8_dataset_path(l1_ls8_folder: Path, l1_ls8_metadata_path: Path) -> Path:
-    """
-    A prepared L1 dataset with an EO3 metadata file.
-    """
-    return l1_ls8_folder
-
-
 @pytest.fixture
 def l1_ls7_tarball(tmp_path: Path) -> Path:
     return _make_copy(L71GT_TARBALL_PATH, tmp_path)
@@ -120,30 +98,13 @@ def _make_copy(input_path, tmp_path):
 
 
 @pytest.fixture
-def l1_ls8_dataset(l1_ls8_folder_md_expected: Dict) -> Eo3DatasetDocBase:
-    return serialise.from_doc(l1_ls8_folder_md_expected)
+def l1_ls8_dataset(l1_ls8_folder_md_expected: Dict) -> DatasetMetadata:
+    return DatasetMetadata(l1_ls8_folder_md_expected)
 
 
 @pytest.fixture
 def l1_ls8_folder_md_expected(l1_ls8_folder) -> Dict:
-    return expected_l1_ls8_folder(l1_ls8_folder, relative_offset)
-
-
-@pytest.fixture
-def l1_ls8_ga_expected(l1_ls8_folder) -> Dict:
-    return expected_l1_ls8_folder(
-        l1_ls8_folder,
-        relative_offset,
-        organisation="ga.gov.au",
-        collection="3",
-        # the id in the ls8_telemetry_path fixture
-        lineage={"satellite_telemetry_data": ["30841328-89c2-4693-8802-a3560a6cf67a"]},
-    )
-
-
-# @pytest.fixture
-# def l1_ls8_folder_md_expected_absolute(l1_ls8_folder) -> Dict:
-#     return expected_l1_ls8_folder(l1_ls8_folder, path_offset)
+    return expected_l1_ls8_folder(l1_ls8_folder)
 
 
 @pytest.fixture
@@ -174,7 +135,6 @@ def example_metadata(
 
 def expected_l1_ls8_folder(
     l1_ls8_folder: Path,
-    offset: Callable[[Path, str], str] = relative_offset,
     organisation="usgs.gov",
     collection="1",
     l1_collection="1",
@@ -187,26 +147,15 @@ def expected_l1_ls8_folder(
     """
     org_code = organisation.split(".")[0]
     product_name = f"{org_code}_ls8c_level1_{collection}"
-    if collection == "2":
-        processing_datetime = datetime(2020, 9, 7, 19, 30, 5)
-        cloud_cover = 93.28
-        points_model = 125
-        points_version = 5
-        rmse_model_x = 4.525
-        rmse_model_y = 5.917
-        software_version = "LPGS_15.3.1c"
-        uuid = "d9221c40-24c3-5356-ab22-4dcac2bf2d70"
-        quality_tag = "QA_PIXEL"
-    else:
-        processing_datetime = datetime(2017, 4, 5, 11, 17, 36)
-        cloud_cover = 93.22
-        points_model = 66
-        points_version = 4
-        rmse_model_x = 4.593
-        rmse_model_y = 5.817
-        software_version = "LPGS_2.7.0"
-        uuid = "a780754e-a884-58a7-9ac0-df518a67f59d"
-        quality_tag = "BQA"
+    processing_datetime = datetime(2017, 4, 5, 11, 17, 36)
+    cloud_cover = 93.22
+    points_model = 66
+    points_version = 4
+    rmse_model_x = 4.593
+    rmse_model_y = 5.817
+    software_version = "LPGS_2.7.0"
+    uuid = "a780754e-a884-58a7-9ac0-df518a67f59d"
+    quality_tag = "BQA"
     processing_date = processing_datetime.strftime("%Y%m%d")
     return {
         "$schema": "https://schemas.opendatacube.org/dataset",
@@ -218,6 +167,7 @@ def expected_l1_ls8_folder(
         },
         "properties": {
             "datetime": datetime(2016, 1, 21, 23, 50, 23, 54435),
+            "dea:dataset_maturity": "final",
             # The minor version comes from the processing date (as used in filenames to distinguish reprocesses).
             "odc:dataset_version": f"{collection}.0.{processing_date}",
             "odc:file_format": "GeoTIFF",
@@ -300,77 +250,41 @@ def expected_l1_ls8_folder(
         },
         "measurements": {
             "coastal_aerosol": {
-                "path": offset(
-                    l1_ls8_folder,
-                    f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B1.TIF",
-                )
+                "path": f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B1.TIF"
             },
             "blue": {
-                "path": offset(
-                    l1_ls8_folder,
-                    f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B2.TIF",
-                )
+                "path": f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B2.TIF"
             },
             "green": {
-                "path": offset(
-                    l1_ls8_folder,
-                    f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B3.TIF",
-                )
+                "path": f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B3.TIF"
             },
             "red": {
-                "path": offset(
-                    l1_ls8_folder,
-                    f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B4.TIF",
-                )
+                "path": f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B4.TIF"
             },
             "nir": {
-                "path": offset(
-                    l1_ls8_folder,
-                    f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B5.TIF",
-                )
+                "path": f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B5.TIF"
             },
             "swir_1": {
-                "path": offset(
-                    l1_ls8_folder,
-                    f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B6.TIF",
-                )
+                "path": f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B6.TIF"
             },
             "swir_2": {
-                "path": offset(
-                    l1_ls8_folder,
-                    f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B7.TIF",
-                )
+                "path": f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B7.TIF"
             },
             "panchromatic": {
                 "grid": "panchromatic",
-                "path": offset(
-                    l1_ls8_folder,
-                    f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B8.TIF",
-                ),
+                "path": f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B8.TIF",
             },
             "cirrus": {
-                "path": offset(
-                    l1_ls8_folder,
-                    f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B9.TIF",
-                )
+                "path": f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B9.TIF"
             },
             "lwir_1": {
-                "path": offset(
-                    l1_ls8_folder,
-                    f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B10.TIF",
-                )
+                "path": f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B10.TIF"
             },
             "lwir_2": {
-                "path": offset(
-                    l1_ls8_folder,
-                    f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B11.TIF",
-                )
+                "path": f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B11.TIF"
             },
             "quality": {
-                "path": offset(
-                    l1_ls8_folder,
-                    f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_{quality_tag}.TIF",
-                )
+                "path": f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_{quality_tag}.TIF"
             },
         },
         "accessories": {
@@ -384,7 +298,7 @@ def expected_l1_ls8_folder(
 
 @pytest.fixture
 def l1_ls7_tarball_md_expected(
-    l1_ls7_tarball, offset: Callable[[Path, str], str] = relative_offset
+    l1_ls7_tarball,
 ) -> Dict:
     return {
         "$schema": "https://schemas.opendatacube.org/dataset",
@@ -490,47 +404,17 @@ def l1_ls7_tarball_md_expected(
             },
         },
         "measurements": {
-            "blue": {
-                "path": offset(
-                    l1_ls7_tarball, "LE07_L1TP_104078_20130429_20161124_01_T1_B1.TIF"
-                )
-            },
-            "green": {
-                "path": offset(
-                    l1_ls7_tarball, "LE07_L1TP_104078_20130429_20161124_01_T1_B2.TIF"
-                )
-            },
-            "nir": {
-                "path": offset(
-                    l1_ls7_tarball, "LE07_L1TP_104078_20130429_20161124_01_T1_B4.TIF"
-                )
-            },
-            "quality": {
-                "path": offset(
-                    l1_ls7_tarball, "LE07_L1TP_104078_20130429_20161124_01_T1_BQA.TIF"
-                )
-            },
-            "red": {
-                "path": offset(
-                    l1_ls7_tarball, "LE07_L1TP_104078_20130429_20161124_01_T1_B3.TIF"
-                )
-            },
-            "swir_1": {
-                "path": offset(
-                    l1_ls7_tarball, "LE07_L1TP_104078_20130429_20161124_01_T1_B5.TIF"
-                )
-            },
-            "swir_2": {
-                "path": offset(
-                    l1_ls7_tarball, "LE07_L1TP_104078_20130429_20161124_01_T1_B7.TIF"
-                )
-            },
+            "blue": {"path": "LE07_L1TP_104078_20130429_20161124_01_T1_B1.TIF"},
+            "green": {"path": "LE07_L1TP_104078_20130429_20161124_01_T1_B2.TIF"},
+            "nir": {"path": "LE07_L1TP_104078_20130429_20161124_01_T1_B4.TIF"},
+            "quality": {"path": "LE07_L1TP_104078_20130429_20161124_01_T1_BQA.TIF"},
+            "red": {"path": "LE07_L1TP_104078_20130429_20161124_01_T1_B3.TIF"},
+            "swir_1": {"path": "LE07_L1TP_104078_20130429_20161124_01_T1_B5.TIF"},
+            "swir_2": {"path": "LE07_L1TP_104078_20130429_20161124_01_T1_B7.TIF"},
             "tir_1": {"path": "LE07_L1TP_104078_20130429_20161124_01_T1_B6_VCID_1.TIF"},
             "tir_2": {"path": "LE07_L1TP_104078_20130429_20161124_01_T1_B6_VCID_2.TIF"},
             "panchromatic": {
-                "path": offset(
-                    l1_ls7_tarball, "LE07_L1TP_104078_20130429_20161124_01_T1_B8.TIF"
-                ),
+                "path": "LE07_L1TP_104078_20130429_20161124_01_T1_B8.TIF",
                 "grid": "panchromatic",
             },
         },
@@ -545,7 +429,7 @@ def l1_ls7_tarball_md_expected(
 
 @pytest.fixture
 def l1_ls5_tarball_md_expected(
-    l1_ls5_tarball, offset: Callable[[Path, str], str] = relative_offset
+    l1_ls5_tarball,  # offset: Callable[[Path, str], str] = relative_offset
 ) -> Dict:
     return {
         "$schema": "https://schemas.opendatacube.org/dataset",
@@ -623,46 +507,14 @@ def l1_ls5_tarball_md_expected(
             }
         },
         "measurements": {
-            "blue": {
-                "path": offset(
-                    l1_ls5_tarball, "LT05_L1TP_090085_19970406_20161231_01_T1_B1.TIF"
-                )
-            },
-            "green": {
-                "path": offset(
-                    l1_ls5_tarball, "LT05_L1TP_090085_19970406_20161231_01_T1_B2.TIF"
-                )
-            },
-            "red": {
-                "path": offset(
-                    l1_ls5_tarball, "LT05_L1TP_090085_19970406_20161231_01_T1_B3.TIF"
-                )
-            },
-            "nir": {
-                "path": offset(
-                    l1_ls5_tarball, "LT05_L1TP_090085_19970406_20161231_01_T1_B4.TIF"
-                )
-            },
-            "swir_1": {
-                "path": offset(
-                    l1_ls5_tarball, "LT05_L1TP_090085_19970406_20161231_01_T1_B5.TIF"
-                )
-            },
-            "swir_2": {
-                "path": offset(
-                    l1_ls5_tarball, "LT05_L1TP_090085_19970406_20161231_01_T1_B7.TIF"
-                )
-            },
-            "tir": {
-                "path": offset(
-                    l1_ls5_tarball, "LT05_L1TP_090085_19970406_20161231_01_T1_B6.TIF"
-                )
-            },
-            "quality": {
-                "path": offset(
-                    l1_ls5_tarball, "LT05_L1TP_090085_19970406_20161231_01_T1_BQA.TIF"
-                )
-            },
+            "blue": {"path": "LT05_L1TP_090085_19970406_20161231_01_T1_B1.TIF"},
+            "green": {"path": "LT05_L1TP_090085_19970406_20161231_01_T1_B2.TIF"},
+            "red": {"path": "LT05_L1TP_090085_19970406_20161231_01_T1_B3.TIF"},
+            "nir": {"path": "LT05_L1TP_090085_19970406_20161231_01_T1_B4.TIF"},
+            "swir_1": {"path": "LT05_L1TP_090085_19970406_20161231_01_T1_B5.TIF"},
+            "swir_2": {"path": "LT05_L1TP_090085_19970406_20161231_01_T1_B7.TIF"},
+            "tir": {"path": "LT05_L1TP_090085_19970406_20161231_01_T1_B6.TIF"},
+            "quality": {"path": "LT05_L1TP_090085_19970406_20161231_01_T1_BQA.TIF"},
         },
         "accessories": {
             "metadata:landsat_mtl": {
@@ -708,7 +560,27 @@ def metadata_type():
                         ["properties", "dtr:end_datetime"],
                         ["properties", "datetime"],
                     ],
-                }
+                },
+                "lat": {
+                    "description": "Latitude range",
+                    "type": "double-range",
+                    "min_offset": [
+                        ["extent", "lat", "begin"],
+                    ],
+                    "max_offset": [
+                        ["extent", "lat", "end"],
+                    ],
+                },
+                "lon": {
+                    "description": "Longitude range",
+                    "type": "double-range",
+                    "min_offset": [
+                        ["extent", "lon", "begin"],
+                    ],
+                    "max_offset": [
+                        ["extent", "lon", "end"],
+                    ],
+                },
             },
         },
     }
@@ -752,7 +624,6 @@ def eo3_product():
         "metadata_type": "eo3_landsat_l1",
         "license": "CC-BY-4.0",
         "metadata": {
-            # "product": {"name": "usgs_ls8c_level1_1"},  DEPRECATED
             "properties": {
                 "eo:platform": "landsat-8",
                 "eo:instrument": "OLI_TIRS",
diff --git a/tests/integration/test_image.py b/tests/integration/test_image.py
deleted file mode 100644
index 9a15d329..00000000
--- a/tests/integration/test_image.py
+++ /dev/null
@@ -1,160 +0,0 @@
-import numpy as np
-
-from eo3 import images
-
-
-def test_rescale_intensity():
-    # Example was generated via:
-    #     scipy.ndimage.rotate(np.arange(1000, 8000, 100).reshape((7,10)), 45, cval=-99)
-
-    # (Using a variable so the array is more spaced-out & readable)
-    nada = -999
-    original_image = np.array(
-        [
-            [nada, nada, nada, nada, nada, nada, nada, nada, nada, nada, nada, nada],
-            [nada, nada, nada, nada, nada, nada, 1852, 2730, nada, nada, nada, nada],
-            [nada, nada, nada, nada, nada, 1711, 2570, 3428, 4169, nada, nada, nada],
-            [nada, nada, nada, nada, 1568, 2432, 3287, 4009, 4805, 5610, nada, nada],
-            [nada, nada, nada, 1427, 2291, 3144, 3871, 4663, 5451, 6181, 7049, nada],
-            [nada, nada, 1284, 2149, 3003, 3729, 4521, 5312, 6040, 6889, 7757, nada],
-            [nada, 1143, 2011, 2860, 3588, 4379, 5171, 5897, 6751, 7616, nada, nada],
-            [nada, 1851, 2719, 3449, 4237, 5029, 5756, 6609, 7473, nada, nada, nada],
-            [nada, nada, 3290, 4095, 4891, 5613, 6468, 7332, nada, nada, nada, nada],
-            [nada, nada, nada, 4731, 5472, 6330, 7189, nada, nada, nada, nada, nada],
-            [nada, nada, nada, nada, 6170, 7048, nada, nada, nada, nada, nada, nada],
-            [nada, nada, nada, nada, nada, nada, nada, nada, nada, nada, nada, nada],
-        ]
-    )
-    unmodified = original_image.copy()
-
-    assert np.array_equal(
-        original_image, unmodified
-    ), "rescale_intensity modified the input image"
-
-    staticly_rescaled = images.rescale_intensity(
-        original_image, in_range=(4000, 6000), out_range=(100, 255), image_nodata=-999
-    )
-    print("Statically rescaled result: ")
-    print(repr(staticly_rescaled))
-
-    # - Note that the nodata values are not scaled (a previous bug!)
-    #   they're translated to the output nodata value (0).
-    # - Note how many will be clipped to the min (100) without falling into nodata.
-    non = 0
-    expected_static_rescale = np.array(
-        [
-            [non, non, non, non, non, non, non, non, non, non, non, non],
-            [non, non, non, non, non, non, 100, 100, non, non, non, non],
-            [non, non, non, non, non, 100, 100, 100, 113, non, non, non],
-            [non, non, non, non, 100, 100, 100, 100, 162, 224, non, non],
-            [non, non, non, 100, 100, 100, 100, 151, 212, 255, 255, non],
-            [non, non, 100, 100, 100, 100, 140, 201, 255, 255, 255, non],
-            [non, 100, 100, 100, 100, 129, 190, 247, 255, 255, non, non],
-            [non, 100, 100, 100, 118, 179, 236, 255, 255, non, non, non],
-            [non, non, 100, 107, 169, 225, 255, 255, non, non, non, non],
-            [non, non, non, 156, 214, 255, 255, non, non, non, non, non],
-            [non, non, non, non, 255, 255, non, non, non, non, non, non],
-            [non, non, non, non, non, non, non, non, non, non, non, non],
-        ],
-        dtype=np.uint8,
-    )
-    assert np.array_equal(staticly_rescaled, expected_static_rescale)
-
-
-def test_calc_range():
-    # Test that the correct value range and valid data arrays are calculated.
-
-    # Test arrays generated via:
-    # >>> scipy.ndimage.rotate(np.arange(10, 70, 1).reshape((6, 10)), 55, cval=-11)
-    # >>> scipy.ndimage.rotate(np.arange(20, 80, 1).reshape((6, 10)), 50, cval=-11)
-    # >>> scipy.ndimage.rotate(np.arange(30, 90, 1).reshape((6, 10)), 55, cval=-11)
-
-    # They have:
-    # - slightly different values to test the highest/lowest value range calculation
-    #   (it should be across all bands)
-    # - And slightly different rotation to test the combined valid_data mask.
-
-    no = -11
-    r_array = np.array(
-        [
-            [no, no, no, no, no, no, no, no, no, no, no],
-            [no, no, no, no, no, no, 25, no, no, no, no],
-            [no, no, no, no, no, 21, 31, 40, no, no, no],
-            [no, no, no, no, 17, 27, 36, 45, 53, 64, no],
-            [no, no, no, 15, 23, 32, 41, 49, 59, 68, no],
-            [no, no, no, 18, 29, 37, 46, 54, 65, no, no],
-            [no, no, 14, 25, 33, 42, 50, 61, no, no, no],
-            [no, 11, 20, 30, 38, 47, 56, 64, no, no, no],
-            [no, 15, 26, 34, 43, 52, 62, no, no, no, no],
-            [no, no, no, 39, 48, 58, no, no, no, no, no],
-            [no, no, no, no, 54, no, no, no, no, no, no],
-            [no, no, no, no, no, no, no, no, no, no, no],
-        ]
-    )
-    g_array = np.array(
-        [
-            [no, no, no, no, no, no, no, no, no, no, no],
-            [no, no, no, no, no, no, 31, no, no, no, no],
-            [no, no, no, no, no, 28, 38, 47, no, no, no],
-            [no, no, no, no, 26, 35, 44, 52, 60, 68, no],
-            [no, no, no, 24, 32, 41, 49, 58, 66, 76, no],
-            [no, no, no, 29, 39, 47, 55, 63, 73, no, no],
-            [no, no, 26, 36, 44, 52, 60, 70, no, no, no],
-            [no, 23, 33, 41, 50, 58, 67, 75, no, no, no],
-            [no, 31, 39, 47, 55, 64, 73, no, no, no, no],
-            [no, no, no, 52, 61, 71, no, no, no, no, no],
-            [no, no, no, no, 68, no, no, no, no, no, no],
-            [no, no, no, no, no, no, no, no, no, no, no],
-        ]
-    )
-    b_array = np.array(
-        [
-            [no, no, no, no, no, no, no, no, no, no, no],
-            [no, no, no, no, no, no, 45, no, no, no, no],
-            [no, no, no, no, no, 41, 51, 60, no, no, no],
-            [no, no, no, no, 37, 47, 56, 65, 73, 84, no],
-            [no, no, no, 35, 43, 52, 61, 69, 79, 88, no],
-            [no, no, no, 38, 49, 57, 66, 74, 85, no, no],
-            [no, no, 34, 45, 53, 62, 70, 81, no, no, no],
-            [no, 31, 40, 50, 58, 67, 76, 84, no, no, no],
-            [no, 35, 46, 54, 63, 72, 82, no, no, no, no],
-            [no, no, no, 59, 68, 78, no, no, no, no, no],
-            [no, no, no, no, 74, no, no, no, no, no, no],
-            [no, no, no, no, no, no, no, no, no, no, no],
-        ]
-    )
-
-    mask = np.ones(r_array.shape, dtype=np.bool_)
-    calculated_range = images.read_valid_mask_and_value_range(
-        mask,
-        ((r_array, no), (g_array, no), (b_array, no)),
-        calculate_percentiles=(2, 98),
-    )
-
-    expected_combined_mask = np.array(
-        [
-            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-            [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
-            [0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
-            [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0],
-            [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0],
-            [0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0],
-            [0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0],
-            [0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
-            [0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
-            [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0],
-            [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
-            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-        ],
-        dtype=bool,
-    )
-
-    assert np.array_equal(expected_combined_mask, mask), (
-        f"Combined mask isn't as expected. "
-        f"Diff: {repr(np.logical_xor(expected_combined_mask, mask))}"
-    )
-
-    assert calculated_range == (
-        34,
-        65,
-    ), f"Unexpected 2/98 percentile values: {calculated_range}"
diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py
new file mode 100644
index 00000000..795febc8
--- /dev/null
+++ b/tests/integration/test_model.py
@@ -0,0 +1,137 @@
+from datetime import datetime
+from textwrap import dedent
+from typing import Dict
+
+import pytest
+import toolz
+
+from eo3.fields import Range
+from eo3.model import DatasetMetadata
+from eo3.utils import default_utc
+from eo3.validate import InvalidDatasetError
+
+
+def test_get_and_set(l1_ls8_folder_md_expected: Dict, metadata_type):
+    """Test that we are able to access and set fields correctly"""
+    ds = DatasetMetadata(
+        raw_dict=l1_ls8_folder_md_expected, mdt_definition=metadata_type
+    )
+    # get
+    with pytest.raises(AttributeError, match="Unknown field 'foobar'"):
+        ds.foobar
+    assert ds.id == "a780754e-a884-58a7-9ac0-df518a67f59d"
+    assert ds.format == "GeoTIFF"
+    # set
+    with pytest.raises(AttributeError, match="Unknown field offset"):
+        ds.foo = "bar"
+    ds.format = "GeoTIFFF"
+    assert ds.format == "GeoTIFFF"
+    # set range
+    with pytest.raises(TypeError, match="expects a Range value"):
+        ds.lat = 0.0
+    # time can be a range or a single value
+    dt = datetime(2020, 1, 1, 23, 59, 59)
+    ds.time = dt
+    assert ds.time == Range(default_utc(dt), default_utc(dt))
+    dt_end = datetime(2020, 1, 2, 23, 59, 59)
+    ds.time = Range(dt, dt_end)
+    assert ds.time == Range(default_utc(dt), default_utc(dt_end))
+
+
+def test_update_metadata_type(l1_ls8_folder_md_expected: Dict, metadata_type):
+    """
+    Test that updating the metadata type definition gives us access to custom fields
+    included in the new definition
+    """
+    ds = DatasetMetadata(
+        raw_dict=l1_ls8_folder_md_expected, mdt_definition=metadata_type
+    )
+    with pytest.raises(AttributeError):
+        ds.instrument
+    new_metadata_type = toolz.assoc_in(
+        metadata_type,
+        ["dataset", "search_fields", "instrument"],
+        {
+            "offset": ["properties", "eo:instrument"],
+            "description": "Instrument name",
+        },
+    )
+    ds.metadata_type = new_metadata_type
+    assert ds.instrument == "OLI_TIRS"
+
+
+def test_additional_metadata_access(l1_ls8_folder_md_expected: Dict, metadata_type):
+    """Check that we are able to access metadata not defined in the metadata type"""
+    ds = DatasetMetadata(
+        raw_dict=l1_ls8_folder_md_expected, mdt_definition=metadata_type
+    )
+    assert ds.crs.epsg == 32655
+    assert ds.product.name == "usgs_ls8c_level1_1"
+    assert "coastal_aerosol" in ds.measurements
+    assert "metadata:landsat_mtl" in ds.accessories
+    assert ds.locations is None
+
+
+def test_bad_crs(example_metadata: Dict):
+    """CRS should be valid, and is preferred in epsg form if possible"""
+    # Invalid crs
+    example_metadata["crs"] = "123456"
+    with pytest.raises(InvalidDatasetError, match="invalid_crs"):
+        DatasetMetadata(example_metadata)
+    # Missing crs
+    del example_metadata["crs"]
+    with pytest.raises(InvalidDatasetError, match="incomplete_geometry"):
+        DatasetMetadata(example_metadata)
+
+    # A CRS should be in epsg form if an EPSG exists, not WKT
+    example_metadata["crs"] = dedent(
+        """PROJCS["WGS 84 / UTM zone 55N",
+    GEOGCS["WGS 84",
+        DATUM["WGS_1984",
+            SPHEROID["WGS 84",6378137,298.257223563,
+                AUTHORITY["EPSG","7030"]],
+            AUTHORITY["EPSG","6326"]],
+        PRIMEM["Greenwich",0,
+            AUTHORITY["EPSG","8901"]],
+        UNIT["degree",0.01745329251994328,
+            AUTHORITY["EPSG","9122"]],
+        AUTHORITY["EPSG","4326"]],
+    UNIT["metre",1,
+        AUTHORITY["EPSG","9001"]],
+    PROJECTION["Transverse_Mercator"],
+    PARAMETER["latitude_of_origin",0],
+    PARAMETER["central_meridian",147],
+    PARAMETER["scale_factor",0.9996],
+    PARAMETER["false_easting",500000],
+    PARAMETER["false_northing",0],
+    AUTHORITY["EPSG","32655"],
+    AXIS["Easting",EAST],
+    AXIS["Northing",NORTH]]
+    """
+    )
+    with pytest.warns(UserWarning, match="change CRS to 'epsg:32655'"):
+        DatasetMetadata(example_metadata)
+
+
+def test_extent(l1_ls8_folder_md_expected: Dict):
+    # Core TODO: copied from tests.test_eo3
+    """Check that extent is properly calculated"""
+    ds = DatasetMetadata(l1_ls8_folder_md_expected)
+    assert ds.extent is not None
+    assert ds.extent.crs.epsg == 32655
+
+    del l1_ls8_folder_md_expected["geometry"]
+    doc = dict(**l1_ls8_folder_md_expected, geometry=ds.extent.buffer(-1).json)
+
+    ds2 = DatasetMetadata(doc)
+    assert ds.extent.contains(ds2.extent)
+
+
+def test_warn_location_deprecated(
+    l1_ls8_folder_md_expected: Dict,
+):
+    """Warn if dataset includes deprecated 'location' field"""
+    l1_ls8_folder_md_expected["location"] = "file:///path/to"
+    ds = DatasetMetadata(l1_ls8_folder_md_expected)
+    with pytest.warns(UserWarning, match="`location` is deprecated"):
+        assert ds.locations == ["file:///path/to"]
diff --git a/tests/integration/test_product_validate.py b/tests/integration/test_product_validate.py
index 51e1fd70..57e594b9 100644
--- a/tests/integration/test_product_validate.py
+++ b/tests/integration/test_product_validate.py
@@ -1,4 +1,3 @@
-from pathlib import Path
 from typing import Dict
 
 from eo3.product.validate import validate_product
@@ -49,7 +48,7 @@ def test_managed_deprecation(product: Dict, metadata_type: Dict):
     assert "ingested_product" in msgs.warning_text()
 
 
-def test_warn_bad_product_license(l1_ls8_metadata_path: Path, product: Dict):
+def test_warn_bad_product_license(product: Dict):
     # Missing license is a warning.
     del product["license"]
     msgs = MessageCatcher(validate_product(product))
diff --git a/tests/integration/test_serialise.py b/tests/integration/test_serialise.py
deleted file mode 100644
index 9c2480f9..00000000
--- a/tests/integration/test_serialise.py
+++ /dev/null
@@ -1,63 +0,0 @@
-from pathlib import Path
-from typing import Dict
-
-import ciso8601
-
-from eo3 import serialise
-from eo3.utils import default_utc
-
-from tests.common import dump_roundtrip
-
-
-def test_stac_to_eo3_serialise(sentinel1_eo3):
-    assert_unchanged_after_roundstrip(sentinel1_eo3)
-
-
-def test_valid_document_works(example_metadata: Dict):
-    assert_unchanged_after_roundstrip(example_metadata)
-
-
-def assert_unchanged_after_roundstrip(doc: Dict):
-    generated_doc = dump_roundtrip(doc)
-
-    # Do a serialisation roundtrip and check that it's still identical.
-    reserialised_doc = dump_roundtrip(
-        serialise.to_doc(serialise.from_doc(generated_doc))
-    )
-
-    # One allowed difference: input dates can be many string formats,
-    # but we normalise them with timezone (UTC default)
-    _normalise_datetime_props(generated_doc)
-
-    assert serialise.from_doc(generated_doc) == serialise.from_doc(reserialised_doc)
-
-
-def _normalise_datetime_props(generated_doc):
-    properties = generated_doc.get("properties", {})
-    for key in properties:
-        if "datetime" in key:
-            # If string value, make it explicitly iso format with timezone.
-            val = properties[key]
-            if isinstance(val, str):
-                properties[key] = default_utc(ciso8601.parse_datetime(val)).isoformat()
-
-
-def test_location_serialisation(l1_ls8_folder_md_expected: Dict):
-    l1_ls8_folder_md_expected["location"] = "s3://test/url/metadata.txt"
-    assert_unchanged_after_roundstrip(l1_ls8_folder_md_expected)
-
-
-def test_location_single_serialisation(tmp_path: Path, l1_ls8_folder_md_expected: Dict):
-    # Always serialises a single location as 'location'
-    location = "https://some/test/path"
-
-    # Given multiple
-    l1_ls8_folder_md_expected["locations"] = [location]
-
-    reserialised_doc = dump_roundtrip(
-        serialise.to_doc(serialise.from_doc(l1_ls8_folder_md_expected))
-    )
-
-    # We get singular
-    assert reserialised_doc["location"] == location
-    assert "locations" not in reserialised_doc
diff --git a/tests/integration/test_thumbnail.py b/tests/integration/test_thumbnail.py
deleted file mode 100644
index 3aceb37d..00000000
--- a/tests/integration/test_thumbnail.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import tempfile
-from pathlib import Path
-
-import rasterio
-
-from eo3.images import FileWrite, GridSpec
-
-from . import assert_image
-
-
-def test_thumbnail_bitflag(input_uint8_tif: Path):
-    writer = FileWrite()
-
-    outfile = Path(tempfile.gettempdir()) / "test-bitflag.jpg"
-
-    water = 128
-
-    writer.create_thumbnail_singleband(input_uint8_tif, Path(outfile), bit=water)
-
-    assert_image(outfile, bands=3)
-
-
-def test_thumbnail_lookuptable(input_uint8_tif_2: Path):
-    writer = FileWrite()
-
-    outfile = Path(tempfile.gettempdir()) / "test-lookuptable.jpg"
-
-    wofs_lookup = {
-        0: [150, 150, 110],  # dry
-        1: [255, 255, 255],  # nodata,
-        16: [119, 104, 87],  # terrain
-        32: [89, 88, 86],  # cloud_shadow
-        64: [216, 215, 214],  # cloud
-        80: [242, 220, 180],  # cloudy terrain
-        128: [79, 129, 189],  # water
-        160: [51, 82, 119],  # shady water
-        192: [186, 211, 242],  # cloudy water
-    }
-
-    writer.create_thumbnail_singleband(
-        input_uint8_tif_2, Path(outfile), lookup_table=wofs_lookup
-    )
-
-    assert_image(outfile, bands=3)
-
-
-def test_thumbnail_from_numpy_bitflag(input_uint8_tif: Path):
-    writer = FileWrite()
-    outfile = Path(tempfile.gettempdir()) / "test-bitflag.jpg"
-    water = 128
-
-    with rasterio.open(input_uint8_tif) as ds:
-        input_geobox = GridSpec.from_rio(ds)
-        data = ds.read(1)
-
-        image_bytes = writer.create_thumbnail_singleband_from_numpy(
-            input_data=data, input_geobox=input_geobox, bit=water
-        )
-
-        with open(outfile, "wb") as jpeg_file:
-            jpeg_file.write(image_bytes)
-
-        assert_image(outfile, bands=3)
-
-
-def test_thumbnail_from_numpy_lookuptable(input_uint8_tif_2: Path):
-    writer = FileWrite()
-    outfile = Path(tempfile.gettempdir()) / "test-lookuptable.jpg"
-    wofs_lookup = {
-        0: [150, 150, 110],  # dry
-        1: [255, 255, 255],  # nodata,
-        16: [119, 104, 87],  # terrain
-        32: [89, 88, 86],  # cloud_shadow
-        64: [216, 215, 214],  # cloud
-        80: [242, 220, 180],  # cloudy terrain
-        128: [79, 129, 189],  # water
-        160: [51, 82, 119],  # shady water
-        192: [186, 211, 242],  # cloudy water
-    }
-
-    with rasterio.open(input_uint8_tif_2) as ds:
-        input_geobox = GridSpec.from_rio(ds)
-        data = ds.read(1)
-
-        image_bytes = writer.create_thumbnail_singleband_from_numpy(
-            input_data=data, input_geobox=input_geobox, lookup_table=wofs_lookup
-        )
-
-        with open(outfile, "wb") as jpeg_file:
-            jpeg_file.write(image_bytes)
-
-        assert_image(outfile, bands=3)
diff --git a/tests/integration/test_tostac.py b/tests/integration/test_tostac.py
index 8ee54bf2..3d8dc0a8 100644
--- a/tests/integration/test_tostac.py
+++ b/tests/integration/test_tostac.py
@@ -7,6 +7,8 @@
 
 from eo3 import serialise
 from eo3.scripts import tostac
+from eo3.utils import read_file
+from eo3.validate import InvalidDatasetError
 
 from tests.common import assert_same, run_prepare_cli
 
@@ -46,33 +48,6 @@ def test_tostac(odc_dataset_path: Path, expected_stac_doc: Dict):
     assert_same(expected_stac_doc, output_doc)
 
 
-def test_tostac_no_grids(odc_dataset_path: Path, expected_stac_doc: Dict):
-    """
-    Converted EO1 datasets don't have grid information. Make sure it still outputs
-    without falling over.
-    """
-
-    # Remove grids from the input....
-    dataset = serialise.from_path(odc_dataset_path)
-    dataset.grids = None
-    serialise.to_path(odc_dataset_path, dataset)
-
-    run_tostac(odc_dataset_path)
-    expected_output_path = odc_dataset_path.with_name(
-        odc_dataset_path.name.replace(".odc-metadata.yaml", ".stac-item.json")
-    )
-
-    # No longer expect proj  fields (they come from grids).
-    remove_stac_properties(
-        expected_stac_doc, ("proj:shape", "proj:transform", "proj:epsg")
-    )
-    # But we do still expect a global CRS.
-    expected_stac_doc["properties"]["proj:epsg"] = 32656
-
-    output_doc = json.load(expected_output_path.open())
-    assert_same(expected_stac_doc, output_doc)
-
-
 def remove_stac_properties(doc: Dict, remove_properties=()):
     """
     Remove the given fields from properties and assets.
@@ -92,7 +67,7 @@ def test_add_property(input_doc_folder: Path):
     input_metadata_path = input_doc_folder.joinpath(ODC_METADATA_FILE)
     assert input_metadata_path.exists()
 
-    input_doc = serialise.load_yaml(input_metadata_path)
+    input_doc = read_file(input_metadata_path)
     input_doc["properties"]["test"] = "testvalue"
 
     serialise.dump_yaml(input_metadata_path, input_doc)
@@ -112,30 +87,28 @@ def test_no_crs(input_doc_folder: Path):
     input_metadata_path = input_doc_folder.joinpath(ODC_METADATA_FILE)
     assert input_metadata_path.exists()
 
-    input_doc = serialise.load_yaml(input_metadata_path)
+    input_doc = read_file(input_metadata_path)
     del input_doc["crs"]
 
     serialise.dump_yaml(input_metadata_path, input_doc)
     assert input_metadata_path.exists()
 
-    with pytest.raises(RuntimeError) as exp:
+    with pytest.raises(InvalidDatasetError, match="incomplete_geometry"):
         run_tostac(input_metadata_path)
-    assert "Unexpected input encountered" in str(exp.value)
 
 
 def test_invalid_crs(input_doc_folder: Path):
     input_metadata_path = input_doc_folder.joinpath(ODC_METADATA_FILE)
     assert input_metadata_path.exists()
 
-    input_doc = serialise.load_yaml(input_metadata_path)
+    input_doc = read_file(input_metadata_path)
     input_doc["crs"] = "I-CANT-BELIEVE-ITS-NOT-A-VALID-CRS:4236"
 
     serialise.dump_yaml(input_metadata_path, input_doc)
     assert input_metadata_path.exists()
 
-    with pytest.raises(RuntimeError) as exp:
+    with pytest.raises(InvalidDatasetError, match="invalid_crs"):
         run_tostac(input_metadata_path)
-    assert "Invalid projection" in str(exp.value)
 
 
 def run_tostac(input_metadata_path: Path):
diff --git a/tests/integration/test_validate.py b/tests/integration/test_validate.py
index 2d7e8ada..0b5fd6d4 100644
--- a/tests/integration/test_validate.py
+++ b/tests/integration/test_validate.py
@@ -1,26 +1,20 @@
-from pathlib import Path
-from textwrap import dedent
-from typing import Dict, Union
-from uuid import uuid4
+from typing import Dict
 
-import numpy as np
-import rasterio
-from rasterio.io import DatasetWriter
+import pytest
+import toolz
 
 from eo3 import validate
+from eo3.model import DatasetMetadata
 from eo3.validate import (
-    DocKind,
-    ValidationExpectations,
-    filename_doc_kind,
-    guess_kind_from_contents,
-    validate_dataset,
+    InvalidDatasetError,
+    validate_ds_to_metadata_type,
+    validate_ds_to_product,
+    validate_ds_to_schema,
 )
 from eo3.validation_msg import ValidationMessage
 
 from tests.common import MessageCatcher
 
-Doc = Union[Dict, Path]
-
 
 def test_val_msg_str():
     msg = ValidationMessage.info(
@@ -32,360 +26,166 @@ def test_val_msg_str():
     assert "I don't like spam!" in msg_str
 
 
-def test_dockind_legacy():
-    assert not DocKind.dataset.is_legacy
-    assert DocKind.legacy_dataset.is_legacy
-    assert DocKind.ingestion_config.is_legacy
-
-
-def test_valid_document_works(example_metadata: Dict):
+def test_valid_document_works(
+    l1_ls8_folder_md_expected: Dict, eo3_product, metadata_type
+):
     """All of our example metadata files should validate"""
-    msgs = MessageCatcher(validate_dataset(example_metadata))
+    dataset = l1_ls8_folder_md_expected
+    msgs = MessageCatcher(validate_ds_to_schema(dataset))
     assert not msgs.errors()
 
+    msgs = MessageCatcher(validate_ds_to_metadata_type(dataset, metadata_type))
+    assert not msgs.errors()
 
-def test_bad_crs(example_metadata: Dict):
-    example_metadata["crs"] = 4326
-    msgs = MessageCatcher(validate_dataset(example_metadata))
-    assert "epsg codes should be prefixed" in msgs.error_text()
+    msgs = MessageCatcher(validate_ds_to_product(dataset, eo3_product))
+    assert not msgs.errors()
 
 
 def test_missing_field(example_metadata: Dict):
     """when a required field (id) is missing, validation should fail"""
     del example_metadata["id"]
-    msgs = MessageCatcher(validate_dataset(example_metadata))
+    msgs = MessageCatcher(validate_ds_to_schema(example_metadata))
     assert "'id' is a required property" in msgs.error_text()
 
+    with pytest.raises(InvalidDatasetError, match="structure"):
+        DatasetMetadata(example_metadata)
+
 
 def test_invalid_eo3_schema(example_metadata: Dict):
     """When there's no eo3 $schema defined"""
     del example_metadata["$schema"]
-    msgs = MessageCatcher(validate_dataset(example_metadata))
-    assert "no_schema:" in msgs.error_text()
-    example_metadata["$schema"] = "https://schemas.onepdapatube.org/dataset"
-    msgs = MessageCatcher(validate_dataset(example_metadata))
-    assert "unknown_doc_type" in msgs.error_text()
-
+    msgs = MessageCatcher(validate_ds_to_schema(example_metadata))
+    assert "$schema" in msgs.error_text()
 
-def test_allow_optional_geo(example_metadata: Dict):
-    """A doc can omit all geo fields and be valid if not requiring geometry."""
-    del example_metadata["crs"]
-    del example_metadata["geometry"]
+    example_metadata["$schema"] = "https://schemas.onepdapatube.org/dataset"
+    msgs = MessageCatcher(validate_ds_to_schema(example_metadata))
+    assert "($schema)" in msgs.error_text()
 
-    for m in example_metadata["measurements"].values():
-        if "grid" in m:
-            del m["grid"]
 
-    example_metadata["grids"] = {}
-    msgs = MessageCatcher(validate_dataset(example_metadata))
+def test_dataset_maturity(example_metadata: Dict):
+    """Dataset maturity is an optional but recommended field; schema validation
+    should warn if it is absent and error if it is incorrect"""
+    example_metadata["properties"]["dea:dataset_maturity"] = "blah"
+    msgs = MessageCatcher(validate_ds_to_schema(example_metadata))
     assert msgs.errors()
-    expect = ValidationExpectations(require_geometry=False)
-    msgs = MessageCatcher(validate_dataset(example_metadata, expect=expect))
-    assert "No geo information in dataset" in msgs.all_text()
-    assert not msgs.errors()
+    assert "dataset_maturity" in msgs.error_text()
 
+    example_metadata["properties"]["dea:dataset_maturity"] = "INTERIM"
+    msgs = MessageCatcher(validate_ds_to_schema(example_metadata))
+    assert msgs.errors()
+    assert "dataset_maturity" in msgs.error_text()
 
-def test_missing_geo_fields(example_metadata: Dict):
-    """If you have one gis field, you should have all of them"""
-    del example_metadata["crs"]
-    msgs = MessageCatcher(validate_dataset(example_metadata))
-    assert "incomplete_crs" in msgs.error_text()
-    expect = ValidationExpectations(require_geometry=False)
-    msgs = MessageCatcher(validate_dataset(example_metadata, expect=expect))
-    assert "incomplete_crs" in msgs.error_text()
+    del example_metadata["properties"]["dea:dataset_maturity"]
+    msgs = MessageCatcher(validate_ds_to_schema(example_metadata))
+    assert not msgs.errors()
+    assert "recommended_field" in msgs.warning_text()
 
 
 def test_grid_custom_crs(example_metadata: Dict):
-    """A Measurement refers to a grid that doesn't exist"""
+    """Allow a grid to have its own crs, and error if crs is invalid"""
     example_metadata["grids"]["other_crs"] = {
         "crs": "epsg:32756",
         "shape": [2267, 1567],
         "transform": [50.0, 0.0, 257975.0, 0.0, -50.0, 6290325.0],
     }
-    msgs = MessageCatcher(validate_dataset(example_metadata))
-    assert not msgs.error_text()
-    assert not msgs.warning_text()
+    ds = DatasetMetadata(example_metadata)
+    grid = ds.grids.get("other_crs")
+    assert grid.crs == "epsg:32756"
+    assert ds.crs.epsg != 32756
 
-
-def test_grid_custom_bad_crs(example_metadata: Dict):
-    """A Measurement refers to a grid that doesn't exist"""
-    example_metadata["grids"]["other_crs"] = {
+    example_metadata["grids"]["default"] = {
         "crs": "splunge:32756",
         "shape": [2267, 1567],
         "transform": [50.0, 0.0, 257975.0, 0.0, -50.0, 6290325.0],
     }
-    msgs = MessageCatcher(validate_dataset(example_metadata))
-    errs = msgs.error_text()
-    assert "invalid_crs" in errs
-    assert "other_crs" in errs
+    with pytest.raises(InvalidDatasetError, match="invalid_crs"):
+        DatasetMetadata(example_metadata)
 
 
 def test_missing_grid_def(example_metadata: Dict):
     """A Measurement refers to a grid that doesn't exist"""
     a_measurement, *_ = list(example_metadata["measurements"])
     example_metadata["measurements"][a_measurement]["grid"] = "unknown_grid"
-    msgs = MessageCatcher(validate_dataset(example_metadata))
-    assert "invalid_grid_ref" in msgs.error_text()
+    with pytest.raises(InvalidDatasetError, match="invalid_grid_ref"):
+        DatasetMetadata(example_metadata)
 
 
 def test_absolute_path_in_measurement(example_metadata: Dict):
-    """A Measurement refers to a grid that doesn't exist"""
+    """Warn if a measurement path is absolute"""
     a_measurement, *_ = list(example_metadata["measurements"])
     example_metadata["measurements"][a_measurement][
         "path"
     ] = "file:///this/is/an/utter/absolute/path.nc"
-    msgs = MessageCatcher(validate_dataset(example_metadata))
-    warns = msgs.warning_text()
-    assert "absolute_path" in warns
-    assert a_measurement in warns
+    with pytest.warns(UserWarning, match="absolute_path"):
+        DatasetMetadata(example_metadata)
 
 
 def test_path_with_part_in_measurement(example_metadata: Dict):
-    """A Measurement refers to a grid that doesn't exist"""
+    """
+    Measurement paths should not include parts; warn if they are present and error if they are invalid
+    """
     a_measurement, *_ = list(example_metadata["measurements"])
     example_metadata["measurements"][a_measurement]["path"] += "#part=0"
-    msgs = MessageCatcher(validate_dataset(example_metadata))
-    assert "uri_part" in msgs.warning_text()
+    with pytest.warns(UserWarning, match="uri_part"):
+        DatasetMetadata(example_metadata)
 
     example_metadata["measurements"][a_measurement]["path"] += "#part=nir"
-    msgs = MessageCatcher(validate_dataset(example_metadata))
-    assert "uri_part" in msgs.warning_text()
-    errs = msgs.error_text()
-    assert "uri_invalid_part" in errs
-    assert "nir" in errs
+    with pytest.raises(InvalidDatasetError, match="uri_invalid_part"):
+        DatasetMetadata(example_metadata)
 
     example_metadata["measurements"][a_measurement]["path"] += "#part=-22"
-    msgs = MessageCatcher(validate_dataset(example_metadata))
-    assert "uri_part" in msgs.warning_text()
-    errs = msgs.error_text()
-    assert "uri_invalid_part" in errs
-    assert "-22" in errs
+    with pytest.raises(InvalidDatasetError, match="uri_invalid_part"):
+        DatasetMetadata(example_metadata)
 
 
-def test_absolute_path_in_accessory(example_metadata: Dict):
-    an_accessory, *_ = list(example_metadata["accessories"])
-    example_metadata["accessories"][an_accessory][
-        "path"
-    ] = "file:///this/is/an/utter/absolute/path.nc"
-    msgs = MessageCatcher(validate_dataset(example_metadata))
-    warns = msgs.warning_text()
-    assert "absolute_path" in warns
-    assert an_accessory in warns
-
-
-def test_invalid_shape(example_metadata: Dict):
-    """the geometry must be a valid shape"""
-
-    # Points are in an invalid order.
-    example_metadata["geometry"] = {
-        "coordinates": (
-            (
-                (770_115.0, -2_768_985.0),
-                (525_285.0, -2_981_715.0),
-                (770_115.0, -2_981_715.0),
-                (525_285.0, -2_768_985.0),
-                (770_115.0, -2_768_985.0),
-            ),
-        ),
-        "type": "Polygon",
-    }
-    msgs = MessageCatcher(validate_dataset(example_metadata))
-    assert "invalid_geometry" in msgs.error_text()
-
-
-def test_crs_as_wkt(example_metadata: Dict):
-    """A CRS should be in epsg form if an EPSG exists, not WKT"""
-    example_metadata["crs"] = dedent(
-        """PROJCS["WGS 84 / UTM zone 55N",
-    GEOGCS["WGS 84",
-        DATUM["WGS_1984",
-            SPHEROID["WGS 84",6378137,298.257223563,
-                AUTHORITY["EPSG","7030"]],
-            AUTHORITY["EPSG","6326"]],
-        PRIMEM["Greenwich",0,
-            AUTHORITY["EPSG","8901"]],
-        UNIT["degree",0.01745329251994328,
-            AUTHORITY["EPSG","9122"]],
-        AUTHORITY["EPSG","4326"]],
-    UNIT["metre",1,
-        AUTHORITY["EPSG","9001"]],
-    PROJECTION["Transverse_Mercator"],
-    PARAMETER["latitude_of_origin",0],
-    PARAMETER["central_meridian",147],
-    PARAMETER["scale_factor",0.9996],
-    PARAMETER["false_easting",500000],
-    PARAMETER["false_northing",0],
-    AUTHORITY["EPSG","32655"],
-    AXIS["Easting",EAST],
-    AXIS["Northing",NORTH]]
-    """
+def test_product_name_mismatch(l1_ls8_folder_md_expected: Dict, eo3_product):
+    """Dataset product name doesn't match product name of given product"""
+    eo3_product["name"] = "wrong_product_name"
+    msgs = MessageCatcher(
+        validate_ds_to_product(l1_ls8_folder_md_expected, eo3_product)
     )
-    msgs = MessageCatcher(validate_dataset(example_metadata))
-    assert not msgs.errors()
-    assert "non_epsg" in msgs.warning_text()
-    assert "change CRS to 'epsg:32655'" in msgs.warning_text()
-
-
-def test_flat_lineage(example_metadata: Dict):
-    example_metadata["lineage"] = {
-        "spam": [str(uuid4())],
-        "bacon": [str(uuid4())],
-        "eggs": [str(uuid4())],
-    }
-    msgs = MessageCatcher(validate_dataset(example_metadata))
-    assert not msgs.error_text()
-    assert not msgs.warning_text()
-    assert "nonflat_lineage" not in msgs.info_text()
-
-
-def test_nonflat_lineage(example_metadata: Dict):
-    example_metadata["lineage"] = {
-        "spam": [str(uuid4()), str(uuid4()), str(uuid4())],
-    }
-    msgs = MessageCatcher(validate_dataset(example_metadata))
-    assert not msgs.error_text()
-    assert not msgs.warning_text()
-    assert "nonflat_lineage" in msgs.info_text()
+    assert "product_mismatch" in msgs.error_text()
 
 
-def test_non_uuids_in_lineage(example_metadata: Dict):
-    example_metadata["lineage"] = {
-        "spam": [str(uuid4())],
-        "eggs": [str(uuid4()), "scrambled"],
-        "beans": [str(uuid4()), str(uuid4()), str(uuid4())],
-    }
-    msgs = MessageCatcher(validate_dataset(example_metadata))
-    errs = msgs.error_text()
-    assert "invalid_source_id" in errs
-    assert "scrambled" in errs
-    assert "eggs" in errs
-
-
-def test_valid_with_product_doc(l1_ls8_folder_md_expected: Dict, product: Dict) -> Path:
-    """When a product is specified, it will validate that the measurements match the product"""
-    product["name"] = l1_ls8_folder_md_expected["product"]["name"]
-    # Document is valid on its own.
-    msgs = MessageCatcher(validate_dataset(l1_ls8_folder_md_expected))
-    assert not msgs.errors()
-    # It contains all measurements in the product, so will be valid when not thorough.
-    msgs = MessageCatcher(
-        validate_dataset(l1_ls8_folder_md_expected, product_definition=product)
+def test_measurements_match_product(l1_ls8_folder_md_expected: Dict, eo3_product):
+    """Validate that the dataset measurements match the product"""
+    measurements = l1_ls8_folder_md_expected["measurements"]
+    # add extra measurement not defined in product
+    measurements = toolz.assoc(
+        measurements, "new_measurement", {"path": "measurement_path"}
     )
-    assert not msgs.errors()
+    # remove measurement expected by product
+    measurements = toolz.dissoc(measurements, "blue")
+    l1_ls8_folder_md_expected["measurements"] = measurements
 
-    # Remove some expected measurements from product - should get warnings now
-    product["default_allowances"]["allow_extra_measurements"] = [
-        "cirrus",
-        "coastal_aerosol",
-        "red",
-        "green",
-        "blue",
-        "nir",
-        "swir_1",
-        "swir_2",
-        "panchromatic",
-    ]
     msgs = MessageCatcher(
-        validate_dataset(l1_ls8_folder_md_expected, product_definition=product)
+        validate_ds_to_product(l1_ls8_folder_md_expected, eo3_product)
     )
+    assert "missing_measurement" in msgs.error_text()
     assert "extra_measurements" in msgs.warning_text()
-    assert "quality" in msgs.warning_text()
-    assert "lwir_1" in msgs.warning_text()
-    assert not msgs.errors()
-
-    expect = ValidationExpectations(
-        allow_extra_measurements=[
-            "lwir_1",
-            "lwir_2",
-            "quality",
-        ]
-    )
-    msgs = MessageCatcher(
-        validate_dataset(
-            l1_ls8_folder_md_expected, product_definition=product, expect=expect
-        )
-    )
-    assert not msgs.errors()
+    assert "new_measurement" in msgs.warning_text()
 
 
-# @pytest.mark.skip("This check is outside the current callpath.")
-def test_complains_about_product_not_matching(
+def test_product_metadata_mismatch(
     l1_ls8_folder_md_expected: Dict,
     eo3_product,
 ):
     """
-    Complains when we're given products but they don't match the dataset
+    Complains when a dataset doesn't contain all metadata properties given by the product
     """
-
     # A metadata field that's not in the dataset.
     eo3_product["metadata"]["properties"]["favourite_sandwich"] = "spam"
 
     msgs = MessageCatcher(
-        validate_dataset(l1_ls8_folder_md_expected, product_definition=eo3_product)
-    )
-    assert "metadata_mismatch" in msgs.error_text()
-
-
-def test_complains_when_no_product(
-    l1_ls8_folder_md_expected: Dict,
-):
-    """When a product is specified, it will validate that the measurements match the product"""
-    # Thorough checking should fail when there's no product provided
-    msgs = MessageCatcher(
-        validate_dataset(
-            l1_ls8_folder_md_expected, thorough=True, product_definition=None
+        validate_ds_to_product(
+            l1_ls8_folder_md_expected, product_definition=eo3_product
         )
     )
-    assert "no_product" in msgs.error_text()
-
-
-def test_is_product():
-    """Product documents should be correctly identified as products"""
-    product = dict(
-        name="minimal_product", metadata_type="eo3", measurements=[dict(name="blue")]
-    )
-    assert guess_kind_from_contents(product) == DocKind.product
-
-
-def test_is_ingestion():
-    """Product documents should be correctly identified as products"""
-    product = dict(
-        name="minimal_product", metadata_type="eo3", measurements=[dict(name="blue")]
-    )
-    assert guess_kind_from_contents(product) == DocKind.product
-
-
-def test_is_metadata_type():
-    """Product documents should be correctly identified as products"""
-    mdt = dict(name="minimal_mdt", dataset=dict(search_fields=dict()))
-    assert guess_kind_from_contents(mdt) == DocKind.metadata_type
-
-
-def test_is_legacy_dataset():
-    """Product documents should be correctly identified as products"""
-    ds = dict(id="spam", lineage=["sources"], platform="boots")
-    assert guess_kind_from_contents(ds) == DocKind.legacy_dataset
-
-
-def test_is_legacy_ingestion_cfg():
-    """Product documents should be correctly identified as products"""
-    ds = dict(metadata_type="foo", source_type="bar")
-    assert guess_kind_from_contents(ds) == DocKind.ingestion_config
-
-
-def test_is_stac():
-    """Product documents should be correctly identified as products"""
-    ds = dict(id="spam", properties=dict(datetime="today, right now"))
-    assert guess_kind_from_contents(ds) == DocKind.stac_item
-
-
-def test_not_a_dockind():
-    """Product documents should be correctly identified as products"""
-    product = dict(spam="spam", bacon="eggs", interruptions="vikings")
-    assert guess_kind_from_contents(product) is None
+    assert "metadata_mismatch" in msgs.error_text()
 
 
 def test_has_offset():
+    """_has_offset helper function for checking missing offsets"""
     doc = dict(spam="spam", bacon="eggs", atmosphere=dict(interruptions="vikings"))
     from eo3.validate import _has_offset
 
@@ -394,19 +194,9 @@ def test_has_offset():
     assert not _has_offset(doc, ["eggs"])
 
 
-def test_dataset_is_not_a_product(example_metadata: Dict):
-    """
-    Datasets should not be identified as products
-
-    (checks all example metadata files)
-    """
-    assert guess_kind_from_contents(example_metadata) == DocKind.dataset
-    assert filename_doc_kind(Path("asdf.odc-metadata.yaml")) == DocKind.dataset
-
-
 def test_get_field_offsets(metadata_type: Dict):
     """
-    Test the get_field_offsets function.
+    Test the get_field_offsets function, should return all field offsets defined by the metadata type
     """
     assert list(validate._get_field_offsets(metadata_type)) == [
         ("id", [["id"]]),
@@ -425,220 +215,46 @@ def test_get_field_offsets(metadata_type: Dict):
                 ["properties", "datetime"],
             ],
         ),
+        (
+            "lat",
+            [
+                ["extent", "lat", "begin"],
+                ["extent", "lat", "end"],
+            ],
+        ),
+        (
+            "lon",
+            [
+                ["extent", "lon", "begin"],
+                ["extent", "lon", "end"],
+            ],
+        ),
     ]
 
 
-def test_validate_ds_with_metadata_doc(
-    l1_ls8_metadata_path: str,
-    metadata_type,
-    l1_ls8_folder_md_expected: Dict,
-):
-    # When thorough, the dtype and nodata are wrong
-    msgs = MessageCatcher(
-        validate_dataset(
-            l1_ls8_folder_md_expected,
-            metadata_type_definition=metadata_type,
-            readable_location=l1_ls8_metadata_path,
-        )
-    )
-    assert not msgs.error_text()
-    assert not msgs.warning_text()
-
-
-def test_validate_ds_with_metadata_doc_warnings(
-    l1_ls8_metadata_path: str,
+def test_validate_ds_to_metadata_type(
     metadata_type,
     l1_ls8_folder_md_expected: Dict,
 ):
+    """
+    Validator should allow a document that doesn't include all the metadata type fields,
+    but should warn about these missing fields
+    """
     metadata_type["dataset"]["search_fields"]["foobar"] = {
         "description": "A required property that is missing",
         "type": "string",
         "offset": ["properties", "eo3:foobar"],
     }
     msgs = MessageCatcher(
-        validate_dataset(
+        validate_ds_to_metadata_type(
             l1_ls8_folder_md_expected,
             metadata_type_definition=metadata_type,
-            readable_location=l1_ls8_metadata_path,
         )
     )
     assert not msgs.error_text()
     warns = msgs.warning_text()
     assert "missing_field" in warns
     assert "foobar" in warns
-    l1_ls8_folder_md_expected["properties"]["eo3:foobar"] = None
-    msgs = MessageCatcher(
-        validate_dataset(
-            l1_ls8_folder_md_expected,
-            metadata_type_definition=metadata_type,
-            readable_location=l1_ls8_metadata_path,
-        )
-    )
-    assert not msgs.error_text()
-    assert not msgs.warning_text()
-    infos = msgs.info_text()
-    assert "null_field" in infos
-    assert "foobar" in infos
-
-
-def test_validate_location_deprec(
-    l1_ls8_folder_md_expected: Dict,
-):
-    l1_ls8_folder_md_expected["location"] = "file:///path/to"
-    # When thorough, the dtype and nodata are wrong
-    msgs = MessageCatcher(
-        validate_dataset(
-            l1_ls8_folder_md_expected,
-        )
-    )
-    assert "dataset_location" in msgs.warning_text()
-
-
-def test_dtype_compare_with_product_doc(
-    l1_ls8_metadata_path: str,
-    eo3_product,
-    l1_ls8_folder_md_expected: Dict,
-):
-    """'thorough' validation should check the dtype of measurements against the product"""
-
-    eo3_product["measurements"] = [
-        dict(name="blue", dtype="uint8", units="1", nodata=255)
-    ]
-
-    # When thorough, the dtype and nodata are wrong
-    msgs = MessageCatcher(
-        validate_dataset(
-            l1_ls8_folder_md_expected,
-            product_definition=eo3_product,
-            readable_location=l1_ls8_metadata_path,
-            thorough=True,
-        )
-    )
-    err_text = msgs.error_text()
-    assert "different_dtype" in err_text
-    assert "blue" in err_text
-    assert "uint8" in err_text
-
-
-def test_nodata_compare_with_product_doc(
-    l1_ls8_metadata_path: str,
-    eo3_product,
-    l1_ls8_folder_md_expected: Dict,
-):
-    """'thorough' validation should check the nodata of measurements against the product"""
-
-    # Remake the tiff with a 'nodata' set.
-    blue_tif = (
-        l1_ls8_metadata_path.parent
-        / l1_ls8_folder_md_expected["measurements"]["blue"]["path"]
-    )
-    _create_dummy_tif(
-        blue_tif,
-        dtype="uint16",
-        nodata=65535,
-    )
-    msgs = MessageCatcher(
-        validate_dataset(
-            l1_ls8_folder_md_expected,
-            product_definition=eo3_product,
-            readable_location=l1_ls8_metadata_path,
-            thorough=True,
-        )
-    )
-    assert not msgs.errors()
-    assert not msgs.warnings()
-    assert not msgs.infos()
-
-    # Override blue definition with invalid nodata value.
-    _measurement(eo3_product, "blue")["nodata"] = 255
-    msgs = MessageCatcher(
-        validate_dataset(
-            l1_ls8_folder_md_expected,
-            product_definition=eo3_product,
-            readable_location=l1_ls8_metadata_path,
-            thorough=True,
-        )
-    )
-    assert "different_nodata" in msgs.error_text()
-
-
-def test_measurements_compare_with_nans(
-    l1_ls8_metadata_path: str,
-    eo3_product,
-    l1_ls8_folder_md_expected: Dict,
-):
-    """When dataset and product have NaN nodata values, it should handle them correctly"""
-    product = eo3_product
-    blue_tif = (
-        l1_ls8_metadata_path.parent
-        / l1_ls8_folder_md_expected["measurements"]["blue"]["path"]
-    )
-
-    # When both are NaN, it should be valid
-    blue = _measurement(product, "blue")
-    blue["nodata"] = float("NaN")
-    blue["dtype"] = "float32"
-    _create_dummy_tif(blue_tif, nodata=float("NaN"))
-
-    msgs = MessageCatcher(
-        validate_dataset(
-            l1_ls8_folder_md_expected,
-            product_definition=eo3_product,
-            readable_location=l1_ls8_metadata_path,
-            thorough=True,
-        )
-    )
-    assert not msgs.errors()
-    assert not msgs.warnings()
-    assert not msgs.infos()
-
-    # ODC can also represent NaNs as strings due to json's lack of NaN
-    blue["nodata"] = "NaN"
-    msgs = MessageCatcher(
-        validate_dataset(
-            l1_ls8_folder_md_expected,
-            product_definition=eo3_product,
-            readable_location=l1_ls8_metadata_path,
-            thorough=True,
-        )
-    )
-    assert not msgs.errors()
-    assert not msgs.warnings()
-    assert not msgs.infos()
-
-    # When product is set, dataset is NaN, they no longer match.
-    blue["nodata"] = 0
-    msgs = MessageCatcher(
-        validate_dataset(
-            l1_ls8_folder_md_expected,
-            product_definition=eo3_product,
-            readable_location=l1_ls8_metadata_path,
-            thorough=True,
-        )
-    )
-    errtxt = msgs.error_text()
-    assert "different_nodata" in errtxt
-    assert "blue" in errtxt
-    assert "dataset nan" in errtxt
-    assert "product 0" in errtxt
-
-
-def test_missing_measurement_from_product(
-    l1_ls8_folder_md_expected: Dict,
-    eo3_product,
-):
-    """Validator should notice a missing measurement from the product def"""
-    product = eo3_product
-    product["name"] = "test_with_extra_measurement"
-    product["measurements"] = [
-        dict(name="razzmatazz", dtype="int32", units="1", nodata=-999)
-    ]
-    msgs = MessageCatcher(
-        validate_dataset(l1_ls8_folder_md_expected, product_definition=eo3_product)
-    )
-    errtxt = msgs.error_text()
-    assert "missing_measurement" in errtxt
-    assert "razzmatazz" in errtxt
 
 
 def test_supports_measurementless_products(
@@ -654,7 +270,7 @@ def test_supports_measurementless_products(
     """
     eo3_product["measurements"] = []
     msgs = MessageCatcher(
-        validate_dataset(l1_ls8_folder_md_expected, product_definition=eo3_product)
+        validate_ds_to_product(l1_ls8_folder_md_expected, eo3_product)
     )
     assert not msgs.errors()
 
@@ -670,30 +286,5 @@ def test_product_no_href(
     Level 1 products.
     """
     del l1_ls8_folder_md_expected["product"]["href"]
-    msgs = MessageCatcher(validate_dataset(l1_ls8_folder_md_expected))
-    assert not msgs.errors()
-    assert "product_href" in msgs.info_text()
-
-
-def _measurement(product: Dict, name: str):
-    """Get a measurement by name"""
-    for m in product["measurements"]:
-        if m["name"] == name:
-            return m
-    raise ValueError(f"Measurement {name} not found?")
-
-
-def _create_dummy_tif(blue_tif, nodata=None, dtype="float32", **opts):
-    with rasterio.open(
-        blue_tif,
-        "w",
-        width=10,
-        height=10,
-        count=1,
-        dtype=dtype,
-        driver="GTiff",
-        nodata=nodata,
-        **opts,
-    ) as ds:
-        ds: DatasetWriter
-        ds.write(np.ones((10, 10), dtype=dtype), 1)
+    with pytest.warns(UserWarning, match="product->href"):
+        DatasetMetadata(l1_ls8_folder_md_expected)
diff --git a/tests/test_documents.py b/tests/test_documents.py
deleted file mode 100644
index de38f684..00000000
--- a/tests/test_documents.py
+++ /dev/null
@@ -1,102 +0,0 @@
-"""
-Module
-"""
-
-from eo3.documents import _find_any_metadata_suffix, find_metadata_path
-
-from tests import write_files
-
-
-def test_find_metadata_path():
-    files = write_files(
-        {
-            "directory_dataset": {
-                "file1.txt": "",
-                "file2.txt": "",
-                "ga-metadata.yaml.gz": "",
-            },
-            "file_dataset.tif": "",
-            "file_dataset.agdc-md.yaml": "",
-            "dataset_metadata.yaml": "",
-            "no_metadata.tif": "",
-            # Newer eo3-style names.`
-            # Sibling
-            "newer-dataset.tar": "",
-            "newer-dataset.odc-metadata.yaml": "",
-            # Directory
-            "newer_directory_dataset": {
-                "newer-dataset.txt": "",
-                "newer-dataset-b2.txt": "",
-                "newer-dataset.odc-metadata.yaml.gz": "",
-            },
-        }
-    )
-
-    # A metadata file can be specified directly.
-    path = find_metadata_path(files.joinpath("dataset_metadata.yaml"))
-    assert path.absolute() == files.joinpath("dataset_metadata.yaml").absolute()
-
-    # A older dataset directory will have an internal 'agdc-metadata' file.
-    path = find_metadata_path(files.joinpath("directory_dataset"))
-    assert (
-        path.absolute()
-        == files.joinpath("directory_dataset", "ga-metadata.yaml.gz").absolute()
-    )
-
-    # Other older files can have a sibling file ending in 'agdc-md.yaml'
-    path = find_metadata_path(files.joinpath("file_dataset.tif"))
-    assert path.absolute() == files.joinpath("file_dataset.agdc-md.yaml").absolute()
-
-    # No metadata to find.
-    assert find_metadata_path(files.joinpath("no_metadata.tif")) is None
-
-    # Dataset itself doesn't exist.
-    assert find_metadata_path(files.joinpath("missing-dataset.tif")) is None
-
-    # EO3-style dataset metadata
-    path = find_metadata_path(files.joinpath("newer-dataset.tar"))
-    assert (
-        path.absolute() == files.joinpath("newer-dataset.odc-metadata.yaml").absolute()
-    )
-
-    # EO3-style dataset in a directory
-    path = find_metadata_path(files.joinpath("newer_directory_dataset"))
-    assert (
-        path.absolute()
-        == files.joinpath(
-            "newer_directory_dataset", "newer-dataset.odc-metadata.yaml.gz"
-        ).absolute()
-    )
-
-
-def test_find_any_metatadata_suffix():
-    files = write_files(
-        {
-            "directory_dataset": {
-                "file1.txt": "",
-                "file2.txt": "",
-                "agdc-metadata.json.gz": "",
-            },
-            "file_dataset.tif.ga-md.yaml": "",
-            "dataset_metadata.YAML": "",
-            "no_metadata.tif": "",
-        }
-    )
-
-    path = _find_any_metadata_suffix(files.joinpath("dataset_metadata"))
-    assert path.absolute() == files.joinpath("dataset_metadata.YAML").absolute()
-
-    path = _find_any_metadata_suffix(
-        files.joinpath("directory_dataset", "agdc-metadata")
-    )
-    assert (
-        path.absolute()
-        == files.joinpath("directory_dataset", "agdc-metadata.json.gz").absolute()
-    )
-
-    path = _find_any_metadata_suffix(files.joinpath("file_dataset.tif.ga-md"))
-    assert path.absolute() == files.joinpath("file_dataset.tif.ga-md.yaml").absolute()
-
-    # Returns none if none exist
-    path = _find_any_metadata_suffix(files.joinpath("no_metadata"))
-    assert path is None
diff --git a/tests/test_eo3_core.py b/tests/test_eo3_core.py
new file mode 100644
index 00000000..a5b7351a
--- /dev/null
+++ b/tests/test_eo3_core.py
@@ -0,0 +1,290 @@
+"""
+Module
+"""
+import pytest
+from affine import Affine
+from odc.geo.geom import CRS, polygon
+from ruamel.yaml import YAML
+
+from eo3.eo3_core import (
+    EO3Grid,
+    add_eo3_parts,
+    eo3_grid_spatial,
+    is_doc_eo3,
+    is_doc_geo,
+    prep_eo3,
+)
+
+SAMPLE_DOC = """---
+$schema: https://schemas.opendatacube.org/dataset
+id: 7d41a4d0-2ab3-4da1-a010-ef48662ae8ef
+crs: "EPSG:3857"
+product:
+    name: sample_product
+properties:
+    datetime: 2020-05-25 23:35:47.745731Z
+    odc:processing_datetime: 2020-05-25 23:35:47.745731Z
+grids:
+    default:
+       shape: [100, 200]
+       transform: [10, 0, 100000, 0, -10, 200000, 0, 0, 1]
+lineage:
+  src_a: ['7cf53cb3-5da7-483f-9f12-6056e3290b4e']
+  src_b:
+    - 'f5b9f582-d5ff-43c0-a49b-ef175abe429c'
+    - '7f8c6e8e-6f6b-4513-a11c-efe466405509'
+  src_empty: []
+...
+"""
+
+# Crosses lon=180 line in Pacific, taken from one the Landsat scenes
+# https://landsat-pds.s3.amazonaws.com/c1/L8/074/071/LC08_L1TP_074071_20190622_20190704_01_T1/index.html
+#
+SAMPLE_DOC_180 = """---
+$schema: https://schemas.opendatacube.org/dataset
+id: f884df9b-4458-47fd-a9d2-1a52a2db8a1a
+crs: "EPSG:32660"
+product:
+    name: sample_product
+properties:
+    datetime: 2020-05-25 23:35:47.745731Z
+    odc:processing_datetime: 2020-05-25 23:35:47.745731Z
+grids:
+    default:
+       shape: [7811, 7691]
+       transform: [30, 0, 618285, 0, -30, -1642485, 0, 0, 1]
+    pan:
+       shape: [15621, 15381]
+       transform: [15, 0, 618292.5, 0, -15, -1642492.5, 0, 0, 1]
+lineage: {}
+...
+"""
+
+
+@pytest.fixture
+def basic_grid():
+    return EO3Grid(dict(shape=(100, 100), transform=Affine(0, 100, 50, 100, 0, 50)))
+
+
+@pytest.fixture
+def sample_doc():
+    return YAML(typ="safe").load(SAMPLE_DOC)
+
+
+@pytest.fixture
+def sample_doc_180():
+    return YAML(typ="safe").load(SAMPLE_DOC_180)
+
+
+def test_grid_ref_points(basic_grid):
+    ref_pts = basic_grid.ref_points()
+    assert ref_pts["ul"] == {"x": 50, "y": 50}
+    assert ref_pts["lr"] == {"x": 10050, "y": 10050}
+    assert ref_pts["ur"] == {"x": 50, "y": 10050}
+    assert ref_pts["ll"] == {"x": 10050, "y": 50}
+
+
+def test_polygon(basic_grid):
+    poly = basic_grid.polygon()
+    assert poly == polygon(
+        [
+            (50, 50),
+            (50, 10050),
+            (10050, 10050),
+            (10050, 50),
+            (50, 50),
+        ],
+        crs=None,
+    )
+
+
+def test_grid_crs(basic_grid):
+    crs = CRS("EPSG:4326")
+    poly = basic_grid.polygon(crs)
+    assert poly == polygon(
+        [
+            (50, 50),
+            (50, 10050),
+            (10050, 10050),
+            (10050, 50),
+            (50, 50),
+        ],
+        crs=crs,
+    )
+    basic_grid.crs = crs
+    poly = basic_grid.polygon()
+    assert poly == polygon(
+        [
+            (50, 50),
+            (50, 10050),
+            (10050, 10050),
+            (10050, 50),
+            (50, 50),
+        ],
+        crs=crs,
+    )
+
+
+def test_grid_points():
+    identity = list(Affine.translation(0, 0))
+    grid = EO3Grid({"shape": (11, 22), "transform": identity})
+
+    pts = grid.points()
+    assert len(pts) == 4
+    assert pts == [(0, 0), (22, 0), (22, 11), (0, 11)]
+    pts_ = grid.points(ring=True)
+    assert len(pts_) == 5
+    assert pts == pts_[:4]
+    assert pts_[0] == pts_[-1]
+
+    grid = EO3Grid({"shape": (11, 22), "transform": tuple(Affine.translation(100, 0))})
+    pts = grid.points()
+    assert pts == [(100, 0), (122, 0), (122, 11), (100, 11)]
+
+    for bad in [{}, dict(shape=(1, 1)), dict(transform=identity)]:
+        with pytest.raises(ValueError):
+            grid = EO3Grid(bad)
+
+
+def test_bad_grids():
+    identity = list(Affine.translation(0, 0))
+    bad_grids = [
+        # No Shape
+        {
+            "transform": identity,
+        },
+        # Non 2-d Shape (NB: geospatial dimensions only.  Other dimensions are handled elsewhere.)
+        {
+            "shape": (1024,),
+            "transform": identity,
+        },
+        {
+            "shape": (1024, 564, 256),
+            "transform": identity,
+        },
+        # No Transform
+        {
+            "shape": (1024, 256),
+        },
+        # Formally invalid affine transform (must be 6 or 9 elements)
+        {
+            "shape": (1024, 256),
+            "transform": [343.3],
+        },
+        {
+            "shape": (1024, 256),
+            "transform": [343, 23345, 234, 9, -65.3],
+        },
+        {
+            "shape": (1024, 256),
+            "transform": [343, 23345, 234, 9, -65.3, 1, 0],
+        },
+        {
+            "shape": (1024, 256),
+            "transform": [
+                343,
+                23345,
+                234,
+                9,
+                -65.3,
+                1,
+                0,
+                7435.24563,
+                0.0001234,
+                888.888,
+                3,
+                3,
+                2,
+            ],
+        },
+        # Formally invalid affine transform (all elements must be numbers)
+        {"shape": (1024, 256), "transform": [343, 23345, 234, 9, -65.3, "six"]},
+        # Formally invalid affine transform (in 9 element form, last 3 numbers must be 0,0,1)
+        {
+            "shape": (1024, 256),
+            "transform": [343, 23345, 234, 9, -65.3, 1, 3, 3, 2],
+        },
+    ]
+    for bad_grid in bad_grids:
+        with pytest.raises(ValueError):
+            EO3Grid(bad_grid)
+
+
+def test_eo3_grid_spatial_nogrids():
+    with pytest.raises(ValueError, match="grids.foo"):
+        eo3_grid_spatial(
+            {
+                "crs": "EPSG:4326",
+                "grids": {
+                    "default": {
+                        "shape": (1024, 256),
+                        "transform": [343, 23345, 234, 9, -65.3, 1],
+                    }
+                },
+            },
+            grid_name="foo",
+        )
+
+
+def test_is_eo3(sample_doc, sample_doc_180):
+    assert is_doc_eo3(sample_doc) is True
+    assert is_doc_eo3(sample_doc_180) is True
+
+    # If there's no schema field at all, it's treated as legacy eo.
+    assert is_doc_eo3({}) is False
+    assert is_doc_eo3({"crs": "EPSG:4326"}) is False
+    assert is_doc_eo3({"crs": "EPSG:4326", "grids": {}}) is False
+
+    with pytest.raises(ValueError, match="Unsupported dataset schema.*"):
+        is_doc_eo3({"$schema": "https://schemas.opendatacube.org/eo4"})
+
+
+def test_is_geo(sample_doc, sample_doc_180):
+    assert is_doc_geo(sample_doc) is True
+    assert is_doc_geo(sample_doc_180) is True
+
+    assert is_doc_geo({}) is False
+    assert is_doc_geo({"crs": "EPSG:4326"}) is False
+    assert is_doc_geo({"crs": "EPSG:4326", "extent": "dummy_extent"}) is True
+
+
+def test_add_gs_info(sample_doc, sample_doc_180):
+    doc = dict(**sample_doc)
+    doc.pop("crs")
+    with pytest.raises(ValueError):
+        add_eo3_parts(doc)
+
+    doc = dict(**sample_doc)
+    doc.pop("grids")
+    with pytest.raises(ValueError):
+        add_eo3_parts(doc)
+
+    doc = add_eo3_parts(sample_doc)
+    assert doc is not sample_doc
+    assert doc.get("crs") == "EPSG:3857"
+    assert doc.get("extent") is not None
+    assert doc.get("grid_spatial") is not None
+    assert doc["extent"]["lat"]["begin"] < doc["extent"]["lat"]["end"]
+    assert doc["extent"]["lon"]["begin"] < doc["extent"]["lon"]["end"]
+
+    assert doc == add_eo3_parts(doc)
+
+    doc = add_eo3_parts(sample_doc_180)
+    assert doc is not sample_doc_180
+    assert doc["extent"]["lon"]["begin"] < 180 < doc["extent"]["lon"]["end"]
+
+
+def test_prep_eo3(sample_doc, sample_doc_180):
+    doc = prep_eo3(sample_doc)
+
+    assert "src_a" in doc["lineage"]["source_datasets"]
+    assert "src_b1" in doc["lineage"]["source_datasets"]
+    assert "src_b2" in doc["lineage"]["source_datasets"]
+    assert "src_empty" not in doc["lineage"]["source_datasets"]
+
+    doc = prep_eo3(sample_doc_180)
+    assert doc["lineage"]["source_datasets"] == {}
+
+    assert prep_eo3(None) is None
+    with pytest.raises(ValueError):
+        prep_eo3({})
diff --git a/tests/test_model.py b/tests/test_model.py
deleted file mode 100644
index 0e8b0cc7..00000000
--- a/tests/test_model.py
+++ /dev/null
@@ -1,75 +0,0 @@
-"""
-Module
-"""
-
-
-import pytest
-from affine import Affine
-from odc.geo.geom import CRS, polygon
-
-from eo3.model import GridDoc
-
-
-@pytest.fixture
-def basic_grid():
-    return GridDoc(shape=(100, 100), transform=Affine(0, 100, 50, 100, 0, 50))
-
-
-def test_grid_ref_points(basic_grid):
-    ref_pts = basic_grid.ref_points()
-    assert ref_pts["ul"] == {"x": 50, "y": 50}
-    assert ref_pts["lr"] == {"x": 10050, "y": 10050}
-    assert ref_pts["ur"] == {"x": 50, "y": 10050}
-    assert ref_pts["ll"] == {"x": 10050, "y": 50}
-
-
-def test_grid_points(basic_grid):
-    pts = basic_grid.points(ring=True)
-    assert pts == [
-        (50, 50),
-        (50, 10050),
-        (10050, 10050),
-        (10050, 50),
-        (50, 50),
-    ]
-
-
-def test_polygon(basic_grid):
-    poly = basic_grid.polygon()
-    assert poly == polygon(
-        [
-            (50, 50),
-            (50, 10050),
-            (10050, 10050),
-            (10050, 50),
-            (50, 50),
-        ],
-        crs=None,
-    )
-
-
-def test_grid_crs(basic_grid):
-    crs = CRS("EPSG:4326")
-    poly = basic_grid.polygon(crs)
-    assert poly == polygon(
-        [
-            (50, 50),
-            (50, 10050),
-            (10050, 10050),
-            (10050, 50),
-            (50, 50),
-        ],
-        crs=crs,
-    )
-    basic_grid.crs = crs
-    poly = basic_grid.polygon()
-    assert poly == polygon(
-        [
-            (50, 50),
-            (50, 10050),
-            (10050, 10050),
-            (10050, 50),
-            (50, 50),
-        ],
-        crs=crs,
-    )
diff --git a/tests/test_properties.py b/tests/test_properties.py
deleted file mode 100644
index 5308cee5..00000000
--- a/tests/test_properties.py
+++ /dev/null
@@ -1,57 +0,0 @@
-"""
-Module
-"""
-
-from enum import Enum
-
-import pytest
-
-from eo3.properties import FileFormat, degrees_type, of_enum_type, percent_type
-
-
-class LowerEnum(Enum):
-    spam = 1
-    bacon = 2
-    eggs = 3
-    beans = 4
-
-
-class UpperEnum(Enum):
-    SPAM = 1
-    BACON = 2
-    EGGS = 3
-    BEANS = 4
-
-
-def test_of_enum_type():
-    ff = of_enum_type(FileFormat)
-    assert ff("GeoTIFF") == "GeoTIFF"
-    assert ff(FileFormat.GeoTIFF) == "GeoTIFF"
-    with pytest.raises(ValueError):
-        assert ff("GeoTUFF") == "GeoTIFF"
-    ff = of_enum_type(FileFormat, strict=False)
-    assert ff("GeoTUFF") == "GeoTUFF"
-
-    low = of_enum_type(LowerEnum, lower=True)
-    assert low("spam") == "spam"
-    assert low("BACON") == "bacon"
-
-    upp = of_enum_type(UpperEnum, upper=True)
-    assert upp("spam") == "SPAM"
-    assert upp("BACON") == "BACON"
-
-
-def test_percent_type():
-    assert percent_type("2.22") == pytest.approx(2.22)
-    with pytest.raises(ValueError):
-        percent_type("-2.2")
-    with pytest.raises(ValueError):
-        percent_type("104.6666")
-
-
-def test_degrees_type():
-    assert degrees_type("355.3") == pytest.approx(355.3)
-    with pytest.raises(ValueError):
-        percent_type("-2.2")
-    with pytest.raises(ValueError):
-        percent_type("404.6666")
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 00000000..00859744
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,200 @@
+"""
+Test utility functions
+(tests copied from datacube-core/tests/test_utils_docs.py and test_utils_generic.py)
+"""
+from collections import OrderedDict
+from pathlib import Path
+from typing import Iterable, Tuple
+
+import numpy as np
+import pytest
+
+from eo3.utils import (
+    as_url,
+    jsonify_document,
+    netcdf_extract_string,
+    read_documents,
+    thread_local_cache,
+)
+from eo3.utils.utils import _open_from_s3, map_with_lookahead, transform_object_tree
+
+
+@pytest.fixture
+def sample_document_files():
+    files = [
+        ("multi_doc.yml", 3),
+        ("multi_doc.yml.gz", 3),
+        ("multi_doc.nc", 3),
+        ("single_doc.yaml", 1),
+        ("sample.json", 1),
+    ]
+
+    files = [
+        (str(Path(__file__).parent / "data" / f), num_docs) for f, num_docs in files
+    ]
+
+    return files
+
+
+def test_read_docs_from_local_path(sample_document_files):
+    _test_read_docs_impl(sample_document_files)
+
+
+def test_read_docs_from_file_uris(sample_document_files):
+    uris = [("file://" + doc, ndocs) for doc, ndocs in sample_document_files]
+    _test_read_docs_impl(uris)
+
+
+def test_read_docs_from_s3(sample_document_files, monkeypatch):
+    """
+    Use a mocked S3 bucket to test reading documents from S3
+    """
+    boto3 = pytest.importorskip("boto3")
+    moto = pytest.importorskip("moto")
+
+    monkeypatch.setenv("AWS_ACCESS_KEY_ID", "fake")
+    monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "fake")
+
+    with moto.mock_s3():
+        s3 = boto3.resource("s3", region_name="us-east-1")
+        bucket = s3.create_bucket(Bucket="mybucket")
+
+        mocked_s3_objs = []
+        for abs_fname, ndocs in sample_document_files:
+            if abs_fname.endswith("gz") or abs_fname.endswith("nc"):
+                continue
+
+            fname = Path(abs_fname).name
+            bucket.upload_file(abs_fname, fname)
+
+            mocked_s3_objs.append(("s3://mybucket/" + fname, ndocs))
+
+        _test_read_docs_impl(mocked_s3_objs)
+
+    with pytest.raises(RuntimeError):
+        with _open_from_s3("https://not-s3.ga/file.txt"):
+            pass
+
+
+def test_read_docs_from_http(sample_document_files, httpserver):
+    http_docs = []
+    for abs_fname, ndocs in sample_document_files:
+        if abs_fname.endswith("gz") or abs_fname.endswith("nc"):
+            continue
+        path = "/" + Path(abs_fname).name
+
+        httpserver.expect_request(path).respond_with_data(open(abs_fname).read())
+        http_docs.append((httpserver.url_for(path), ndocs))
+
+    _test_read_docs_impl(http_docs)
+
+
+def _test_read_docs_impl(sample_documents: Iterable[Tuple[str, int]]):
+    # Test case for returning URIs pointing to documents
+    for doc_url, num_docs in sample_documents:
+        all_docs = list(read_documents(doc_url, uri=True))
+        assert len(all_docs) == num_docs
+
+        for uri, doc in all_docs:
+            assert isinstance(doc, dict)
+            assert isinstance(uri, str)
+
+        url = as_url(doc_url)
+        if num_docs > 1:
+            expect_uris = [as_url(url) + f"#part={i}" for i in range(num_docs)]
+        else:
+            expect_uris = [as_url(url)]
+
+        assert [f for f, _ in all_docs] == expect_uris
+
+
+def test_netcdf_strings():
+    assert netcdf_extract_string(np.asarray([b"a", b"b"])) == "ab"
+    txt = "some string"
+    assert netcdf_extract_string(txt) is txt
+
+
+def test_jsonify():
+    from datetime import datetime
+    from decimal import Decimal
+    from uuid import UUID
+
+    assert sorted(
+        jsonify_document(
+            {
+                "a": (1.0, 2.0, 3.0),
+                "b": float("inf"),
+                "c": datetime(2016, 3, 11),
+                "d": np.dtype("int16"),
+            }
+        ).items()
+    ) == [
+        ("a", (1.0, 2.0, 3.0)),
+        ("b", "Infinity"),
+        ("c", "2016-03-11T00:00:00"),
+        ("d", "int16"),
+    ]
+
+    # Converts keys to strings:
+    assert sorted(jsonify_document({1: "a", "2": Decimal("2")}).items()) == [
+        ("1", "a"),
+        ("2", "2"),
+    ]
+
+    assert jsonify_document({"k": UUID("1f231570-e777-11e6-820f-185e0f80a5c0")}) == {
+        "k": "1f231570-e777-11e6-820f-185e0f80a5c0"
+    }
+
+
+def test_transform_object_tree():
+    def add_one(a):
+        return a + 1
+
+    assert transform_object_tree(add_one, [1, 2, 3]) == [2, 3, 4]
+    assert transform_object_tree(add_one, {"a": 1, "b": 2, "c": 3}) == {
+        "a": 2,
+        "b": 3,
+        "c": 4,
+    }
+    assert transform_object_tree(add_one, {"a": 1, "b": (2, 3), "c": [4, 5]}) == {
+        "a": 2,
+        "b": (3, 4),
+        "c": [5, 6],
+    }
+    assert transform_object_tree(
+        add_one, {1: 1, "2": 2, 3.0: 3}, key_transform=float
+    ) == {1.0: 2, 2.0: 3, 3.0: 4}
+    # Order must be maintained
+    assert transform_object_tree(
+        add_one, OrderedDict([("z", 1), ("w", 2), ("y", 3), ("s", 7)])
+    ) == OrderedDict([("z", 2), ("w", 3), ("y", 4), ("s", 8)])
+
+
+def test_map_with_lookahead():
+    def if_one(x):
+        return "one" + str(x)
+
+    def if_many(x):
+        return "many" + str(x)
+
+    assert list(map_with_lookahead(iter([]), if_one, if_many)) == []
+    assert list(map_with_lookahead(iter([1]), if_one, if_many)) == [if_one(1)]
+    assert list(map_with_lookahead(range(5), if_one, if_many)) == list(
+        map(if_many, range(5))
+    )
+    assert list(map_with_lookahead(range(10), if_one=if_one)) == list(range(10))
+    assert list(map_with_lookahead(iter([1]), if_many=if_many)) == [1]
+
+
+def test_thread_local_cache():
+    name = "test_0123394"
+    v = {}
+
+    assert thread_local_cache(name, v) is v
+    assert thread_local_cache(name) is v
+    assert thread_local_cache(name, purge=True) is v
+    assert thread_local_cache(name, 33) == 33
+    assert thread_local_cache(name, purge=True) == 33
+
+    assert thread_local_cache("no_such_key", purge=True) is None
+    assert thread_local_cache("no_such_key", 111, purge=True) == 111
diff --git a/tests/test_utils_aws.py b/tests/test_utils_aws.py
new file mode 100644
index 00000000..c73461cc
--- /dev/null
+++ b/tests/test_utils_aws.py
@@ -0,0 +1,157 @@
+# This file is part of the Open Data Cube, see https://opendatacube.org for more information
+#
+# Copyright (c) 2015-2023 ODC Contributors
+# SPDX-License-Identifier: Apache-2.0
+import json
+from unittest import mock
+
+import botocore
+import pytest
+from botocore.credentials import ReadOnlyCredentials
+
+from eo3.utils.aws import (
+    _fetch_text,
+    _s3_cache_key,
+    auto_find_region,
+    ec2_current_region,
+    s3_client,
+    s3_fmt_range,
+    s3_url_parse,
+)
+
+AWS_ENV_VARS = (
+    "AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY AWS_SESSION_TOKEN"
+    "AWS_DEFAULT_REGION AWS_DEFAULT_OUTPUT AWS_PROFILE "
+    "AWS_ROLE_SESSION_NAME AWS_CA_BUNDLE "
+    "AWS_SHARED_CREDENTIALS_FILE AWS_CONFIG_FILE"
+).split(" ")
+
+
+@pytest.fixture
+def without_aws_env(monkeypatch):
+    for e in AWS_ENV_VARS:
+        monkeypatch.delenv(e, raising=False)
+
+
+def _json(**kw):
+    return json.dumps(kw)
+
+
+def mock_urlopen(text, code=200):
+    m = mock.MagicMock()
+    m.getcode.return_value = code
+    m.read.return_value = text.encode("utf8")
+    m.__enter__.return_value = m
+    return m
+
+
+def test_ec2_current_region():
+    tests = [
+        (None, None),
+        (_json(region="TT"), "TT"),
+        (_json(x=3), None),
+        ("not valid json", None),
+    ]
+
+    for rv, expect in tests:
+        with mock.patch("eo3.utils.aws._fetch_text", return_value=rv):
+            assert ec2_current_region() == expect
+
+
+@mock.patch("eo3.utils.aws.botocore_default_region", return_value=None)
+def test_auto_find_region(*mocks):
+    with mock.patch("eo3.utils.aws._fetch_text", return_value=None):
+        with pytest.raises(ValueError):
+            auto_find_region()
+
+    with mock.patch("eo3.utils.aws._fetch_text", return_value=_json(region="TT")):
+        assert auto_find_region() == "TT"
+
+
+@mock.patch("eo3.utils.aws.botocore_default_region", return_value="tt-from-botocore")
+def test_auto_find_region_2(*mocks):
+    assert auto_find_region() == "tt-from-botocore"
+
+
+def test_fetch_text():
+    with mock.patch("eo3.utils.aws.urlopen", return_value=mock_urlopen("", 505)):
+        assert _fetch_text("http://localhost:8817") is None
+
+    with mock.patch("eo3.utils.aws.urlopen", return_value=mock_urlopen("text", 200)):
+        assert _fetch_text("http://localhost:8817") == "text"
+
+    def fake_urlopen(*args, **kw):
+        raise OSError("Always broken")
+
+    with mock.patch("eo3.utils.aws.urlopen", fake_urlopen):
+        assert _fetch_text("http://localhost:8817") is None
+
+
+def test_s3_basics(without_aws_env):
+    from botocore.credentials import ReadOnlyCredentials
+    from numpy import s_
+
+    assert s3_url_parse("s3://bucket/key") == ("bucket", "key")
+    assert s3_url_parse("s3://bucket/key/") == ("bucket", "key/")
+    assert s3_url_parse("s3://bucket/k/k/key") == ("bucket", "k/k/key")
+
+    with pytest.raises(ValueError):
+        s3_url_parse("file://some/path")
+
+    assert s3_fmt_range((0, 3)) == "bytes=0-2"
+    assert s3_fmt_range(s_[4:10]) == "bytes=4-9"
+    assert s3_fmt_range(s_[:10]) == "bytes=0-9"
+    assert s3_fmt_range(None) is None
+
+    for bad in (s_[10:], s_[-2:3], s_[:-3], (-1, 3), (3, -1), s_[1:100:3]):
+        with pytest.raises(ValueError):
+            s3_fmt_range(bad)
+
+    creds = ReadOnlyCredentials("fake-key", "fake-secret", None)
+
+    assert (
+        str(s3_client(region_name="kk")._endpoint) == "s3(https://s3.kk.amazonaws.com)"
+    )
+    assert (
+        str(s3_client(region_name="kk", use_ssl=False)._endpoint)
+        == "s3(http://s3.kk.amazonaws.com)"
+    )
+
+    s3 = s3_client(region_name="us-west-2", creds=creds)
+    assert s3 is not None
+
+
+def test_s3_unsigned(monkeypatch, without_aws_env):
+    s3 = s3_client(aws_unsigned=True)
+    assert s3._request_signer.signature_version == botocore.UNSIGNED
+
+    monkeypatch.setenv("AWS_UNSIGNED", "yes")
+    s3 = s3_client()
+    assert s3._request_signer.signature_version == botocore.UNSIGNED
+
+
+@mock.patch("eo3.utils.aws.ec2_current_region", return_value="us-west-2")
+def test_s3_client_cache(monkeypatch, without_aws_env):
+    monkeypatch.setenv("AWS_ACCESS_KEY_ID", "fake-key-id")
+    monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "fake-secret")
+
+    s3 = s3_client(cache=True)
+    assert s3 is s3_client(cache=True)
+    assert s3 is s3_client(cache="purge")
+    assert s3_client(cache="purge") is None
+    assert s3 is not s3_client(cache=True)
+
+    opts = (
+        dict(),
+        dict(region_name="foo"),
+        dict(region_name="bar"),
+        dict(profile="foo"),
+        dict(profile="foo", region_name="xxx"),
+        dict(profile="bar"),
+        dict(creds=ReadOnlyCredentials("fake1", "...", None)),
+        dict(creds=ReadOnlyCredentials("fake1", "...", None), region_name="custom"),
+        dict(creds=ReadOnlyCredentials("fake2", "...", None)),
+    )
+
+    keys = {_s3_cache_key(**o) for o in opts}
+    assert len(keys) == len(opts)
diff --git a/tests/test_utils_uris.py b/tests/test_utils_uris.py
new file mode 100644
index 00000000..56b0e7ca
--- /dev/null
+++ b/tests/test_utils_uris.py
@@ -0,0 +1,165 @@
+"""
+Test utility uri functions
+(tests copied from datacube-core/tests/test_utils_other.py)
+"""
+import os
+from pathlib import Path
+
+import pytest
+
+from eo3.utils import (
+    as_url,
+    get_part_from_uri,
+    is_url,
+    is_vsipath,
+    mk_part_uri,
+    normalise_path,
+    uri_resolve,
+    uri_to_local_path,
+)
+from eo3.utils.uris import default_base_dir
+
+
+def test_uri_to_local_path():
+    if os.name == "nt":
+        assert "C:\\tmp\\test.tmp" == str(uri_to_local_path("file:///C:/tmp/test.tmp"))
+        assert "\\\\remote\\path\\file.txt" == str(
+            uri_to_local_path("file://remote/path/file.txt")
+        )
+
+    else:
+        assert "/tmp/something.txt" == str(
+            uri_to_local_path("file:///tmp/something.txt")
+        )
+
+        with pytest.raises(ValueError):
+            uri_to_local_path("file://remote/path/file.txt")
+
+    assert uri_to_local_path(None) is None
+
+    with pytest.raises(ValueError):
+        uri_to_local_path("ftp://example.com/tmp/something.txt")
+
+
+def test_part_uri():
+    base = "file:///foo.txt"
+
+    for i in range(10):
+        assert get_part_from_uri(mk_part_uri(base, i)) == i
+
+    assert get_part_from_uri("file:///f.txt") is None
+    assert get_part_from_uri("file:///f.txt#something_else") is None
+    assert get_part_from_uri("file:///f.txt#part=aa") == "aa"
+    assert get_part_from_uri("file:///f.txt#part=111") == 111
+
+
+@pytest.mark.parametrize(
+    "test_input,expected",
+    [
+        ("/foo/bar/file.txt", False),
+        ("file:///foo/bar/file.txt", True),
+        ("test.bar", False),
+        ("s3://mybucket/objname.tiff", True),
+        ("gs://mybucket/objname.tiff", True),
+        ("wasb://mybucket/objname.tiff", True),
+        ("wasbs://mybucket/objname.tiff", True),
+        ("ftp://host.name/filename.txt", True),
+        ("https://host.name.com/path/file.txt", True),
+        ("http://host.name.com/path/file.txt", True),
+        ("sftp://user:pass@host.name.com/path/file.txt", True),
+        ("file+gzip://host.name.com/path/file.txt", True),
+        ("bongo:host.name.com/path/file.txt", False),
+    ],
+)
+def test_is_url(test_input, expected):
+    assert is_url(test_input) == expected
+    if expected:
+        assert as_url(test_input) is test_input
+
+
+@pytest.mark.parametrize(
+    "base",
+    [
+        "s3://foo",
+        "gs://foo",
+        "wasb://foo",
+        "wasbs://foo",
+        "/vsizip//vsicurl/https://host.tld/some/path",
+    ],
+)
+def test_uri_resolve(base):
+    abs_path = "/abs/path/to/something"
+    some_uri = "http://example.com/file.txt"
+
+    assert uri_resolve(base, abs_path) == "file://" + abs_path
+    assert uri_resolve(base, some_uri) is some_uri
+    assert uri_resolve(base, None) is base
+    assert uri_resolve(base, "") is base
+    assert uri_resolve(base, "relative/path") == base + "/relative/path"
+    assert uri_resolve(base + "/", "relative/path") == base + "/relative/path"
+    assert (
+        uri_resolve(base + "/some/dir/", "relative/path")
+        == base + "/some/dir/relative/path"
+    )
+
+    if not is_vsipath(base):
+        assert (
+            uri_resolve(base + "/some/dir/file.txt", "relative/path")
+            == base + "/some/dir/relative/path"
+        )
+
+
+def test_normalise_path():
+    cwd = Path(".").resolve()
+    assert normalise_path(".").resolve() == cwd
+
+    p = Path("/a/b/c/d.txt")
+    assert normalise_path(p) == Path(p)
+    assert normalise_path(str(p)) == Path(p)
+
+    base = Path("/a/b/")
+    p = Path("c/d.txt")
+    assert normalise_path(p, base) == (base / p)
+    assert normalise_path(str(p), str(base)) == (base / p)
+    assert normalise_path(p) == (cwd / p)
+
+    with pytest.raises(ValueError):
+        normalise_path(p, "not/absolute/path")
+
+
+def test_default_base_dir(monkeypatch):
+    def set_pwd(p):
+        if p is None:
+            monkeypatch.delenv("PWD")
+        else:
+            monkeypatch.setenv("PWD", str(p))
+
+    cwd = Path(".").resolve()
+
+    # Default base dir (once resolved) will never be different from cwd
+    assert default_base_dir().resolve() == cwd
+
+    # should work when PWD is not set
+    set_pwd(None)
+    assert "PWD" not in os.environ
+    assert default_base_dir() == cwd
+
+    # should work when PWD is not absolute path
+    set_pwd("this/is/not/a/valid/path")
+    assert default_base_dir() == cwd
+
+    # should be cwd when PWD points to some other dir
+    set_pwd(cwd / "deeper")
+    assert default_base_dir() == cwd
+
+    set_pwd(cwd.parent)
+    assert default_base_dir() == cwd
+
+    # PWD == cwd
+    set_pwd(cwd)
+    assert default_base_dir() == cwd
+
+    # TODO:
+    # - create symlink to current directory in temp
+    # - set PWD to that link
+    # - make sure that returned path is the same as symlink and different from cwd
diff --git a/tests/test_verify.py b/tests/test_verify.py
deleted file mode 100644
index 59b05ff4..00000000
--- a/tests/test_verify.py
+++ /dev/null
@@ -1,88 +0,0 @@
-import hashlib
-import unittest
-from textwrap import dedent
-
-from eo3 import verify
-
-from tests import write_files
-
-
-class VerifyTests(unittest.TestCase):
-    def test_checksum(self):  # noqa: T003
-        d = write_files({"test1.txt": "test"})
-
-        test_file = d.joinpath("test1.txt")
-
-        sha1_hash = verify.calculate_file_hash(test_file)
-        assert sha1_hash == "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3"
-
-        md5_hash = verify.calculate_file_hash(test_file, hash_fn=hashlib.md5)
-        assert md5_hash == "098f6bcd4621d373cade4e832627b4f6"
-
-        crc32_checksum = verify.calculate_file_crc32(test_file)
-        assert crc32_checksum == "d87f7e0c"
-
-    def test_package_checksum(self):
-        d = write_files(
-            {
-                "test1.txt": "test",
-                "package": {"test2.txt": "test2", "test3.txt": "test3"},
-            }
-        )
-
-        c = verify.PackageChecksum()
-
-        c.add_file(d.joinpath("test1.txt"))
-        c.add_file(d.joinpath("package", "test3.txt"))
-        c.add_file(d.joinpath("package", "test2.txt").absolute())
-
-        checksums_file = d.joinpath("package.sha1")
-        c.write(checksums_file)
-
-        with checksums_file.open("r") as f:
-            doc = f.read()
-
-        # One (hash, file) per line separated by a tab.
-        #  - File paths must be relative to the checksum file.
-        #  - Output in filename alphabetical order.
-        assert (
-            dedent(
-                """\
-            109f4b3c50d7b0df729d299bc6f8e9ef9066971f\tpackage/test2.txt
-            3ebfa301dc59196f18593c45e519287a23297589\tpackage/test3.txt
-            a94a8fe5ccb19ba61c4c0873d391e987982fbbd3\ttest1.txt
-            """
-            )
-            == doc
-        )
-
-        # After dumping to a file, read()'ing from the file should give us identical values.
-        c2 = verify.PackageChecksum()
-        c2.read(checksums_file)
-        original_items = set(c.items())
-        loaded_items = set(c2.items())
-        assert original_items == loaded_items
-        assert c == c2
-        # ... and a sanity check of our equals method:
-        assert c != verify.PackageChecksum()
-
-        # Verification should succeed:
-        verification_results = set(c2.iteratively_verify())
-        expected_verification = {
-            (d.joinpath("test1.txt").absolute(), True),
-            (d.joinpath("package", "test3.txt").absolute(), True),
-            (d.joinpath("package", "test2.txt").absolute(), True),
-        }
-        assert expected_verification == verification_results
-
-        # Corrupt a file, and expect it to fail verification.
-        with d.joinpath("package", "test3.txt").open("w") as f:
-            f.write("Deliberate corruption!")
-
-        expected_verification = {
-            (d.joinpath("test1.txt").absolute(), True),
-            (d.joinpath("package", "test3.txt").absolute(), False),
-            (d.joinpath("package", "test2.txt").absolute(), True),
-        }
-        verification_results = set(c2.iteratively_verify())
-        assert expected_verification == verification_results