diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9e20ea53..2e3de94c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -39,6 +39,7 @@ repos: - id: check-merge-conflict - id: check-symlinks # Symlinks that don't point to anything? - id: check-yaml # Check Yaml file syntax + args: [--allow-multiple-documents] - id: debug-statements # Avoid commiting debug/breakpoints - id: end-of-file-fixer # Normalise on exactly one newline - id: fix-byte-order-marker # No UTF-8 byte order marks diff --git a/eo3/__init__.py b/eo3/__init__.py index 209b3d08..c107b4a8 100644 --- a/eo3/__init__.py +++ b/eo3/__init__.py @@ -1,20 +1,15 @@ from ._version import get_versions -from .assemble import IncompleteDatasetError -from .images import GridSpec, ValidDataMethod -from .model import Eo3DatasetDocBase -from .properties import Eo3DictBase +from .fields import Range +from .model import DatasetMetadata -REPO_URL = "https://github.com/GeoscienceAustralia/eo-datasets.git" +REPO_URL = "https://github.com/opendatacube/eo3.git" __version__ = get_versions()["version"] del get_versions __all__ = ( - "Eo3DatasetDocBase", - "Eo3DictBase", - "GridSpec", - "IncompleteDatasetError", + "DatasetMetadata", + "Range", "REPO_URL", - "ValidDataMethod", "__version__", ) diff --git a/eo3/assemble.py b/eo3/assemble.py deleted file mode 100644 index 90671b30..00000000 --- a/eo3/assemble.py +++ /dev/null @@ -1,146 +0,0 @@ -""" -API for easily writing an ODC Dataset -""" -from pathlib import PosixPath -from urllib.parse import urlsplit - -from eo3.uris import uri_resolve -from eo3.validation_msg import ValidationMessage - - -class AssemblyError(Exception): - pass - - -class IncompleteDatasetError(Exception): - """ - Raised when a dataset is missing essential things and so cannot be written. - - (such as mandatory metadata) - """ - - def __init__(self, validation: ValidationMessage) -> None: - self.validation = validation - - -class IncompleteDatasetWarning(UserWarning): - """A non-critical warning for invalid or incomplete metadata""" - - def __init__(self, validation: ValidationMessage) -> None: - self.validation = validation - - def __str__(self) -> str: - return str(self.validation) - - -def _validate_property_name(name: str): - """ - >>> _validate_property_name('eo:gsd') - >>> _validate_property_name('thumbnail:full_resolution') - >>> _validate_property_name('full resolution') - Traceback (most recent call last): - ... - ValueError: Not a valid property name 'full resolution' (must be alphanumeric with colons or underscores) - >>> _validate_property_name('Mr Sprinkles') - Traceback (most recent call last): - ... - ValueError: Not a valid property name 'Mr Sprinkles' (must be alphanumeric with colons or underscores) - """ - if not name.replace(":", "").isidentifier(): - raise ValueError( - f"Not a valid property name {name!r} " - "(must be alphanumeric with colons or underscores)" - ) - - -def _default_metadata_path(dataset_url: str): - """ - The default metadata path for a given dataset location url. - - By default, we put a sibling file with extension 'odc-metadata.yaml': - >>> _default_metadata_path('file:///tmp/ls7_nbar_20120403_c1/esri-scene.stac-item.json') - 'file:///tmp/ls7_nbar_20120403_c1/esri-scene.odc-metadata.yaml' - >>> _default_metadata_path('s3://deafrica-data/jaxa/alos_palsar_mosaic/2017/N05E040/N05E040_2017.tif') - 's3://deafrica-data/jaxa/alos_palsar_mosaic/2017/N05E040/N05E040_2017.odc-metadata.yaml' - >>> _default_metadata_path('file:///tmp/ls7_nbar_20120403_c1/my-dataset.tar.gz') - 'file:///tmp/ls7_nbar_20120403_c1/my-dataset.odc-metadata.yaml' - - Or, if a directory, we place one inside: - >>> _default_metadata_path('file:///tmp/ls7_nbar_20120403_c1/') - 'file:///tmp/ls7_nbar_20120403_c1/odc-metadata.yaml' - - If a tar/zip file, place it alongside. - >>> _default_metadata_path('tar:///g/data/v10/somewhere/my-dataset.tar!/') - 'file:///g/data/v10/somewhere/my-dataset.odc-metadata.yaml' - >>> _default_metadata_path('zip:///g/data/v10/landsat-dataset.zip!') - 'file:///g/data/v10/landsat-dataset.odc-metadata.yaml' - - Unless it's already a metadata path: - >>> _default_metadata_path('file:///tmp/ls7_nbar_20120403_c1/odc-metadata.yaml') - 'file:///tmp/ls7_nbar_20120403_c1/odc-metadata.yaml' - """ - # Already a metadata url? - if dataset_url.endswith("odc-metadata.yaml"): - return dataset_url - - # If a tar URL, convert to file before proceding. - u = urlsplit(dataset_url) - path = PosixPath(u.path) - if u.scheme in ("tar", "zip"): - dataset_url = f"file://{path.as_posix()}" - - # A directory, place a default name inside. - if dataset_url.endswith("/"): - return f"{dataset_url}odc-metadata.yaml" - - # Otherwise a sibling file to the dataset file. - base_url, file_name = dataset_url.rsplit("/", maxsplit=1) - file_stem = file_name.split(".")[0] - return uri_resolve(dataset_url, f"{base_url}/{file_stem}.odc-metadata.yaml") - - -def relative_url(base: str, offset: str, allow_absolute=False): - """ - >>> relative_url('file:///tmp/dataset/odc-metadata.yaml', 'file:///tmp/dataset/my-image.tif') - 'my-image.tif' - >>> relative_url('file:///tmp/dataset/odc-metadata.yaml', 'file:///tmp/dataset/images/my-image.tif') - 'images/my-image.tif' - >>> relative_url( - ... 'https://example.test/dataset/odc-metadata.yaml', - ... 'https://example.test/dataset/images/my-image.tif' - ... ) - 'images/my-image.tif' - >>> # Outside the base directory - >>> relative_url('https://example.test/dataset/odc-metadata.yaml', 'https://example.test/my-image.tif') - Traceback (most recent call last): - ... - ValueError: Absolute paths are not allowed, and file 'https://example.test/my-image.tif' is outside location \ -'https://example.test/dataset/odc-metadata.yaml' - >>> # Matching paths, different hosts. - >>> relative_url('https://example.test/odc-metadata.yaml', 'https://example2.test/my-image.tif') - Traceback (most recent call last): - ... - ValueError: Absolute paths are not allowed, and file 'https://example2.test/my-image.tif' is outside location \ -'https://example.test/odc-metadata.yaml' - """ - base_parts = urlsplit(base) - offset_parts = urlsplit(offset) - if not allow_absolute: - if (base_parts.hostname, base_parts.scheme) != ( - offset_parts.hostname, - offset_parts.scheme, - ): - raise ValueError( - f"Absolute paths are not allowed, and file {offset!r} is outside location {base!r}" - ) - - base_dir, _ = base_parts.path.rsplit("/", 1) - try: - return PosixPath(offset_parts.path).relative_to(base_dir).as_posix() - except ValueError: - if not allow_absolute: - raise ValueError( - f"Absolute paths are not allowed, and file {offset!r} is outside location {base!r}" - ) - # We can't make it relative, return the absolute. - return offset diff --git a/eo3/documents.py b/eo3/documents.py deleted file mode 100644 index c971c678..00000000 --- a/eo3/documents.py +++ /dev/null @@ -1,295 +0,0 @@ -""" -Common methods for UI code. -""" - -import gzip -import json -import os -import posixpath -from pathlib import Path, PurePath -from typing import Dict, Generator, Tuple -from urllib.parse import urlparse - -from boltons import iterutils - -from eo3 import serialise - -_DOCUMENT_EXTENSIONS = (".yaml", ".yml", ".json") -_COMPRESSION_EXTENSIONS = ("", ".gz") - -# Both compressed (*.gz) and uncompressed. -_ALL_SUPPORTED_EXTENSIONS = tuple( - doc_type + compression_type - for doc_type in _DOCUMENT_EXTENSIONS - for compression_type in _COMPRESSION_EXTENSIONS -) - -DEFAULT_SYSTEM_NAMES = ("odc-metadata", "agdc-md") - - -def is_supported_document_type(path): - """ - Does a document path look like a supported type? - :type path: pathlib.Path - :rtype: bool - >>> from pathlib import Path - >>> is_supported_document_type(Path('/tmp/something.yaml')) - True - >>> is_supported_document_type(Path('/tmp/something.YML')) - True - >>> is_supported_document_type(Path('/tmp/something.yaml.gz')) - True - >>> is_supported_document_type(Path('/tmp/something.tif')) - False - >>> is_supported_document_type(Path('/tmp/something.tif.gz')) - False - """ - return any( - [str(path).lower().endswith(suffix) for suffix in _ALL_SUPPORTED_EXTENSIONS] - ) - - -def find_metadata_path(dataset_path, system_names=None): - """ - Find a metadata path for a given input/dataset path. - - :type dataset_path: pathlib.Path - :rtype: Path - """ - - if system_names is None: - system_names = DEFAULT_SYSTEM_NAMES - # They may have given us a metadata file directly. - if dataset_path.is_file() and is_supported_document_type(dataset_path): - return dataset_path - - for system_name in system_names: - # Otherwise there may be a sibling file with appended suffix '.ga-md.yaml'. - expected_name = dataset_path.parent.joinpath( - f"{dataset_path.stem}.{system_name}" - ) - found = _find_any_metadata_suffix(expected_name) - if found: - return found - - if dataset_path.is_dir(): - # Eo3-style. - for m in dataset_path.glob("*.odc-metadata.*"): - return m - - for system_name in "agdc", "ga": - # Otherwise if it's a directory, there may be an 'ga-metadata.yaml' file describing all contained datasets. - expected_name = dataset_path.joinpath(system_name + "-metadata") - found = _find_any_metadata_suffix(expected_name) - if found: - return found - - return None - - -def _find_any_metadata_suffix(path): - """ - Find any metadata files that exist with the given file name/path. - (supported suffixes are tried on the name) - :type path: pathlib.Path - """ - existing_paths = list( - filter(is_supported_document_type, path.parent.glob(path.name + "*")) - ) - if not existing_paths: - return None - - if len(existing_paths) > 1: - raise ValueError(f"Multiple matched metadata files: {existing_paths!r}") - - return existing_paths[0] - - -def find_and_read_documents( - *paths: Path, system_names=None -) -> Generator[Tuple[Path, Dict], None, None]: - # TODO EODATASETS: default system_names no longer include 'ga-md' - # Scan all paths immediately so we can fail fast if some are wrong. - metadata_paths = [ - (path, find_metadata_path(path, system_names=system_names)) for path in paths - ] - - missing_paths = [path for (path, md) in metadata_paths if md is None] - if missing_paths: - raise ValueError( - f"No metadata found for input path{'s' if len(missing_paths) > 1 else ''}: " - f"{', '.join(map(str, missing_paths))}" - ) - - for input_path, metadata_path in metadata_paths: - yield from read_documents(metadata_path) - - -def read_documents(*paths: Path) -> Generator[Tuple[Path, Dict], None, None]: - """ - Read & parse documents from the filesystem (yaml or json). - - Note that a single yaml file can contain multiple documents. - """ - for path in paths: - suffix = path.suffix.lower() - - # If compressed, open as gzip stream. - opener = open - if suffix == ".gz": - suffix = path.suffixes[-2].lower() - opener = gzip.open - - with opener(str(path), "r") as f: - if suffix in (".yaml", ".yml"): - for parsed_doc in serialise.loads_yaml(f): - yield path, parsed_doc - elif suffix == ".json": - yield path, json.load(f) - else: - raise ValueError( - "Unknown document type for {}; expected one of {!r}.".format( - path.name, _ALL_SUPPORTED_EXTENSIONS - ) - ) - - -def docpath_set(doc, path, value): - """ - Set a value in a document using a path (sequence of keys). - - (It's designed to mirror `boltons.iterutils.get_path()` and related methods) - - >>> d = {'a': 1} - >>> docpath_set(d, ['a'], 2) - >>> d - {'a': 2} - >>> d = {'a':{'b':{'c': 1}}} - >>> docpath_set(d, ['a', 'b', 'c'], 2) - >>> d - {'a': {'b': {'c': 2}}} - >>> d = {} - >>> docpath_set(d, ['a'], 2) - >>> d - {'a': 2} - >>> d = {} - >>> docpath_set(d, ['a', 'b'], 2) - Traceback (most recent call last): - ... - KeyError: 'a' - >>> d - {} - >>> docpath_set(d, [], 2) - Traceback (most recent call last): - ... - ValueError: Cannot set a value to an empty path - """ - if not path: - raise ValueError("Cannot set a value to an empty path") - - d = doc - for part in path[:-1]: - d = d[part] - - d[path[-1]] = value - - -def make_paths_relative( - doc: Dict, base_directory: PurePath, allow_paths_outside_base=False -): - """ - Find all pathlib.Path values in a document structure and make them relative to the given path. - - >>> from copy import deepcopy - >>> base = PurePath('/tmp/basket') - >>> doc = {'id': 1, 'fruits': [{'apple': PurePath('/tmp/basket/fruits/apple.txt')}]} - >>> make_paths_relative(doc, base) - >>> doc - {'id': 1, 'fruits': [{'apple': 'fruits/apple.txt'}]} - >>> # No change if repeated. (relative paths still relative) - >>> previous = deepcopy(doc) - >>> make_paths_relative(doc, base) - >>> doc == previous - True - >>> # Relative pathlibs also become relative strings for consistency. - >>> doc = {'villains': PurePath('the-baron.txt')} - >>> make_paths_relative(doc, base) - >>> doc - {'villains': 'the-baron.txt'} - """ - for doc_path, value in iterutils.research( - doc, lambda p, k, v: isinstance(v, PurePath) - ): - value: PurePath - value = relative_path( - value, base_directory, allow_paths_outside_base=allow_paths_outside_base - ) - docpath_set(doc, doc_path, value.as_posix()) - - -def relative_url(value: str, base: str, allow_paths_outside_base=False) -> str: - """ - Make a single url relative to the base url if it is inside it. - - By default, will throw a ValueError if not able to make it relative to the path. - - - >>> relative_url('file:///g/data/v10/0/2015/blue.jpg', 'file:///g/data/v10/0/2015/odc-metadata.yaml') - 'blue.jpg' - >>> relative_url('https://example.test/2015/images/blue.jpg', 'https://example.test/2015/odc-metadata.yaml') - 'images/blue.jpg' - >>> relative_url('file:///g/data/v10/0/2018/blue.jpg', 'file:///g/data/v10/0/2015/odc-metadata.yaml') - Traceback (most recent call last): - ... - ValueError: Path 'file:///g/data/v10/0/2018/blue.jpg' is outside path 'file:///g/data/v10/0/2015/odc-metadata.yaml'\ - (allow_paths_outside_base=False) - """ - - if not value: - return value - - if not value.startswith(base) and not value.startswith(os.path.dirname(base)): - if not allow_paths_outside_base: - raise ValueError( - f"Path {value!r} is outside path {base!r} " - f"(allow_paths_outside_base={allow_paths_outside_base})" - ) - return value - - return _make_relurl(value, base) - - -def _make_relurl(target: str, base: str) -> str: - base = urlparse(base) - target = urlparse(target) - if base.netloc != target.netloc: - raise ValueError("target and base netlocs do not match") - base_dir = "." + posixpath.dirname(base.path) - target = "." + target.path - return posixpath.relpath(target, start=base_dir) - - -def relative_path( - value: PurePath, base_directory: PurePath, allow_paths_outside_base=False -) -> PurePath: - """ - Make a single path relative to the base directory if it is inside it. - - By default, will throw a ValueError if not able to make it relative to the path. - - >>> val = PurePath('/tmp/minimal-pkg/loch_ness_sightings_2019-07-04_blue.tif') - >>> base = PurePath('/tmp/minimal-pkg') - >>> relative_path(val, base).as_posix() - 'loch_ness_sightings_2019-07-04_blue.tif' - """ - if not value or not value.is_absolute(): - return value - - if base_directory not in value.parents: - if not allow_paths_outside_base: - raise ValueError( - f"Path {value.as_posix()!r} is outside path {base_directory.as_posix()!r} " - f"(allow_paths_outside_base={allow_paths_outside_base})" - ) - return value - return value.relative_to(base_directory) diff --git a/eo3/eo3_core.py b/eo3/eo3_core.py index 776fbf2a..efa849e4 100644 --- a/eo3/eo3_core.py +++ b/eo3/eo3_core.py @@ -1,6 +1,6 @@ """ Tools for working with EO3 metadata """ -# TODO CORE: copied from datacube.index.eo3 +import warnings from functools import reduce from typing import Any, Dict, Iterable, Optional, Tuple, Union from uuid import UUID @@ -16,10 +16,9 @@ polygon, ) -EO3_SCHEMA = "https://schemas.opendatacube.org/dataset" +from eo3.schema import ODC_DATASET_SCHEMA_URL -# This is should become eo3.models.GridDoc class EO3Grid: def __init__(self, grid: Dict[str, Any]) -> None: shape = grid.get("shape") @@ -28,6 +27,7 @@ def __init__(self, grid: Dict[str, Any]) -> None: if len(shape) != 2: raise ValueError("Grid shape must be two dimensional") self.shape: Tuple[int, int] = tuple(int(x) for x in shape) + xform = grid.get("transform") if xform is None: raise ValueError("Each grid must have a transform") @@ -35,11 +35,18 @@ def __init__(self, grid: Dict[str, Any]) -> None: raise ValueError("Grid transform must have 6 or 9 elements.") for elem in xform: if type(elem) not in (int, float): - raise ValueError("All grid transform elements must be numbers") + raise ValueError( + f"All grid transform elements must be numbers, got {type(elem)}" + ) if len(xform) == 9 and list(xform[6:]) != [0, 0, 1]: raise ValueError("Grid transform must be a valid Affine matrix") self.transform = Affine(*xform[:6]) + crs = grid.get("crs") + if crs is not None: + check_crs_epsg(crs) + self.crs = crs + def points(self, ring: bool = False) -> CoordList: ny, nx = (float(dim) for dim in self.shape) pts = [(0.0, 0.0), (nx, 0.0), (nx, ny), (0.0, ny)] @@ -52,6 +59,8 @@ def ref_points(self) -> Dict[str, Dict[str, float]]: return {n: dict(x=x, y=y) for n, (x, y) in zip(nn, self.points())} def polygon(self, crs: Optional[SomeCRS] = None) -> Geometry: + # use grid's own CRS if it was provided + crs = self.crs if self.crs is not None else crs return polygon(self.points(ring=True), crs=crs) @@ -66,7 +75,11 @@ def eo3_lonlat_bbox( return lonlat_bounds(valid_data, resolution=resolution) all_grids_extent = reduce( - lambda x, y: x.union(y), (grid.polygon(crs) for grid in grids) + lambda x, y: x.union(y), + ( + grid.polygon(grid.crs) if grid.crs is not None else grid.polygon(crs) + for grid in grids + ), ) return lonlat_bounds(all_grids_extent, resolution=resolution) @@ -119,6 +132,7 @@ def eo3_grid_spatial( crs = doc.get("crs", None) if crs is None or not gridspecs: raise ValueError("Input must have crs and grids.") + check_crs_epsg(crs) grids = {name: EO3Grid(grid_spec) for name, grid_spec in gridspecs.items()} grid = grids.get(grid_name) if not grid: @@ -154,7 +168,10 @@ def eo3_grid_spatial( def add_eo3_parts( doc: Dict[str, Any], resolution: Optional[float] = None ) -> Dict[str, Any]: - """Add spatial keys the DB requires to eo3 metadata""" + """Add spatial keys the DB required by eo3 metadata""" + # don't attempt to recalculate gs info if it already exists + if doc.get("grid_spatial"): + return doc return dict(**doc, **eo3_grid_spatial(doc, resolution=resolution)) @@ -174,7 +191,7 @@ def is_doc_eo3(doc: Dict[str, Any]) -> bool: if schema is None: return False - if schema == EO3_SCHEMA: + if schema == ODC_DATASET_SCHEMA_URL: return True # Otherwise it has an unknown schema. @@ -205,45 +222,57 @@ def is_doc_geo(doc: Dict[str, Any], check_eo3: bool = True) -> bool: def prep_eo3( - doc: Dict[str, Any], auto_skip: bool = False, resolution: Optional[float] = None + doc: Dict[str, Any], + resolution: Optional[float] = None, # can we remove this? + remap_lineage=True, ) -> Dict[str, Any]: """Modify spatial and lineage sections of eo3 metadata :param doc: input document - :param auto_skip: If true check if dataset is EO3 and if not - silently return input dataset without modifications + :param remap_lineage: If True (default) disambiguate lineage classifiers so that + source_id and classifier form a unique index (for indexes that DON'T + support external_lineage). + If False, leave lineage in the same format. """ if doc is None: return None - if auto_skip: - if not is_doc_eo3(doc): - return doc - def stringify(u: Optional[Union[str, UUID]]) -> Optional[str]: return u if isinstance(u, str) else str(u) if u else None doc["id"] = stringify(doc.get("id", None)) doc = add_eo3_parts(doc, resolution=resolution) - lineage = doc.pop("lineage", {}) - - def remap_lineage(name, uuids) -> Dict[str, Any]: - """Turn name, [uuid] -> {name: {id: uuid}}""" - if len(uuids) == 0: - return {} - if isinstance(uuids, dict) or isinstance(uuids[0], dict): - raise ValueError("Embedded lineage not supported for eo3 metadata types") - if len(uuids) == 1: - return {name: {"id": stringify(uuids[0])}} - - out = {} - for idx, uuid in enumerate(uuids, start=1): - out[name + str(idx)] = {"id": stringify(uuid)} - return out - - sources = {} - for name, uuids in lineage.items(): - sources.update(remap_lineage(name, uuids)) - - doc["lineage"] = dict(source_datasets=sources) + if remap_lineage: + lineage = doc.pop("lineage", {}) + + def lineage_remap(name, uuids) -> Dict[str, Any]: + """Turn name, [uuid] -> {name: {id: uuid}}""" + if len(uuids) == 0: + return {} + if isinstance(uuids, dict) or isinstance(uuids[0], dict): + raise ValueError( + "Embedded lineage not supported for eo3 metadata types" + ) + if len(uuids) == 1: + return {name: {"id": stringify(uuids[0])}} + + out = {} + for idx, uuid in enumerate(uuids, start=1): + out[name + str(idx)] = {"id": stringify(uuid)} + return out + + sources = {} + for name, uuids in lineage.items(): + sources.update(lineage_remap(name, uuids)) + + doc["lineage"] = dict(source_datasets=sources) return doc + + +def check_crs_epsg(crs): + """Check if CRS is WKT when it could be provided as EPSG (preferred)""" + crs = CRS(crs) + if crs.epsg is not None and not str(crs).startswith("EPSG"): + warnings.warn( + f"Prefer an EPSG code to a WKT when possible. (Can change CRS to 'epsg:{crs.epsg}')" + ) diff --git a/eo3/fields.py b/eo3/fields.py new file mode 100644 index 00000000..2b7a755c --- /dev/null +++ b/eo3/fields.py @@ -0,0 +1,245 @@ +# Core TODO: copied over from datacube.model.fields +"""Non-db specific implementation of metadata search fields. + +This allows extraction of fields of interest from dataset metadata document. +""" +import decimal +from collections import namedtuple +from typing import Any, Dict, List, Mapping + +import toolz # type: ignore[import] + +from eo3.utils import parse_time + +Range = namedtuple("Range", ("begin", "end")) + +# Allowed values for field 'type' (specified in a metadata type docuemnt) +_AVAILABLE_TYPE_NAMES = ( + "numeric-range", + "double-range", + "integer-range", + "datetime-range", + "string", + "numeric", + "double", + "integer", + "datetime", + "object", + # For backwards compatibility (alias for numeric-range) + "float-range", +) + +_TYPE_PARSERS = { + "string": str, + "double": float, + "integer": int, + "numeric": decimal.Decimal, + "datetime": parse_time, + "object": lambda x: x, +} + + +class Expression: + # No properties at the moment. These are built and returned by the + # DB driver (from Field methods), so they're mostly an opaque token. + + # A simple equals implementation for comparison in test code. + def __eq__(self, other) -> bool: + if self.__class__ != other.__class__: + return False + return self.__dict__ == other.__dict__ + + def evaluate(self, ctx): + raise NotImplementedError() + + +class SimpleEqualsExpression(Expression): + def __init__(self, field, value): + self.field = field + self.value = value + + def evaluate(self, ctx): + return self.field.extract(ctx) == self.value + + +class Field: + """ + A searchable field within a dataset/storage metadata document. + """ + + # type of field. + # If type is not specified, the field is a string + # This should always be one of _AVAILABLE_TYPE_NAMES + type_name = "string" + + def __init__(self, name: str, description: str): + self.name = name + + self.description = description + + # Does selecting this affect the output rows? + # (eg. Does this join other tables that aren't 1:1 with datasets.) + self.affects_row_selection = False + + if self.type_name not in _AVAILABLE_TYPE_NAMES: + raise ValueError(f"Invalid type name {self.type_name!r}") + + def __eq__(self, value) -> Expression: # type: ignore + """ + Is this field equal to a value? + + this returns an Expression object (hence type ignore above) + """ + raise NotImplementedError("equals expression") + + def between(self, low, high) -> Expression: + """ + Is this field in a range? + """ + raise NotImplementedError("between expression") + + +class SimpleField(Field): + def __init__(self, offset, converter, type_name, name="", description=""): + self.offset = offset + self._converter = converter + self.type_name = type_name + super().__init__(name, description) + + def __eq__(self, value) -> Expression: # type: ignore[override] + return SimpleEqualsExpression(self, value) + + def extract(self, doc): + v = toolz.get_in(self.offset, doc, default=None) + if v is None: + return None + return self._converter(v) + + +class RangeField(Field): + def __init__( + self, min_offset, max_offset, base_converter, type_name, name="", description="" + ): + self.type_name = type_name + self._converter = base_converter + self.min_offset = min_offset + self.max_offset = max_offset + super().__init__(name, description) + + def extract(self, doc): + def extract_raw(paths): + vv = [toolz.get_in(p, doc, default=None) for p in paths] + return [self._converter(v) for v in vv if v is not None] + + v_min = extract_raw(self.min_offset) + v_max = extract_raw(self.max_offset) + + v_min = None if len(v_min) == 0 else min(v_min) + v_max = None if len(v_max) == 0 else max(v_max) + + if v_min is None and v_max is None: + return None + + return Range(v_min, v_max) + + +def parse_search_field(doc, name=""): + _type = doc.get("type", "string") + + if _type in _TYPE_PARSERS: + offset = doc.get("offset", None) + if offset is None: + raise ValueError("Missing offset") + + return SimpleField( + offset, + _TYPE_PARSERS[_type], + _type, + name=name, + description=doc.get("description", ""), + ) + + if not _type.endswith("-range"): + raise ValueError("Unsupported search field type: " + str(_type)) + + raw_type = _type.split("-")[0] + + if ( + raw_type == "float" + ): # float-range is supposed to be supported, but not just float? + raw_type = "numeric" + _type = "numeric-range" + + if raw_type not in _TYPE_PARSERS: + raise ValueError("Unsupported search field type: " + str(_type)) + + min_offset = doc.get("min_offset", None) + max_offset = doc.get("max_offset", None) + + if min_offset is None or max_offset is None: + raise ValueError("Need to specify both min_offset and max_offset") + + return RangeField( + min_offset, + max_offset, + _TYPE_PARSERS[raw_type], + _type, + name=name, + description=doc.get("description", ""), + ) + + +def get_search_fields(metadata_definition: Mapping[str, Any]) -> Dict[str, Field]: + """Construct search fields dictionary not tied to any specific db implementation.""" + fields = toolz.get_in(["dataset", "search_fields"], metadata_definition, {}) + return {n: parse_search_field(doc, name=n) for n, doc in fields.items()} + + +def parse_offset_field(name="", offset=[]): + field_types = { + "id": "string", + "label": "string", + "format": "string", + "sources": "object", + "creation_dt": "datetime", + "grid_spatial": "object", + "measurements": "object", + } + + if name in field_types: + _type = field_types[name] + return SimpleField(offset, _TYPE_PARSERS[_type], _type, name=name) + + +def get_system_fields(metadata_definition: Mapping[str, Any]) -> Dict[str, Field]: + """Construct system fields dictionary not tied to any specific db implementation.""" + fields = metadata_definition.get("dataset") + return { + name: parse_offset_field(name, offset) + for name, offset in fields.items() + if name != "search_fields" + } + + +def get_all_fields(metadata_definition: Mapping[str, Any]) -> Dict[str, Field]: + """Construct dictionary of all fields""" + search_fields = { + name: field for name, field in get_search_fields(metadata_definition).items() + } + system_offsets = { + name: field for name, field in get_system_fields(metadata_definition).items() + } + return dict(**system_offsets, **search_fields) + + +def all_field_offsets(metadata_definition: Mapping[str, Any]) -> Dict[str, List[Any]]: + """Get a mapping of all field names -> offset""" + all_fields = get_all_fields(metadata_definition) + return { + name: ( + [field.offset] + if hasattr(field, "offset") + else field.min_offset + field.max_offset + ) + for name, field in all_fields.items() + } diff --git a/eo3/images.py b/eo3/images.py deleted file mode 100644 index 112b215a..00000000 --- a/eo3/images.py +++ /dev/null @@ -1,1330 +0,0 @@ -import math -import os -import string -import sys -import tempfile -from collections import defaultdict -from enum import Enum, auto -from pathlib import Path, PurePath -from typing import ( - Dict, - Generator, - Iterable, - List, - Mapping, - Optional, - Sequence, - Set, - Tuple, - Union, -) - -import attr -import numpy -import rasterio -import rasterio.features -import shapely -import shapely.affinity -import shapely.ops -import xarray -from affine import Affine -from rasterio import DatasetReader -from rasterio.coords import BoundingBox -from rasterio.crs import CRS -from rasterio.enums import Resampling -from rasterio.io import DatasetWriter, MemoryFile -from rasterio.shutil import copy as rio_copy -from rasterio.warp import calculate_default_transform, reproject -from scipy.ndimage import binary_fill_holes -from shapely.geometry import box -from shapely.geometry.base import CAP_STYLE, JOIN_STYLE, BaseGeometry - -from eo3.model import Eo3DatasetDocBase, GridDoc, MeasurementDoc -from eo3.properties import FileFormat - -DEFAULT_OVERVIEWS = (8, 16, 32) - -try: - import h5py -except ImportError: - h5py = None - - -class ValidDataMethod(Enum): - """ - How to calculate the valid data geometry for an image? - """ - - #: Vectorize the full valid pixel mask as-is. - #: - #: In some circumstances this can be very slow. - #: `filled` may be safer. - #: - thorough = auto() - - #: Fill holes in the valid pixel mask before vectorizing. - #: - #: (Potentially much faster than ``thorough`` if there's many small - #: nodata holes, as they will create many tiny polygons. - #: *slightly* slower if no holes exist.) - filled = auto() - - #: Take convex-hull of valid pixel mask before vectorizing. - #: - #: This is much slower than ``filled``, but will work in cases where - #: you have a lot of internal geometry that aren't holes. - #: Such as SLC-Off Landsat 7 data. - #: - #: Requires 'scikit-image' dependency. - convex_hull = auto() - - #: Use the image file bounds, ignoring actual pixel values. - bounds = auto() - - -@attr.s(auto_attribs=True, slots=True, hash=True, frozen=True) -class GridSpec: - """ - The grid spec defines the coordinates/transform and size of pixels of a - measurment. - - The easiest way to create one is use the ``GridSpec.from_*()`` class methods, such as - ``GridSpec.from_path(my_image_path)``. - - To create one manually: - - >>> from eo3 import GridSpec - >>> from affine import Affine - >>> from rasterio.crs import CRS - >>> g = GridSpec(shape=(7721, 7621), - ... transform=Affine(30.0, 0.0, 241485.0, 0.0, -30.0, -2281485.0), - ... crs=CRS.from_epsg(32656)) - >>> # Numbers copied from equivalent rio dataset.bounds call. - >>> g.bounds - BoundingBox(left=241485.0, bottom=-2513115.0, right=470115.0, top=-2281485.0) - >>> g.resolution_yx - (30.0, 30.0) - """ - - #: - shape: Tuple[int, int] - #: - transform: Affine - #: - crs: CRS = attr.ib( - metadata=dict(doc_exclude=True), default=None, hash=False, eq=False - ) - - @classmethod - def from_dataset_doc(cls, ds: Eo3DatasetDocBase, grid="default") -> "GridSpec": - """ - Create from an existing parsed metadata document - - :param grid: Grid name to read, if not the default. - """ - g = ds.grids[grid] - - if ds.crs.startswith("epsg:"): - crs = CRS.from_epsg(ds.crs[5:]) - else: - crs = CRS.from_wkt(ds.crs) - - return GridSpec(g.shape, g.transform, crs=crs) - - @classmethod - def from_rio(cls, dataset: rasterio.DatasetReader) -> "GridSpec": - """Create from an open rasterio dataset""" - return cls(shape=dataset.shape, transform=dataset.transform, crs=dataset.crs) - - @property - def resolution_yx(self): - return abs(self.transform[4]), abs(self.transform[0]) - - @classmethod - def from_odc_xarray(cls, dataset: xarray.Dataset) -> "GridSpec": - """Create from an ODC xarray""" - shape = {v.shape for v in dataset.data_vars.values()}.pop() - return cls( - shape=shape, - transform=dataset.geobox.transform, - crs=CRS.from_wkt(dataset.geobox.crs.crs_str), - ) - - @classmethod - def from_path(cls, path: str) -> "GridSpec": - """Create from the spec of a (rio-readable) filesystem path or url""" - with rasterio.open(path) as rio: - return GridSpec.from_rio(rio) - - @property - def bounds(self): - """ - Get bounding box. - """ - return BoundingBox( - *(self.transform * (0, self.shape[0])) - + (self.transform * (self.shape[1], 0)) - ) - - -def generate_tiles( - samples: int, lines: int, xtile: int = None, ytile: int = None -) -> Generator[Tuple[Tuple[int, int], Tuple[int, int]], None, None]: - """ - Generates a list of tile indices for a 2D array. - - :param samples: - An integer expressing the total number of samples in an array. - - :param lines: - An integer expressing the total number of lines in an array. - - :param xtile: - (Optional) The desired size of the tile in the x-direction. - Default is all samples - - :param ytile: - (Optional) The desired size of the tile in the y-direction. - Default is min(100, lines) lines. - - :return: - Each tuple in the generator contains - ((ystart,yend),(xstart,xend)). - - >>> import pprint - >>> tiles = generate_tiles(1624, 1567, xtile=1000, ytile=400) - >>> pprint.pprint(list(tiles)) - [((0, 400), (0, 1000)), - ((0, 400), (1000, 1624)), - ((400, 800), (0, 1000)), - ((400, 800), (1000, 1624)), - ((800, 1200), (0, 1000)), - ((800, 1200), (1000, 1624)), - ((1200, 1567), (0, 1000)), - ((1200, 1567), (1000, 1624))] - """ - - def create_tiles(samples, lines, xstart, ystart): - """ - Creates a generator object for the tiles. - """ - for ystep in ystart: - if ystep + ytile < lines: - yend = ystep + ytile - else: - yend = lines - for xstep in xstart: - if xstep + xtile < samples: - xend = xstep + xtile - else: - xend = samples - yield ((ystep, yend), (xstep, xend)) - - # check for default or out of bounds - if xtile is None or xtile < 0: - xtile = samples - if ytile is None or ytile < 0: - ytile = min(100, lines) - - xstart = numpy.arange(0, samples, xtile) - ystart = numpy.arange(0, lines, ytile) - - tiles = create_tiles(samples, lines, xstart, ystart) - - return tiles - - -def _common_suffix(names: Iterable[str]) -> str: - return os.path.commonprefix([s[::-1] for s in names])[::-1] - - -def _find_a_common_name( - group_of_names: Sequence[str], all_possible_names: Set[str] = None -) -> Optional[str]: - """ - If we have a list of band names, can we find a nice name for the group of them? - - (used when naming the grid for a set of bands) - - >>> _find_a_common_name(['nbar_blue', 'nbar_red']) - 'nbar' - >>> _find_a_common_name(['nbar_band08', 'nbart_band08']) - 'band08' - >>> _find_a_common_name(['nbar:band08', 'nbart:band08']) - 'band08' - >>> _find_a_common_name(['panchromatic']) - 'panchromatic' - >>> _find_a_common_name(['nbar_panchromatic']) - 'nbar_panchromatic' - >>> # It's ok to find nothing. - >>> _find_a_common_name(['nbar_blue', 'nbar_red', 'qa']) - >>> _find_a_common_name(['a', 'b']) - >>> # If a name is taken by non-group memebers, it shouldn't be chosen - >>> # (There's an 'nbar' prefix outside of the group, so shouldn't be found) - >>> all_names = {'nbar_blue', 'nbar_red', 'nbar_green', 'nbart_blue'} - >>> _find_a_common_name(['nbar_blue', 'nbar_red'], all_possible_names=all_names) - >>> _find_a_common_name(['nbar_blue', 'nbar_red', 'nbar_green'], all_possible_names=all_names) - 'nbar' - """ - options = [] - - non_group_names = (all_possible_names or set()).difference(group_of_names) - - # If all measurements have a common prefix (like 'nbar_') it makes a nice grid name. - prefix = os.path.commonprefix(group_of_names) - if not any(name.startswith(prefix) for name in non_group_names): - options.append(prefix) - - suffix = _common_suffix(group_of_names) - if not any(name.endswith(suffix) for name in non_group_names): - options.append(suffix) - - if not options: - return None - - options = [s.strip("_:") for s in options] - # Pick the longest candidate. - options.sort(key=len, reverse=True) - return options[0] or None - - -@attr.s(auto_attribs=True, slots=True) -class _MeasurementLocation: - path: Union[Path, str] - layer: str = None - - -_Measurements = Dict[str, _MeasurementLocation] - - -class MeasurementBundler: - """ - Incrementally record the information for a set of measurements/images to group into grids, - calculate geometry etc, suitable for metadata. - """ - - def __init__(self): - # The measurements grouped by their grid. - # (value is band_name->Path) - self._measurements_per_grid: Dict[GridSpec, _Measurements] = defaultdict(dict) - # Valid data mask per grid, in pixel coordinates. - self.mask_by_grid: Dict[GridSpec, numpy.ndarray] = {} - - def record_image( - self, - name: str, - grid: GridSpec, - path: Union[PurePath, str], - img: numpy.ndarray, - layer: Optional[str] = None, - nodata: Optional[Union[float, int]] = None, - expand_valid_data=True, - ): - for measurements in self._measurements_per_grid.values(): - if name in measurements: - raise ValueError( - f"Duplicate addition of band called {name!r}. " - f"Original at {measurements[name]} and now {path}" - ) - - self._measurements_per_grid[grid][name] = _MeasurementLocation(path, layer) - if expand_valid_data: - self._expand_valid_data_mask(grid, img, nodata) - - def _expand_valid_data_mask( - self, grid: GridSpec, img: numpy.ndarray, nodata: Union[float, int] - ): - if nodata is None: - nodata = float("nan") if numpy.issubdtype(img.dtype, numpy.floating) else 0 - - if math.isnan(nodata): - valid_values = numpy.isfinite(img) - else: - valid_values = img != nodata - - mask = self.mask_by_grid.get(grid) - if mask is None: - mask = valid_values - else: - mask |= valid_values - self.mask_by_grid[grid] = mask - - def _as_named_grids(self) -> Dict[str, Tuple[GridSpec, _Measurements]]: - """Get our grids with sensible (hopefully!), names.""" - - # Order grids from most to fewest measurements. - # PyCharm's typing seems to get confused by the sorted() call. - # noinspection PyTypeChecker - grids_by_frequency: List[Tuple[GridSpec, _Measurements]] = sorted( - self._measurements_per_grid.items(), key=lambda k: len(k[1]), reverse=True - ) - - # The largest group is the default. - default_grid = grids_by_frequency.pop(0) - - named_grids = {"default": default_grid} - - # No other grids? Nothing to do! - if not grids_by_frequency: - return named_grids - - # First try to name them via common prefixes, suffixes etc. - all_measurement_names = set(self.iter_names()) - for grid, measurements in grids_by_frequency: - if len(measurements) == 1: - grid_name = "_".join(measurements.keys()) - else: - grid_name = _find_a_common_name( - list(measurements.keys()), all_possible_names=all_measurement_names - ) - if not grid_name: - # Nothing useful found! - break - - if grid_name in named_grids: - # Clash of names! This strategy wont work. - break - - named_grids[grid_name] = (grid, measurements) - else: - # We finished without a clash. - return named_grids - - # Otherwise, try resolution names: - named_grids = {"default": default_grid} - for grid, measurements in grids_by_frequency: - res_y, res_x = grid.resolution_yx - if res_x > 1: - res_x = int(res_x) - grid_name = f"{res_x}" - if grid_name in named_grids: - # Clash of names! This strategy wont work. - break - - named_grids[grid_name] = (grid, measurements) - else: - # We finished without a clash. - return named_grids - - # No strategies worked! - # Enumerated, alphabetical letter names. Grid 'a', Grid 'b', etc... - grid_names = list(string.ascii_letters) - if len(grids_by_frequency) > len(grid_names): - raise NotImplementedError( - f"More than {len(grid_names)} grids that cannot be named!" - ) - return { - "default": default_grid, - **{ - grid_names[i]: (grid, measurements) - for i, (grid, measurements) in enumerate(grids_by_frequency) - }, - } - - def as_geo_docs(self) -> Tuple[CRS, Dict[str, GridDoc], Dict[str, MeasurementDoc]]: - """Calculate combined geo information for metadata docs""" - - if not self._measurements_per_grid: - return None, None, None - - grid_docs: Dict[str, GridDoc] = {} - measurement_docs: Dict[str, MeasurementDoc] = {} - crs = None - for grid_name, (grid, measurements) in self._as_named_grids().items(): - # Validate assumption: All grids should have same CRS - if crs is None: - crs = grid.crs - # CRS equality is tricky. This may not work. - # We're assuming a group of measurements specify their CRS - # the same way if they are the same. - elif grid.crs != crs: - raise ValueError( - f"Measurements have different CRSes in the same dataset:\n" - f"\t{crs.to_string()!r}\n" - f"\t{grid.crs.to_string()!r}\n" - ) - - grid_docs[grid_name] = GridDoc(grid.shape, grid.transform) - - for measurement_name, measurement_path in measurements.items(): - # No measurement groups in the doc: we replace with underscores. - measurement_name = measurement_name.replace(":", "_") - - measurement_docs[measurement_name] = MeasurementDoc( - path=measurement_path.path, - layer=measurement_path.layer, - grid=grid_name if grid_name != "default" else None, - ) - return crs, grid_docs, measurement_docs - - def consume_and_get_valid_data( - self, valid_data_method: ValidDataMethod = ValidDataMethod.thorough - ) -> BaseGeometry: - """ - Consume the stored grids and produce the valid data for them. - - (they are consumed in order to to minimise peak memory usage) - - :param valid_data_method: How to calculate the valid-data polygon? - """ - - geoms = [] - - while self.mask_by_grid: - grid, mask = self.mask_by_grid.popitem() - - if valid_data_method is ValidDataMethod.bounds: - geom = box(*grid.bounds) - elif valid_data_method is ValidDataMethod.filled: - mask = mask.astype("uint8") - binary_fill_holes(mask, output=mask) - geom = _grid_to_poly(grid, mask) - elif valid_data_method is ValidDataMethod.convex_hull: - # Requires optional dependency scikit-image - from skimage import morphology as morph - - geom = _grid_to_poly( - grid, morph.convex_hull_image(mask).astype("uint8") - ) - elif valid_data_method is ValidDataMethod.thorough: - geom = _grid_to_poly(grid, mask.astype("uint8")) - else: - raise NotImplementedError( - f"Unexpected valid data method: {valid_data_method}" - ) - geoms.append(geom) - return shapely.ops.unary_union(geoms) - - def iter_names(self) -> Generator[str, None, None]: - """All known measurement names""" - for grid, measurements in self._measurements_per_grid.items(): - for band_name, _ in measurements.items(): - yield band_name - - def iter_paths(self) -> Generator[Tuple[GridSpec, str, Path], None, None]: - """All current measurement paths on disk""" - for grid, measurements in self._measurements_per_grid.items(): - for band_name, meas_path in measurements.items(): - yield grid, band_name, meas_path.path - - -def _valid_shape(shape: BaseGeometry) -> BaseGeometry: - if shape.is_valid: - return shape - return shape.buffer(0) - - -def _grid_to_poly(grid: GridSpec, mask: numpy.ndarray) -> BaseGeometry: - shape = shapely.ops.unary_union( - [ - _valid_shape(shapely.geometry.shape(shape)) - for shape, val in rasterio.features.shapes(mask) - if val == 1 - ] - ) - shape_y, shape_x = mask.shape - del mask - # convex hull - geom = shape.convex_hull - # buffer by 1 pixel - geom = geom.buffer(1, cap_style=CAP_STYLE.square, join_style=JOIN_STYLE.bevel) - # simplify with 1 pixel radius - geom = geom.simplify(1) - # intersect with image bounding box - geom = geom.intersection(shapely.geometry.box(0, 0, shape_x, shape_y)) - # transform from pixel space into CRS space - geom = shapely.affinity.affine_transform( - geom, - ( - grid.transform.a, - grid.transform.b, - grid.transform.d, - grid.transform.e, - grid.transform.xoff, - grid.transform.yoff, - ), - ) - return geom - - -@attr.s(auto_attribs=True) -class WriteResult: - # path: Path - - # The value to put in 'odc:file_format' metadata field. - file_format: FileFormat - - # size_bytes: int - - -class FileWrite: - """ - Write COGs from arrays / files. - - This code is derived from the old eugl packaging code and can probably be improved. - """ - - PREDICTOR_DEFAULTS = { - "int8": 2, - "uint8": 2, - "int16": 2, - "uint16": 2, - "int32": 2, - "uint32": 2, - "int64": 2, - "uint64": 2, - "float32": 3, - "float64": 3, - } - - def __init__( - self, - gdal_options: Dict = None, - overview_blocksize: Optional[int] = None, - ) -> None: - super().__init__() - self.options = gdal_options or {} - self.overview_blocksize = overview_blocksize - - @classmethod - def from_existing( - cls, - shape: Tuple[int, int], - overviews: bool = True, - blocksize_yx: Optional[Tuple[int, int]] = None, - overview_blocksize: Optional[int] = None, - compress="deflate", - zlevel=4, - ) -> "FileWrite": - """Returns write_img options according to the source imagery provided - :param overviews: - (boolean) sets overview flags in gdal config options - :param blockxsize: - (int) override the derived base blockxsize in cogtif conversion - :param blockysize: - (int) override the derived base blockysize in cogtif conversion - - """ - options = {"compress": compress, "zlevel": zlevel} - - y_size, x_size = blocksize_yx or (512, 512) - # Do not set block sizes for small imagery - if shape[0] < y_size and shape[1] < x_size: - pass - else: - options["blockxsize"] = x_size - options["blockysize"] = y_size - options["tiled"] = "yes" - - if overviews: - options["copy_src_overviews"] = "yes" - - return FileWrite(options, overview_blocksize=overview_blocksize) - - def write_from_ndarray( - self, - array: numpy.ndarray, - out_filename: Path, - geobox: GridSpec = None, - nodata: int = None, - overview_resampling=Resampling.nearest, - overviews: Optional[Tuple[int, ...]] = DEFAULT_OVERVIEWS, - tags: Optional[Mapping[str, str]] = None, - ) -> WriteResult: - """ - Writes a 2D/3D image to disk using rasterio. - - :param array: - A 2D/3D NumPy array. - - :param out_filename: - A string containing the output file name. - - :param geobox: - An instance of a GriddedGeoBox object. - - :param nodata: - A value representing the no data value for the array. - - :param overview_resampling: - If levels is set, build overviews using a resampling method - from `rasterio.enums.Resampling` - Default is `Resampling.nearest`. - - :param tags: - File tags. - - :notes: - If array is an instance of a `h5py.Dataset`, then the output - file will include blocksizes based on the `h5py.Dataset's` - chunks. To override the blocksizes, specify them using the - `options` keyword. Eg {'blockxsize': 512, 'blockysize': 512}. - """ - if out_filename.exists(): - # Sanity check. Our measurements should have different names... - raise RuntimeError( - f"measurement output file already exists? {out_filename}" - ) - - if tags is None: - tags = {} - - dtype = array.dtype.name - - # Check for excluded datatypes - excluded_dtypes = ["int64", "int8", "uint64"] - if dtype in excluded_dtypes: - raise TypeError(f"Datatype not supported: {dtype}") - - # convert any bools to uin8 - if dtype == "bool": - array = numpy.uint8(array) - dtype = "uint8" - - ndims = array.ndim - shape = array.shape - - # Get the (z, y, x) dimensions (assuming BSQ interleave) - if ndims == 2: - samples = shape[1] - lines = shape[0] - bands = 1 - elif ndims == 3: - samples = shape[2] - lines = shape[1] - bands = shape[0] - else: - raise IndexError(f"Input array is not of 2 or 3 dimensions. Got {ndims}") - - transform = None - projection = None - if geobox is not None: - transform = geobox.transform - projection = geobox.crs - - rio_args = { - "count": bands, - "width": samples, - "height": lines, - "crs": projection, - "transform": transform, - "dtype": dtype, - "driver": "GTiff", - "predictor": self.PREDICTOR_DEFAULTS[dtype], - } - if nodata is not None: - rio_args["nodata"] = nodata - - if h5py is not None and isinstance(array, h5py.Dataset): - # TODO: if array is 3D get x & y chunks - if array.chunks[1] == array.shape[1]: - # GDAL doesn't like tiled or blocksize options to be set - # the same length as the columns (probably true for rows as well) - array = array[:] - else: - y_tile, x_tile = array.chunks - tiles = generate_tiles(samples, lines, x_tile, y_tile) - - if "tiled" in self.options: - rio_args["blockxsize"] = self.options.get("blockxsize", x_tile) - rio_args["blockysize"] = self.options.get("blockysize", y_tile) - - # the user can override any derived blocksizes by supplying `options` - # handle case where no options are provided - for key in self.options: - rio_args[key] = self.options[key] - - # Write to temp directory first so we can add levels afterwards with gdal. - with tempfile.TemporaryDirectory( - dir=out_filename.parent, prefix=".band_write" - ) as tmpdir: - unstructured_image = Path(tmpdir) / out_filename.name - """ - This is a wrapper around rasterio writing tiles to - enable writing to a temporary location before rearranging - the overviews within the file by gdal when required - """ - with rasterio.open(unstructured_image, "w", **rio_args) as outds: - if bands == 1: - if h5py is not None and isinstance(array, h5py.Dataset): - for tile in tiles: - idx = ( - slice(tile[0][0], tile[0][1]), - slice(tile[1][0], tile[1][1]), - ) - outds.write(array[idx], 1, window=tile) - else: - outds.write(array, 1) - else: - if h5py is not None and isinstance(array, h5py.Dataset): - for tile in tiles: - idx = ( - slice(tile[0][0], tile[0][1]), - slice(tile[1][0], tile[1][1]), - ) - subs = array[:, idx[0], idx[1]] - for i in range(bands): - outds.write(subs[i], i + 1, window=tile) - else: - for i in range(bands): - outds.write(array[i], i + 1) - if tags is not None: - outds.update_tags(**tags) - - # overviews/pyramids to disk - if overviews: - outds.build_overviews(overviews, overview_resampling) - - if overviews: - # Move the overviews to the start of the file, as required to be COG-compliant. - with rasterio.Env( - GDAL_TIFF_OVR_BLOCKSIZE=self.overview_blocksize or 512 - ): - rio_copy( - unstructured_image, - out_filename, - **{"copy_src_overviews": True, **rio_args}, - ) - else: - unstructured_image.rename(out_filename) - - return WriteResult(file_format=FileFormat.GeoTIFF) - - def create_thumbnail( - self, - rgb: Tuple[Path, Path, Path], - out: Path, - out_scale=10, - resampling=Resampling.average, - static_stretch: Tuple[int, int] = None, - percentile_stretch: Tuple[int, int] = (2, 98), - compress_quality: int = 85, - input_geobox: GridSpec = None, - ): - """ - Generate a thumbnail jpg image using the given three paths as red,green, blue. - - A linear stretch is performed on the colour. By default this is a dynamic 2% stretch - (the 2% and 98% percentile values of the input). The static_stretch parameter will - override this with a static range of values. - - If the input image has a valid no data value, the no data will - be set to 0 in the output image. - - Any non-contiguous data across the colour domain, will be set to - zero. - """ - # No aux.xml file with our jpeg. - with rasterio.Env(GDAL_PAM_ENABLED=False): - with tempfile.TemporaryDirectory( - dir=out.parent, prefix=".thumbgen-" - ) as tmpdir: - tmp_quicklook_path = Path(tmpdir) / "quicklook.tif" - - # We write an intensity-scaled, reprojected version of the dataset at full res. - # Then write a scaled JPEG verison. (TODO: can we do it in one step?) - ql_grid = _write_quicklook( - rgb, - tmp_quicklook_path, - resampling, - static_range=static_stretch, - percentile_range=percentile_stretch, - input_geobox=input_geobox, - ) - out_crs = ql_grid.crs - - # Scale and write as JPEG to the output. - ( - thumb_transform, - thumb_width, - thumb_height, - ) = calculate_default_transform( - out_crs, - out_crs, - ql_grid.shape[1], - ql_grid.shape[0], - *ql_grid.bounds, - dst_width=ql_grid.shape[1] // out_scale, - dst_height=ql_grid.shape[0] // out_scale, - ) - thumb_args = dict( - driver="JPEG", - quality=compress_quality, - height=thumb_height, - width=thumb_width, - count=3, - dtype="uint8", - nodata=0, - transform=thumb_transform, - crs=out_crs, - ) - with rasterio.open(tmp_quicklook_path, "r") as ql_ds: - ql_ds: DatasetReader - with rasterio.open(out, "w", **thumb_args) as thumb_ds: - thumb_ds: DatasetWriter - for index in thumb_ds.indexes: - thumb_ds.write( - ql_ds.read( - index, - out_shape=(thumb_height, thumb_width), - resampling=resampling, - ), - index, - ) - - def create_thumbnail_from_numpy( - self, - rgb: Tuple[numpy.array, numpy.array, numpy.array], - out_scale=10, - resampling=Resampling.average, - static_stretch: Tuple[int, int] = None, - percentile_stretch: Tuple[int, int] = (2, 98), - compress_quality: int = 85, - input_geobox: GridSpec = None, - nodata: int = -999, - ): - """ - Generate a thumbnail as numpy arrays. - - Unlike the default `create_thumbnail` function, this is done entirely in-memory. It will likely require more - memory but does not touch the filesystem. - - A linear stretch is performed on the colour. By default this is a dynamic 2% stretch - (the 2% and 98% percentile values of the input). The static_stretch parameter will - override this with a static range of values. - - Any non-contiguous data across the colour domain, will be set to zero. - """ - ql_grid, numpy_array_list, ql_write_args = _write_to_numpy_array( - rgb, - resampling, - static_range=static_stretch, - percentile_range=percentile_stretch, - input_geobox=input_geobox, - nodata=nodata, - ) - out_crs = ql_grid.crs - - # Scale and write as JPEG to the output. - ( - thumb_transform, - thumb_width, - thumb_height, - ) = calculate_default_transform( - out_crs, - out_crs, - ql_grid.shape[1], - ql_grid.shape[0], - *ql_grid.bounds, - dst_width=ql_grid.shape[1] // out_scale, - dst_height=ql_grid.shape[0] // out_scale, - ) - thumb_args = dict( - driver="JPEG", - quality=compress_quality, - height=thumb_height, - width=thumb_width, - count=3, - dtype="uint8", - nodata=0, - transform=thumb_transform, - crs=out_crs, - ) - - with MemoryFile() as mem_tif_file: - with mem_tif_file.open(**ql_write_args) as dataset: - for i, data in enumerate(numpy_array_list): - dataset.write(data, i + 1) - - with MemoryFile() as mem_jpg_file: - with mem_jpg_file.open(**thumb_args) as thumbnail: - for index in thumbnail.indexes: - thumbnail.write( # write the data from temp_tif to temp_jpg - dataset.read( - index, - out_shape=(thumb_height, thumb_width), - resampling=Resampling.average, - ), - index, - ) - - return_bytes = mem_jpg_file.read() - - return return_bytes - - def create_thumbnail_singleband( - self, - in_file: Path, - out_file: Path, - bit: int = None, - lookup_table: Dict[int, Tuple[int, int, int]] = None, - ): - """ - Write out a JPG thumbnail from a singleband image. - This takes in a path to a valid raster dataset and writes - out a file with only the values of the bit (integer) as white - """ - if bit is not None and lookup_table is not None: - raise ValueError( - "Please set either bit or lookup_table, and not both of them" - ) - if bit is None and lookup_table is None: - raise ValueError( - "Please set either bit or lookup_table, you haven't set either of them" - ) - - with rasterio.open(in_file) as dataset: - data = dataset.read() - out_data, stretch = self._filter_singleband_data(data, bit, lookup_table) - - meta = dataset.meta - meta["driver"] = "GTiff" - - with tempfile.TemporaryDirectory() as temp_dir: - if bit: - # Only use one file, three times - temp_file = Path(temp_dir) / "temp.tif" - - with rasterio.open(temp_file, "w", **meta) as tmpdataset: - tmpdataset.write(out_data) - self.create_thumbnail( - (temp_file, temp_file, temp_file), - out_file, - static_stretch=stretch, - ) - else: - # Use three different files - temp_files = tuple(Path(temp_dir) / f"temp_{i}.tif" for i in range(3)) - - for i in range(3): - with rasterio.open(temp_files[i], "w", **meta) as tmpdataset: - tmpdataset.write(out_data[i]) - self.create_thumbnail(temp_files, out_file, static_stretch=stretch) - - def create_thumbnail_singleband_from_numpy( - self, - input_data: numpy.array, - bit: int = None, - lookup_table: Dict[int, Tuple[int, int, int]] = None, - input_geobox: GridSpec = None, - nodata: int = -999, - ) -> bytes: - """ - Output a thumbnail ready bytes from the input numpy array. - This takes a valid raster data (numpy arrary) and return - out bytes with only the values of the bit (integer) as white. - """ - if bit is not None and lookup_table is not None: - raise ValueError( - "Please set either bit or lookup_table, and not both of them" - ) - if bit is None and lookup_table is None: - raise ValueError( - "Please set either bit or lookup_table, you haven't set either of them" - ) - - out_data, stretch = self._filter_singleband_data(input_data, bit, lookup_table) - - if bit: - rgb = [out_data, out_data, out_data] - else: - rgb = out_data - - return self.create_thumbnail_from_numpy( - rgb=rgb, - static_stretch=stretch, - input_geobox=input_geobox, - nodata=nodata, - ) - - def _filter_singleband_data( - self, - data: numpy.array, - bit: int = None, - lookup_table: Dict[int, Tuple[int, int, int]] = None, - ): - """ - Apply bit or lookup_table to filter the numpy array - and generate the thumbnail content. - """ - if bit is not None: - out_data = numpy.copy(data) - out_data[data != bit] = 0 - stretch = (0, bit) - if lookup_table is not None: - out_data = [ - numpy.full_like(data, 0), - numpy.full_like(data, 0), - numpy.full_like(data, 0), - ] - stretch = (0, 255) - - for value, rgb in lookup_table.items(): - for index in range(3): - out_data[index][data == value] = rgb[index] - return out_data, stretch - - -def _write_to_numpy_array( - rgb: Sequence[numpy.array], - resampling: Resampling, - static_range: Tuple[int, int], - percentile_range: Tuple[int, int] = (2, 98), - input_geobox: GridSpec = None, - nodata: int = -999, -) -> GridSpec: - """ - Write an intensity-scaled wgs84 image using the given files as bands. - """ - if input_geobox is None: - raise NotImplementedError("generating geobox from numpy is't yet supported") - - out_crs = CRS.from_epsg(4326) - ( - reprojected_transform, - reprojected_width, - reprojected_height, - ) = calculate_default_transform( - input_geobox.crs, - out_crs, - input_geobox.shape[1], - input_geobox.shape[0], - *input_geobox.bounds, - ) - reproj_grid = GridSpec( - (reprojected_height, reprojected_width), reprojected_transform, crs=out_crs - ) - ql_write_args = dict( - driver="GTiff", - dtype="uint8", - count=len(rgb), - width=reproj_grid.shape[1], - height=reproj_grid.shape[0], - transform=reproj_grid.transform, - crs=reproj_grid.crs, - nodata=0, - tiled="yes", - ) - - # Only set blocksize on larger imagery; enables reduced resolution processing - if reproj_grid.shape[0] > 512: - ql_write_args["blockysize"] = 512 - if reproj_grid.shape[1] > 512: - ql_write_args["blockxsize"] = 512 - - # Calculate combined nodata mask - valid_data_mask = numpy.ones(input_geobox.shape, dtype="bool") - calculated_range = read_valid_mask_and_value_range( - valid_data_mask, _iter_arrays(rgb, nodata=nodata), percentile_range - ) - - output_list = [] - - for band_no, (image, nodata) in enumerate( - _iter_arrays(rgb, nodata=nodata), start=1 - ): - reprojected_data = numpy.zeros(reproj_grid.shape, dtype=numpy.uint8) - reproject( - rescale_intensity( - image, - image_null_mask=~valid_data_mask, - in_range=(static_range or calculated_range), - out_range=(1, 255), - out_dtype=numpy.uint8, - ), - reprojected_data, - src_crs=input_geobox.crs, - src_transform=input_geobox.transform, - src_nodata=0, - dst_crs=reproj_grid.crs, - dst_nodata=0, - dst_transform=reproj_grid.transform, - resampling=resampling, - num_threads=2, - ) - output_list.append(reprojected_data) - del reprojected_data - - return reproj_grid, output_list, ql_write_args - - -def _write_quicklook( - rgb: Sequence[Path], - dest_path: Path, - resampling: Resampling, - static_range: Tuple[int, int], - percentile_range: Tuple[int, int] = (2, 98), - input_geobox: GridSpec = None, -) -> GridSpec: - """ - Write an intensity-scaled wgs84 image using the given files as bands. - """ - if input_geobox is None: - with rasterio.open(rgb[0]) as ds: - input_geobox = GridSpec.from_rio(ds) - - out_crs = CRS.from_epsg(4326) - ( - reprojected_transform, - reprojected_width, - reprojected_height, - ) = calculate_default_transform( - input_geobox.crs, - out_crs, - input_geobox.shape[1], - input_geobox.shape[0], - *input_geobox.bounds, - ) - reproj_grid = GridSpec( - (reprojected_height, reprojected_width), reprojected_transform, crs=out_crs - ) - ql_write_args = dict( - driver="GTiff", - dtype="uint8", - count=len(rgb), - width=reproj_grid.shape[1], - height=reproj_grid.shape[0], - transform=reproj_grid.transform, - crs=reproj_grid.crs, - nodata=0, - tiled="yes", - ) - - # Only set blocksize on larger imagery; enables reduced resolution processing - if reproj_grid.shape[0] > 512: - ql_write_args["blockysize"] = 512 - if reproj_grid.shape[1] > 512: - ql_write_args["blockxsize"] = 512 - - with rasterio.open(dest_path, "w", **ql_write_args) as ql_ds: - ql_ds: DatasetWriter - - # Calculate combined nodata mask - valid_data_mask = numpy.ones(input_geobox.shape, dtype="bool") - calculated_range = read_valid_mask_and_value_range( - valid_data_mask, _iter_images(rgb), percentile_range - ) - - for band_no, (image, nodata) in enumerate(_iter_images(rgb), start=1): - reprojected_data = numpy.zeros(reproj_grid.shape, dtype=numpy.uint8) - reproject( - rescale_intensity( - image, - image_null_mask=~valid_data_mask, - in_range=(static_range or calculated_range), - out_range=(1, 255), - out_dtype=numpy.uint8, - ), - reprojected_data, - src_crs=input_geobox.crs, - src_transform=input_geobox.transform, - src_nodata=0, - dst_crs=reproj_grid.crs, - dst_nodata=0, - dst_transform=reproj_grid.transform, - resampling=resampling, - num_threads=2, - ) - ql_ds.write(reprojected_data, band_no) - del reprojected_data - - return reproj_grid - - -LazyImages = Iterable[Tuple[numpy.ndarray, int]] - - -def _iter_images(rgb: Sequence[Path]) -> LazyImages: - """ - Lazily load a series of single-band images from a path. - - Yields the image array and nodata value. - """ - for path in rgb: - with rasterio.open(path) as ds: - ds: DatasetReader - if ds.count != 1: - raise NotImplementedError( - "multi-band measurement files aren't yet supported" - ) - yield ds.read(1), ds.nodata - - -def _iter_arrays(rgb: Sequence[numpy.array], nodata: int) -> LazyImages: - """ - Lazily load a series of single-band images from a path. - - Yields the image array and nodata value. - """ - for data in rgb: - yield data, nodata - - -def read_valid_mask_and_value_range( - valid_data_mask: numpy.ndarray, - images: LazyImages, - calculate_percentiles: Optional[Tuple[int, int]] = None, -) -> Optional[Tuple[int, int]]: - """ - Read the given images, filling in a valid data mask and optional pixel percentiles. - """ - calculated_range = (-sys.maxsize - 1, sys.maxsize) - for array, nodata in images: - valid_data_mask &= array != nodata - - if calculate_percentiles is not None: - the_data = array[valid_data_mask] - # Check if there's a non-empty array first - if the_data.any(): - # Numpy changed the 'interpolation' method, but we need to still support the - # older Python 3.6 module at NCI. - if numpy.__version__ < "1.22": - low, high = numpy.percentile( - the_data, calculate_percentiles, interpolation="nearest" - ) - else: - low, high = numpy.percentile( - the_data, calculate_percentiles, method="nearest" - ) - calculated_range = ( - max(low, calculated_range[0]), - min(high, calculated_range[1]), - ) - - return calculated_range - - -def rescale_intensity( - image: numpy.ndarray, - in_range: Tuple[int, int], - out_range: Optional[Tuple[int, int]] = None, - image_nodata: int = None, - image_null_mask: numpy.ndarray = None, - out_dtype=numpy.uint8, - out_nodata=0, -) -> numpy.ndarray: - """ - Based on scikit-image's rescale_intensity, but does fewer copies/allocations of the array. - - (and it saves us bringing in the entire dependency for one small method) - """ - if image_null_mask is None: - if image_nodata is None: - raise ValueError("Must specify either a null mask or a nodata val") - image_null_mask = image == image_nodata - - imin, imax = in_range - omin, omax = out_range or (numpy.iinfo(out_dtype).min, numpy.iinfo(out_dtype).max) - - # The intermediate calculation will need floats. - # We'll convert to it immediately to avoid modifying the input array - image = image.astype(numpy.float64) - - numpy.clip(image, imin, imax, out=image) - image -= imin - image /= float(imax - imin) - image *= omax - omin - image += omin - image = image.astype(out_dtype) - image[image_null_mask] = out_nodata - return image diff --git a/eo3/metadata/default-eo3-type.yaml b/eo3/metadata/default-eo3-type.yaml new file mode 100644 index 00000000..4e617fe7 --- /dev/null +++ b/eo3/metadata/default-eo3-type.yaml @@ -0,0 +1,104 @@ +--- +# Metadata Type +name: eo3 +description: Default EO3 with no custom fields +dataset: + id: + - id + label: + - label + format: + - properties + - odc:file_format + sources: + - lineage + - source_datasets + creation_dt: + - properties + - odc:processing_datetime + grid_spatial: + - grid_spatial + - projection + measurements: + - measurements + search_fields: + lat: + type: double-range + max_offset: + - - extent + - lat + - end + min_offset: + - - extent + - lat + - begin + description: Latitude range + lon: + type: double-range + max_offset: + - - extent + - lon + - end + min_offset: + - - extent + - lon + - begin + description: Longitude range + time: + type: datetime-range + max_offset: + - - properties + - dtr:end_datetime + - - properties + - datetime + min_offset: + - - properties + - dtr:start_datetime + - - properties + - datetime + description: Acquisition time range + crs_raw: + offset: + - crs + indexed: false + description: The raw CRS string as it appears in metadata + platform: + offset: + - properties + - eo:platform + indexed: false + description: Platform code + instrument: + offset: + - properties + - eo:instrument + indexed: false + description: Instrument name + cloud_cover: + type: double + offset: + - properties + - eo:cloud_cover + indexed: false + description: Cloud cover percentage [0, 100] + region_code: + offset: + - properties + - odc:region_code + description: "Spatial reference code from the provider. For Landsat region_code + is a scene path row:\n '{:03d}{:03d}.format(path,row)'.\nFor Sentinel + it is MGRS code. In general it is a unique string identifier that datasets + covering roughly the same spatial region share.\n" + product_family: + offset: + - properties + - odc:product_family + indexed: false + description: Product family code + dataset_maturity: + offset: + - properties + - dea:dataset_maturity + indexed: false + description: One of - final|interim|nrt (near real time) +... diff --git a/eo3/metadata/validate.py b/eo3/metadata/validate.py index 71360ec3..e2245726 100644 --- a/eo3/metadata/validate.py +++ b/eo3/metadata/validate.py @@ -2,7 +2,7 @@ from attr import define -from eo3 import serialise +from eo3 import schema from eo3.validation_msg import ValidationMessage, ValidationMessages @@ -158,7 +158,7 @@ def validate_metadata_type(doc: Dict) -> ValidationMessages: yield ValidationMessage.error("no_type_name", "Metadata type must have a name.") return # Validate it against ODC's schema (will be refused by ODC otherwise) - for error in serialise.METADATA_TYPE_SCHEMA.iter_errors(doc): + for error in schema.METADATA_TYPE_SCHEMA.iter_errors(doc): displayable_path = ".".join(map(str, error.absolute_path)) context = f"Error in {name}: ({displayable_path}) " if displayable_path else "" yield ValidationMessage.error("document_schema", f"{context}{error.message} ") diff --git a/eo3/model.py b/eo3/model.py index da951247..ac2c598e 100644 --- a/eo3/model.py +++ b/eo3/model.py @@ -1,22 +1,48 @@ +import warnings from pathlib import Path -from typing import Dict, List, Optional, Tuple, Union -from uuid import UUID +from typing import Mapping, Optional -import affine import attr -from odc.geo import CoordList, Geometry, SomeCRS +import toolz +from odc.geo import CRS, Geometry from odc.geo.geom import polygon -from ruamel.yaml.comments import CommentedMap -from shapely.geometry.base import BaseGeometry +from pyproj.exceptions import CRSError +from ruamel.yaml.timestamp import TimeStamp as RuamelTimeStamp -from eo3.properties import Eo3DictBase, Eo3InterfaceBase +from eo3 import validate +from eo3.eo3_core import EO3Grid, prep_eo3 +from eo3.fields import Range, all_field_offsets, get_search_fields, get_system_fields +from eo3.metadata.validate import validate_metadata_type +from eo3.utils import default_utc, parse_time, read_file +from eo3.validation_msg import ContextualMessager, ValidationMessages DEA_URI_PREFIX = "https://collections.dea.ga.gov.au" -ODC_DATASET_SCHEMA_URL = "https://schemas.opendatacube.org/dataset" +DEFAULT_METADATA_TYPE = read_file( + Path(__file__).parent / "metadata" / "default-eo3-type.yaml" +) -# Either a local filesystem path or a string URI. -# (the URI can use any scheme supported by rasterio, such as tar:// or https:// or ...) -Location = Union[Path, str] + +def datetime_type(value): + # Ruamel's TimeZone class can become invalid from the .replace(utc) call. + # (I think it no longer matches the internal ._yaml fields.) + # Convert to a regular datetime. + if isinstance(value, RuamelTimeStamp): + value = value.isoformat() + else: + value = parse_time(value) + + # Store all dates with a timezone. + # yaml standard says all dates default to UTC. + # (and ruamel normalises timezones to UTC itself) + return default_utc(value) + + +BASE_NORMALISERS = { + "datetime": datetime_type, + "dtr:end_datetime": datetime_type, + "dtr:start_datetime": datetime_type, + "odc:processing_datetime": datetime_type, +} @attr.s(auto_attribs=True, slots=True) @@ -33,31 +59,6 @@ class ProductDoc: href: str = None -@attr.s(auto_attribs=True, slots=True, hash=True) -class GridDoc: - """The grid describing a measurement/band's pixels""" - - shape: Tuple[int, int] - transform: affine.Affine - crs: Optional[str] = None - - def points(self, ring: bool = False) -> CoordList: - ny, nx = (float(dim) for dim in self.shape) - pts = [(0.0, 0.0), (nx, 0.0), (nx, ny), (0.0, ny)] - if ring: - pts += pts[:1] - return [self.transform * pt for pt in pts] - - def ref_points(self) -> Dict[str, Dict[str, float]]: - nn = ["ul", "ur", "lr", "ll"] - return {n: dict(x=x, y=y) for n, (x, y) in zip(nn, self.points())} - - def polygon(self, crs: Optional[SomeCRS] = None) -> Geometry: - if not crs: - crs = self.crs - return polygon(self.points(ring=True), crs=crs) - - @attr.s(auto_attribs=True, slots=True) class MeasurementDoc: """ @@ -87,55 +88,298 @@ class AccessoryDoc: name: str = attr.ib(metadata=dict(doc_exclude=True), default=None) -@attr.s(auto_attribs=True, slots=True) -class Eo3DatasetDocBase(Eo3InterfaceBase): +class DatasetMetadata: """ - A minimally-validated EO3 dataset document + A representation of an EO3 dataset document that allows for easy metadata access and validation. + + :param raw_dict: The document describing the dataset as a dictionary. Can also provide a path to the dictionary + file via the `from_path` class method. + + :param mdt_definition: The metadata type definition dictionary. Dataset fields are accessed based on the offsets + defined in the metadata type definition. If no metadata type definition is provided, it will default to the simple + eo3 metadata type with no custom fields. It can be updated later using the `metadata_type` property - Includes :class:`.Eo3InterfaceBase` methods for metadata access:: + :param normalisers: A mapping of property normalisation functions, for any type or semantic normalisation that isn't + enforced by the dataset schema. By default it only normalisesdatetime strings to datetime.datetime objects + with a utc timezone if no timezone is specified - >>> p = Eo3DatasetDocBase() - >>> p.processed = '2018-04-03' - >>> p.properties['odc:processing_datetime'] - datetime.datetime(2018, 4, 3, 0, 0, tzinfo=datetime.timezone.utc) + :param legacy_lineage: False if dataset uses external lineage + DatasetMetadata also allows access to the raw document, the raw properties dictionary, and dataset properties + not defined within the metadata type, such as locations, geometry, grids, measurements, accessories + + Validation against the schema and the metadata type definition are conducted by default, as is geometry validation + via the call to `prep_eo3`, which adds/modifies metadata sections required for an eo3 dataset. """ - #: Dataset UUID - id: UUID = None - #: Human-readable identifier for the dataset - label: str = None - #: The product name (local) and/or url (global) - product: ProductDoc = None - #: Location(s) where this dataset is stored. - #: - #: (ODC supports multiple locations when the same dataset is stored in multiple places) - #: - #: They are fully qualified URIs (``file://...`, ``https://...``, ``s3://...``) - #: - #: All other paths in the document (measurements, accessories) are relative to the - #: chosen location. - #: - #: If not supplied, the directory from which the metadata was read is treated as the root for the data. - locations: List[str] = None - - #: CRS string. Eg. ``epsg:3577`` - crs: str = None - #: Shapely geometry of the valid data coverage - #: - #: (it must contain all non-empty pixels of the image) - geometry: BaseGeometry = None - #: Grid specifications for measurements - grids: Dict[str, GridDoc] = None - #: Raw properties - properties: Eo3DictBase = attr.ib(factory=Eo3DictBase) - #: Loadable measurements of the dataset - measurements: Dict[str, MeasurementDoc] = None - #: References to accessory files - #: - #: Such as thumbnails, checksums, other kinds of metadata files. - #: - #: (any files included in the dataset that are not measurements) - accessories: Dict[str, AccessoryDoc] = attr.ib(factory=CommentedMap) - #: Links to source dataset uuids - lineage: Dict[str, List[UUID]] = attr.ib(factory=CommentedMap) + def __init__( + self, + raw_dict, + mdt_definition: Mapping = DEFAULT_METADATA_TYPE, + normalisers: Mapping = BASE_NORMALISERS, + legacy_lineage=True, + ): + try: + self.__dict__["_doc"] = prep_eo3(raw_dict, remap_lineage=legacy_lineage) + except CRSError: + raise validate.InvalidDatasetError( + f"invalid_crs: CRS {raw_dict.get('crs')} is not a valid CRS" + ) + except ValueError as e: + raise validate.InvalidDatasetError(f"incomplete_geometry: {e}") + + self.__dict__["_normalisers"] = normalisers + for key, val in self._doc["properties"].items(): + self._doc["properties"][key] = self.normalise(key, val) + + self.__dict__["_mdt_definition"] = mdt_definition + + # The user-configurable search fields for this dataset type. + self.__dict__["_search_fields"] = { + name: field for name, field in get_search_fields(mdt_definition).items() + } + # The field offsets that the datacube itself understands: id, format, sources etc. + # (See the metadata-type-schema.yaml or the comments in default-metadata-types.yaml) + self.__dict__["_system_offsets"] = { + name: field for name, field in get_system_fields(mdt_definition).items() + } + + self.__dict__["_all_offsets"] = all_field_offsets(mdt_definition) + + self.__dict__["_msg"] = ContextualMessager( + { + "type": mdt_definition.get("name"), + } + ) + + validate.handle_validation_messages(self.validate_base()) + + def __getattr__(self, name): + if name in self.fields.keys(): + return self.fields[name] + else: + raise AttributeError( + "Unknown field {!r}. Expected one of {!r}".format( + name, list(self.fields.keys()) + ) + ) + + def __setattr__(self, name, val): + offset = self._all_offsets.get(name) + if offset is None: + # check for a @property.setter first + if hasattr(self, name): + super().__setattr__(name, val) + return + raise AttributeError( + "Unknown field offset {!r}. Expected one of {!r}".format( + name, list(self._all_offsets.keys()) + ) + ) + + def _set_range_offset(name, val, offset, doc): + """Helper function for updating a field that expects a range""" + is_range = isinstance(val, Range) + # time can be a range or a single datetime + if name == "time": + if is_range: + doc = toolz.assoc_in( + doc, + ["properties", "dtr:start_datetime"], + self.normalise("dtr:start_datetime", val.begin), + ) + doc = toolz.assoc_in( + doc, + ["properties", "dtr:end_datetime"], + self.normalise("dtr:end_datetime", val.end), + ) + else: + doc = toolz.assoc_in( + doc, ["properties", "datetime"], self.normalise("datetime", val) + ) + # for all other range fields, value must be range + else: + if not is_range: + raise TypeError(f"The {name} field expects a Range value") + # this assumes that offsets are in min, max order + # and that there aren't multiple possible offsets for each + doc = toolz.assoc_in( + doc, offset[0], self.normalise(offset[0], val.begin) + ) + doc = toolz.assoc_in(doc, offset[1], self.normalise(offset[0], val.end)) + return doc + + # handle if there are multiple offsets + if len(offset) > 1: + self._doc = _set_range_offset(name, val, offset, self._doc) + # otherwise it's a simple field + else: + self._doc = toolz.assoc_in(self._doc, *offset, self.normalise(*offset, val)) + + def __dir__(self): + return list(self.fields) + + @property + def doc(self): + return self._doc + + @property + def search_fields(self): + return { + name: field.extract(self.doc) for name, field in self._search_fields.items() + } + + @property + def system_fields(self): + return { + name: field.extract(self.doc) + for name, field in self._system_offsets.items() + } + + @property + def fields(self): + return dict(**self.system_fields, **self.search_fields) + + @property + def properties(self): + return self.doc.get("properties") + + @property + def metadata_type(self): + return self._mdt_definition + + @metadata_type.setter + def metadata_type(self, val: Mapping): + validate.handle_validation_messages(validate_metadata_type(val)) + self._mdt_definition = val + self._search_fields = { + name: field for name, field in get_search_fields(val).items() + } + self._system_offsets = { + name: field for name, field in get_system_fields(val).items() + } + self._all_offsets = all_field_offsets(val) + self._msg.context["type"] = val.get("name") + + # Additional metadata not included in the metadata type + @property + def locations(self): + if self.doc.get("location"): + warnings.warn( + "`location` is deprecated and will be removed in a future release. Use `locations` instead." + ) + return [self.doc.get("location")] + return self.doc.get("locations", None) + + @property + def product(self): + return ProductDoc(**self.doc.get("product")) + + @property + def geometry(self): + from shapely.geometry import shape + + return shape(self.doc.get("geometry")) + + @property + def grids(self): + return {key: EO3Grid(doc) for key, doc in self.doc.get("grids").items()} + + @property + def measurements(self): + return { + key: MeasurementDoc(**doc) + for key, doc in self.doc.get("measurements").items() + } + + @property + def accessories(self): + return { + key: AccessoryDoc(**doc) for key, doc in self.doc.get("accessories").items() + } + + @property + def crs(self) -> str: + # get doc crs as an actual CRS + return CRS(self._doc.get("crs")) + + # Core TODO: copied from datacube.model.Dataset + @property + def extent(self): + def xytuple(obj): + return obj["x"], obj["y"] + + projection = self.grid_spatial + valid_data = projection.get("valid_data") + geo_ref_points = projection.get("geo_ref_points") + if valid_data: + return Geometry(valid_data, crs=self.crs) + elif geo_ref_points: + return polygon( + [ + xytuple(geo_ref_points[key]) + for key in ("ll", "ul", "ur", "lr", "ll") + ], + crs=self.crs, + ) + + return None + + # Validation and other methods + def without_lineage(self): + return toolz.assoc(self._doc, "lineage", {}) + + def normalise(self, key, val): + """If property name is present in the normalisation mapping, apply the + normalisation function""" + # for easy dealing with offsets, such as when used in __setattr__ + if key[0] == "properties": + key = key[1] + normalise = self._normalisers.get(key, None) + if normalise: + return normalise(val) + return val + + def validate_to_product(self, product_definition: Mapping): + # Core TODO: replaces datacube.index.hl.check_dataset_consistent and check_consistent + self._msg.context["product"] = product_definition.get("name") + yield from validate.validate_ds_to_product( + self._doc, product_definition, self._msg + ) + + def validate_to_schema(self) -> ValidationMessages: + # don't error if properties 'extent' or 'grid_spatial' are present + doc = toolz.dissoc(self._doc, "extent", "grid_spatial") + yield from validate.validate_ds_to_schema(doc, self._msg) + + def validate_to_mdtype(self) -> ValidationMessages: + yield from validate.validate_ds_to_metadata_type( + self._doc, self._mdt_definition, self._msg + ) + + def validate_measurements(self) -> ValidationMessages: + """Check that measurement paths and grid references are valid""" + for name, measurement in self.measurements.items(): + grid_name = measurement.grid + if grid_name != "default" or self.grids: + if grid_name not in self.grids: + yield self._msg.error( + "invalid_grid_ref", + f"Measurement {name!r} refers to unknown grid {grid_name!r}", + ) + yield from validate.validate_measurement_path( + name, measurement.path, self._msg + ) + + def validate_base(self) -> ValidationMessages: + """Basic validations that can be done with information present at initialisation""" + yield from self.validate_to_schema() + yield from self.validate_to_mdtype() + # measurements are not mandatory + if self.measurements: + yield from self.validate_measurements() + + @classmethod + def from_path(cls, path): + # Create DatasetMetadata from filepath + return cls(read_file(path)) diff --git a/eo3/names.py b/eo3/names.py deleted file mode 100644 index 0febe048..00000000 --- a/eo3/names.py +++ /dev/null @@ -1,64 +0,0 @@ -from pathlib import Path -from urllib.parse import unquote, urlparse - -from eo3.model import Location -from eo3.uris import is_url, is_vsipath, normalise_path, register_scheme - -# Needed when packaging zip or tar files. -register_scheme("zip", "tar") - - -def _strip_major_version(version: str) -> str: - """ - >>> _strip_major_version('1.2.3') - '2.3' - >>> _strip_major_version('01.02.03') - '02.03' - >>> _strip_major_version('30.40') - '40' - >>> _strip_major_version('40') - '' - """ - return ".".join(version.split(".")[1:]) - - -class MissingRequiredFields(ValueError): - ... - - -def resolve_location(path: Location) -> str: - """ - Make sure a dataset location is a URL, suitable to be - the dataset_location in datacube indexing. - - Users may specify a pathlib.Path(), and we'll convert it as needed. - """ - if isinstance(path, str): - if not is_url(path) and not is_vsipath(path): - raise ValueError( - "A string location is expected to be a URL or VSI path. " - "Perhaps you want to give it as a local pathlib.Path()?" - ) - return path - - path = normalise_path(path) - if ".tar" in path.suffixes: - return f"tar:{path}!/" - elif ".zip" in path.suffixes: - return f"zip:{path}!/" - else: - uri = unquote(path.as_uri()) - # Base paths specified as directories must end in a slash, - # so they will be url joined as subfolders. (pathlib strips them) - if path.is_dir(): - return f"{uri}/" - return uri - - -def _as_path(url: str) -> Path: - """Try to convert the given URL to a local Path""" - parts = urlparse(url) - if not parts.scheme == "file": - raise ValueError(f"Expected a filesystem path, got a URL! {url!r}") - - return Path(parts.path) diff --git a/eo3/product/validate.py b/eo3/product/validate.py index aae97e61..4540ff99 100644 --- a/eo3/product/validate.py +++ b/eo3/product/validate.py @@ -6,8 +6,8 @@ from odc.geo import CRS from pyproj.exceptions import CRSError -from eo3 import serialise -from eo3.utils import _is_nan +from eo3 import schema +from eo3.utils.utils import _is_nan from eo3.validation_msg import ValidationMessage, ValidationMessages @@ -18,7 +18,7 @@ def validate_product(doc: Dict) -> ValidationMessages: # Validate it against ODC's product schema. has_doc_errors = False - for error in serialise.PRODUCT_SCHEMA.iter_errors(doc): + for error in schema.PRODUCT_SCHEMA.iter_errors(doc): has_doc_errors = True displayable_path = ".".join(map(str, error.absolute_path)) context = f"({displayable_path}) " if displayable_path else "" @@ -79,13 +79,13 @@ def validate_product(doc: Dict) -> ValidationMessages: def validate_product_metadata(template: Dict, name: str) -> ValidationMessages: for key, value in template.items(): if key == "product": - for prod_key, prod_val in template["product"].items(): + for prod_key, prod_val in value.items(): if prod_key == "name": - if template["product"]["name"] != name: + if prod_val != name: yield ValidationMessage.error( "product_name_mismatch", "If specified, metadata::product::name must match the product name " - f"(Expected {name}, got {template['product']['name']})", + f"(Expected {name}, got {prod_val})", ) else: yield ValidationMessage.warning( @@ -98,7 +98,7 @@ def validate_product_metadata(template: Dict, name: str) -> ValidationMessages: f"Only the name field is permitted in metadata::product::name ({prod_key})", ) elif key == "properties": - for prop_key, prop_val in template["properties"].items(): + for prop_key, prop_val in value.items(): if isinstance(prop_val, dict): yield ValidationMessage.error( "nested_metadata", @@ -386,7 +386,7 @@ def numpy_value_fits_dtype(value, dtype): if _is_nan(value): return np.issubdtype(dtype, np.floating) else: - return np.all(np.array([value], dtype=dtype) == [value]) + return np.all(np.array([value]).astype(dtype) == [value]) def _find_duplicates(values: Iterable[str]) -> Generator[str, None, None]: diff --git a/eo3/properties.py b/eo3/properties.py index cfda89a5..71cc93b1 100644 --- a/eo3/properties.py +++ b/eo3/properties.py @@ -1,6 +1,5 @@ import collections.abc import warnings -from abc import abstractmethod from collections import defaultdict from datetime import datetime from enum import Enum, EnumMeta @@ -9,8 +8,7 @@ import ciso8601 from ruamel.yaml.timestamp import TimeStamp as RuamelTimeStamp -from eo3.utils import _is_nan, default_utc -from eo3.validation_msg import ContextualMessager, ValidationMessage, ValidationMessages +from eo3.utils import default_utc class FileFormat(Enum): @@ -61,6 +59,15 @@ def datetime_type(value): return default_utc(value) +def degrees_type(value): + value = float(value) + + if not (-360.0 <= value <= 360.0): + raise ValueError("Expected degrees between -360,+360") + + return value + + def of_enum_type( vals: Union[EnumMeta, Tuple[str, ...]] = None, lower=False, upper=False, strict=True ) -> Callable[[str], str]: @@ -87,31 +94,37 @@ def normalise(v: str): return normalise -def percent_type(value): - value = float(value) - - if not (0.0 <= value <= 100.0): - raise ValueError("Expected percent between 0,100") +def producer_check(value): + if "." not in value: + warnings.warn( + "Property 'odc:producer' is expected to be a domain name, " + "eg 'usgs.gov' or 'ga.gov.au'" + ) return value -def degrees_type(value): - value = float(value) - - if not (-360.0 <= value <= 360.0): - raise ValueError("Expected degrees between -360,+360") - - return value +def normalise_platforms(value: Union[str, list, set]): + """ + >>> normalise_platforms('LANDSAT_8') + 'landsat-8' + >>> # Multiple can be comma-separated. They're normalised independently and sorted. + >>> normalise_platforms('LANDSAT_8,Landsat-5,landsat-7') + 'landsat-5,landsat-7,landsat-8' + >>> # Can be given as a list. + >>> normalise_platforms(['sentinel-2b','SENTINEL-2a']) + 'sentinel-2a,sentinel-2b' + >>> # Deduplicated too + >>> normalise_platforms('landsat-5,landsat-5,LANDSAT-5') + 'landsat-5' + """ + if not isinstance(value, (list, set, tuple)): + value = value.split(",") + platforms = sorted({s.strip().lower().replace("_", "-") for s in value if s}) + if not platforms: + return None -def identifier_type(v: str): - v = v.replace("-", "_") - if not v.isidentifier() or not v.islower(): - warnings.warn( - f"{v!r} is expected to be an identifier " - "(alphanumeric with underscores, typically lowercase)" - ) - return v + return ",".join(platforms) # The primitive types allowed as stac values. @@ -142,8 +155,7 @@ class Eo3DictBase(collections.abc.MutableMapping): the input dictionary on creation, but you can disable this with `normalise_input=False`. """ - # Every property we know about. Subclasses should extend this mapping. - # TODO: Really need to add at least dataset maturity and region code + # Every property we know about. Subclasses should extend this mapping. KNOWN_PROPERTIES: Mapping[str, Optional[NormaliseValueFn]] = { "datetime": datetime_type, "dtr:end_datetime": datetime_type, @@ -151,8 +163,23 @@ class Eo3DictBase(collections.abc.MutableMapping): "odc:file_format": of_enum_type(FileFormat, strict=False), "odc:processing_datetime": datetime_type, "odc:product": None, + "dea:dataset_maturity": of_enum_type(("final", "interim", "nrt"), lower=True), + "odc:region_code": None, + "odc:producer": producer_check, + # Common STAC properties + "eo:gsd": None, + "eo:instrument": None, + "eo:platform": normalise_platforms, + "eo:constellation": None, + "eo:off_nadir": float, + "eo:azimuth": float, + "eo:sun_azimuth": degrees_type, + "eo:sun_elevation": degrees_type, } + # Required properties whose presence will be enforced. + REQUIRED_PROPERTIES = ["datetime", "odc:processing_datetime"] + def __init__(self, properties: Mapping = None, normalise_input=True) -> None: if properties is None: properties = {} @@ -213,7 +240,7 @@ def normalise_and_set(self, key, value, allow_override=True, expect_override=Fal :argument expect_override: We expect to overwrite a property, so don't produce a warning or error. """ if key not in self.KNOWN_PROPERTIES: - warnings.warn(f"Unknown Stac property {key!r}. ") + warnings.warn(f"Unknown Stac property {key!r}.") if value is not None: normalise = self.KNOWN_PROPERTIES.get(key) @@ -243,151 +270,17 @@ def normalise_and_set(self, key, value, allow_override=True, expect_override=Fal def nested(self): return nest_properties(self._props) - def validate_eo3_properties(self, msg: ContextualMessager) -> ValidationMessages: - for name, value in self.items(): - yield from self.validate_eo3_property(name, value, msg) - - # ODC requires this - if not self.get("odc:file_format"): - yield msg.error( - "global_file_format", - "Property 'odc:file_format' is empty", - hint="Usually 'GeoTIFF'", + def validate_properties(self): + # Enforce presence of properties identified as required + missing_required = [] + for prop in self.REQUIRED_PROPERTIES: + if self._props.get(prop) is None: + missing_required.append(prop) + if missing_required: + raise KeyError( + f"The following required properties are missing or None: {', '.join(missing_required)}" ) - def validate_eo3_property( - self, name, value, msg: ContextualMessager - ) -> ValidationMessages: - # Everything has already been through normalise_and_set above, so - # most of these errors are untriggerable? - if name in self.KNOWN_PROPERTIES: - normaliser = self.KNOWN_PROPERTIES.get(name) - if normaliser and value is not None: - try: - normalised_value = normaliser(value) - # A normaliser can return two values, the latter adding extra extracted fields. - if isinstance(normalised_value, tuple): - normalised_value = normalised_value[0] - - # It's okay for datetimes to be strings - # .. since ODC's own loader does that. - if isinstance(normalised_value, datetime) and isinstance( - value, str - ): - value = ciso8601.parse_datetime(value) - - # Special case for dates, as "no timezone" and "utc timezone" are treated identical. - if isinstance(value, datetime): - value = default_utc(value) - - if not isinstance(value, type(normalised_value)): - yield msg.warning( - "property_type", - f"Value {value} expected to be " - f"{type(normalised_value).__name__!r} (got {type(value).__name__!r})", - ) - elif normalised_value != value: - if _is_nan(normalised_value) and _is_nan(value): - # Both are NaNs, ignore. - pass - else: - yield ValidationMessage.warning( - "property_formatting", - f"Property {value!r} expected to be {normalised_value!r}", - ) - except ValueError as e: - yield msg.error("invalid_property", f"{name!r}: {e.args[0]}") - if name == "odc:producer": - # We use domain name to avoid arguing about naming conventions ('ga' vs 'geoscience-australia' vs ...) - if "." not in self["odc:producer"]: - yield msg.warning( - "producer_domain", - "Property 'odc:producer' should be the organisation's domain name. Eg. 'ga.gov.au'", - ) - class PropertyOverrideWarning(UserWarning): """A warning that a property was set twice with different values.""" - - -class Eo3InterfaceBase: - """ - These are convenience properties for common metadata fields. They are available - on DatasetAssemblers and within other naming APIs. - - (This is abstract. If you want one of these of your own, you probably want to create - an :class:`eo3.DatasetDoc`) - - """ - - @property - @abstractmethod - def properties(self) -> Eo3DictBase: - raise NotImplementedError - - @property - def product_name(self) -> Optional[str]: - """ - The ODC product name - """ - return self.properties.get("odc:product") - - @product_name.setter - def product_name(self, value: str): - self.properties["odc:product"] = value - - @property - def datetime_range(self) -> Tuple[datetime, datetime]: - """ - An optional date range for the dataset. - - The ``datetime`` is still mandatory when this is set. - - This field is a shorthand for reading/setting the datetime-range - stac 0.6 extension properties: ``dtr:start_datetime`` and ``dtr:end_datetime`` - """ - return ( - self.properties.get("dtr:start_datetime"), - self.properties.get("dtr:end_datetime"), - ) - - @datetime_range.setter - def datetime_range(self, val: Tuple[datetime, datetime]): - # TODO: string type conversion, better validation/errors - start, end = val - self.properties["dtr:start_datetime"] = start - self.properties["dtr:end_datetime"] = end - - @property - def processed(self) -> datetime: - """When the dataset was created (Defaults to UTC if not specified) - - Shorthand for the ``odc:processing_datetime`` field - """ - return self.properties.get("odc:processing_datetime") - - @processed.setter - def processed(self, value: Union[str, datetime]): - self.properties["odc:processing_datetime"] = value - - def processed_now(self): - """ - Shorthand for when the dataset was processed right now on the current system. - """ - self.properties["odc:processing_datetime"] = datetime.utcnow() - - # Note that giving a method the name 'datetime' will override the 'datetime' type - # for class-level declarations (ie, for any types on functions!) - # So we make an alias: - from datetime import datetime as datetime_ - - @property - def datetime(self) -> datetime_: - """ - The searchable date and time of the assets. (Default to UTC if not specified) - """ - return self.properties.get("datetime") - - @datetime.setter - def datetime(self, val: datetime_): - self.properties["datetime"] = val diff --git a/eo3/schema/__init__.py b/eo3/schema/__init__.py new file mode 100644 index 00000000..4ffe8114 --- /dev/null +++ b/eo3/schema/__init__.py @@ -0,0 +1,10 @@ +from .schema import DATASET_SCHEMA, METADATA_TYPE_SCHEMA, PRODUCT_SCHEMA + +ODC_DATASET_SCHEMA_URL = "https://schemas.opendatacube.org/dataset" + +__all__ = ( + "DATASET_SCHEMA", + "PRODUCT_SCHEMA", + "METADATA_TYPE_SCHEMA", + "ODC_DATASET_SCHEMA_URL", +) diff --git a/eo3/schema/dataset.schema.yaml b/eo3/schema/dataset.schema.yaml index 636028ff..a4895693 100644 --- a/eo3/schema/dataset.schema.yaml +++ b/eo3/schema/dataset.schema.yaml @@ -23,6 +23,7 @@ properties: # Should match name field in product schema. (alphanumeric plus underscore and hyphen) pattern: '^\w+$' href: + # Optional but recommended type: string format: url required: @@ -59,7 +60,7 @@ properties: minItems: 6 maxItems: 9 crs: - # Optional - defaults to dataset crs, described abobe. + # Optional - defaults to dataset crs, described above. # Either an epsg code ('epsg:1234') (preferred!) or a WKT string if no EPSG is possible. type: string required: @@ -81,6 +82,15 @@ properties: # TODO: "string" type is problematic as they're currently parsed directly into datetime objects... # type: string format: date-time + + dea:dataset_maturity: + type: string + nullable: true + enum: + - final + - interim + - nrt + - null required: - odc:processing_datetime - datetime diff --git a/eo3/schema/ingestor-config-type-schema.yaml b/eo3/schema/ingestor-config-type-schema.yaml deleted file mode 100644 index 0102c39e..00000000 --- a/eo3/schema/ingestor-config-type-schema.yaml +++ /dev/null @@ -1,172 +0,0 @@ -"$schema": "http://json-schema.org/draft-04/schema#" -# TODO CORE Copied from datacube/models/schema -description: Schema for ingestor configuration. -type: object -properties: - source_type: - type: string - output_type: - type: string - description: - type: string - location: - type: string - file_path_template: - type: string - global_attributes: - type: object - properties: - title: - type: string - summary: - type: string - source: - type: string - history: - type: string - institution: - type: string - instrument: - type: string - cdm_data_type: - type: string - keywords: - type: string - keywords_vocabulary: - type: string - platform: - type: string - product_version: - type: [number, string] - publisher_email: - type: string - publisher_name: - type: string - publisher_url: - type: string - product_suite: - type: string - project: - type: string - coverage_content_type: - type: string - references: - type: string - license: - type: string - naming_authority: - type: string - acknowkledgment: - type: string - ingestion_bounds: - type: object - properties: - left: - type: number - bottom: - type: number - right: - type: number - top: - type: number - storage: - "$ref": "#/definitions/storage" - measurements: - type: array - additionalProperties: true - items: - "$ref": "#/definitions/measurement" -required: - - output_type - - location - - file_path_template - - global_attributes - - storage - - measurements -additionalProperties: true - -definitions: - dtype: - enum: ["float16", "float32", "float64", "int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64", "complex64", "complex128"] - measurement: - type: object - properties: - name: - type: string - pattern: '^\w+$' - dtype: - "$ref": "#/definitions/dtype" - nodata: - oneOf: - - type: number - - enum: [NaN, Inf, -Inf] - resampling_method: - type: string - src_varname: - type: string - zlib: - type: boolean - units: - type: string - aliases: - type: array - items: - type: string - spectral_definition: - type: object - properties: - wavelength: - type: array - items: - type: number - response: - type: array - items: - type: number - flags_definition: - type: object - patternProperties: - ".*": - required: [bits, values] - properties: - bits: - type: [number, array] - values: - type: object - properties: - description: - type: string - attrs: - type: object - properties: - long_name: - type: string - alias: - type: string - required: - - name - - dtype - - nodata - - src_varname - additionalProperties: true - - storage: - type: object - properties: - chunking: - type: object - crs: - type: string - dimension_order: - type: array - resolution: - type: object - tile_size: - type: object - origin: - type: object - driver: - type: string - bucket: - type: string - additionalProperties: true diff --git a/eo3/schema/metadata-type-schema.yaml b/eo3/schema/metadata-type-schema.yaml index 3780638f..0f30d081 100644 --- a/eo3/schema/metadata-type-schema.yaml +++ b/eo3/schema/metadata-type-schema.yaml @@ -1,4 +1,4 @@ -"$schema": "http://json-schema.org/draft-04/schema#" +"$schema": "http://json-schema.org/draft-07/schema#" # TODO CORE Copied from datacube/models/schema description: Schema for metadata types. type: object diff --git a/eo3/schema/product-schema.yaml b/eo3/schema/product-schema.yaml index 147b3af7..8e0c45f7 100644 --- a/eo3/schema/product-schema.yaml +++ b/eo3/schema/product-schema.yaml @@ -1,4 +1,4 @@ -"$schema": "http://json-schema.org/draft-04/schema#" +"$schema": "http://json-schema.org/draft-07/schema#" # TODO CORE Copied from datacube/models/schema description: Schema for dataset types. type: object diff --git a/eo3/schema/schema.py b/eo3/schema/schema.py new file mode 100644 index 00000000..ff20d87b --- /dev/null +++ b/eo3/schema/schema.py @@ -0,0 +1,57 @@ +from pathlib import Path + +import jsonschema +import referencing + +from eo3.utils import read_file + + +def _is_json_array(checker, instance) -> bool: + """ + By default, jsonschema only allows a json array to be a Python list. + Let's allow it to be a tuple too. + """ + return isinstance(instance, (list, tuple)) + + +def _load_schema_validator(p: Path) -> jsonschema.Draft7Validator: + """ + Create a schema instance for the file. + + (Assumes they are trustworthy. Only local schemas!) + """ + if not p.is_file(): + raise ValueError(f"Can only load local schemas. Could not find file {str(p)}") + if p.suffix.lower() not in (".yaml", ".yml"): + raise ValueError(f"Unexpected file type {p.suffix}. Expected yaml") + schema = read_file(p) + + # Allow schemas to reference other schemas relatively + def doc_reference(path): + path = p.parent.joinpath(path) + if not path.exists(): + raise ValueError(f"Reference not found: {path}") + referenced_schema = read_file(path) + return referencing.Resource(referenced_schema, referencing.jsonschema.DRAFT7) + + if p.parent: + registry = referencing.Registry(retrieve=doc_reference) + else: + registry = referencing.Registry() + + jsonschema.Draft7Validator.check_schema(schema) + validator = jsonschema.validators.extend( + jsonschema.Draft7Validator, + type_checker=jsonschema.Draft7Validator.TYPE_CHECKER.redefine( + "array", _is_json_array + ), + ) + return validator(schema, registry=registry) + + +SCHEMAS_PATH = Path(__file__).parent +DATASET_SCHEMA = _load_schema_validator(SCHEMAS_PATH / "dataset.schema.yaml") +PRODUCT_SCHEMA = _load_schema_validator(SCHEMAS_PATH / "product-schema.yaml") +METADATA_TYPE_SCHEMA = _load_schema_validator( + SCHEMAS_PATH / "metadata-type-schema.yaml" +) diff --git a/eo3/scripts/tostac.py b/eo3/scripts/tostac.py index e4d641ea..8fcfc084 100644 --- a/eo3/scripts/tostac.py +++ b/eo3/scripts/tostac.py @@ -12,10 +12,16 @@ from click import echo, style import eo3.stac as eo3stac -from eo3 import serialise -from eo3.model import Eo3DatasetDocBase -from eo3.ui import PathPath -from eo3.utils import jsonify_document +from eo3.model import DatasetMetadata +from eo3.utils import jsonify_document, normalise_path + + +class PathPath(click.Path): + """ + A Click argument that returns a normalised (absolute) pathlib Path""" + + def convert(self, value, param, ctx): + return Path(normalise_path(super().convert(value, param, ctx))) @click.command(help=__doc__) @@ -40,7 +46,7 @@ def run( validate: bool, ): for input_metadata in odc_metadata_files: - dataset = serialise.from_path(input_metadata) + dataset = DatasetMetadata.from_path(input_metadata) name = input_metadata.stem.replace(".odc-metadata", "") output_path = input_metadata.with_name(f"{name}.stac-item.json") @@ -66,7 +72,7 @@ def run( def dc_to_stac( - dataset: Eo3DatasetDocBase, + dataset: DatasetMetadata, input_metadata: Path, output_path: Path, stac_base_url: str, diff --git a/eo3/serialise.py b/eo3/serialise.py index b1131dd6..f662a08d 100644 --- a/eo3/serialise.py +++ b/eo3/serialise.py @@ -1,30 +1,14 @@ -import uuid from datetime import datetime -from functools import partial from pathlib import Path, PurePath -from typing import IO, Dict, Iterable, Mapping, Tuple, Union +from typing import Mapping from uuid import UUID -import attr -import cattr -import ciso8601 -import click -import jsonschema import numpy -import shapely -import shapely.affinity -import shapely.ops -from affine import Affine from ruamel.yaml import YAML, Representer from ruamel.yaml.comments import CommentedMap, CommentedSeq -from shapely.geometry import shape -from shapely.geometry.base import BaseGeometry -from eo3.model import ODC_DATASET_SCHEMA_URL, Eo3DatasetDocBase, Eo3DictBase +from eo3.model import DatasetMetadata from eo3.properties import FileFormat -from eo3.utils import read_documents - -converter = cattr.Converter() def _format_representer(dumper, data: FileFormat): @@ -117,195 +101,9 @@ def dumps_yaml(stream, *docs: Mapping) -> None: return yml.dump_all(docs, stream=stream) -def load_yaml(p: Path) -> Dict: - with p.open() as f: - return _yaml().load(f) - - -def _yaml(): - return YAML(typ="safe") - - -def loads_yaml(stream: Union[str, IO]) -> Iterable[Dict]: - """Dump yaml through a stream, using the default deserialisation settings.""" - return _yaml().load_all(stream) - - -def from_path(path: Path, skip_validation=False) -> Eo3DatasetDocBase: - """ - Parse an EO3 document from a filesystem path - - :param path: Filesystem path - :param skip_validation: Optionally disable validation (it's faster, but I hope your - doc is structured correctly) - """ - if path.suffix.lower() not in (".yaml", ".yml"): - raise ValueError(f"Unexpected file type {path.suffix}. Expected yaml") - - return from_doc(load_yaml(path), skip_validation=skip_validation) - - -class InvalidDataset(Exception): - def __init__(self, path: Path, error_code: str, reason: str) -> None: - self.path = path - self.error_code = error_code - self.reason = reason - - -def _is_json_array(checker, instance) -> bool: - """ - By default, jsonschema only allows a json array to be a Python list. - Let's allow it to be a tuple too. - """ - return isinstance(instance, (list, tuple)) - - -def _load_schema_validator(p: Path) -> jsonschema.Draft6Validator: - """ - Create a schema instance for the file. - - (Assumes they are trustworthy. Only local schemas!) - """ - with p.open() as f: - schema = _yaml().load(f) - validator = jsonschema.validators.validator_for(schema) - validator.check_schema(schema) - - # Allow schemas to reference other schemas relatively - def doc_reference(path): - path = p.parent.joinpath(path) - if not path.exists(): - raise ValueError(f"Reference not found: {path}") - referenced_schema = next(iter(read_documents(path)))[1] - return referenced_schema - - ref_resolver = jsonschema.RefResolver.from_schema( - schema, handlers={"": doc_reference} - ) - custom_validator = jsonschema.validators.extend( - validator, type_checker=validator.TYPE_CHECKER.redefine("array", _is_json_array) - ) - return custom_validator(schema, resolver=ref_resolver) - - -SCHEMAS_PATH = Path(__file__).parent / "schema" -DATASET_SCHEMA = _load_schema_validator(SCHEMAS_PATH / "dataset.schema.yaml") -PRODUCT_SCHEMA = _load_schema_validator(SCHEMAS_PATH / "product-schema.yaml") -METADATA_TYPE_SCHEMA = _load_schema_validator( - SCHEMAS_PATH / "metadata-type-schema.yaml" -) - - -def from_doc(doc: Dict, skip_validation=False) -> Eo3DatasetDocBase: - """ - Parse a dictionary into an EO3 dataset. - - By default it will validate it against the schema, which will result in far more - useful error messages if fields are missing. - - :param doc: A dictionary, such as is returned from yaml.load or json.load - :param skip_validation: Optionally disable validation (it's faster, but I hope your - doc is structured correctly) - """ - doc = doc.copy() - if not skip_validation: - # don't error if properties 'extent' or 'grid_spatial' are present - if doc.get("extent"): - del doc["extent"] - if doc.get("grid_spatial"): - del doc["grid_spatial"] - DATASET_SCHEMA.validate(doc) - - location = doc.pop("location", None) - if location: - doc["locations"] = [location] - - return converter.structure(doc, Eo3DatasetDocBase) - - -def _structure_as_uuid(d, t): - return uuid.UUID(str(d)) - - -def _structure_as_stac_props(d, t, normalise_properties=False): - """ - :param normalise_properties: - We don't normalise properties by default as we usually want it to reflect the original file. - - """ - return Eo3DictBase( - # The passed-in dictionary is stored internally, so we want to make a copy of it - # so that our serialised output is fully separate from the input. - dict(d), - normalise_input=normalise_properties, - ) - - -def _structure_as_affine(d: Tuple, t): - if len(d) not in [6, 9]: - raise ValueError(f"Expected 6 or 9 coefficients in transform. Got {d!r}") - - if len(d) == 9: - if tuple(d[-3:]) != (0.0, 0.0, 1.0): - raise ValueError( - f"Nine-element affine should always end in [0, 0, 1]. Got {d!r}" - ) - d = [*d[:-3]] - - return Affine(*d) - - -def _unstructure_as_stac_props(v: Eo3DictBase): - return v._props - - -def _structure_as_shape(d, t): - return shape(d) - - -converter.register_structure_hook(uuid.UUID, _structure_as_uuid) -converter.register_structure_hook(BaseGeometry, _structure_as_shape) -converter.register_structure_hook( - Eo3DictBase, - partial(_structure_as_stac_props, normalise_properties=False), -) -converter.register_structure_hook(Affine, _structure_as_affine) -converter.register_unstructure_hook(Eo3DictBase, _unstructure_as_stac_props) - - -def to_doc(d: Eo3DatasetDocBase) -> Dict: - """ - Serialise a DatasetDoc to a dict - - If you plan to write this out as a yaml file on disk, you're - better off with one of our formatted writers: :func:`.to_stream`, :func:`.to_path`. - """ - doc = attr.asdict( - d, - recurse=True, - dict_factory=dict, - # Exclude fields that are the default. - filter=lambda attr, value: "doc_exclude" not in attr.metadata - and value != attr.default - # Exclude any fields set to None. The distinction should never matter in our docs. - and value is not None, - retain_collection_types=False, - ) - doc["$schema"] = ODC_DATASET_SCHEMA_URL - if d.geometry is not None: - doc["geometry"] = shapely.geometry.mapping(d.geometry) - doc["id"] = str(d.id) - doc["properties"] = dict(d.properties) - - if len(doc.get("locations", [])) == 1: - doc["location"] = doc.pop("locations")[0] - - return doc - - -def to_formatted_doc(d: Eo3DatasetDocBase) -> CommentedMap: - """Serialise a DatasetDoc to a yaml-serialisation-ready dict""" - doc = prepare_formatting(to_doc(d)) +def to_formatted_doc(d: DatasetMetadata) -> CommentedMap: + """Serialise to a yaml-serialisation-ready dict""" + doc = prepare_formatting(d.doc) # Add user-readable names for measurements as a comment if present. if d.measurements: for band_name, band_doc in d.measurements.items(): @@ -315,7 +113,7 @@ def to_formatted_doc(d: Eo3DatasetDocBase) -> CommentedMap: return doc -def to_path(path: Path, *ds: Eo3DatasetDocBase): +def to_path(path: Path, *ds: DatasetMetadata): """ Output dataset(s) as a formatted YAML to a local path @@ -324,7 +122,7 @@ def to_path(path: Path, *ds: Eo3DatasetDocBase): dump_yaml(path, *(to_formatted_doc(d) for d in ds)) -def to_stream(stream, *ds: Eo3DatasetDocBase): +def to_stream(stream, *ds: DatasetMetadata): """ Output dataset(s) as a formatted YAML to an output stream @@ -436,30 +234,3 @@ def _add_space_before(d: CommentedMap, *keys): """Add an empty line to the document before a section (key)""" for key in keys: d.yaml_set_comment_before_after_key(key, before="\n") - - -class ClickDatetime(click.ParamType): - """ - Take a datetime parameter, supporting any ISO8601 date/time/timezone combination. - """ - - name = "date" - - def convert(self, value, param, ctx): - if value is None: - return value - - if isinstance(value, datetime): - return value - - try: - return ciso8601.parse_datetime(value) - except ValueError: - self.fail( - ( - "Invalid date string {!r}. Expected any ISO date/time format " - '(eg. "2017-04-03" or "2014-05-14 12:34")'.format(value) - ), - param, - ctx, - ) diff --git a/eo3/stac.py b/eo3/stac.py index b99f2792..ef248120 100644 --- a/eo3/stac.py +++ b/eo3/stac.py @@ -18,8 +18,9 @@ from pystac.extensions.view import ViewExtension from pystac.utils import datetime_to_str -from eo3.model import Eo3DatasetDocBase, GridDoc -from eo3.uris import uri_resolve +from eo3.eo3_core import EO3Grid +from eo3.model import DatasetMetadata +from eo3.utils import uri_resolve # Mapping between EO3 field names and STAC properties object field names MAPPING_EO3_TO_STAC = { @@ -103,7 +104,7 @@ def _asset_title_fields(asset_name: str) -> Optional[str]: return None -def _proj_fields(grid: Dict[str, GridDoc], grid_name: str = "default") -> Dict: +def _proj_fields(grid: Dict[str, EO3Grid], grid_name: str = "default") -> Dict: """ Get any proj (Stac projection extension) fields if we have them for the grid. """ @@ -126,7 +127,9 @@ def _lineage_fields(lineage: Dict) -> Dict: """ if lineage: lineage_dict = { - key: [str(uuid) for uuid in value] for key, value in lineage.items() + # there will only ever be one lineage id per level + key: [str(value["id"])] + for key, value in lineage.items() } return {"odc:lineage": lineage_dict} @@ -136,7 +139,7 @@ def _lineage_fields(lineage: Dict) -> Dict: def _odc_links( explorer_base_url: str, - dataset: Eo3DatasetDocBase, + dataset: DatasetMetadata, collection_url: Optional[str], ) -> List[Link]: """ @@ -173,24 +176,17 @@ def _odc_links( warnings.warn("No collection provided for Stac Item.") -def _get_projection(dataset: Eo3DatasetDocBase) -> Tuple[Optional[int], Optional[str]]: +def _get_projection(dataset: DatasetMetadata) -> Tuple[Optional[int], Optional[str]]: if dataset.crs is None: return None, None - crs_l = dataset.crs.lower() - epsg = None - wkt = None - if crs_l.startswith("epsg:"): - epsg = int(crs_l.lstrip("epsg:")) - else: - wkt = dataset.crs + epsg = dataset.crs.epsg + wkt = None if epsg is not None else dataset.crs.wkt return epsg, wkt -def eo3_to_stac_properties( - dataset: Eo3DatasetDocBase, crs: Optional[str] = None, title: str = None -) -> Dict: +def eo3_to_stac_properties(dataset: DatasetMetadata, title: str = None) -> Dict: """ Convert EO3 properties dictionary to the Stac equivalent. """ @@ -207,7 +203,7 @@ def eo3_to_stac_properties( def to_pystac_item( - dataset: Eo3DatasetDocBase, + dataset: DatasetMetadata, stac_item_destination_url: str, dataset_location: Optional[str] = None, odc_dataset_metadata_url: Optional[str] = None, @@ -231,7 +227,7 @@ def to_pystac_item( """ if dataset.geometry is not None: - geom = Geometry(dataset.geometry, CRS(dataset.crs)) + geom = Geometry(dataset.geometry, dataset.crs) wgs84_geometry = geom.to_crs(CRS("epsg:4326"), math.inf) geometry = wgs84_geometry.json @@ -241,7 +237,7 @@ def to_pystac_item( bbox = None properties = eo3_to_stac_properties(dataset, title=dataset.label) - properties.update(_lineage_fields(dataset.lineage)) + properties.update(_lineage_fields(dataset.sources)) dt = properties["datetime"] del properties["datetime"] @@ -349,7 +345,7 @@ def to_pystac_item( def to_stac_item( - dataset: Eo3DatasetDocBase, + dataset: DatasetMetadata, stac_item_destination_url: str, dataset_location: Optional[str] = None, odc_dataset_metadata_url: Optional[str] = None, diff --git a/eo3/ui.py b/eo3/ui.py deleted file mode 100644 index 786f2bcc..00000000 --- a/eo3/ui.py +++ /dev/null @@ -1,90 +0,0 @@ -import os -import urllib.parse -from pathlib import Path -from typing import Optional, Union -from urllib.parse import parse_qsl, urljoin, urlparse - -import click - -from eo3.uris import normalise_path - - -class PathPath(click.Path): - """ - A Click argument that returns a normalised (absolute) pathlib Path""" - - def convert(self, value, param, ctx): - return Path(normalise_path(super().convert(value, param, ctx))) - - -def uri_resolve(base: Union[str, Path], path: Optional[str]) -> str: - """ - Backport of datacube.utils.uris.uri_resolve() - """ - if path: - p = Path(path) - if p.is_absolute(): - return p.as_uri() - - if isinstance(base, Path): - base = base.absolute().as_uri() - return urljoin(base, path) - - -def bool_style(b, color=True) -> str: - if b: - return click.style("✓", fg=color and "green") - else: - return click.style("✗", fg=color and "yellow") - - -def is_absolute(url): - """ - >>> is_absolute('LC08_L1TP_108078_20151203_20170401_01_T1.TIF') - False - >>> is_absolute('data/LC08_L1TP_108078_20151203_20170401_01_T1.TIF') - False - >>> is_absolute('/g/data/somewhere/LC08_L1TP_108078_20151203_20170401_01_T1.TIF') - True - >>> is_absolute('file:///g/data/v10/somewhere/LC08_L1TP_108078_20151203_20170401_01_T1.TIF') - True - >>> is_absolute('http://example.com/LC08_L1TP_108078_20151203_20170401_01_T1.TIF') - True - >>> is_absolute('tar:///g/data/v10/somewhere/dataset.tar#LC08_L1TP_108078_20151203_20170401_01_T1.TIF') - True - """ - location = urlparse(url) - return bool(location.scheme or location.netloc) or os.path.isabs(location.path) - - -def get_part(url): - """ - >>> get_part('path/to/file.tif') - >>> get_part('path/to/file.tif#page=2') - >>> get_part('path/to/file.tif#part=3') - 3 - >>> get_part('path/to/file.tif#part=one') - 'one' - """ - opts = dict(parse_qsl(urlparse(url).fragment)) - part = opts.get("part") - if part is None: - return None - try: - return int(part) - except ValueError: - return part - - -def register_scheme(*schemes): - """ - Register additional uri schemes as supporting relative offsets (etc), so that band/measurement paths can be - calculated relative to the base uri. - """ - urllib.parse.uses_netloc.extend(schemes) - urllib.parse.uses_relative.extend(schemes) - urllib.parse.uses_params.extend(schemes) - - -register_scheme("tar") -register_scheme("s3") diff --git a/eo3/utils/__init__.py b/eo3/utils/__init__.py new file mode 100644 index 00000000..878cba55 --- /dev/null +++ b/eo3/utils/__init__.py @@ -0,0 +1,47 @@ +from .uris import ( + as_url, + get_part_from_uri, + is_absolute, + is_url, + is_vsipath, + mk_part_uri, + normalise_path, + uri_resolve, + uri_to_local_path, +) +from .utils import ( + InvalidDocException, + contains, + default_utc, + flatten_dict, + jsonify_document, + netcdf_extract_string, + parse_time, + read_documents, + read_file, + read_strings_from_netcdf, + thread_local_cache, +) + +__all__ = ( + "is_url", + "uri_to_local_path", + "get_part_from_uri", + "mk_part_uri", + "is_vsipath", + "normalise_path", + "uri_resolve", + "as_url", + "is_absolute", + "default_utc", + "jsonify_document", + "InvalidDocException", + "read_documents", + "read_strings_from_netcdf", + "netcdf_extract_string", + "contains", + "thread_local_cache", + "parse_time", + "read_file", + "flatten_dict", +) diff --git a/eo3/aws.py b/eo3/utils/aws.py similarity index 70% rename from eo3/aws.py rename to eo3/utils/aws.py index 71024bdb..90acda4e 100644 --- a/eo3/aws.py +++ b/eo3/utils/aws.py @@ -2,51 +2,18 @@ Helper methods for working with AWS """ import os -import threading -import time -from types import SimpleNamespace from typing import Any, Dict, Optional, Tuple, Union from urllib.parse import urlparse from urllib.request import urlopen import botocore import botocore.session -from botocore.credentials import Credentials, ReadOnlyCredentials +from botocore.credentials import ReadOnlyCredentials from botocore.session import Session -# TODO CORE: Copy of datacube.utils.generic.py -_LCL = threading.local() - - -def thread_local_cache( - name: str, initial_value: Any = None, purge: bool = False -) -> Any: - """Define/get thread local object with a given name. - - :param name: name for this cache - :param initial_value: Initial value if not set for this thread - :param purge: If True delete from cache (returning what was there previously) - - Returns - ------- - value previously set in the thread or `initial_value` - """ - absent = object() - cc = getattr(_LCL, name, absent) - absent = cc is absent - - if absent: - cc = initial_value - - if purge: - if not absent: - delattr(_LCL, name) - else: - if absent: - setattr(_LCL, name, cc) - - return cc +from eo3.utils import thread_local_cache +# TODO: ideally this file would eventually be moved to a lower-level utils package # TODO CORE: Copy of datacube.utils.aws.__init__.py ByteRange = Union[slice, Tuple[int, int]] # pylint: disable=invalid-name @@ -61,7 +28,6 @@ def thread_local_cache( "ec2_current_region", "botocore_default_region", "auto_find_region", - "get_creds_with_retry", "mk_boto_session", ) @@ -168,26 +134,6 @@ def auto_find_region( return default -def get_creds_with_retry( - session: Session, max_tries: int = 10, sleep: float = 0.1 -) -> Optional[Credentials]: - """Attempt to obtain credentials upto `max_tries` times with back off - :param session: botocore session, see mk_boto_session - :param max_tries: number of attempt before failing and returing None - :param sleep: number of seconds to sleep after first failure (doubles on every consecutive failure) - """ - for i in range(max_tries): - if i > 0: - time.sleep(sleep) - sleep = min(sleep * 2, 10) - - creds = session.get_credentials() - if creds is not None: - return creds - - return None - - def mk_boto_session( profile: Optional[str] = None, creds: Optional[ReadOnlyCredentials] = None, @@ -375,68 +321,3 @@ def s3_open( bucket, key = s3_url_parse(url) oo = s3.get_object(Bucket=bucket, Key=key, **kwargs) # type: ignore[attr-defined] return oo["Body"] - - -def s3_head_object(url: str, s3: MaybeS3 = None, **kwargs) -> Optional[Dict[str, Any]]: - """ - Head object, return object metadata. - - :param url: s3://bucket/path/to/object - :param s3: pre-configured s3 client, see make_s3_client() - :param kwargs: are passed on to ``s3.head_object(..)`` - """ - from botocore.exceptions import ClientError - - s3 = s3 or s3_client() - bucket, key = s3_url_parse(url) - - try: - oo = s3.head_object(Bucket=bucket, Key=key, **kwargs) # type: ignore[attr-defined] - except ClientError: - return None - - meta = oo.pop("ResponseMetadata", {}) - code = meta.get("HTTPStatusCode", 0) - if 200 <= code < 300: - return oo - - # it actually raises exceptions when http code is in the "fail" range - return None # pragma: no cover - - -def obtain_new_iam_auth_token( - url: str, region_name: str = "auto", profile_name: Optional[str] = None -) -> str: - # Boto3 is not core requirement, but ImportError is probably the right exception to throw anyway. - from boto3.session import Session as Boto3Session - - session = Boto3Session(profile_name=profile_name) - client = session.client("rds", region_name=region_name) - return client.generate_db_auth_token( - DBHostname=url.host, Port=url.port, DBUsername=url.username, Region=region_name - ) - - -# TODO CORE: Copy from datacube.utils.rio.rio -_CFG_LOCK = threading.Lock() -_CFG = SimpleNamespace(aws=None, cloud_defaults=False, kwargs={}, epoch=0) - - -def set_default_rio_config(aws=None, cloud_defaults=False, **kwargs): - """Setup default configuration for rasterio/GDAL. - - Doesn't actually activate one, just stores configuration for future - use from IO threads. - - :param aws: Dictionary of options for rasterio.session.AWSSession - OR 'auto' -- session = rasterio.session.AWSSession() - - :param cloud_defaults: When True inject settings for reading COGs - :param **kwargs: Passed on to rasterio.Env(..) constructor - """ - global _CFG # pylint: disable=global-statement - - with _CFG_LOCK: - _CFG = SimpleNamespace( - aws=aws, cloud_defaults=cloud_defaults, kwargs=kwargs, epoch=_CFG.epoch + 1 - ) diff --git a/eo3/uris.py b/eo3/utils/uris.py similarity index 79% rename from eo3/uris.py rename to eo3/utils/uris.py index c9ed689e..a8850bd1 100644 --- a/eo3/uris.py +++ b/eo3/utils/uris.py @@ -4,9 +4,11 @@ import urllib.parse from pathlib import Path from typing import Optional, Union -from urllib.parse import urljoin, urlparse +from urllib.parse import parse_qsl, urljoin, urlparse from urllib.request import url2pathname +# TODO: ideally this file would eventually be moved to a lower-level utils package + # CORE TODO: forked from datacube.utils.uris @@ -171,6 +173,44 @@ def as_url(maybe_uri: str) -> str: return pathlib.Path(maybe_uri).absolute().as_uri() +def is_absolute(url): + """ + >>> is_absolute('LC08_L1TP_108078_20151203_20170401_01_T1.TIF') + False + >>> is_absolute('data/LC08_L1TP_108078_20151203_20170401_01_T1.TIF') + False + >>> is_absolute('/g/data/somewhere/LC08_L1TP_108078_20151203_20170401_01_T1.TIF') + True + >>> is_absolute('file:///g/data/v10/somewhere/LC08_L1TP_108078_20151203_20170401_01_T1.TIF') + True + >>> is_absolute('http://example.com/LC08_L1TP_108078_20151203_20170401_01_T1.TIF') + True + >>> is_absolute('tar:///g/data/v10/somewhere/dataset.tar#LC08_L1TP_108078_20151203_20170401_01_T1.TIF') + True + """ + location = urlparse(url) + return bool(location.scheme or location.netloc) or os.path.isabs(location.path) + + +def get_part_from_uri(url): + """ + >>> get_part_from_uri('path/to/file.tif') + >>> get_part_from_uri('path/to/file.tif#page=2') + >>> get_part_from_uri('path/to/file.tif#part=3') + 3 + >>> get_part_from_uri('path/to/file.tif#part=one') + 'one' + """ + opts = dict(parse_qsl(urlparse(url).fragment)) + part = opts.get("part") + if part is None: + return None + try: + return int(part) + except ValueError: + return part + + def register_scheme(*schemes): """ Register additional uri schemes as supporting relative offsets (etc), so that band/measurement paths can be diff --git a/eo3/utils.py b/eo3/utils/utils.py similarity index 75% rename from eo3/utils.py rename to eo3/utils/utils.py index 01870996..3ed2a336 100644 --- a/eo3/utils.py +++ b/eo3/utils/utils.py @@ -5,30 +5,27 @@ import math import os import re +import threading from collections import OrderedDict from contextlib import contextmanager from datetime import date, datetime, timezone from decimal import Decimal from pathlib import Path -from typing import Any, Dict, Iterable, Mapping, Sequence, Tuple, Union +from typing import Any, Iterable, Mapping, Sequence, Tuple, Union from urllib.parse import urlparse from urllib.request import urlopen from uuid import UUID -import yaml - -try: - from yaml import CSafeLoader as SafeLoader # type: ignore -except ImportError: - from yaml import SafeLoader # type: ignore - import ciso8601 import click +import dateutil.parser import numpy +from ruamel.yaml import YAML, YAMLError -from eo3.uris import as_url, mk_part_uri +from .uris import as_url, mk_part_uri, uri_to_local_path -EO3_SCHEMA = "https://schemas.opendatacube.org/dataset" +# TODO: ideally the functions marked as 'general util' (originally copied +# over from core) would eventually be moved to a lower-level utils package class ItemProvider(enum.Enum): @@ -161,33 +158,6 @@ def get_collection_number( ) -def is_doc_eo3(doc: Dict[str, Any]) -> bool: - """Is this document eo3? - - :param doc: Parsed ODC Dataset metadata document - - :returns: - False if this document is a legacy dataset - True if this document is eo3 - - :raises ValueError: For an unsupported document - """ - schema = doc.get("$schema") - # All legacy documents had no schema at all. - if schema is None: - return False - - if schema == EO3_SCHEMA: - return True - - # Otherwise it has an unknown schema. - # - # Reject it for now. - # We don't want future documents (like Stac items, or "eo4") to be quietly - # accepted as legacy eo. - raise ValueError(f"Unsupported dataset schema: {schema!r}") - - def flatten_dict( d: Mapping, prefix: str = None, separator: str = "." ) -> Iterable[Tuple[str, Any]]: @@ -212,49 +182,26 @@ def flatten_dict( # CORE TODO: from datacube.utils.documents +# TODO: general util @contextmanager def _open_from_s3(url): o = urlparse(url) if o.scheme != "s3": raise RuntimeError("Abort abort I don't know how to open non s3 urls") - from .aws import s3_open + from eo3.utils.aws import s3_open yield s3_open(url) # CORE TODO: from datacube.utils.documents +# TODO: general util def _open_with_urllib(url): return urlopen(url) # nosec B310 # CORE TODO: from datacube.utils.documents -class NoDatesSafeLoader(SafeLoader): # pylint: disable=too-many-ancestors - @classmethod - def remove_implicit_resolver(cls, tag_to_remove): - """ - Removes implicit resolvers for a particular tag - - Takes care not to modify resolvers in super classes. - - We want to load datetimes as strings, not dates. We go on to - serialise as json which doesn't have the advanced types of - yaml, and leads to slightly different objects down the track. - """ - if "yaml_implicit_resolvers" not in cls.__dict__: - cls.yaml_implicit_resolvers = cls.yaml_implicit_resolvers.copy() - - for first_letter, mappings in cls.yaml_implicit_resolvers.items(): - cls.yaml_implicit_resolvers[first_letter] = [ - (tag, regexp) for tag, regexp in mappings if tag != tag_to_remove - ] - - -# CORE TODO: from datacube.utils.documents -NoDatesSafeLoader.remove_implicit_resolver("tag:yaml.org,2002:timestamp") - - -# CORE TODO: from datacube.utils.documents +# TODO: general util _PROTOCOL_OPENERS = { "s3": _open_from_s3, "ftp": _open_with_urllib, @@ -265,16 +212,57 @@ def remove_implicit_resolver(cls, tag_to_remove): # CORE TODO: from datacube.utils.documents -def load_from_yaml(handle, parse_dates=False): - loader = SafeLoader if parse_dates else NoDatesSafeLoader - yield from yaml.load_all(handle, Loader=loader) # noqa: DUO109 +# TODO: general util +def load_from_yaml(handle): + yield from YAML(typ="safe").load_all(handle) # noqa: DUO109 # CORE TODO: from datacube.utils.documents +# TODO: general util def load_from_json(handle): yield json.load(handle) +# TODO: general util +def load_from_netcdf(path): + for doc in read_strings_from_netcdf(path, variable="dataset"): + yield YAML(typ="safe").load(doc) + + +# TODO: general util +def netcdf_extract_string(chars): + """ + Convert netcdf S|U chars to Unicode string. + """ + import netCDF4 # type: ignore[import] + + if isinstance(chars, str): + return chars + + chars = netCDF4.chartostring(chars) + if chars.dtype.kind == "U": + return str(chars) + else: + return str(numpy.char.decode(chars)) + + +# TODO: general util +def read_strings_from_netcdf(path, variable): + """ + Load all of the string encoded data from a variable in a NetCDF file. + + By 'string', the CF conventions mean ascii. + + Useful for loading dataset metadata information. + """ + import netCDF4 + + with netCDF4.Dataset(str(path)) as ds: + for chars in ds[variable]: + yield netcdf_extract_string(chars) + + +# TODO: general util _PARSERS = { ".yaml": load_from_yaml, ".yml": load_from_yaml, @@ -282,6 +270,7 @@ def load_from_json(handle): } +# TODO: general util def transform_object_tree(f, o, key_transform=lambda k: k): """ Apply a function (f) on all the values in the given document tree (o), returning a new document of @@ -312,6 +301,7 @@ def recur(o_): return f(o) +# TODO: general util def jsonify_document(doc): """ Make a document ready for serialisation as JSON. @@ -340,6 +330,7 @@ def fixup_value(v): return transform_object_tree(fixup_value, doc, key_transform=str) +# TODO: general util def load_documents(path): """ Load document/s from the specified path. @@ -348,7 +339,7 @@ def load_documents(path): - JSON and YAML locally and remotely. - Compressed JSON and YAML locally - - Data Cube Dataset Documents inside local NetCDF files. # CORE TODO: stripped out for now??? + - Data Cube Dataset Documents inside local NetCDF files. :param path: path or URI to load documents from :return: generator of dicts @@ -359,28 +350,30 @@ def load_documents(path): scheme = urlparse(url).scheme compressed = url[-3:] == ".gz" - # if scheme == 'file' and path[-3:] == '.nc': - # path = uri_to_local_path(url) - # yield from load_from_netcdf(path) - # lse: - with _PROTOCOL_OPENERS[scheme](url) as fh: - if compressed: - fh = gzip.open(fh) - path = path[:-3] + if scheme == "file" and path[-3:] == ".nc": + path = uri_to_local_path(url) + yield from load_from_netcdf(path) + else: + with _PROTOCOL_OPENERS[scheme](url) as fh: + if compressed: + fh = gzip.open(fh) + path = path[:-3] - suffix = Path(path).suffix + suffix = Path(path).suffix - parser = _PARSERS[suffix] + parser = _PARSERS[suffix] - yield from parser(fh) + yield from parser(fh) # CORE TODO: from datacube.utils.documents +# TODO: general util class InvalidDocException(Exception): # noqa: N818 pass # CORE TODO: from datacube.utils.generic +# TODO: general util def map_with_lookahead(it, if_one=None, if_many=None): """ It's like normal map: creates a new generator by applying a function to every @@ -408,6 +401,7 @@ def map_with_lookahead(it, if_one=None, if_many=None): yield proc(v) +# TODO: general util def read_documents(*paths, uri=False): """ Read and parse documents from the filesystem or remote URLs (yaml or json). @@ -451,13 +445,20 @@ def add_uri_with_part(x): yield from process_file(path) except InvalidDocException as e: raise e - except (yaml.YAMLError, ValueError) as e: + except (YAMLError, ValueError) as e: raise InvalidDocException(f"Failed to load {path}: {e}") except Exception as e: raise InvalidDocException(f"Failed to load {path}: {e}") +# TODO: general util +def read_file(p: Path): + """Shorthand for when you just need to get the dict representation of 1 file""" + return next(iter(read_documents(p)))[1] + + # CORE TODO: from datacube.utils.changes +# TODO: general util # Type that can be checked for changes. # (MyPy approximation without recursive references) Changable = Union[str, int, None, Sequence[Any], Mapping[str, Any]] @@ -496,3 +497,61 @@ def _is_nan(v): if isinstance(v, str): return v == "NaN" return isinstance(v, float) and math.isnan(v) + + +# CORE TODO: from datacube.utils.dates +# TODO: general util +def parse_time(time: Union[str, datetime]) -> datetime: + """Convert string to datetime object + + This function deals with ISO8601 dates fast, and fallbacks to python for + other formats. + + Calling this on datetime object is a no-op. + """ + if isinstance(time, str): + try: + from ciso8601 import ( # pylint: disable=wrong-import-position # noqa: F401 + parse_datetime, + ) + + return parse_datetime(time) + except (ImportError, ValueError): # pragma: no cover + return dateutil.parser.parse(time) + + return time + + +# CORE TODO: from datacube.utils.generic.py +# TODO: general util +_LCL = threading.local() + + +def thread_local_cache( + name: str, initial_value: Any = None, purge: bool = False +) -> Any: + """Define/get thread local object with a given name. + + :param name: name for this cache + :param initial_value: Initial value if not set for this thread + :param purge: If True delete from cache (returning what was there previously) + + Returns + ------- + value previously set in the thread or `initial_value` + """ + absent = object() + cc = getattr(_LCL, name, absent) + absent = cc is absent + + if absent: + cc = initial_value + + if purge: + if not absent: + delattr(_LCL, name) + else: + if absent: + setattr(_LCL, name, cc) + + return cc diff --git a/eo3/validate.py b/eo3/validate.py index ca7748e4..126b94d0 100644 --- a/eo3/validate.py +++ b/eo3/validate.py @@ -1,350 +1,28 @@ """ Validate ODC dataset documents """ -import enum -from datetime import datetime -from pathlib import Path +import warnings from textwrap import indent -from typing import ( - Dict, - Generator, - Iterable, - List, - Mapping, - MutableMapping, - Optional, - Sequence, - Set, - Tuple, - Union, -) -from urllib.parse import urlparse -from uuid import UUID - -import attr -import cattr -import ciso8601 -import rasterio -import toolz -from attr import Factory, define, field, frozen -from cattrs import ClassValidationError -from click import echo -from rasterio import DatasetReader -from rasterio.crs import CRS -from rasterio.errors import CRSError -from shapely.validation import explain_validity - -from eo3 import model, serialise, utils -from eo3.eo3_core import prep_eo3 -from eo3.metadata.validate import validate_metadata_type -from eo3.model import AccessoryDoc, Eo3DatasetDocBase -from eo3.product.validate import validate_product -from eo3.ui import get_part, is_absolute, uri_resolve -from eo3.uris import is_url -from eo3.utils import ( - EO3_SCHEMA, - InvalidDocException, - _is_nan, - contains, - default_utc, - load_documents, - read_documents, -) -from eo3.validation_msg import ( - ContextualMessager, - Level, - ValidationMessage, - ValidationMessages, -) - -DEFAULT_NULLABLE_FIELDS = ("label",) -DEFAULT_OPTIONAL_FIELDS = ( - # Older product do not have this field at all, and when not specified it is considered stable. - "dataset_maturity", -) - - -class DocKind(enum.Enum): - # EO3 datacube dataset. - dataset = 1 - # Datacube product - product = 2 - # Datacube Metadata Type - metadata_type = 3 - # Stac Item - stac_item = 4 - # Legacy datacube ("eo1") dataset - legacy_dataset = 5 - # Legacy product config for ingester - ingestion_config = 6 - - @property - def is_legacy(self): - return self in (self.legacy_dataset, self.ingestion_config) - - -# What kind of document each suffix represents. -# (full suffix will also have a doc type: .yaml, .json, .yaml.gz etc) -# Example: "my-test-dataset.odc-metadata.yaml" -SUFFIX_KINDS = { - ".odc-metadata": DocKind.dataset, - ".odc-product": DocKind.product, - ".odc-type": DocKind.metadata_type, -} -# Inverse of above -DOC_TYPE_SUFFIXES = {v: k for k, v in SUFFIX_KINDS.items()} - - -def filename_doc_kind(path: Union[str, Path]) -> Optional["DocKind"]: - """ - Get the expected file type for the given filename. - - Returns None if it does not follow any naming conventions. - - >>> filename_doc_kind('LC8_2014.odc-metadata.yaml').name - 'dataset' - >>> filename_doc_kind('/tmp/something/water_bodies.odc-metadata.yaml.gz').name - 'dataset' - >>> filename_doc_kind(Path('/tmp/something/ls8_fc.odc-product.yaml')).name - 'product' - >>> filename_doc_kind(Path('/tmp/something/ls8_wo.odc-product.json.gz')).name - 'product' - >>> filename_doc_kind(Path('/tmp/something/eo3_gqa.odc-type.yaml')).name - 'metadata_type' - >>> filename_doc_kind(Path('/tmp/something/some_other_file.yaml')) - """ - - for suffix in reversed(Path(path).suffixes): - suffix = suffix.lower() - if suffix in SUFFIX_KINDS: - return SUFFIX_KINDS[suffix] - - return None +from typing import Dict, Iterable, List, Mapping, Set, Tuple +import toolz -def guess_kind_from_contents(doc: Dict): - """ - What sort of document do the contents look like? - """ - if "$schema" in doc and doc["$schema"] == EO3_SCHEMA: - return DocKind.dataset - if "metadata_type" in doc: - if "source_type" in doc: - return DocKind.ingestion_config - return DocKind.product - if ("dataset" in doc) and ("search_fields" in doc["dataset"]): - return DocKind.metadata_type - if "id" in doc: - if ("lineage" in doc) and ("platform" in doc): - return DocKind.legacy_dataset - - if ("properties" in doc) and ("datetime" in doc["properties"]): - return DocKind.stac_item - - return None - - -@frozen(init=True) -class ValidationExpectations: - """ - What expectations do we have when validating this dataset? - """ - - #: Allow these extra measurement names to be included in the dataset. - #: (ODC allows unlisted measurement names, but it's usually a mistake) - allow_extra_measurements: Sequence[str] = () - - #: Do we expect full geometry information in every dataset? - #: (It's optional in ODC, but often a mistake to miss it) - require_geometry: bool = True - - #: Are any of the configured fields nullable? - allow_nullable_fields: Sequence[str] = field( - default=Factory(lambda: DEFAULT_NULLABLE_FIELDS) - ) - #: Can any of the fields be completely omitted from the document? - allow_missing_fields: Sequence[str] = field( - default=Factory(lambda: DEFAULT_OPTIONAL_FIELDS) - ) - - def with_document_overrides(self, doc: Dict): - """ - Return an instance with any overrides from the given document. - - (TODO: Overrides are passed in in "default_allowances" section of product or metadata - document but are not part of the schema, so using them renders the document - invalid. Bad API design, IMO.) - """ - if "default_allowances" not in doc: - return self - - overridden_values = {**attr.asdict(self), **doc["default_allowances"]} - # Merge, don't replace, these lists. - overridden_values["allow_nullable_fields"] = list( - {*overridden_values["allow_nullable_fields"], *self.allow_nullable_fields} - ) - overridden_values["allow_missing_fields"] = list( - {*overridden_values["allow_missing_fields"], *self.allow_missing_fields} - ) - overridden_values["allow_extra_measurements"] = list( - { - *overridden_values["allow_extra_measurements"], - *self.allow_extra_measurements, - } - ) - return cattr.structure(overridden_values, self.__class__) +from eo3 import schema, utils +from eo3.fields import all_field_offsets +from eo3.utils import contains, get_part_from_uri, is_absolute +from eo3.validation_msg import ContextualMessager, Level, ValidationMessages -def validate_dataset( - doc: Dict, - product_definition: Optional[Dict] = None, - product_definitions: Optional[Dict] = None, - metadata_type_definition: Optional[Mapping[str, Dict]] = None, - thorough: bool = False, - readable_location: Union[str, Path] = None, - expect: ValidationExpectations = None, +def validate_ds_to_schema( + doc: Dict, msg: ContextualMessager = None ) -> ValidationMessages: - """ - Validate a dataset document, optionally against the given product. - - By default this will only look at the metadata, run with thorough=True to - open the data files too. - - :param product_definition: Optionally check that the dataset matches this product definition. - :param thorough: Open the imagery too, to check that data types etc match. - :param readable_location: Dataset location to use, if not the metadata path. - :param expect: Where can we be lenient in validation? - """ - # Prepare validation context and contextual message builder - expect = expect or ValidationExpectations() - validation_context = {} - if metadata_type_definition is not None: - expect = expect.with_document_overrides(metadata_type_definition) - validation_context["type"] = metadata_type_definition["name"] - if product_definition is not None: - expect = expect.with_document_overrides(product_definition) - validation_context["product"] = product_definition["name"] - elif product_definitions is not None: - product_name = doc.get("product", {}).get("name") - if product_name and product_name in product_definitions: - product_definition = product_definitions[product_name] - expect = expect.with_document_overrides(product_definition) - validation_context["product"] = product_name - - msg = ContextualMessager(validation_context) - - if expect.allow_extra_measurements: - yield msg.warning("extra_measurements", "Extra measurements are deprecated") - - if thorough and not product_definition: - yield msg.error( - "no_product", "Must supply product definition for thorough validation" - ) - - # Validate against schema and deserialise to a (base eo3) dataset doc - yield from _validate_ds_to_schema(doc, msg) - if msg.errors: - return - - # Validate Lineage before serialisation for clearer error reporting. (Get incomprehensible error messages - # for invalid UUIDs) - yield from _validate_lineage(doc.get("lineage", {}), msg) - if msg.errors: - return - - # TODO: How to make this step more extensible? - try: - dataset = serialise.from_doc(doc, skip_validation=True) - except ClassValidationError as e: - - def expand(err: ClassValidationError) -> str: - expanded = err.message - try: - for sub_err in err.exceptions: - expanded += expand(sub_err) - except AttributeError: - pass - return expanded - - yield msg.error("serialisation_failure", f"Serialisation failed: {expand(e)}") - return - - # non-schema basic validation - if not dataset.product.href: - yield msg.info("product_href", "A url (href) is recommended for products") - - if doc.get("location"): - yield msg.warning( - "dataset_location", - "Location is deprecated and will be removed in a future release. Use 'locations' instead.", - ) - - # Validate geometry - yield from _validate_geo(dataset, msg, expect_geometry=expect.require_geometry) - if msg.errors: - return - - # Previously a dataset could have no measurements (eg. telemetry data). - if expect.require_geometry: - if dataset.measurements: - yield from _validate_measurements(dataset, msg) - if msg.errors: - return - - # Base properties - # Validation is implemented in Eo3DictBase so it can be extended - yield from dataset.properties.validate_eo3_properties(msg) - - # Accessories - for acc_name, accessory in dataset.accessories.items(): - yield from _validate_accessory(acc_name, accessory, msg) - - required_measurements: Dict[str, ExpectedMeasurement] = {} - - # Validate dataset against product and metadata type definitions - if product_definition is not None: - yield from _validate_ds_to_product( - dataset, - required_measurements, - product_definition, - allow_extra_measurements=expect.allow_extra_measurements, - msg=msg, - ) - if msg.errors: - return - - if metadata_type_definition: - yield from _validate_ds_to_metadata_type( - doc, metadata_type_definition, expect, msg - ) - - if thorough: - # Validate contents of actual data against measurement metadata - yield from _validate_ds_against_data( - dataset, readable_location, required_measurements, msg - ) - - -def _validate_ds_to_schema(doc: Dict, msg: ContextualMessager) -> ValidationMessages: """ Validate against eo3 schema """ - schema = doc.get("$schema") - if schema is None: - yield msg.error( - "no_schema", - f"No $schema field. " - f"You probably want an ODC dataset schema {model.ODC_DATASET_SCHEMA_URL!r}", - ) - return - if schema != model.ODC_DATASET_SCHEMA_URL: - yield msg.error( - "unknown_doc_type", - f"Unknown doc schema {schema!r}. Only ODC datasets are supported ({model.ODC_DATASET_SCHEMA_URL!r})", - ) - return + if msg is None: + msg = ContextualMessager() - for error in serialise.DATASET_SCHEMA.iter_errors(doc): + for error in schema.DATASET_SCHEMA.iter_errors(doc): displayable_path = ".".join(error.absolute_path) hint = None @@ -354,331 +32,141 @@ def _validate_ds_to_schema(doc: Dict, msg: ContextualMessager) -> ValidationMess context = f"({displayable_path}) " if displayable_path else "" yield msg.error("structure", f"{context}{error.message} ", hint=hint) - -def _validate_measurements(dataset: Eo3DatasetDocBase, msg: ContextualMessager): - for name, measurement in dataset.measurements.items(): - grid_name = measurement.grid - if grid_name != "default" or dataset.grids: - if grid_name not in dataset.grids: - yield msg.error( - "invalid_grid_ref", - f"Measurement {name!r} refers to unknown grid {grid_name!r}", - ) - - if is_absolute(measurement.path): - yield msg.warning( - "absolute_path", - f"measurement {name!r} has an absolute path: {measurement.path!r}", - ) - - part = get_part(measurement.path) - if part is not None: + # properties detailed in the schema that are optional but recommended + recommended = [["product", "href"], ["properties", "dea:dataset_maturity"]] + for r in recommended: + if toolz.get_in(r, doc) is None: yield msg.warning( - "uri_part", - f"measurement {name!r} has a part in the path. (Use band and/or layer instead)", - ) - if isinstance(part, int): - if part < 0: - yield msg.error( - "uri_invalid_part", - f"measurement {name!r} has an invalid part (less than zero) in the path ({part})", - ) - elif isinstance(part, str): - yield msg.error( - "uri_invalid_part", - f"measurement {name!r} has an invalid part (non-integer) in the path ({part})", + "recommended_field", f"Field {'->'.join(r)} is optional but recommended" ) -def _validate_accessory(name: str, accessory: AccessoryDoc, msg: ContextualMessager): - accessory.name = name - if is_absolute(accessory.path): - yield msg.warning( - "absolute_path", - f"Accessory {accessory.name!r} has an absolute path: {accessory.path!r}", - ) - - -def _validate_lineage(lineage, msg): - for label, parent_ids in lineage.items(): - if len(parent_ids) > 1: - yield msg.info( - "nonflat_lineage", - f"Lineage label {label} has multiple sources and may get flattened on indexing " - "depending on the index driver", - ) - for parent_id in parent_ids: - try: - UUID(parent_id) - except ValueError: - yield msg.error( - "invalid_source_id", - f"Lineage id in {label} is not a valid UUID {parent_id}", - ) - - -def _validate_ds_to_product( - dataset: Eo3DatasetDocBase, - required_measurements: MutableMapping[str, "ExpectedMeasurement"], +def validate_ds_to_product( + doc: Dict, product_definition: Mapping, - allow_extra_measurements: Sequence[str], - msg: ContextualMessager, + msg: ContextualMessager = None, ): - required_measurements.update( - { - m.name: m - for m in map( - ExpectedMeasurement.from_definition, - product_definition.get("measurements") or (), - ) - } - ) - product_name = product_definition.get("name") - if product_name and product_name != dataset.product.name: + """Validate dataset is consistent with product definition""" + if msg is None: + msg = ContextualMessager({"product": product_definition.get("name")}) + + product_name = msg.context.get("product") + ds_product_name = doc.get("product").get("name") + if product_name and product_name != ds_product_name: yield msg.error( "product_mismatch", - f"Dataset product name {dataset.product.name!r} " - f"does not match the given product ({product_name!r}", + f"Dataset product name {ds_product_name!r} " + f"does not match the given product {product_name!r}", ) - ds_props = dict(dataset.properties) + ds_props = doc.get("properties") prod_props = product_definition["metadata"].get("properties", {}) if not contains(ds_props, prod_props): diffs = tuple(_get_printable_differences(ds_props, prod_props)) difference_hint = _differences_as_hint(diffs) yield msg.error( "metadata_mismatch", - "Dataset template does not match product document template.", + f"Dataset template does not match product document template for product {product_name!r}.", hint=difference_hint, ) - for name in required_measurements: - if name not in dataset.measurements.keys(): + product_measurement_names = [ + m["name"] for m in product_definition.get("measurements") + ] + doc_measurements = doc.get("measurements").keys() + for name in product_measurement_names: + if name not in doc_measurements: yield msg.error( "missing_measurement", f"Product {product_name} expects a measurement {name!r})", ) - measurements_not_in_product = set(dataset.measurements.keys()).difference( + measurements_not_in_product = set(doc_measurements).difference( {m["name"] for m in product_definition.get("measurements") or ()} ) - # Remove the measurements that are allowed to be extra. - measurements_not_in_product.difference_update(allow_extra_measurements or set()) if measurements_not_in_product: things = ", ".join(sorted(measurements_not_in_product)) yield msg.warning( "extra_measurements", f"Dataset has measurements not present in product definition for {product_name!r}: {things}", - hint="This may be valid, as it's allowed by ODC. Set `expect_extra_measurements` to mute this.", ) -def _validate_ds_to_metadata_type( +def validate_ds_to_metadata_type( doc: Dict, metadata_type_definition: Dict, - expect: ValidationExpectations, - msg: ContextualMessager, + msg: ContextualMessager = None, ): - # Datacube does certain transforms on an eo3 doc before storage. - # We need to do the same, as the fields will be read from the storage. - prepared_doc = prep_eo3(doc) + """ + Validate against the metadata type definition. A dataset doesn't have to include + all metadata type fields, but users should be warned that there are missing fields. + """ + if msg is None: + msg = ContextualMessager() - all_nullable_fields = tuple(expect.allow_nullable_fields) + tuple( - expect.allow_missing_fields - ) - for field_name, offsets in _get_field_offsets( - metadata_type=metadata_type_definition - ): - if ( - # If a field is required... - (field_name not in expect.allow_missing_fields) - and - # ... and none of its offsets are in the document - not any(_has_offset(prepared_doc, offset) for offset in offsets) + for field_name, offsets in _get_field_offsets(metadata_type_definition): + # If none of a field's offsets are in the document - ignore for lineage + if field_name != "sources" and not any( + _has_offset(doc, offset) for offset in offsets ): # ... warn them. readable_offsets = " or ".join("->".join(offset) for offset in offsets) yield msg.warning( "missing_field", f"Dataset is missing field {field_name!r} " - f"for type {metadata_type_definition['name']!r}", - hint=f"Expected at {readable_offsets}", + f"expected by metadata type {metadata_type_definition['name']!r}", + hint=f"Expected at offset {readable_offsets}", ) continue - if field_name not in all_nullable_fields: - value = None - for offset in offsets: - value = toolz.get_in(offset, prepared_doc) - if value is None: - yield msg.info( - "null_field", - f"Value is null for configured field {field_name!r}", - ) - - -def _validate_ds_against_data( - dataset: Eo3DatasetDocBase, - readable_location: str, - required_measurements: Dict[str, "ExpectedMeasurement"], - msg: ContextualMessager, -): - # For each measurement, try to load it. - # If loadable, validate measurements exist and match expectations. - dataset_location = dataset.locations[0] if dataset.locations else readable_location - for name, measurement in dataset.measurements.items(): - full_path = uri_resolve(dataset_location, measurement.path) - expected_measurement = required_measurements.get(name) - - band = measurement.band or 1 - with rasterio.open(full_path) as ds: - ds: DatasetReader - - if band not in ds.indexes: - yield msg.error( - "incorrect_band", - f"Measurement {name!r} file contains no rio index {band!r}.", - hint=f"contains indexes {ds.indexes!r}", - ) - continue - - if not expected_measurement: - # The measurement is not in the product definition - # - # This is only informational because a product doesn't have to define all - # measurements that the datasets contain. - # - # This is historically because dataset documents reflect the measurements that - # are stored on disk, which can differ. But products define the set of measurments - # that are mandatory in every dataset. - # - # (datasets differ when, for example, sensors go offline, or when there's on-disk - # measurements like panchromatic that GA doesn't want in their product definitions) - if required_measurements: - yield msg.info( - "unspecified_measurement", - f"Measurement {name} is not in the product", - ) - else: - expected_dtype = expected_measurement.dtype - band_dtype = ds.dtypes[band - 1] - if expected_dtype != band_dtype: - yield ValidationMessage.error( - "different_dtype", - f"{name} dtype: " - f"product {expected_dtype!r} != dataset {band_dtype!r}", - ) - - ds_nodata = ds.nodatavals[band - 1] - - # If the dataset is missing 'nodata', we can allow anything in product 'nodata'. - # (In ODC, nodata might be a fill value for loading data.) - if ds_nodata is None: - continue - - # Otherwise check that nodata matches. - expected_nodata = expected_measurement.nodata - if expected_nodata != ds_nodata and not ( - _is_nan(expected_nodata) and _is_nan(ds_nodata) - ): - yield msg.error( - "different_nodata", - f"{name} nodata: " - f"product {expected_nodata !r} != dataset {ds_nodata !r}", - ) + +def validate_measurement_path( + name, path, msg: ContextualMessager = None +) -> ValidationMessages: + if msg is None: + msg = ContextualMessager() + + if is_absolute(path): + yield msg.warning( + "absolute_path", + f"measurement {name!r} has an absolute path: {path!r}", + ) + + part = get_part_from_uri(path) + if part is not None: + yield msg.warning( + "uri_part", + f"measurement {name!r} has a part in the path. (Use band and/or layer instead)", + ) + if isinstance(part, int): + if part < 0: + yield msg.error( + "uri_invalid_part", + f"measurement {name!r} has an invalid part (less than zero) in the path ({part})", + ) + elif isinstance(part, str): + yield msg.error( + "uri_invalid_part", + f"measurement {name!r} has an invalid part (non-integer) in the path ({part})", + ) def _has_offset(doc: Dict, offset: List[str]) -> bool: """ Is the given offset present in the document? """ - for key in offset: - if key not in doc: - return False - doc = doc[key] - return True - - -@define -class ExpectedMeasurement: - name: str - dtype: str - nodata: int - - @classmethod - def from_definition(cls, doc: Dict): - return ExpectedMeasurement(doc["name"], doc.get("dtype"), doc.get("nodata")) + try: + toolz.get_in(offset, doc, no_default=True) + return True + except (KeyError, IndexError): + return False # Name of a field and its possible offsets in the document. -FieldNameOffsetS = Tuple[str, Set[List[str]]] - - -def validate_paths( - paths: List[str], - thorough: bool = False, - product_definitions: Dict[str, Dict] = None, - metadata_type_definitions: Dict[str, Dict] = None, - expect: ValidationExpectations = None, -) -> Generator[Tuple[str, List[ValidationMessage]], None, None]: - """Validate the list of paths. Product documents can be specified before their datasets.""" - - products = dict(product_definitions or {}) - metadata_types = dict(metadata_type_definitions or {}) - - for url, doc, was_specified_by_user in read_paths(paths): - messages = [] - kind = filename_doc_kind(url) - if kind is None: - kind = guess_kind_from_contents(doc) - if kind and (kind in DOC_TYPE_SUFFIXES): - # It looks like an ODC doc but doesn't have the standard suffix. - messages.append( - ValidationMessage.warning( - "missing_suffix", - f"Document looks like a {kind.name} but does not have " - f'filename extension "{DOC_TYPE_SUFFIXES[kind]}{_readable_doc_extension(url)}"', - ) - ) - - if kind == DocKind.product: - messages.extend(validate_product(doc)) - if "name" in doc: - products[doc["name"]] = doc - elif kind == DocKind.dataset: - messages.extend( - validate_eo3_doc( - doc, - url, - products, - metadata_types, - thorough, - expect=expect, - ) - ) - elif kind == DocKind.metadata_type: - messages.extend(validate_metadata_type(doc)) - if "name" in doc: - metadata_types[doc["name"]] = doc - - # Otherwise it's a file we don't support. - # If the user gave us the path explicitly, it seems to be an error. - # (if they didn't -- it was found via scanning directories -- we don't care.) - elif was_specified_by_user: - if kind is None: - raise ValueError(f"Unknown document type for {url}") - else: - raise NotImplementedError( - f"Cannot currently validate {kind.name} files" - ) - else: - # Not a doc type we recognise, and the user didn't specify it. Skip it. - continue - - yield url, messages +FieldNameOffsets = Tuple[str, Set[List[str]]] -def _get_field_offsets(metadata_type: Dict) -> Iterable[FieldNameOffsetS]: +def _get_field_offsets(metadata_type: Dict) -> Iterable[FieldNameOffsets]: """ Yield all fields and their possible document-offsets that are expected for this metadata type. @@ -688,141 +176,7 @@ def _get_field_offsets(metadata_type: Dict) -> Iterable[FieldNameOffsetS]: (Properties can have multiple offsets, where ODC will choose the first non-null one, hence the return of multiple offsets for each field.) """ - dataset_section = metadata_type["dataset"] - search_fields = dataset_section["search_fields"] - - # The fixed fields of ODC. 'id', 'label', etc. - for field_ in dataset_section: - if field_ == "search_fields": - continue - - offset = dataset_section[field_] - if offset is not None: - yield field_, [offset] - - # The configurable search fields. - for field_, spec in search_fields.items(): - offsets = [] - if "offset" in spec: - offsets.append(spec["offset"]) - offsets.extend(spec.get("min_offset", [])) - offsets.extend(spec.get("max_offset", [])) - - yield field_, offsets - - -def _readable_doc_extension(uri: str): - """ - >>> _readable_doc_extension('something.json.gz') - '.json.gz' - >>> _readable_doc_extension('something.yaml') - '.yaml' - >>> _readable_doc_extension('apple.odc-metadata.yaml.gz') - '.yaml.gz' - >>> _readable_doc_extension('products/tmad/tmad_product.yaml#part=1') - '.yaml' - >>> _readable_doc_extension('/tmp/human.06.tall.yml') - '.yml' - >>> # Not a doc, even though it's compressed. - >>> _readable_doc_extension('db_dump.gz') - >>> _readable_doc_extension('/tmp/nothing') - """ - path = urlparse(uri).path - compression_formats = (".gz",) - doc_formats = ( - ".yaml", - ".yml", - ".json", - ) - suffix = "".join( - s.lower() - for s in Path(path).suffixes - if s.lower() in doc_formats + compression_formats - ) - # If it's only compression, no doc format, it's not valid. - if suffix in compression_formats: - return None - return suffix or None - - -def read_paths( - input_paths: Iterable[str], -) -> Generator[Tuple[str, Union[Dict, str], bool], None, None]: - """ - Read the given input paths, returning a URL, document, and whether - it was explicitly given by the user. - - When a local directory is specified, inner readable docs are returned, but will - be marked as not explicitly specified. - """ - for input_ in input_paths: - for uri, was_specified in expand_paths_as_uris([input_]): - try: - for full_uri, doc in read_documents(uri, uri=True): - yield full_uri, doc, was_specified - except InvalidDocException as e: - if was_specified: - raise - else: - echo(e, err=True) - - -def expand_paths_as_uris( - input_paths: Iterable[str], -) -> Generator[Tuple[Path, bool], None, None]: - """ - For any paths that are directories, find inner documents that are known. - - Returns Tuples: path as a URL, and whether it was specified explicitly by user. - """ - for input_ in input_paths: - if is_url(input_): - yield input_, True - else: - path = Path(input_).resolve() - if path.is_dir(): - for found_path in path.rglob("*"): - if _readable_doc_extension(found_path.as_uri()) is not None: - yield found_path.as_uri(), False - else: - yield path.as_uri(), True - - -def validate_eo3_doc( - doc: Dict, - location: Union[str, Path], - products: Dict[str, Dict], - metadata_types: Dict[str, Dict], - thorough: bool = False, - expect: ValidationExpectations = None, -) -> List[ValidationMessage]: - messages = [] - - matched_product = None - - metadata_type = None - if metadata_types and matched_product: - metadata_type = matched_product["metadata_type"] - if metadata_type not in metadata_types: - messages.append( - ValidationMessage( - Level.error if thorough else Level.info, - "no_metadata_type", - f"Metadata type not provided {metadata_type}: not validating fields", - ) - ) - - messages.extend( - validate_dataset( - doc, - product_definitions=products, - readable_location=location, - thorough=thorough, - metadata_type_definition=metadata_types.get(metadata_type), - expect=expect, - ) - ) - return messages + yield from all_field_offsets(metadata_type).items() def _get_printable_differences(dict1: Dict, dict2: Dict): @@ -838,156 +192,31 @@ def _get_printable_differences(dict1: Dict, dict2: Dict): yield f"{path}: {v1!r} != {v2!r}" -def _get_product_mismatch_reasons(dataset_doc: Dict, product_definition: Dict): - """ - Which fields don't match the given dataset doc to a product definition? - - Gives human-readable lines of text. - """ - yield from _get_printable_differences(dataset_doc, product_definition["metadata"]) - - def _differences_as_hint(product_diffs): return indent("\n".join(product_diffs), prefix="\t") -def _validate_eo3_properties(dataset: Eo3DatasetDocBase, msg: ContextualMessager): - for name, value in dataset.properties.items(): - if name in dataset.properties.KNOWN_PROPERTIES: - normaliser = dataset.properties.KNOWN_PROPERTIES.get(name) - if normaliser and value is not None: - try: - normalised_value = normaliser(value) - # A normaliser can return two values, the latter adding extra extracted fields. - if isinstance(normalised_value, tuple): - normalised_value = normalised_value[0] - - # It's okay for datetimes to be strings - # .. since ODC's own loader does that. - if isinstance(normalised_value, datetime) and isinstance( - value, str - ): - value = ciso8601.parse_datetime(value) - - # Special case for dates, as "no timezone" and "utc timezone" are treated identical. - if isinstance(value, datetime): - value = default_utc(value) - - if not isinstance(value, type(normalised_value)): - yield msg.warning( - "property_type", - f"Value {value} expected to be " - f"{type(normalised_value).__name__!r} (got {type(value).__name__!r})", - ) - elif normalised_value != value: - if _is_nan(normalised_value) and _is_nan(value): - # Both are NaNs, ignore. - pass - else: - yield ValidationMessage.warning( - "property_formatting", - f"Property {value!r} expected to be {normalised_value!r}", - ) - except ValueError as e: - yield msg.error("invalid_property", f"{name!r}: {e.args[0]}") - # else: warning for unknown property? - if "odc:producer" in dataset.properties: - producer = dataset.properties["odc:producer"] - # We use domain name to avoid arguing about naming conventions ('ga' vs 'geoscience-australia' vs ...) - if "." not in producer: - yield msg.warning( - "producer_domain", - "Property 'odc:producer' should be the organisation's domain name. Eg. 'ga.gov.au'", - ) - - # This field is a little odd, but is expected by the current version of ODC. - # (from discussion with Kirill) - if not dataset.properties.get("odc:file_format"): - yield msg.warning( - "global_file_format", - "Property 'odc:file_format' is empty", - hint="Usually 'GeoTIFF'", - ) - - -def _validate_geo( - dataset: Eo3DatasetDocBase, msg: ContextualMessager, expect_geometry: bool = True -): - # If we're not expecting geometry, and there's no geometry, then there's nothing to see here. - if not expect_geometry and ( - dataset.geometry is None and not dataset.grids and not dataset.crs - ): - yield msg.info("non_geo", "No geo information in dataset") - return - - # Geometry is recommended but not required - if dataset.geometry is None: - if expect_geometry: - yield msg.info( - "incomplete_geo", "Dataset has some geo fields but no geometry" - ) - elif not dataset.geometry.is_valid: - yield msg.error( - "invalid_geometry", - f"Geometry is not a valid shape: {explain_validity(dataset.geometry)!r}", - ) - return - - # CRS required - if not dataset.crs: - yield msg.error("incomplete_crs", "Dataset has some geo fields but no crs") - else: - # We only officially support epsg code (recommended) or wkt. - # TODO Anything supported by odc-geo - yield from _validate_crs(dataset.crs, msg) - - # Grids is validated by schema - but is required - if not dataset.grids: - yield msg.error("incomplete_grids", "Dataset has some geo fields but no grids") - else: - yield from _validate_grids(dataset.grids, dataset.crs, msg) - - return - - -def _validate_crs(crs, msg): - if crs.lower().startswith("epsg:"): - try: - CRS.from_string(crs) - except CRSError as e: - yield msg.error("invalid_crs_epsg", e.args[0]) - - if crs.lower() != crs: - yield msg.warning("mixed_crs_case", "Recommend lowercase 'epsg:' prefix") - else: - wkt_crs = None - try: - wkt_crs = CRS.from_wkt(crs) - except CRSError as e: - yield msg.error( - "invalid_crs", - f"Expect either an epsg code or a WKT string: {e.args[0]}", - ) - - if wkt_crs and wkt_crs.is_epsg_code: - yield msg.warning( - "non_epsg", - f"Prefer an EPSG code to a WKT when possible. (Can change CRS to 'epsg:{wkt_crs.to_epsg()}')", - ) - - -def _validate_grids(grids, default_crs, msg): - for grid_name, grid_def in grids.items(): - sub_msg = msg.sub_msg(grid=grid_name) - if not grid_def.crs: - grid_def.crs = default_crs - else: - yield from _validate_crs(grid_def.crs, sub_msg) +class InvalidDatasetError(Exception): + """ + Raised when a dataset is missing essential things (such as mandatory metadata) + or contains invalid values and so cannot be written. + """ -def _has_some_geo(dataset: Eo3DatasetDocBase) -> bool: - return dataset.geometry is not None or dataset.grids or dataset.crs +class InvalidDatasetWarning(UserWarning): + """A non-critical warning for invalid or incomplete metadata""" -def _load_doc(url): - return list(load_documents(url)) +def handle_validation_messages(messages: ValidationMessages): + """Capture multiple errors or warning messages and raise them as one""" + warns = [] + errors = [] + for msg in messages: + if msg.level == Level.warning: + warns.append(str(msg)) + if msg.level == Level.error: + errors.append(str(msg)) + if warns: + warnings.warn(InvalidDatasetWarning("\n".join(warns))) + if errors: + raise InvalidDatasetError("\n".join(errors)) diff --git a/eo3/validation_msg.py b/eo3/validation_msg.py index 9c8278c0..3af32d88 100644 --- a/eo3/validation_msg.py +++ b/eo3/validation_msg.py @@ -22,12 +22,8 @@ class ValidationMessage: def __str__(self) -> str: hint = "" if self.hint: - hint = f" (Hint: {self.hint})" - if self.context: - context_str = ",".join(f"{k}: {v}" for k, v in self.context.items()) - return f"{self.code} in [{context_str}]: {self.reason}{hint}" - else: - return f"{self.code}: {self.reason}{hint}" + hint = f"(Hint: {self.hint})" + return f"{self.code}: {self.reason} {hint}" @classmethod def info( @@ -54,7 +50,7 @@ def error( class ContextualMessager: - def __init__(self, context: dict): + def __init__(self, context: dict = {}): self.context = context self.errors = 0 diff --git a/eo3/verify.py b/eo3/verify.py deleted file mode 100644 index ef0aee2c..00000000 --- a/eo3/verify.py +++ /dev/null @@ -1,238 +0,0 @@ -import binascii -import hashlib -import logging -import os -import typing -from distutils import spawn -from pathlib import Path -from urllib.parse import urlparse - -import boto3 - -_LOG = logging.getLogger(__name__) - - -def is_s3_uri(uri): - parsed_uri = urlparse(uri) - return parsed_uri.scheme == "s3" - - -def get_bucket_key(s3_key): - """ - Return bucket name and key from a s3 key - """ - o = urlparse(s3_key) - bucket = o.netloc - key = o.path - # Remove the leading slash from the prefix/key - return bucket, key[1:] - - -def find_exe(name: str): - """ - Find the location of the given executable. - - :return: the absolute path to the executable. - :rtype: str - """ - executable = spawn.find_executable(name) - if not executable: - raise Exception(f"No {name!r} command found.") - - return executable - - -def calculate_file_sha1(filename): - """ - :type filename: str or Path - :rtype: str - """ - return calculate_file_hash(filename, hash_fn=hashlib.sha1) - - -def calculate_file_hash(filename, hash_fn=hashlib.sha1, block_size=4096): - """ - Calculate the hash of the contents of a given file path. - :type filename: str or Path - :param block_size: Number of bytes to read at a time. (for performance: doesn't affect result) - :param hash_fn: hashlib function to use. (typically sha1 or md5) - :return: String of hex characters. - :rtype: str - """ - if is_s3_uri(str(filename)): - bucket, key = get_bucket_key(filename) - try: - region_name = os.environ["AWS_DEFAULT_REGION"] - except Exception as exp: - raise ValueError( - "Failed to find AWS_DEFAULT_REGION in the environment variables" - ) from exp - s3client = boto3.client("s3", region_name=region_name) - fileobj = s3client.get_object(Bucket=bucket, Key=key) - f = fileobj["Body"].read() - return calculate_hash(f, hash_fn, block_size) - else: - with Path(filename).open("rb") as f: - return calculate_hash(f, hash_fn, block_size) - - -def calculate_hash(f, hash_fn=hashlib.sha1, block_size=4096): - m = hash_fn() - - while True: - d = f.read(block_size) - if not d: - break - m.update(d) - - return binascii.hexlify(m.digest()).decode("ascii") - - -# 16K seems to be the sweet spot in performance on my machine. -def calculate_file_crc32(filename, block_size=1024 * 16): - """ - Calculate the crc32 of the contents of a given file path. - :type filename: str or Path - :param block_size: Number of bytes to read at a time. (for performance: doesn't affect result) - :return: String of hex characters. - :rtype: str - """ - m = 0 - with Path(filename).open("rb") as f: - while True: - d = f.read(block_size) - if not d: - break - m = binascii.crc32(d, m) - - return f"{m & 0xFFFFFFFF:08x}" - - -class PackageChecksum: - """ - Incrementally build a checksum file for a package. - - (By building incrementally we can better take advantage of filesystem caching) - """ - - def __init__(self): - self._file_hashes = {} - - def add_file(self, file_path): - """ - Add files to the checksum list (recursing into directories.) - :type file_path: Path - :rtype: None - """ - - if is_s3_uri(str(file_path)): - try: - region_name = os.environ["AWS_DEFAULT_REGION"] - except Exception as exp: - raise ValueError( - "Failed to find AWS_DEFAULT_REGION in the environment variables" - ) from exp - - s3client = boto3.client("s3", region_name) - bucket, key = get_bucket_key(file_path) - response_obj = s3client.list_objects_v2(Bucket=bucket, Prefix=key) - objs = [obj["Key"] for obj in response_obj["Contents"]] - if len(objs) > 1: - for file_path in objs: - hash_ = self._checksum("s3://{bucket}/{file_path}") - self._append_hash(file_path, hash_) - else: - hash_ = self._checksum(file_path) - self._append_hash(file_path, hash_) - return - - if file_path.is_dir(): - self.add_files(file_path.iterdir()) - else: - hash_ = self._checksum(file_path) - self._append_hash(file_path, hash_) - - def add(self, fd: typing.IO, name=None): - """ - Add a checksum, reading the data from an open file descriptor. - """ - name = name or fd.name - if not name: - raise ValueError("No usable name for checksummed file descriptor") - - _LOG.info("Checksumming %r", name) - hash_ = calculate_hash(fd) - _LOG.debug("%r -> %r", name, hash_) - self._append_hash(name, hash_) - - def _checksum(self, file_path): - _LOG.info("Checksumming %r", file_path) - hash_ = calculate_file_hash(file_path) - _LOG.debug("%r -> %r", file_path, hash_) - return hash_ - - def _append_hash(self, file_path, hash_): - self._file_hashes[Path(file_path).absolute()] = hash_ - - def add_files(self, file_paths): - for path in file_paths: - self.add_file(path) - - def write(self, output_file: typing.Union[Path, str]): - """ - Write checksums to the given file. - :type output_file: Path or str - """ - output_file = Path(output_file) - with output_file.open("wb") as f: - f.writelines( - ( - "{}\t{}\n".format( - str(hash_), str(filename.relative_to(output_file.parent)) - ).encode("utf-8") - for filename, hash_ in sorted(self._file_hashes.items()) - ) - ) - - def read(self, checksum_path): - """ - Read checksum values from the given checksum file - :type checksum_path: Path or str - """ - checksum_path = Path(checksum_path) - with checksum_path.open("r") as f: - for line in f.readlines(): - hash_, path = str(line).strip().split("\t") - self._append_hash( - checksum_path.parent.joinpath(*path.split("/")), hash_ - ) - - def items(self): - return self._file_hashes.items() - - def __len__(self): - return len(self._file_hashes) - - def iteratively_verify(self): - """ - Lazily yield each file and whether it matches the known checksum. - - :rtype: [(Path, bool)] - """ - for path, hash_ in self.items(): - calculated_hash = self._checksum(path) - yield path, calculated_hash == hash_ - - def __bool__(self): - return bool(self._file_hashes) - - def __eq__(self, other): - if isinstance(other, self.__class__): - # pylint 1.6.4 isn't smart enough to know that this is protected access of the same class - # pylint: disable=protected-access - return self._file_hashes == other._file_hashes - - return False - - def __hash__(self) -> int: - return hash(self._file_hashes) diff --git a/requirements/deployment.txt b/requirements/deployment.txt index 921a147d..cdbc1a18 100644 --- a/requirements/deployment.txt +++ b/requirements/deployment.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile --extra=deployment --extra=docker --output-file=requirements/deployment.txt @@ -9,12 +9,13 @@ affine==2.3.1 # eo3 (setup.py) # odc-geo # rasterio -attrs==22.1.0 +attrs==23.1.0 # via # cattrs # eo3 (setup.py) # jsonschema # rasterio + # referencing boltons==21.0.0 # via eo3 (setup.py) boto3==1.24.94 @@ -46,9 +47,7 @@ cligj==0.7.2 # via rasterio defusedxml==0.7.1 # via eo3 (setup.py) -exceptiongroup==1.0.0rc9 - # via cattrs -gdal==3.3.2 +gdal==3.6.3 # via eo3 (setup.py) h5py==3.7.0 # via eo3 (setup.py) @@ -56,8 +55,10 @@ jmespath==1.0.1 # via # boto3 # botocore -jsonschema==4.16.0 +jsonschema==4.18.0 # via eo3 (setup.py) +jsonschema-specifications==2023.7.1 + # via jsonschema numpy==1.23.4 # via # eo3 (setup.py) @@ -82,8 +83,6 @@ pyproj==3.4.0 # via # eo3 (setup.py) # odc-geo -pyrsistent==0.18.1 - # via jsonschema pystac==1.7.3 # via eo3 (setup.py) python-dateutil==2.8.2 @@ -97,10 +96,17 @@ pytz==2022.5 # via pandas rasterio==1.3.3 # via eo3 (setup.py) +referencing==0.30.2 + # via + # eo3 (setup.py) + # jsonschema + # jsonschema-specifications +rpds-py==0.10.2 + # via + # jsonschema + # referencing ruamel-yaml==0.17.21 # via eo3 (setup.py) -ruamel-yaml-clib==0.2.6 - # via ruamel-yaml s3transfer==0.6.0 # via boto3 scipy==1.9.3 diff --git a/requirements/setup.txt b/requirements/setup.txt index 3e02c3fc..99d417d8 100644 --- a/requirements/setup.txt +++ b/requirements/setup.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile --output-file=requirements/setup.txt --strip-extras @@ -9,12 +9,13 @@ affine==2.4.0 # eo3 (setup.py) # odc-geo # rasterio -attrs==22.1.0 +attrs==23.1.0 # via # cattrs # eo3 (setup.py) # jsonschema # rasterio + # referencing boltons==23.0.0 # via eo3 (setup.py) boto3==1.26.129 @@ -46,16 +47,16 @@ cligj==0.7.2 # via rasterio defusedxml==0.7.1 # via eo3 (setup.py) -exceptiongroup==1.1.1 - # via cattrs h5py==3.8.0 # via eo3 (setup.py) jmespath==1.0.1 # via # boto3 # botocore -jsonschema==4.17.3 +jsonschema==4.18.0 # via eo3 (setup.py) +jsonschema-specifications==2023.7.1 + # via jsonschema numpy==1.24.3 # via # eo3 (setup.py) @@ -81,8 +82,6 @@ pyproj==3.4.0 # via # eo3 (setup.py) # odc-geo -pyrsistent==0.19.3 - # via jsonschema pystac==1.7.3 # via eo3 (setup.py) python-dateutil==2.8.2 @@ -96,6 +95,15 @@ pytz==2023.3 # via pandas rasterio==1.3.6 # via eo3 (setup.py) +referencing==0.30.2 + # via + # eo3 (setup.py) + # jsonschema + # jsonschema-specifications +rpds-py==0.10.2 + # via + # jsonschema + # referencing ruamel-yaml==0.17.24 # via eo3 (setup.py) ruamel-yaml-clib==0.2.7 diff --git a/requirements/test.txt b/requirements/test.txt index 0f1dec58..fe9fa0f8 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile --extra=docker --extra=test --output-file=requirements/test.txt @@ -11,7 +11,7 @@ affine==2.3.1 # rasterio alabaster==0.7.12 # via sphinx -attrs==22.1.0 +attrs==23.1.0 # via # cattrs # eo3 (setup.py) @@ -19,16 +19,20 @@ attrs==22.1.0 # morecantile # pytest # rasterio + # referencing babel==2.10.3 # via sphinx boltons==21.0.0 # via eo3 (setup.py) boto3==1.24.94 - # via eo3 (setup.py) + # via + # eo3 (setup.py) + # moto botocore==1.27.94 # via # boto3 # eo3 (setup.py) + # moto # s3transfer cachetools==5.2.0 # via odc-geo @@ -39,6 +43,8 @@ certifi==2022.12.7 # pyproj # rasterio # requests +cffi==1.15.1 + # via cryptography cftime==1.6.2 # via netcdf4 charset-normalizer==2.1.1 @@ -58,6 +64,10 @@ click-plugins==1.1.1 # via rasterio cligj==0.7.2 # via rasterio +coverage[toml]==7.3.1 + # via pytest-cov +cryptography==41.0.3 + # via moto deepdiff==6.2.1 # via eo3 (setup.py) defusedxml==0.7.1 @@ -66,8 +76,6 @@ docutils==0.17.1 # via # sphinx # sphinx-rtd-theme -exceptiongroup==1.0.0rc9 - # via cattrs flake8==5.0.4 # via pep8-naming gdal==3.6.3 @@ -83,21 +91,29 @@ imagesize==1.4.1 iniconfig==1.1.1 # via pytest jinja2==3.1.2 - # via sphinx + # via + # moto + # sphinx jmespath==1.0.1 # via # boto3 # botocore -jsonschema==4.16.0 +jsonschema==4.18.0 # via eo3 (setup.py) +jsonschema-specifications==2023.7.1 + # via jsonschema markupsafe==2.1.1 - # via jinja2 + # via + # jinja2 + # werkzeug mccabe==0.7.0 # via flake8 mock==4.0.3 # via eo3 (setup.py) morecantile==3.1.2 # via rio-cogeo +moto==4.2.2 + # via eo3 (setup.py) netcdf4==1.6.1 # via eo3 (setup.py) networkx==2.8.7 @@ -143,6 +159,8 @@ py==1.11.0 # via pytest pycodestyle==2.9.1 # via flake8 +pycparser==2.21 + # via cffi pydantic==1.10.2 # via # morecantile @@ -160,15 +178,20 @@ pyproj==3.4.0 # eo3 (setup.py) # morecantile # odc-geo -pyrsistent==0.18.1 - # via jsonschema pystac==1.7.3 # via eo3 (setup.py) pytest==7.1.3 + # via + # eo3 (setup.py) + # pytest-cov +pytest-cov==4.1.0 + # via eo3 (setup.py) +pytest-httpserver==1.0.8 # via eo3 (setup.py) python-dateutil==2.8.2 # via # botocore + # moto # pandas # pystac python-rapidjson==1.9 @@ -179,18 +202,32 @@ pytz==2022.5 # pandas pywavelets==1.4.1 # via scikit-image +pyyaml==6.0.1 + # via responses rasterio==1.3.3 # via # eo3 (setup.py) # rio-cogeo -requests==2.28.1 - # via sphinx +referencing==0.30.2 + # via + # eo3 (setup.py) + # jsonschema + # jsonschema-specifications +requests==2.31.0 + # via + # moto + # responses + # sphinx +responses==0.23.3 + # via moto rio-cogeo==3.4.1 # via eo3 (setup.py) +rpds-py==0.10.2 + # via + # jsonschema + # referencing ruamel-yaml==0.17.21 # via eo3 (setup.py) -ruamel-yaml-clib==0.2.6 - # via ruamel-yaml s3transfer==0.6.0 # via boto3 scikit-image==0.19.3 @@ -237,14 +274,23 @@ tomli==2.0.1 # via pytest toolz==0.12.0 # via eo3 (setup.py) +types-pyyaml==6.0.12.11 + # via responses typing-extensions==4.4.0 # via pydantic urllib3==1.26.12 # via # botocore # requests + # responses +werkzeug==2.3.7 + # via + # moto + # pytest-httpserver xarray==2022.10.0 # via eo3 (setup.py) +xmltodict==0.13.0 + # via moto # The following packages are considered to be unsafe in a requirements file: # setuptools diff --git a/setup.py b/setup.py index f8e9ee37..6375a879 100755 --- a/setup.py +++ b/setup.py @@ -19,9 +19,11 @@ "pep8-naming", "pytest", "pytest-cov", + "pytest-httpserver", "rio_cogeo", "sphinx-autodoc-typehints", "sphinx_rtd_theme", + "moto", ] EXTRAS_REQUIRE = { @@ -78,7 +80,8 @@ "click", "defusedxml", "h5py", - "jsonschema>=3", # We want a Draft6Validator + "jsonschema==4.18.0", # We want a Draft7Validator, but 4.18.0 is the only version that works + "referencing", "numpy>=1.15.4", "odc-geo", "pyproj", diff --git a/test_file.txt b/test_file.txt new file mode 100644 index 00000000..e69de29b diff --git a/tests/common.py b/tests/common.py index 1897666a..52088e70 100644 --- a/tests/common.py +++ b/tests/common.py @@ -1,127 +1,14 @@ import operator -from pathlib import Path -from textwrap import indent -from typing import Dict, Iterable, Mapping, Sequence, Union +from typing import Dict, Iterable, Mapping, Sequence -import pytest import rapidjson from click.testing import CliRunner, Result from deepdiff import DeepDiff from deepdiff.model import DiffLevel -from ruamel import yaml -from shapely.geometry import shape -from shapely.geometry.base import BaseGeometry -from eo3 import Eo3DatasetDocBase, serialise from eo3.validation_msg import Level, ValidationMessage, ValidationMessages -def check_prepare_outputs( - invoke_script, - run_args, - expected_doc: Dict, - expected_metadata_path: Path, - ignore_fields=(), -): - """Call a prepare script and check for an expected output document.""" - __tracebackhide__ = operator.methodcaller("errisinstance", AssertionError) - res = run_prepare_cli(invoke_script, *run_args) - - try: - assert_expected_eo3_path(expected_doc, expected_metadata_path, ignore_fields) - except AssertionError: - print(f'Output:\n{indent(res.output, " ")}') - raise - - -def assert_expected_eo3_path( - expected_doc: Dict, - expected_path: Path, - ignore_fields=(), -): - """ - Check an output path of an EO3 dataset matches an expected document. - - This is slightly smarter about doing geometry equality etc within the document. - """ - __tracebackhide__ = operator.methodcaller("errisinstance", AssertionError) - assert ( - expected_path.exists() - ), f"Expected output EO3 path doesn't exist: {expected_path}" - assert_same_as_file( - expected_doc, - expected_path, - # We check the geometry below - ignore_fields=("geometry",) + tuple(ignore_fields), - ) - - if "geometry" not in ignore_fields: - # Compare geometry after parsing, rather than comparing the raw dict values. - produced_dataset = serialise.from_path(expected_path) - expected_dataset = serialise.from_doc(expected_doc, skip_validation=True) - if expected_dataset.geometry is None: - assert produced_dataset.geometry is None, ( - f"Expected a null geometry, " - f"but output included one: {produced_dataset.geometry.__geo_interface__!r}" - ) - else: - assert_shapes_mostly_equal( - produced_dataset.geometry, - expected_dataset.geometry, - # Typically meters -- this is easily good enough accuracy. - 0.0001, - ) - - -def assert_expected_eo3( - expected_doc: Eo3DatasetDocBase, - given_doc: Eo3DatasetDocBase, - *, - ignore_fields=(), -): - """ - Do the two DatasetDocs match? - - (Unlike equality, gives reasonable error message of differences, and - compares geometry more intelligently.) - """ - __tracebackhide__ = operator.methodcaller("errisinstance", AssertionError) - if expected_doc.geometry is None: - assert given_doc.geometry is None, "Expected no geometry" - else: - assert_shapes_mostly_equal( - given_doc.geometry, expected_doc.geometry, 0.00000001 - ) - e = serialise.to_doc(expected_doc) - g = serialise.to_doc(given_doc) - for f in ("geometry",) + ignore_fields: - e.pop(f) - g.pop(f) - assert_same(g, e) - - -def assert_shapes_mostly_equal( - shape1: Union[BaseGeometry, dict], - shape2: Union[BaseGeometry, dict], - threshold: float, -): - __tracebackhide__ = operator.methodcaller("errisinstance", AssertionError) - - if isinstance(shape1, dict): - shape1 = shape(shape1) - if isinstance(shape2, dict): - shape2 = shape(shape2) - - # Check area first, as it's a nicer error message when they're wildly different. - assert shape1.area == pytest.approx( - shape2.area, abs=threshold - ), f"Shapes have different areas: {shape1.area} != {shape2.area}" - - s1 = shape1.simplify(tolerance=threshold) - s2 = shape2.simplify(tolerance=threshold) - assert (s1 - s2).area < threshold, f"{s1} is not mostly equal to {s2}" - - def assert_same(expected_doc: Dict, generated_doc: Dict): """ Assert two documents are the same, ignoring trivial float differences @@ -131,27 +18,6 @@ def assert_same(expected_doc: Dict, generated_doc: Dict): assert doc_diffs == {}, "\n".join(format_doc_diffs(expected_doc, generated_doc)) -def assert_same_as_file(expected_doc: Dict, generated_file: Path, ignore_fields=()): - """Assert a file contains the given document content (after normalisation etc)""" - __tracebackhide__ = operator.methodcaller("errisinstance", AssertionError) - - assert generated_file.exists(), f"Expected file to exist {generated_file.name}" - - with generated_file.open("r") as f: - generated_doc = yaml.YAML(typ="safe").load(f) - - expected_doc = dict(expected_doc) - for field in ignore_fields: - if field in generated_doc: - del generated_doc[field] - if field in expected_doc: - del expected_doc[field] - - expected_doc = dump_roundtrip(expected_doc) - generated_doc = dump_roundtrip(generated_doc) - assert_same(generated_doc, expected_doc) - - def run_prepare_cli(invoke_script, *args, expect_success=True) -> Result: """Run the prepare script as a command-line command""" __tracebackhide__ = True diff --git a/tests/data/multi_doc.nc b/tests/data/multi_doc.nc new file mode 100644 index 00000000..ea38eb32 Binary files /dev/null and b/tests/data/multi_doc.nc differ diff --git a/tests/data/multi_doc.yml b/tests/data/multi_doc.yml new file mode 100644 index 00000000..53cd5bb9 --- /dev/null +++ b/tests/data/multi_doc.yml @@ -0,0 +1,18 @@ +description: Document 1 of 3 +data: + number: 1 + list: [1,2,3] + +--- + +description: Document 2 of 3 +data: + number: 2 + list: [1,2,3] + +--- + +description: Document 3 of 3 +data: + number: 3 + list: [1,2,3] diff --git a/tests/data/multi_doc.yml.gz b/tests/data/multi_doc.yml.gz new file mode 100644 index 00000000..f28be8be Binary files /dev/null and b/tests/data/multi_doc.yml.gz differ diff --git a/tests/data/sample.json b/tests/data/sample.json new file mode 100644 index 00000000..afdfa7ff --- /dev/null +++ b/tests/data/sample.json @@ -0,0 +1,11 @@ +{ + "description": "File containing json document", + "data": { + "number": 1, + "list": [ + 1, + 2, + 3 + ] + } +} diff --git a/tests/data/single_doc.yaml b/tests/data/single_doc.yaml new file mode 100644 index 00000000..129776d4 --- /dev/null +++ b/tests/data/single_doc.yaml @@ -0,0 +1,4 @@ +description: File containing single yaml document +data: + number: 1 + list: [1,2,3] diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 73b0c3b3..7ce956a9 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -2,12 +2,12 @@ import shutil from datetime import datetime from pathlib import Path -from typing import Callable, Dict +from typing import Dict import pytest from eo3 import serialise -from eo3.model import Eo3DatasetDocBase +from eo3.model import DatasetMetadata # from eo3.prepare.landsat_l1_prepare import normalise_nci_symlinks @@ -50,18 +50,6 @@ WOFS_PATH: Path = Path(__file__).parent / "data" / "wofs" -# def path_offset(base: Path, offset: str): -# return str(normalise_nci_symlinks(base.absolute().joinpath(offset))) - - -# def tar_offset(tar: Path, offset: str): -# return "tar:" + str(normalise_nci_symlinks(tar.absolute())) + "!" + offset - - -def relative_offset(base, offset): - return offset - - @pytest.fixture def sentinel1_eo3() -> Path: with open(S1_EO3_PATH) as f: @@ -79,22 +67,12 @@ def l1_c2_ls8_folder(tmp_path: Path) -> Path: @pytest.fixture -def l1_ls8_metadata_path( - l1_ls8_folder: Path, l1_ls8_dataset: Eo3DatasetDocBase -) -> Path: +def l1_ls8_metadata_path(l1_ls8_folder: Path, l1_ls8_dataset: DatasetMetadata) -> Path: path = l1_ls8_folder / f"{l1_ls8_dataset.label}.odc-metadata.yaml" serialise.to_path(path, l1_ls8_dataset) return path -@pytest.fixture -def l1_ls8_dataset_path(l1_ls8_folder: Path, l1_ls8_metadata_path: Path) -> Path: - """ - A prepared L1 dataset with an EO3 metadata file. - """ - return l1_ls8_folder - - @pytest.fixture def l1_ls7_tarball(tmp_path: Path) -> Path: return _make_copy(L71GT_TARBALL_PATH, tmp_path) @@ -120,30 +98,13 @@ def _make_copy(input_path, tmp_path): @pytest.fixture -def l1_ls8_dataset(l1_ls8_folder_md_expected: Dict) -> Eo3DatasetDocBase: - return serialise.from_doc(l1_ls8_folder_md_expected) +def l1_ls8_dataset(l1_ls8_folder_md_expected: Dict) -> DatasetMetadata: + return DatasetMetadata(l1_ls8_folder_md_expected) @pytest.fixture def l1_ls8_folder_md_expected(l1_ls8_folder) -> Dict: - return expected_l1_ls8_folder(l1_ls8_folder, relative_offset) - - -@pytest.fixture -def l1_ls8_ga_expected(l1_ls8_folder) -> Dict: - return expected_l1_ls8_folder( - l1_ls8_folder, - relative_offset, - organisation="ga.gov.au", - collection="3", - # the id in the ls8_telemetry_path fixture - lineage={"satellite_telemetry_data": ["30841328-89c2-4693-8802-a3560a6cf67a"]}, - ) - - -# @pytest.fixture -# def l1_ls8_folder_md_expected_absolute(l1_ls8_folder) -> Dict: -# return expected_l1_ls8_folder(l1_ls8_folder, path_offset) + return expected_l1_ls8_folder(l1_ls8_folder) @pytest.fixture @@ -174,7 +135,6 @@ def example_metadata( def expected_l1_ls8_folder( l1_ls8_folder: Path, - offset: Callable[[Path, str], str] = relative_offset, organisation="usgs.gov", collection="1", l1_collection="1", @@ -187,26 +147,15 @@ def expected_l1_ls8_folder( """ org_code = organisation.split(".")[0] product_name = f"{org_code}_ls8c_level1_{collection}" - if collection == "2": - processing_datetime = datetime(2020, 9, 7, 19, 30, 5) - cloud_cover = 93.28 - points_model = 125 - points_version = 5 - rmse_model_x = 4.525 - rmse_model_y = 5.917 - software_version = "LPGS_15.3.1c" - uuid = "d9221c40-24c3-5356-ab22-4dcac2bf2d70" - quality_tag = "QA_PIXEL" - else: - processing_datetime = datetime(2017, 4, 5, 11, 17, 36) - cloud_cover = 93.22 - points_model = 66 - points_version = 4 - rmse_model_x = 4.593 - rmse_model_y = 5.817 - software_version = "LPGS_2.7.0" - uuid = "a780754e-a884-58a7-9ac0-df518a67f59d" - quality_tag = "BQA" + processing_datetime = datetime(2017, 4, 5, 11, 17, 36) + cloud_cover = 93.22 + points_model = 66 + points_version = 4 + rmse_model_x = 4.593 + rmse_model_y = 5.817 + software_version = "LPGS_2.7.0" + uuid = "a780754e-a884-58a7-9ac0-df518a67f59d" + quality_tag = "BQA" processing_date = processing_datetime.strftime("%Y%m%d") return { "$schema": "https://schemas.opendatacube.org/dataset", @@ -218,6 +167,7 @@ def expected_l1_ls8_folder( }, "properties": { "datetime": datetime(2016, 1, 21, 23, 50, 23, 54435), + "dea:dataset_maturity": "final", # The minor version comes from the processing date (as used in filenames to distinguish reprocesses). "odc:dataset_version": f"{collection}.0.{processing_date}", "odc:file_format": "GeoTIFF", @@ -300,77 +250,41 @@ def expected_l1_ls8_folder( }, "measurements": { "coastal_aerosol": { - "path": offset( - l1_ls8_folder, - f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B1.TIF", - ) + "path": f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B1.TIF" }, "blue": { - "path": offset( - l1_ls8_folder, - f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B2.TIF", - ) + "path": f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B2.TIF" }, "green": { - "path": offset( - l1_ls8_folder, - f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B3.TIF", - ) + "path": f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B3.TIF" }, "red": { - "path": offset( - l1_ls8_folder, - f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B4.TIF", - ) + "path": f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B4.TIF" }, "nir": { - "path": offset( - l1_ls8_folder, - f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B5.TIF", - ) + "path": f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B5.TIF" }, "swir_1": { - "path": offset( - l1_ls8_folder, - f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B6.TIF", - ) + "path": f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B6.TIF" }, "swir_2": { - "path": offset( - l1_ls8_folder, - f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B7.TIF", - ) + "path": f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B7.TIF" }, "panchromatic": { "grid": "panchromatic", - "path": offset( - l1_ls8_folder, - f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B8.TIF", - ), + "path": f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B8.TIF", }, "cirrus": { - "path": offset( - l1_ls8_folder, - f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B9.TIF", - ) + "path": f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B9.TIF" }, "lwir_1": { - "path": offset( - l1_ls8_folder, - f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B10.TIF", - ) + "path": f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B10.TIF" }, "lwir_2": { - "path": offset( - l1_ls8_folder, - f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B11.TIF", - ) + "path": f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_B11.TIF" }, "quality": { - "path": offset( - l1_ls8_folder, - f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_{quality_tag}.TIF", - ) + "path": f"LC08_L1TP_090084_20160121_{processing_date}_0{l1_collection}_T1_{quality_tag}.TIF" }, }, "accessories": { @@ -384,7 +298,7 @@ def expected_l1_ls8_folder( @pytest.fixture def l1_ls7_tarball_md_expected( - l1_ls7_tarball, offset: Callable[[Path, str], str] = relative_offset + l1_ls7_tarball, ) -> Dict: return { "$schema": "https://schemas.opendatacube.org/dataset", @@ -490,47 +404,17 @@ def l1_ls7_tarball_md_expected( }, }, "measurements": { - "blue": { - "path": offset( - l1_ls7_tarball, "LE07_L1TP_104078_20130429_20161124_01_T1_B1.TIF" - ) - }, - "green": { - "path": offset( - l1_ls7_tarball, "LE07_L1TP_104078_20130429_20161124_01_T1_B2.TIF" - ) - }, - "nir": { - "path": offset( - l1_ls7_tarball, "LE07_L1TP_104078_20130429_20161124_01_T1_B4.TIF" - ) - }, - "quality": { - "path": offset( - l1_ls7_tarball, "LE07_L1TP_104078_20130429_20161124_01_T1_BQA.TIF" - ) - }, - "red": { - "path": offset( - l1_ls7_tarball, "LE07_L1TP_104078_20130429_20161124_01_T1_B3.TIF" - ) - }, - "swir_1": { - "path": offset( - l1_ls7_tarball, "LE07_L1TP_104078_20130429_20161124_01_T1_B5.TIF" - ) - }, - "swir_2": { - "path": offset( - l1_ls7_tarball, "LE07_L1TP_104078_20130429_20161124_01_T1_B7.TIF" - ) - }, + "blue": {"path": "LE07_L1TP_104078_20130429_20161124_01_T1_B1.TIF"}, + "green": {"path": "LE07_L1TP_104078_20130429_20161124_01_T1_B2.TIF"}, + "nir": {"path": "LE07_L1TP_104078_20130429_20161124_01_T1_B4.TIF"}, + "quality": {"path": "LE07_L1TP_104078_20130429_20161124_01_T1_BQA.TIF"}, + "red": {"path": "LE07_L1TP_104078_20130429_20161124_01_T1_B3.TIF"}, + "swir_1": {"path": "LE07_L1TP_104078_20130429_20161124_01_T1_B5.TIF"}, + "swir_2": {"path": "LE07_L1TP_104078_20130429_20161124_01_T1_B7.TIF"}, "tir_1": {"path": "LE07_L1TP_104078_20130429_20161124_01_T1_B6_VCID_1.TIF"}, "tir_2": {"path": "LE07_L1TP_104078_20130429_20161124_01_T1_B6_VCID_2.TIF"}, "panchromatic": { - "path": offset( - l1_ls7_tarball, "LE07_L1TP_104078_20130429_20161124_01_T1_B8.TIF" - ), + "path": "LE07_L1TP_104078_20130429_20161124_01_T1_B8.TIF", "grid": "panchromatic", }, }, @@ -545,7 +429,7 @@ def l1_ls7_tarball_md_expected( @pytest.fixture def l1_ls5_tarball_md_expected( - l1_ls5_tarball, offset: Callable[[Path, str], str] = relative_offset + l1_ls5_tarball, # offset: Callable[[Path, str], str] = relative_offset ) -> Dict: return { "$schema": "https://schemas.opendatacube.org/dataset", @@ -623,46 +507,14 @@ def l1_ls5_tarball_md_expected( } }, "measurements": { - "blue": { - "path": offset( - l1_ls5_tarball, "LT05_L1TP_090085_19970406_20161231_01_T1_B1.TIF" - ) - }, - "green": { - "path": offset( - l1_ls5_tarball, "LT05_L1TP_090085_19970406_20161231_01_T1_B2.TIF" - ) - }, - "red": { - "path": offset( - l1_ls5_tarball, "LT05_L1TP_090085_19970406_20161231_01_T1_B3.TIF" - ) - }, - "nir": { - "path": offset( - l1_ls5_tarball, "LT05_L1TP_090085_19970406_20161231_01_T1_B4.TIF" - ) - }, - "swir_1": { - "path": offset( - l1_ls5_tarball, "LT05_L1TP_090085_19970406_20161231_01_T1_B5.TIF" - ) - }, - "swir_2": { - "path": offset( - l1_ls5_tarball, "LT05_L1TP_090085_19970406_20161231_01_T1_B7.TIF" - ) - }, - "tir": { - "path": offset( - l1_ls5_tarball, "LT05_L1TP_090085_19970406_20161231_01_T1_B6.TIF" - ) - }, - "quality": { - "path": offset( - l1_ls5_tarball, "LT05_L1TP_090085_19970406_20161231_01_T1_BQA.TIF" - ) - }, + "blue": {"path": "LT05_L1TP_090085_19970406_20161231_01_T1_B1.TIF"}, + "green": {"path": "LT05_L1TP_090085_19970406_20161231_01_T1_B2.TIF"}, + "red": {"path": "LT05_L1TP_090085_19970406_20161231_01_T1_B3.TIF"}, + "nir": {"path": "LT05_L1TP_090085_19970406_20161231_01_T1_B4.TIF"}, + "swir_1": {"path": "LT05_L1TP_090085_19970406_20161231_01_T1_B5.TIF"}, + "swir_2": {"path": "LT05_L1TP_090085_19970406_20161231_01_T1_B7.TIF"}, + "tir": {"path": "LT05_L1TP_090085_19970406_20161231_01_T1_B6.TIF"}, + "quality": {"path": "LT05_L1TP_090085_19970406_20161231_01_T1_BQA.TIF"}, }, "accessories": { "metadata:landsat_mtl": { @@ -708,7 +560,27 @@ def metadata_type(): ["properties", "dtr:end_datetime"], ["properties", "datetime"], ], - } + }, + "lat": { + "description": "Latitude range", + "type": "double-range", + "min_offset": [ + ["extent", "lat", "begin"], + ], + "max_offset": [ + ["extent", "lat", "end"], + ], + }, + "lon": { + "description": "Longitude range", + "type": "double-range", + "min_offset": [ + ["extent", "lon", "begin"], + ], + "max_offset": [ + ["extent", "lon", "end"], + ], + }, }, }, } @@ -752,7 +624,6 @@ def eo3_product(): "metadata_type": "eo3_landsat_l1", "license": "CC-BY-4.0", "metadata": { - # "product": {"name": "usgs_ls8c_level1_1"}, DEPRECATED "properties": { "eo:platform": "landsat-8", "eo:instrument": "OLI_TIRS", diff --git a/tests/integration/test_image.py b/tests/integration/test_image.py deleted file mode 100644 index 9a15d329..00000000 --- a/tests/integration/test_image.py +++ /dev/null @@ -1,160 +0,0 @@ -import numpy as np - -from eo3 import images - - -def test_rescale_intensity(): - # Example was generated via: - # scipy.ndimage.rotate(np.arange(1000, 8000, 100).reshape((7,10)), 45, cval=-99) - - # (Using a variable so the array is more spaced-out & readable) - nada = -999 - original_image = np.array( - [ - [nada, nada, nada, nada, nada, nada, nada, nada, nada, nada, nada, nada], - [nada, nada, nada, nada, nada, nada, 1852, 2730, nada, nada, nada, nada], - [nada, nada, nada, nada, nada, 1711, 2570, 3428, 4169, nada, nada, nada], - [nada, nada, nada, nada, 1568, 2432, 3287, 4009, 4805, 5610, nada, nada], - [nada, nada, nada, 1427, 2291, 3144, 3871, 4663, 5451, 6181, 7049, nada], - [nada, nada, 1284, 2149, 3003, 3729, 4521, 5312, 6040, 6889, 7757, nada], - [nada, 1143, 2011, 2860, 3588, 4379, 5171, 5897, 6751, 7616, nada, nada], - [nada, 1851, 2719, 3449, 4237, 5029, 5756, 6609, 7473, nada, nada, nada], - [nada, nada, 3290, 4095, 4891, 5613, 6468, 7332, nada, nada, nada, nada], - [nada, nada, nada, 4731, 5472, 6330, 7189, nada, nada, nada, nada, nada], - [nada, nada, nada, nada, 6170, 7048, nada, nada, nada, nada, nada, nada], - [nada, nada, nada, nada, nada, nada, nada, nada, nada, nada, nada, nada], - ] - ) - unmodified = original_image.copy() - - assert np.array_equal( - original_image, unmodified - ), "rescale_intensity modified the input image" - - staticly_rescaled = images.rescale_intensity( - original_image, in_range=(4000, 6000), out_range=(100, 255), image_nodata=-999 - ) - print("Statically rescaled result: ") - print(repr(staticly_rescaled)) - - # - Note that the nodata values are not scaled (a previous bug!) - # they're translated to the output nodata value (0). - # - Note how many will be clipped to the min (100) without falling into nodata. - non = 0 - expected_static_rescale = np.array( - [ - [non, non, non, non, non, non, non, non, non, non, non, non], - [non, non, non, non, non, non, 100, 100, non, non, non, non], - [non, non, non, non, non, 100, 100, 100, 113, non, non, non], - [non, non, non, non, 100, 100, 100, 100, 162, 224, non, non], - [non, non, non, 100, 100, 100, 100, 151, 212, 255, 255, non], - [non, non, 100, 100, 100, 100, 140, 201, 255, 255, 255, non], - [non, 100, 100, 100, 100, 129, 190, 247, 255, 255, non, non], - [non, 100, 100, 100, 118, 179, 236, 255, 255, non, non, non], - [non, non, 100, 107, 169, 225, 255, 255, non, non, non, non], - [non, non, non, 156, 214, 255, 255, non, non, non, non, non], - [non, non, non, non, 255, 255, non, non, non, non, non, non], - [non, non, non, non, non, non, non, non, non, non, non, non], - ], - dtype=np.uint8, - ) - assert np.array_equal(staticly_rescaled, expected_static_rescale) - - -def test_calc_range(): - # Test that the correct value range and valid data arrays are calculated. - - # Test arrays generated via: - # >>> scipy.ndimage.rotate(np.arange(10, 70, 1).reshape((6, 10)), 55, cval=-11) - # >>> scipy.ndimage.rotate(np.arange(20, 80, 1).reshape((6, 10)), 50, cval=-11) - # >>> scipy.ndimage.rotate(np.arange(30, 90, 1).reshape((6, 10)), 55, cval=-11) - - # They have: - # - slightly different values to test the highest/lowest value range calculation - # (it should be across all bands) - # - And slightly different rotation to test the combined valid_data mask. - - no = -11 - r_array = np.array( - [ - [no, no, no, no, no, no, no, no, no, no, no], - [no, no, no, no, no, no, 25, no, no, no, no], - [no, no, no, no, no, 21, 31, 40, no, no, no], - [no, no, no, no, 17, 27, 36, 45, 53, 64, no], - [no, no, no, 15, 23, 32, 41, 49, 59, 68, no], - [no, no, no, 18, 29, 37, 46, 54, 65, no, no], - [no, no, 14, 25, 33, 42, 50, 61, no, no, no], - [no, 11, 20, 30, 38, 47, 56, 64, no, no, no], - [no, 15, 26, 34, 43, 52, 62, no, no, no, no], - [no, no, no, 39, 48, 58, no, no, no, no, no], - [no, no, no, no, 54, no, no, no, no, no, no], - [no, no, no, no, no, no, no, no, no, no, no], - ] - ) - g_array = np.array( - [ - [no, no, no, no, no, no, no, no, no, no, no], - [no, no, no, no, no, no, 31, no, no, no, no], - [no, no, no, no, no, 28, 38, 47, no, no, no], - [no, no, no, no, 26, 35, 44, 52, 60, 68, no], - [no, no, no, 24, 32, 41, 49, 58, 66, 76, no], - [no, no, no, 29, 39, 47, 55, 63, 73, no, no], - [no, no, 26, 36, 44, 52, 60, 70, no, no, no], - [no, 23, 33, 41, 50, 58, 67, 75, no, no, no], - [no, 31, 39, 47, 55, 64, 73, no, no, no, no], - [no, no, no, 52, 61, 71, no, no, no, no, no], - [no, no, no, no, 68, no, no, no, no, no, no], - [no, no, no, no, no, no, no, no, no, no, no], - ] - ) - b_array = np.array( - [ - [no, no, no, no, no, no, no, no, no, no, no], - [no, no, no, no, no, no, 45, no, no, no, no], - [no, no, no, no, no, 41, 51, 60, no, no, no], - [no, no, no, no, 37, 47, 56, 65, 73, 84, no], - [no, no, no, 35, 43, 52, 61, 69, 79, 88, no], - [no, no, no, 38, 49, 57, 66, 74, 85, no, no], - [no, no, 34, 45, 53, 62, 70, 81, no, no, no], - [no, 31, 40, 50, 58, 67, 76, 84, no, no, no], - [no, 35, 46, 54, 63, 72, 82, no, no, no, no], - [no, no, no, 59, 68, 78, no, no, no, no, no], - [no, no, no, no, 74, no, no, no, no, no, no], - [no, no, no, no, no, no, no, no, no, no, no], - ] - ) - - mask = np.ones(r_array.shape, dtype=np.bool_) - calculated_range = images.read_valid_mask_and_value_range( - mask, - ((r_array, no), (g_array, no), (b_array, no)), - calculate_percentiles=(2, 98), - ) - - expected_combined_mask = np.array( - [ - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0], - [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0], - [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0], - [0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0], - [0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0], - [0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], - [0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], - [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - ], - dtype=bool, - ) - - assert np.array_equal(expected_combined_mask, mask), ( - f"Combined mask isn't as expected. " - f"Diff: {repr(np.logical_xor(expected_combined_mask, mask))}" - ) - - assert calculated_range == ( - 34, - 65, - ), f"Unexpected 2/98 percentile values: {calculated_range}" diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py new file mode 100644 index 00000000..795febc8 --- /dev/null +++ b/tests/integration/test_model.py @@ -0,0 +1,137 @@ +from datetime import datetime +from textwrap import dedent +from typing import Dict + +import pytest +import toolz + +from eo3.fields import Range +from eo3.model import DatasetMetadata +from eo3.utils import default_utc +from eo3.validate import InvalidDatasetError + + +def test_get_and_set(l1_ls8_folder_md_expected: Dict, metadata_type): + """Test that we are able to access and set fields correctly""" + ds = DatasetMetadata( + raw_dict=l1_ls8_folder_md_expected, mdt_definition=metadata_type + ) + # get + with pytest.raises(AttributeError, match="Unknown field 'foobar'"): + ds.foobar + assert ds.id == "a780754e-a884-58a7-9ac0-df518a67f59d" + assert ds.format == "GeoTIFF" + # set + with pytest.raises(AttributeError, match="Unknown field offset"): + ds.foo = "bar" + ds.format = "GeoTIFFF" + assert ds.format == "GeoTIFFF" + # set range + with pytest.raises(TypeError, match="expects a Range value"): + ds.lat = 0.0 + # time can be a range or a single value + dt = datetime(2020, 1, 1, 23, 59, 59) + ds.time = dt + assert ds.time == Range(default_utc(dt), default_utc(dt)) + dt_end = datetime(2020, 1, 2, 23, 59, 59) + ds.time = Range(dt, dt_end) + assert ds.time == Range(default_utc(dt), default_utc(dt_end)) + + +def test_update_metadata_type(l1_ls8_folder_md_expected: Dict, metadata_type): + """ + Test that updating the metadata type definition gives us access to custom fields + included in the new definition + """ + ds = DatasetMetadata( + raw_dict=l1_ls8_folder_md_expected, mdt_definition=metadata_type + ) + with pytest.raises(AttributeError): + ds.instrument + new_metadata_type = toolz.assoc_in( + metadata_type, + ["dataset", "search_fields", "instrument"], + { + "offset": ["properties", "eo:instrument"], + "description": "Instrument name", + }, + ) + ds.metadata_type = new_metadata_type + assert ds.instrument == "OLI_TIRS" + + +def test_additional_metadata_access(l1_ls8_folder_md_expected: Dict, metadata_type): + """Check that we are able to access metadata not defined in the metadata type""" + ds = DatasetMetadata( + raw_dict=l1_ls8_folder_md_expected, mdt_definition=metadata_type + ) + assert ds.crs.epsg == 32655 + assert ds.product.name == "usgs_ls8c_level1_1" + assert "coastal_aerosol" in ds.measurements + assert "metadata:landsat_mtl" in ds.accessories + assert ds.locations is None + + +def test_bad_crs(example_metadata: Dict): + """CRS should be valid, and is preferred in epsg form if possible""" + # Invalid crs + example_metadata["crs"] = "123456" + with pytest.raises(InvalidDatasetError, match="invalid_crs"): + DatasetMetadata(example_metadata) + # Missing crs + del example_metadata["crs"] + with pytest.raises(InvalidDatasetError, match="incomplete_geometry"): + DatasetMetadata(example_metadata) + + # A CRS should be in epsg form if an EPSG exists, not WKT + example_metadata["crs"] = dedent( + """PROJCS["WGS 84 / UTM zone 55N", + GEOGCS["WGS 84", + DATUM["WGS_1984", + SPHEROID["WGS 84",6378137,298.257223563, + AUTHORITY["EPSG","7030"]], + AUTHORITY["EPSG","6326"]], + PRIMEM["Greenwich",0, + AUTHORITY["EPSG","8901"]], + UNIT["degree",0.01745329251994328, + AUTHORITY["EPSG","9122"]], + AUTHORITY["EPSG","4326"]], + UNIT["metre",1, + AUTHORITY["EPSG","9001"]], + PROJECTION["Transverse_Mercator"], + PARAMETER["latitude_of_origin",0], + PARAMETER["central_meridian",147], + PARAMETER["scale_factor",0.9996], + PARAMETER["false_easting",500000], + PARAMETER["false_northing",0], + AUTHORITY["EPSG","32655"], + AXIS["Easting",EAST], + AXIS["Northing",NORTH]] + """ + ) + with pytest.warns(UserWarning, match="change CRS to 'epsg:32655'"): + DatasetMetadata(example_metadata) + + +def test_extent(l1_ls8_folder_md_expected: Dict): + # Core TODO: copied from tests.test_eo3 + """Check that extent is properly calculated""" + ds = DatasetMetadata(l1_ls8_folder_md_expected) + assert ds.extent is not None + assert ds.extent.crs.epsg == 32655 + + del l1_ls8_folder_md_expected["geometry"] + doc = dict(**l1_ls8_folder_md_expected, geometry=ds.extent.buffer(-1).json) + + ds2 = DatasetMetadata(doc) + assert ds.extent.contains(ds2.extent) + + +def test_warn_location_deprecated( + l1_ls8_folder_md_expected: Dict, +): + """Warn if dataset includes deprecated 'location' field""" + l1_ls8_folder_md_expected["location"] = "file:///path/to" + ds = DatasetMetadata(l1_ls8_folder_md_expected) + with pytest.warns(UserWarning, match="`location` is deprecated"): + assert ds.locations == ["file:///path/to"] diff --git a/tests/integration/test_product_validate.py b/tests/integration/test_product_validate.py index 51e1fd70..57e594b9 100644 --- a/tests/integration/test_product_validate.py +++ b/tests/integration/test_product_validate.py @@ -1,4 +1,3 @@ -from pathlib import Path from typing import Dict from eo3.product.validate import validate_product @@ -49,7 +48,7 @@ def test_managed_deprecation(product: Dict, metadata_type: Dict): assert "ingested_product" in msgs.warning_text() -def test_warn_bad_product_license(l1_ls8_metadata_path: Path, product: Dict): +def test_warn_bad_product_license(product: Dict): # Missing license is a warning. del product["license"] msgs = MessageCatcher(validate_product(product)) diff --git a/tests/integration/test_serialise.py b/tests/integration/test_serialise.py deleted file mode 100644 index 9c2480f9..00000000 --- a/tests/integration/test_serialise.py +++ /dev/null @@ -1,63 +0,0 @@ -from pathlib import Path -from typing import Dict - -import ciso8601 - -from eo3 import serialise -from eo3.utils import default_utc - -from tests.common import dump_roundtrip - - -def test_stac_to_eo3_serialise(sentinel1_eo3): - assert_unchanged_after_roundstrip(sentinel1_eo3) - - -def test_valid_document_works(example_metadata: Dict): - assert_unchanged_after_roundstrip(example_metadata) - - -def assert_unchanged_after_roundstrip(doc: Dict): - generated_doc = dump_roundtrip(doc) - - # Do a serialisation roundtrip and check that it's still identical. - reserialised_doc = dump_roundtrip( - serialise.to_doc(serialise.from_doc(generated_doc)) - ) - - # One allowed difference: input dates can be many string formats, - # but we normalise them with timezone (UTC default) - _normalise_datetime_props(generated_doc) - - assert serialise.from_doc(generated_doc) == serialise.from_doc(reserialised_doc) - - -def _normalise_datetime_props(generated_doc): - properties = generated_doc.get("properties", {}) - for key in properties: - if "datetime" in key: - # If string value, make it explicitly iso format with timezone. - val = properties[key] - if isinstance(val, str): - properties[key] = default_utc(ciso8601.parse_datetime(val)).isoformat() - - -def test_location_serialisation(l1_ls8_folder_md_expected: Dict): - l1_ls8_folder_md_expected["location"] = "s3://test/url/metadata.txt" - assert_unchanged_after_roundstrip(l1_ls8_folder_md_expected) - - -def test_location_single_serialisation(tmp_path: Path, l1_ls8_folder_md_expected: Dict): - # Always serialises a single location as 'location' - location = "https://some/test/path" - - # Given multiple - l1_ls8_folder_md_expected["locations"] = [location] - - reserialised_doc = dump_roundtrip( - serialise.to_doc(serialise.from_doc(l1_ls8_folder_md_expected)) - ) - - # We get singular - assert reserialised_doc["location"] == location - assert "locations" not in reserialised_doc diff --git a/tests/integration/test_thumbnail.py b/tests/integration/test_thumbnail.py deleted file mode 100644 index 3aceb37d..00000000 --- a/tests/integration/test_thumbnail.py +++ /dev/null @@ -1,92 +0,0 @@ -import tempfile -from pathlib import Path - -import rasterio - -from eo3.images import FileWrite, GridSpec - -from . import assert_image - - -def test_thumbnail_bitflag(input_uint8_tif: Path): - writer = FileWrite() - - outfile = Path(tempfile.gettempdir()) / "test-bitflag.jpg" - - water = 128 - - writer.create_thumbnail_singleband(input_uint8_tif, Path(outfile), bit=water) - - assert_image(outfile, bands=3) - - -def test_thumbnail_lookuptable(input_uint8_tif_2: Path): - writer = FileWrite() - - outfile = Path(tempfile.gettempdir()) / "test-lookuptable.jpg" - - wofs_lookup = { - 0: [150, 150, 110], # dry - 1: [255, 255, 255], # nodata, - 16: [119, 104, 87], # terrain - 32: [89, 88, 86], # cloud_shadow - 64: [216, 215, 214], # cloud - 80: [242, 220, 180], # cloudy terrain - 128: [79, 129, 189], # water - 160: [51, 82, 119], # shady water - 192: [186, 211, 242], # cloudy water - } - - writer.create_thumbnail_singleband( - input_uint8_tif_2, Path(outfile), lookup_table=wofs_lookup - ) - - assert_image(outfile, bands=3) - - -def test_thumbnail_from_numpy_bitflag(input_uint8_tif: Path): - writer = FileWrite() - outfile = Path(tempfile.gettempdir()) / "test-bitflag.jpg" - water = 128 - - with rasterio.open(input_uint8_tif) as ds: - input_geobox = GridSpec.from_rio(ds) - data = ds.read(1) - - image_bytes = writer.create_thumbnail_singleband_from_numpy( - input_data=data, input_geobox=input_geobox, bit=water - ) - - with open(outfile, "wb") as jpeg_file: - jpeg_file.write(image_bytes) - - assert_image(outfile, bands=3) - - -def test_thumbnail_from_numpy_lookuptable(input_uint8_tif_2: Path): - writer = FileWrite() - outfile = Path(tempfile.gettempdir()) / "test-lookuptable.jpg" - wofs_lookup = { - 0: [150, 150, 110], # dry - 1: [255, 255, 255], # nodata, - 16: [119, 104, 87], # terrain - 32: [89, 88, 86], # cloud_shadow - 64: [216, 215, 214], # cloud - 80: [242, 220, 180], # cloudy terrain - 128: [79, 129, 189], # water - 160: [51, 82, 119], # shady water - 192: [186, 211, 242], # cloudy water - } - - with rasterio.open(input_uint8_tif_2) as ds: - input_geobox = GridSpec.from_rio(ds) - data = ds.read(1) - - image_bytes = writer.create_thumbnail_singleband_from_numpy( - input_data=data, input_geobox=input_geobox, lookup_table=wofs_lookup - ) - - with open(outfile, "wb") as jpeg_file: - jpeg_file.write(image_bytes) - - assert_image(outfile, bands=3) diff --git a/tests/integration/test_tostac.py b/tests/integration/test_tostac.py index 8ee54bf2..3d8dc0a8 100644 --- a/tests/integration/test_tostac.py +++ b/tests/integration/test_tostac.py @@ -7,6 +7,8 @@ from eo3 import serialise from eo3.scripts import tostac +from eo3.utils import read_file +from eo3.validate import InvalidDatasetError from tests.common import assert_same, run_prepare_cli @@ -46,33 +48,6 @@ def test_tostac(odc_dataset_path: Path, expected_stac_doc: Dict): assert_same(expected_stac_doc, output_doc) -def test_tostac_no_grids(odc_dataset_path: Path, expected_stac_doc: Dict): - """ - Converted EO1 datasets don't have grid information. Make sure it still outputs - without falling over. - """ - - # Remove grids from the input.... - dataset = serialise.from_path(odc_dataset_path) - dataset.grids = None - serialise.to_path(odc_dataset_path, dataset) - - run_tostac(odc_dataset_path) - expected_output_path = odc_dataset_path.with_name( - odc_dataset_path.name.replace(".odc-metadata.yaml", ".stac-item.json") - ) - - # No longer expect proj fields (they come from grids). - remove_stac_properties( - expected_stac_doc, ("proj:shape", "proj:transform", "proj:epsg") - ) - # But we do still expect a global CRS. - expected_stac_doc["properties"]["proj:epsg"] = 32656 - - output_doc = json.load(expected_output_path.open()) - assert_same(expected_stac_doc, output_doc) - - def remove_stac_properties(doc: Dict, remove_properties=()): """ Remove the given fields from properties and assets. @@ -92,7 +67,7 @@ def test_add_property(input_doc_folder: Path): input_metadata_path = input_doc_folder.joinpath(ODC_METADATA_FILE) assert input_metadata_path.exists() - input_doc = serialise.load_yaml(input_metadata_path) + input_doc = read_file(input_metadata_path) input_doc["properties"]["test"] = "testvalue" serialise.dump_yaml(input_metadata_path, input_doc) @@ -112,30 +87,28 @@ def test_no_crs(input_doc_folder: Path): input_metadata_path = input_doc_folder.joinpath(ODC_METADATA_FILE) assert input_metadata_path.exists() - input_doc = serialise.load_yaml(input_metadata_path) + input_doc = read_file(input_metadata_path) del input_doc["crs"] serialise.dump_yaml(input_metadata_path, input_doc) assert input_metadata_path.exists() - with pytest.raises(RuntimeError) as exp: + with pytest.raises(InvalidDatasetError, match="incomplete_geometry"): run_tostac(input_metadata_path) - assert "Unexpected input encountered" in str(exp.value) def test_invalid_crs(input_doc_folder: Path): input_metadata_path = input_doc_folder.joinpath(ODC_METADATA_FILE) assert input_metadata_path.exists() - input_doc = serialise.load_yaml(input_metadata_path) + input_doc = read_file(input_metadata_path) input_doc["crs"] = "I-CANT-BELIEVE-ITS-NOT-A-VALID-CRS:4236" serialise.dump_yaml(input_metadata_path, input_doc) assert input_metadata_path.exists() - with pytest.raises(RuntimeError) as exp: + with pytest.raises(InvalidDatasetError, match="invalid_crs"): run_tostac(input_metadata_path) - assert "Invalid projection" in str(exp.value) def run_tostac(input_metadata_path: Path): diff --git a/tests/integration/test_validate.py b/tests/integration/test_validate.py index 2d7e8ada..0b5fd6d4 100644 --- a/tests/integration/test_validate.py +++ b/tests/integration/test_validate.py @@ -1,26 +1,20 @@ -from pathlib import Path -from textwrap import dedent -from typing import Dict, Union -from uuid import uuid4 +from typing import Dict -import numpy as np -import rasterio -from rasterio.io import DatasetWriter +import pytest +import toolz from eo3 import validate +from eo3.model import DatasetMetadata from eo3.validate import ( - DocKind, - ValidationExpectations, - filename_doc_kind, - guess_kind_from_contents, - validate_dataset, + InvalidDatasetError, + validate_ds_to_metadata_type, + validate_ds_to_product, + validate_ds_to_schema, ) from eo3.validation_msg import ValidationMessage from tests.common import MessageCatcher -Doc = Union[Dict, Path] - def test_val_msg_str(): msg = ValidationMessage.info( @@ -32,360 +26,166 @@ def test_val_msg_str(): assert "I don't like spam!" in msg_str -def test_dockind_legacy(): - assert not DocKind.dataset.is_legacy - assert DocKind.legacy_dataset.is_legacy - assert DocKind.ingestion_config.is_legacy - - -def test_valid_document_works(example_metadata: Dict): +def test_valid_document_works( + l1_ls8_folder_md_expected: Dict, eo3_product, metadata_type +): """All of our example metadata files should validate""" - msgs = MessageCatcher(validate_dataset(example_metadata)) + dataset = l1_ls8_folder_md_expected + msgs = MessageCatcher(validate_ds_to_schema(dataset)) assert not msgs.errors() + msgs = MessageCatcher(validate_ds_to_metadata_type(dataset, metadata_type)) + assert not msgs.errors() -def test_bad_crs(example_metadata: Dict): - example_metadata["crs"] = 4326 - msgs = MessageCatcher(validate_dataset(example_metadata)) - assert "epsg codes should be prefixed" in msgs.error_text() + msgs = MessageCatcher(validate_ds_to_product(dataset, eo3_product)) + assert not msgs.errors() def test_missing_field(example_metadata: Dict): """when a required field (id) is missing, validation should fail""" del example_metadata["id"] - msgs = MessageCatcher(validate_dataset(example_metadata)) + msgs = MessageCatcher(validate_ds_to_schema(example_metadata)) assert "'id' is a required property" in msgs.error_text() + with pytest.raises(InvalidDatasetError, match="structure"): + DatasetMetadata(example_metadata) + def test_invalid_eo3_schema(example_metadata: Dict): """When there's no eo3 $schema defined""" del example_metadata["$schema"] - msgs = MessageCatcher(validate_dataset(example_metadata)) - assert "no_schema:" in msgs.error_text() - example_metadata["$schema"] = "https://schemas.onepdapatube.org/dataset" - msgs = MessageCatcher(validate_dataset(example_metadata)) - assert "unknown_doc_type" in msgs.error_text() - + msgs = MessageCatcher(validate_ds_to_schema(example_metadata)) + assert "$schema" in msgs.error_text() -def test_allow_optional_geo(example_metadata: Dict): - """A doc can omit all geo fields and be valid if not requiring geometry.""" - del example_metadata["crs"] - del example_metadata["geometry"] + example_metadata["$schema"] = "https://schemas.onepdapatube.org/dataset" + msgs = MessageCatcher(validate_ds_to_schema(example_metadata)) + assert "($schema)" in msgs.error_text() - for m in example_metadata["measurements"].values(): - if "grid" in m: - del m["grid"] - example_metadata["grids"] = {} - msgs = MessageCatcher(validate_dataset(example_metadata)) +def test_dataset_maturity(example_metadata: Dict): + """Dataset maturity is an optional but recommended field; schema validation + should warn if it is absent and error if it is incorrect""" + example_metadata["properties"]["dea:dataset_maturity"] = "blah" + msgs = MessageCatcher(validate_ds_to_schema(example_metadata)) assert msgs.errors() - expect = ValidationExpectations(require_geometry=False) - msgs = MessageCatcher(validate_dataset(example_metadata, expect=expect)) - assert "No geo information in dataset" in msgs.all_text() - assert not msgs.errors() + assert "dataset_maturity" in msgs.error_text() + example_metadata["properties"]["dea:dataset_maturity"] = "INTERIM" + msgs = MessageCatcher(validate_ds_to_schema(example_metadata)) + assert msgs.errors() + assert "dataset_maturity" in msgs.error_text() -def test_missing_geo_fields(example_metadata: Dict): - """If you have one gis field, you should have all of them""" - del example_metadata["crs"] - msgs = MessageCatcher(validate_dataset(example_metadata)) - assert "incomplete_crs" in msgs.error_text() - expect = ValidationExpectations(require_geometry=False) - msgs = MessageCatcher(validate_dataset(example_metadata, expect=expect)) - assert "incomplete_crs" in msgs.error_text() + del example_metadata["properties"]["dea:dataset_maturity"] + msgs = MessageCatcher(validate_ds_to_schema(example_metadata)) + assert not msgs.errors() + assert "recommended_field" in msgs.warning_text() def test_grid_custom_crs(example_metadata: Dict): - """A Measurement refers to a grid that doesn't exist""" + """Allow a grid to have its own crs, and error if crs is invalid""" example_metadata["grids"]["other_crs"] = { "crs": "epsg:32756", "shape": [2267, 1567], "transform": [50.0, 0.0, 257975.0, 0.0, -50.0, 6290325.0], } - msgs = MessageCatcher(validate_dataset(example_metadata)) - assert not msgs.error_text() - assert not msgs.warning_text() + ds = DatasetMetadata(example_metadata) + grid = ds.grids.get("other_crs") + assert grid.crs == "epsg:32756" + assert ds.crs.epsg != 32756 - -def test_grid_custom_bad_crs(example_metadata: Dict): - """A Measurement refers to a grid that doesn't exist""" - example_metadata["grids"]["other_crs"] = { + example_metadata["grids"]["default"] = { "crs": "splunge:32756", "shape": [2267, 1567], "transform": [50.0, 0.0, 257975.0, 0.0, -50.0, 6290325.0], } - msgs = MessageCatcher(validate_dataset(example_metadata)) - errs = msgs.error_text() - assert "invalid_crs" in errs - assert "other_crs" in errs + with pytest.raises(InvalidDatasetError, match="invalid_crs"): + DatasetMetadata(example_metadata) def test_missing_grid_def(example_metadata: Dict): """A Measurement refers to a grid that doesn't exist""" a_measurement, *_ = list(example_metadata["measurements"]) example_metadata["measurements"][a_measurement]["grid"] = "unknown_grid" - msgs = MessageCatcher(validate_dataset(example_metadata)) - assert "invalid_grid_ref" in msgs.error_text() + with pytest.raises(InvalidDatasetError, match="invalid_grid_ref"): + DatasetMetadata(example_metadata) def test_absolute_path_in_measurement(example_metadata: Dict): - """A Measurement refers to a grid that doesn't exist""" + """Warn if a measurement path is absolute""" a_measurement, *_ = list(example_metadata["measurements"]) example_metadata["measurements"][a_measurement][ "path" ] = "file:///this/is/an/utter/absolute/path.nc" - msgs = MessageCatcher(validate_dataset(example_metadata)) - warns = msgs.warning_text() - assert "absolute_path" in warns - assert a_measurement in warns + with pytest.warns(UserWarning, match="absolute_path"): + DatasetMetadata(example_metadata) def test_path_with_part_in_measurement(example_metadata: Dict): - """A Measurement refers to a grid that doesn't exist""" + """ + Measurement paths should not include parts; warn if they are present and error if they are invalid + """ a_measurement, *_ = list(example_metadata["measurements"]) example_metadata["measurements"][a_measurement]["path"] += "#part=0" - msgs = MessageCatcher(validate_dataset(example_metadata)) - assert "uri_part" in msgs.warning_text() + with pytest.warns(UserWarning, match="uri_part"): + DatasetMetadata(example_metadata) example_metadata["measurements"][a_measurement]["path"] += "#part=nir" - msgs = MessageCatcher(validate_dataset(example_metadata)) - assert "uri_part" in msgs.warning_text() - errs = msgs.error_text() - assert "uri_invalid_part" in errs - assert "nir" in errs + with pytest.raises(InvalidDatasetError, match="uri_invalid_part"): + DatasetMetadata(example_metadata) example_metadata["measurements"][a_measurement]["path"] += "#part=-22" - msgs = MessageCatcher(validate_dataset(example_metadata)) - assert "uri_part" in msgs.warning_text() - errs = msgs.error_text() - assert "uri_invalid_part" in errs - assert "-22" in errs + with pytest.raises(InvalidDatasetError, match="uri_invalid_part"): + DatasetMetadata(example_metadata) -def test_absolute_path_in_accessory(example_metadata: Dict): - an_accessory, *_ = list(example_metadata["accessories"]) - example_metadata["accessories"][an_accessory][ - "path" - ] = "file:///this/is/an/utter/absolute/path.nc" - msgs = MessageCatcher(validate_dataset(example_metadata)) - warns = msgs.warning_text() - assert "absolute_path" in warns - assert an_accessory in warns - - -def test_invalid_shape(example_metadata: Dict): - """the geometry must be a valid shape""" - - # Points are in an invalid order. - example_metadata["geometry"] = { - "coordinates": ( - ( - (770_115.0, -2_768_985.0), - (525_285.0, -2_981_715.0), - (770_115.0, -2_981_715.0), - (525_285.0, -2_768_985.0), - (770_115.0, -2_768_985.0), - ), - ), - "type": "Polygon", - } - msgs = MessageCatcher(validate_dataset(example_metadata)) - assert "invalid_geometry" in msgs.error_text() - - -def test_crs_as_wkt(example_metadata: Dict): - """A CRS should be in epsg form if an EPSG exists, not WKT""" - example_metadata["crs"] = dedent( - """PROJCS["WGS 84 / UTM zone 55N", - GEOGCS["WGS 84", - DATUM["WGS_1984", - SPHEROID["WGS 84",6378137,298.257223563, - AUTHORITY["EPSG","7030"]], - AUTHORITY["EPSG","6326"]], - PRIMEM["Greenwich",0, - AUTHORITY["EPSG","8901"]], - UNIT["degree",0.01745329251994328, - AUTHORITY["EPSG","9122"]], - AUTHORITY["EPSG","4326"]], - UNIT["metre",1, - AUTHORITY["EPSG","9001"]], - PROJECTION["Transverse_Mercator"], - PARAMETER["latitude_of_origin",0], - PARAMETER["central_meridian",147], - PARAMETER["scale_factor",0.9996], - PARAMETER["false_easting",500000], - PARAMETER["false_northing",0], - AUTHORITY["EPSG","32655"], - AXIS["Easting",EAST], - AXIS["Northing",NORTH]] - """ +def test_product_name_mismatch(l1_ls8_folder_md_expected: Dict, eo3_product): + """Dataset product name doesn't match product name of given product""" + eo3_product["name"] = "wrong_product_name" + msgs = MessageCatcher( + validate_ds_to_product(l1_ls8_folder_md_expected, eo3_product) ) - msgs = MessageCatcher(validate_dataset(example_metadata)) - assert not msgs.errors() - assert "non_epsg" in msgs.warning_text() - assert "change CRS to 'epsg:32655'" in msgs.warning_text() - - -def test_flat_lineage(example_metadata: Dict): - example_metadata["lineage"] = { - "spam": [str(uuid4())], - "bacon": [str(uuid4())], - "eggs": [str(uuid4())], - } - msgs = MessageCatcher(validate_dataset(example_metadata)) - assert not msgs.error_text() - assert not msgs.warning_text() - assert "nonflat_lineage" not in msgs.info_text() - - -def test_nonflat_lineage(example_metadata: Dict): - example_metadata["lineage"] = { - "spam": [str(uuid4()), str(uuid4()), str(uuid4())], - } - msgs = MessageCatcher(validate_dataset(example_metadata)) - assert not msgs.error_text() - assert not msgs.warning_text() - assert "nonflat_lineage" in msgs.info_text() + assert "product_mismatch" in msgs.error_text() -def test_non_uuids_in_lineage(example_metadata: Dict): - example_metadata["lineage"] = { - "spam": [str(uuid4())], - "eggs": [str(uuid4()), "scrambled"], - "beans": [str(uuid4()), str(uuid4()), str(uuid4())], - } - msgs = MessageCatcher(validate_dataset(example_metadata)) - errs = msgs.error_text() - assert "invalid_source_id" in errs - assert "scrambled" in errs - assert "eggs" in errs - - -def test_valid_with_product_doc(l1_ls8_folder_md_expected: Dict, product: Dict) -> Path: - """When a product is specified, it will validate that the measurements match the product""" - product["name"] = l1_ls8_folder_md_expected["product"]["name"] - # Document is valid on its own. - msgs = MessageCatcher(validate_dataset(l1_ls8_folder_md_expected)) - assert not msgs.errors() - # It contains all measurements in the product, so will be valid when not thorough. - msgs = MessageCatcher( - validate_dataset(l1_ls8_folder_md_expected, product_definition=product) +def test_measurements_match_product(l1_ls8_folder_md_expected: Dict, eo3_product): + """Validate that the dataset measurements match the product""" + measurements = l1_ls8_folder_md_expected["measurements"] + # add extra measurement not defined in product + measurements = toolz.assoc( + measurements, "new_measurement", {"path": "measurement_path"} ) - assert not msgs.errors() + # remove measurement expected by product + measurements = toolz.dissoc(measurements, "blue") + l1_ls8_folder_md_expected["measurements"] = measurements - # Remove some expected measurements from product - should get warnings now - product["default_allowances"]["allow_extra_measurements"] = [ - "cirrus", - "coastal_aerosol", - "red", - "green", - "blue", - "nir", - "swir_1", - "swir_2", - "panchromatic", - ] msgs = MessageCatcher( - validate_dataset(l1_ls8_folder_md_expected, product_definition=product) + validate_ds_to_product(l1_ls8_folder_md_expected, eo3_product) ) + assert "missing_measurement" in msgs.error_text() assert "extra_measurements" in msgs.warning_text() - assert "quality" in msgs.warning_text() - assert "lwir_1" in msgs.warning_text() - assert not msgs.errors() - - expect = ValidationExpectations( - allow_extra_measurements=[ - "lwir_1", - "lwir_2", - "quality", - ] - ) - msgs = MessageCatcher( - validate_dataset( - l1_ls8_folder_md_expected, product_definition=product, expect=expect - ) - ) - assert not msgs.errors() + assert "new_measurement" in msgs.warning_text() -# @pytest.mark.skip("This check is outside the current callpath.") -def test_complains_about_product_not_matching( +def test_product_metadata_mismatch( l1_ls8_folder_md_expected: Dict, eo3_product, ): """ - Complains when we're given products but they don't match the dataset + Complains when a dataset doesn't contain all metadata properties given by the product """ - # A metadata field that's not in the dataset. eo3_product["metadata"]["properties"]["favourite_sandwich"] = "spam" msgs = MessageCatcher( - validate_dataset(l1_ls8_folder_md_expected, product_definition=eo3_product) - ) - assert "metadata_mismatch" in msgs.error_text() - - -def test_complains_when_no_product( - l1_ls8_folder_md_expected: Dict, -): - """When a product is specified, it will validate that the measurements match the product""" - # Thorough checking should fail when there's no product provided - msgs = MessageCatcher( - validate_dataset( - l1_ls8_folder_md_expected, thorough=True, product_definition=None + validate_ds_to_product( + l1_ls8_folder_md_expected, product_definition=eo3_product ) ) - assert "no_product" in msgs.error_text() - - -def test_is_product(): - """Product documents should be correctly identified as products""" - product = dict( - name="minimal_product", metadata_type="eo3", measurements=[dict(name="blue")] - ) - assert guess_kind_from_contents(product) == DocKind.product - - -def test_is_ingestion(): - """Product documents should be correctly identified as products""" - product = dict( - name="minimal_product", metadata_type="eo3", measurements=[dict(name="blue")] - ) - assert guess_kind_from_contents(product) == DocKind.product - - -def test_is_metadata_type(): - """Product documents should be correctly identified as products""" - mdt = dict(name="minimal_mdt", dataset=dict(search_fields=dict())) - assert guess_kind_from_contents(mdt) == DocKind.metadata_type - - -def test_is_legacy_dataset(): - """Product documents should be correctly identified as products""" - ds = dict(id="spam", lineage=["sources"], platform="boots") - assert guess_kind_from_contents(ds) == DocKind.legacy_dataset - - -def test_is_legacy_ingestion_cfg(): - """Product documents should be correctly identified as products""" - ds = dict(metadata_type="foo", source_type="bar") - assert guess_kind_from_contents(ds) == DocKind.ingestion_config - - -def test_is_stac(): - """Product documents should be correctly identified as products""" - ds = dict(id="spam", properties=dict(datetime="today, right now")) - assert guess_kind_from_contents(ds) == DocKind.stac_item - - -def test_not_a_dockind(): - """Product documents should be correctly identified as products""" - product = dict(spam="spam", bacon="eggs", interruptions="vikings") - assert guess_kind_from_contents(product) is None + assert "metadata_mismatch" in msgs.error_text() def test_has_offset(): + """_has_offset helper function for checking missing offsets""" doc = dict(spam="spam", bacon="eggs", atmosphere=dict(interruptions="vikings")) from eo3.validate import _has_offset @@ -394,19 +194,9 @@ def test_has_offset(): assert not _has_offset(doc, ["eggs"]) -def test_dataset_is_not_a_product(example_metadata: Dict): - """ - Datasets should not be identified as products - - (checks all example metadata files) - """ - assert guess_kind_from_contents(example_metadata) == DocKind.dataset - assert filename_doc_kind(Path("asdf.odc-metadata.yaml")) == DocKind.dataset - - def test_get_field_offsets(metadata_type: Dict): """ - Test the get_field_offsets function. + Test the get_field_offsets function, should return all field offsets defined by the metadata type """ assert list(validate._get_field_offsets(metadata_type)) == [ ("id", [["id"]]), @@ -425,220 +215,46 @@ def test_get_field_offsets(metadata_type: Dict): ["properties", "datetime"], ], ), + ( + "lat", + [ + ["extent", "lat", "begin"], + ["extent", "lat", "end"], + ], + ), + ( + "lon", + [ + ["extent", "lon", "begin"], + ["extent", "lon", "end"], + ], + ), ] -def test_validate_ds_with_metadata_doc( - l1_ls8_metadata_path: str, - metadata_type, - l1_ls8_folder_md_expected: Dict, -): - # When thorough, the dtype and nodata are wrong - msgs = MessageCatcher( - validate_dataset( - l1_ls8_folder_md_expected, - metadata_type_definition=metadata_type, - readable_location=l1_ls8_metadata_path, - ) - ) - assert not msgs.error_text() - assert not msgs.warning_text() - - -def test_validate_ds_with_metadata_doc_warnings( - l1_ls8_metadata_path: str, +def test_validate_ds_to_metadata_type( metadata_type, l1_ls8_folder_md_expected: Dict, ): + """ + Validator should allow a document that doesn't include all the metadata type fields, + but should warn about these missing fields + """ metadata_type["dataset"]["search_fields"]["foobar"] = { "description": "A required property that is missing", "type": "string", "offset": ["properties", "eo3:foobar"], } msgs = MessageCatcher( - validate_dataset( + validate_ds_to_metadata_type( l1_ls8_folder_md_expected, metadata_type_definition=metadata_type, - readable_location=l1_ls8_metadata_path, ) ) assert not msgs.error_text() warns = msgs.warning_text() assert "missing_field" in warns assert "foobar" in warns - l1_ls8_folder_md_expected["properties"]["eo3:foobar"] = None - msgs = MessageCatcher( - validate_dataset( - l1_ls8_folder_md_expected, - metadata_type_definition=metadata_type, - readable_location=l1_ls8_metadata_path, - ) - ) - assert not msgs.error_text() - assert not msgs.warning_text() - infos = msgs.info_text() - assert "null_field" in infos - assert "foobar" in infos - - -def test_validate_location_deprec( - l1_ls8_folder_md_expected: Dict, -): - l1_ls8_folder_md_expected["location"] = "file:///path/to" - # When thorough, the dtype and nodata are wrong - msgs = MessageCatcher( - validate_dataset( - l1_ls8_folder_md_expected, - ) - ) - assert "dataset_location" in msgs.warning_text() - - -def test_dtype_compare_with_product_doc( - l1_ls8_metadata_path: str, - eo3_product, - l1_ls8_folder_md_expected: Dict, -): - """'thorough' validation should check the dtype of measurements against the product""" - - eo3_product["measurements"] = [ - dict(name="blue", dtype="uint8", units="1", nodata=255) - ] - - # When thorough, the dtype and nodata are wrong - msgs = MessageCatcher( - validate_dataset( - l1_ls8_folder_md_expected, - product_definition=eo3_product, - readable_location=l1_ls8_metadata_path, - thorough=True, - ) - ) - err_text = msgs.error_text() - assert "different_dtype" in err_text - assert "blue" in err_text - assert "uint8" in err_text - - -def test_nodata_compare_with_product_doc( - l1_ls8_metadata_path: str, - eo3_product, - l1_ls8_folder_md_expected: Dict, -): - """'thorough' validation should check the nodata of measurements against the product""" - - # Remake the tiff with a 'nodata' set. - blue_tif = ( - l1_ls8_metadata_path.parent - / l1_ls8_folder_md_expected["measurements"]["blue"]["path"] - ) - _create_dummy_tif( - blue_tif, - dtype="uint16", - nodata=65535, - ) - msgs = MessageCatcher( - validate_dataset( - l1_ls8_folder_md_expected, - product_definition=eo3_product, - readable_location=l1_ls8_metadata_path, - thorough=True, - ) - ) - assert not msgs.errors() - assert not msgs.warnings() - assert not msgs.infos() - - # Override blue definition with invalid nodata value. - _measurement(eo3_product, "blue")["nodata"] = 255 - msgs = MessageCatcher( - validate_dataset( - l1_ls8_folder_md_expected, - product_definition=eo3_product, - readable_location=l1_ls8_metadata_path, - thorough=True, - ) - ) - assert "different_nodata" in msgs.error_text() - - -def test_measurements_compare_with_nans( - l1_ls8_metadata_path: str, - eo3_product, - l1_ls8_folder_md_expected: Dict, -): - """When dataset and product have NaN nodata values, it should handle them correctly""" - product = eo3_product - blue_tif = ( - l1_ls8_metadata_path.parent - / l1_ls8_folder_md_expected["measurements"]["blue"]["path"] - ) - - # When both are NaN, it should be valid - blue = _measurement(product, "blue") - blue["nodata"] = float("NaN") - blue["dtype"] = "float32" - _create_dummy_tif(blue_tif, nodata=float("NaN")) - - msgs = MessageCatcher( - validate_dataset( - l1_ls8_folder_md_expected, - product_definition=eo3_product, - readable_location=l1_ls8_metadata_path, - thorough=True, - ) - ) - assert not msgs.errors() - assert not msgs.warnings() - assert not msgs.infos() - - # ODC can also represent NaNs as strings due to json's lack of NaN - blue["nodata"] = "NaN" - msgs = MessageCatcher( - validate_dataset( - l1_ls8_folder_md_expected, - product_definition=eo3_product, - readable_location=l1_ls8_metadata_path, - thorough=True, - ) - ) - assert not msgs.errors() - assert not msgs.warnings() - assert not msgs.infos() - - # When product is set, dataset is NaN, they no longer match. - blue["nodata"] = 0 - msgs = MessageCatcher( - validate_dataset( - l1_ls8_folder_md_expected, - product_definition=eo3_product, - readable_location=l1_ls8_metadata_path, - thorough=True, - ) - ) - errtxt = msgs.error_text() - assert "different_nodata" in errtxt - assert "blue" in errtxt - assert "dataset nan" in errtxt - assert "product 0" in errtxt - - -def test_missing_measurement_from_product( - l1_ls8_folder_md_expected: Dict, - eo3_product, -): - """Validator should notice a missing measurement from the product def""" - product = eo3_product - product["name"] = "test_with_extra_measurement" - product["measurements"] = [ - dict(name="razzmatazz", dtype="int32", units="1", nodata=-999) - ] - msgs = MessageCatcher( - validate_dataset(l1_ls8_folder_md_expected, product_definition=eo3_product) - ) - errtxt = msgs.error_text() - assert "missing_measurement" in errtxt - assert "razzmatazz" in errtxt def test_supports_measurementless_products( @@ -654,7 +270,7 @@ def test_supports_measurementless_products( """ eo3_product["measurements"] = [] msgs = MessageCatcher( - validate_dataset(l1_ls8_folder_md_expected, product_definition=eo3_product) + validate_ds_to_product(l1_ls8_folder_md_expected, eo3_product) ) assert not msgs.errors() @@ -670,30 +286,5 @@ def test_product_no_href( Level 1 products. """ del l1_ls8_folder_md_expected["product"]["href"] - msgs = MessageCatcher(validate_dataset(l1_ls8_folder_md_expected)) - assert not msgs.errors() - assert "product_href" in msgs.info_text() - - -def _measurement(product: Dict, name: str): - """Get a measurement by name""" - for m in product["measurements"]: - if m["name"] == name: - return m - raise ValueError(f"Measurement {name} not found?") - - -def _create_dummy_tif(blue_tif, nodata=None, dtype="float32", **opts): - with rasterio.open( - blue_tif, - "w", - width=10, - height=10, - count=1, - dtype=dtype, - driver="GTiff", - nodata=nodata, - **opts, - ) as ds: - ds: DatasetWriter - ds.write(np.ones((10, 10), dtype=dtype), 1) + with pytest.warns(UserWarning, match="product->href"): + DatasetMetadata(l1_ls8_folder_md_expected) diff --git a/tests/test_documents.py b/tests/test_documents.py deleted file mode 100644 index de38f684..00000000 --- a/tests/test_documents.py +++ /dev/null @@ -1,102 +0,0 @@ -""" -Module -""" - -from eo3.documents import _find_any_metadata_suffix, find_metadata_path - -from tests import write_files - - -def test_find_metadata_path(): - files = write_files( - { - "directory_dataset": { - "file1.txt": "", - "file2.txt": "", - "ga-metadata.yaml.gz": "", - }, - "file_dataset.tif": "", - "file_dataset.agdc-md.yaml": "", - "dataset_metadata.yaml": "", - "no_metadata.tif": "", - # Newer eo3-style names.` - # Sibling - "newer-dataset.tar": "", - "newer-dataset.odc-metadata.yaml": "", - # Directory - "newer_directory_dataset": { - "newer-dataset.txt": "", - "newer-dataset-b2.txt": "", - "newer-dataset.odc-metadata.yaml.gz": "", - }, - } - ) - - # A metadata file can be specified directly. - path = find_metadata_path(files.joinpath("dataset_metadata.yaml")) - assert path.absolute() == files.joinpath("dataset_metadata.yaml").absolute() - - # A older dataset directory will have an internal 'agdc-metadata' file. - path = find_metadata_path(files.joinpath("directory_dataset")) - assert ( - path.absolute() - == files.joinpath("directory_dataset", "ga-metadata.yaml.gz").absolute() - ) - - # Other older files can have a sibling file ending in 'agdc-md.yaml' - path = find_metadata_path(files.joinpath("file_dataset.tif")) - assert path.absolute() == files.joinpath("file_dataset.agdc-md.yaml").absolute() - - # No metadata to find. - assert find_metadata_path(files.joinpath("no_metadata.tif")) is None - - # Dataset itself doesn't exist. - assert find_metadata_path(files.joinpath("missing-dataset.tif")) is None - - # EO3-style dataset metadata - path = find_metadata_path(files.joinpath("newer-dataset.tar")) - assert ( - path.absolute() == files.joinpath("newer-dataset.odc-metadata.yaml").absolute() - ) - - # EO3-style dataset in a directory - path = find_metadata_path(files.joinpath("newer_directory_dataset")) - assert ( - path.absolute() - == files.joinpath( - "newer_directory_dataset", "newer-dataset.odc-metadata.yaml.gz" - ).absolute() - ) - - -def test_find_any_metatadata_suffix(): - files = write_files( - { - "directory_dataset": { - "file1.txt": "", - "file2.txt": "", - "agdc-metadata.json.gz": "", - }, - "file_dataset.tif.ga-md.yaml": "", - "dataset_metadata.YAML": "", - "no_metadata.tif": "", - } - ) - - path = _find_any_metadata_suffix(files.joinpath("dataset_metadata")) - assert path.absolute() == files.joinpath("dataset_metadata.YAML").absolute() - - path = _find_any_metadata_suffix( - files.joinpath("directory_dataset", "agdc-metadata") - ) - assert ( - path.absolute() - == files.joinpath("directory_dataset", "agdc-metadata.json.gz").absolute() - ) - - path = _find_any_metadata_suffix(files.joinpath("file_dataset.tif.ga-md")) - assert path.absolute() == files.joinpath("file_dataset.tif.ga-md.yaml").absolute() - - # Returns none if none exist - path = _find_any_metadata_suffix(files.joinpath("no_metadata")) - assert path is None diff --git a/tests/test_eo3_core.py b/tests/test_eo3_core.py new file mode 100644 index 00000000..a5b7351a --- /dev/null +++ b/tests/test_eo3_core.py @@ -0,0 +1,290 @@ +""" +Module +""" +import pytest +from affine import Affine +from odc.geo.geom import CRS, polygon +from ruamel.yaml import YAML + +from eo3.eo3_core import ( + EO3Grid, + add_eo3_parts, + eo3_grid_spatial, + is_doc_eo3, + is_doc_geo, + prep_eo3, +) + +SAMPLE_DOC = """--- +$schema: https://schemas.opendatacube.org/dataset +id: 7d41a4d0-2ab3-4da1-a010-ef48662ae8ef +crs: "EPSG:3857" +product: + name: sample_product +properties: + datetime: 2020-05-25 23:35:47.745731Z + odc:processing_datetime: 2020-05-25 23:35:47.745731Z +grids: + default: + shape: [100, 200] + transform: [10, 0, 100000, 0, -10, 200000, 0, 0, 1] +lineage: + src_a: ['7cf53cb3-5da7-483f-9f12-6056e3290b4e'] + src_b: + - 'f5b9f582-d5ff-43c0-a49b-ef175abe429c' + - '7f8c6e8e-6f6b-4513-a11c-efe466405509' + src_empty: [] +... +""" + +# Crosses lon=180 line in Pacific, taken from one the Landsat scenes +# https://landsat-pds.s3.amazonaws.com/c1/L8/074/071/LC08_L1TP_074071_20190622_20190704_01_T1/index.html +# +SAMPLE_DOC_180 = """--- +$schema: https://schemas.opendatacube.org/dataset +id: f884df9b-4458-47fd-a9d2-1a52a2db8a1a +crs: "EPSG:32660" +product: + name: sample_product +properties: + datetime: 2020-05-25 23:35:47.745731Z + odc:processing_datetime: 2020-05-25 23:35:47.745731Z +grids: + default: + shape: [7811, 7691] + transform: [30, 0, 618285, 0, -30, -1642485, 0, 0, 1] + pan: + shape: [15621, 15381] + transform: [15, 0, 618292.5, 0, -15, -1642492.5, 0, 0, 1] +lineage: {} +... +""" + + +@pytest.fixture +def basic_grid(): + return EO3Grid(dict(shape=(100, 100), transform=Affine(0, 100, 50, 100, 0, 50))) + + +@pytest.fixture +def sample_doc(): + return YAML(typ="safe").load(SAMPLE_DOC) + + +@pytest.fixture +def sample_doc_180(): + return YAML(typ="safe").load(SAMPLE_DOC_180) + + +def test_grid_ref_points(basic_grid): + ref_pts = basic_grid.ref_points() + assert ref_pts["ul"] == {"x": 50, "y": 50} + assert ref_pts["lr"] == {"x": 10050, "y": 10050} + assert ref_pts["ur"] == {"x": 50, "y": 10050} + assert ref_pts["ll"] == {"x": 10050, "y": 50} + + +def test_polygon(basic_grid): + poly = basic_grid.polygon() + assert poly == polygon( + [ + (50, 50), + (50, 10050), + (10050, 10050), + (10050, 50), + (50, 50), + ], + crs=None, + ) + + +def test_grid_crs(basic_grid): + crs = CRS("EPSG:4326") + poly = basic_grid.polygon(crs) + assert poly == polygon( + [ + (50, 50), + (50, 10050), + (10050, 10050), + (10050, 50), + (50, 50), + ], + crs=crs, + ) + basic_grid.crs = crs + poly = basic_grid.polygon() + assert poly == polygon( + [ + (50, 50), + (50, 10050), + (10050, 10050), + (10050, 50), + (50, 50), + ], + crs=crs, + ) + + +def test_grid_points(): + identity = list(Affine.translation(0, 0)) + grid = EO3Grid({"shape": (11, 22), "transform": identity}) + + pts = grid.points() + assert len(pts) == 4 + assert pts == [(0, 0), (22, 0), (22, 11), (0, 11)] + pts_ = grid.points(ring=True) + assert len(pts_) == 5 + assert pts == pts_[:4] + assert pts_[0] == pts_[-1] + + grid = EO3Grid({"shape": (11, 22), "transform": tuple(Affine.translation(100, 0))}) + pts = grid.points() + assert pts == [(100, 0), (122, 0), (122, 11), (100, 11)] + + for bad in [{}, dict(shape=(1, 1)), dict(transform=identity)]: + with pytest.raises(ValueError): + grid = EO3Grid(bad) + + +def test_bad_grids(): + identity = list(Affine.translation(0, 0)) + bad_grids = [ + # No Shape + { + "transform": identity, + }, + # Non 2-d Shape (NB: geospatial dimensions only. Other dimensions are handled elsewhere.) + { + "shape": (1024,), + "transform": identity, + }, + { + "shape": (1024, 564, 256), + "transform": identity, + }, + # No Transform + { + "shape": (1024, 256), + }, + # Formally invalid affine transform (must be 6 or 9 elements) + { + "shape": (1024, 256), + "transform": [343.3], + }, + { + "shape": (1024, 256), + "transform": [343, 23345, 234, 9, -65.3], + }, + { + "shape": (1024, 256), + "transform": [343, 23345, 234, 9, -65.3, 1, 0], + }, + { + "shape": (1024, 256), + "transform": [ + 343, + 23345, + 234, + 9, + -65.3, + 1, + 0, + 7435.24563, + 0.0001234, + 888.888, + 3, + 3, + 2, + ], + }, + # Formally invalid affine transform (all elements must be numbers) + {"shape": (1024, 256), "transform": [343, 23345, 234, 9, -65.3, "six"]}, + # Formally invalid affine transform (in 9 element form, last 3 numbers must be 0,0,1) + { + "shape": (1024, 256), + "transform": [343, 23345, 234, 9, -65.3, 1, 3, 3, 2], + }, + ] + for bad_grid in bad_grids: + with pytest.raises(ValueError): + EO3Grid(bad_grid) + + +def test_eo3_grid_spatial_nogrids(): + with pytest.raises(ValueError, match="grids.foo"): + eo3_grid_spatial( + { + "crs": "EPSG:4326", + "grids": { + "default": { + "shape": (1024, 256), + "transform": [343, 23345, 234, 9, -65.3, 1], + } + }, + }, + grid_name="foo", + ) + + +def test_is_eo3(sample_doc, sample_doc_180): + assert is_doc_eo3(sample_doc) is True + assert is_doc_eo3(sample_doc_180) is True + + # If there's no schema field at all, it's treated as legacy eo. + assert is_doc_eo3({}) is False + assert is_doc_eo3({"crs": "EPSG:4326"}) is False + assert is_doc_eo3({"crs": "EPSG:4326", "grids": {}}) is False + + with pytest.raises(ValueError, match="Unsupported dataset schema.*"): + is_doc_eo3({"$schema": "https://schemas.opendatacube.org/eo4"}) + + +def test_is_geo(sample_doc, sample_doc_180): + assert is_doc_geo(sample_doc) is True + assert is_doc_geo(sample_doc_180) is True + + assert is_doc_geo({}) is False + assert is_doc_geo({"crs": "EPSG:4326"}) is False + assert is_doc_geo({"crs": "EPSG:4326", "extent": "dummy_extent"}) is True + + +def test_add_gs_info(sample_doc, sample_doc_180): + doc = dict(**sample_doc) + doc.pop("crs") + with pytest.raises(ValueError): + add_eo3_parts(doc) + + doc = dict(**sample_doc) + doc.pop("grids") + with pytest.raises(ValueError): + add_eo3_parts(doc) + + doc = add_eo3_parts(sample_doc) + assert doc is not sample_doc + assert doc.get("crs") == "EPSG:3857" + assert doc.get("extent") is not None + assert doc.get("grid_spatial") is not None + assert doc["extent"]["lat"]["begin"] < doc["extent"]["lat"]["end"] + assert doc["extent"]["lon"]["begin"] < doc["extent"]["lon"]["end"] + + assert doc == add_eo3_parts(doc) + + doc = add_eo3_parts(sample_doc_180) + assert doc is not sample_doc_180 + assert doc["extent"]["lon"]["begin"] < 180 < doc["extent"]["lon"]["end"] + + +def test_prep_eo3(sample_doc, sample_doc_180): + doc = prep_eo3(sample_doc) + + assert "src_a" in doc["lineage"]["source_datasets"] + assert "src_b1" in doc["lineage"]["source_datasets"] + assert "src_b2" in doc["lineage"]["source_datasets"] + assert "src_empty" not in doc["lineage"]["source_datasets"] + + doc = prep_eo3(sample_doc_180) + assert doc["lineage"]["source_datasets"] == {} + + assert prep_eo3(None) is None + with pytest.raises(ValueError): + prep_eo3({}) diff --git a/tests/test_model.py b/tests/test_model.py deleted file mode 100644 index 0e8b0cc7..00000000 --- a/tests/test_model.py +++ /dev/null @@ -1,75 +0,0 @@ -""" -Module -""" - - -import pytest -from affine import Affine -from odc.geo.geom import CRS, polygon - -from eo3.model import GridDoc - - -@pytest.fixture -def basic_grid(): - return GridDoc(shape=(100, 100), transform=Affine(0, 100, 50, 100, 0, 50)) - - -def test_grid_ref_points(basic_grid): - ref_pts = basic_grid.ref_points() - assert ref_pts["ul"] == {"x": 50, "y": 50} - assert ref_pts["lr"] == {"x": 10050, "y": 10050} - assert ref_pts["ur"] == {"x": 50, "y": 10050} - assert ref_pts["ll"] == {"x": 10050, "y": 50} - - -def test_grid_points(basic_grid): - pts = basic_grid.points(ring=True) - assert pts == [ - (50, 50), - (50, 10050), - (10050, 10050), - (10050, 50), - (50, 50), - ] - - -def test_polygon(basic_grid): - poly = basic_grid.polygon() - assert poly == polygon( - [ - (50, 50), - (50, 10050), - (10050, 10050), - (10050, 50), - (50, 50), - ], - crs=None, - ) - - -def test_grid_crs(basic_grid): - crs = CRS("EPSG:4326") - poly = basic_grid.polygon(crs) - assert poly == polygon( - [ - (50, 50), - (50, 10050), - (10050, 10050), - (10050, 50), - (50, 50), - ], - crs=crs, - ) - basic_grid.crs = crs - poly = basic_grid.polygon() - assert poly == polygon( - [ - (50, 50), - (50, 10050), - (10050, 10050), - (10050, 50), - (50, 50), - ], - crs=crs, - ) diff --git a/tests/test_properties.py b/tests/test_properties.py deleted file mode 100644 index 5308cee5..00000000 --- a/tests/test_properties.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Module -""" - -from enum import Enum - -import pytest - -from eo3.properties import FileFormat, degrees_type, of_enum_type, percent_type - - -class LowerEnum(Enum): - spam = 1 - bacon = 2 - eggs = 3 - beans = 4 - - -class UpperEnum(Enum): - SPAM = 1 - BACON = 2 - EGGS = 3 - BEANS = 4 - - -def test_of_enum_type(): - ff = of_enum_type(FileFormat) - assert ff("GeoTIFF") == "GeoTIFF" - assert ff(FileFormat.GeoTIFF) == "GeoTIFF" - with pytest.raises(ValueError): - assert ff("GeoTUFF") == "GeoTIFF" - ff = of_enum_type(FileFormat, strict=False) - assert ff("GeoTUFF") == "GeoTUFF" - - low = of_enum_type(LowerEnum, lower=True) - assert low("spam") == "spam" - assert low("BACON") == "bacon" - - upp = of_enum_type(UpperEnum, upper=True) - assert upp("spam") == "SPAM" - assert upp("BACON") == "BACON" - - -def test_percent_type(): - assert percent_type("2.22") == pytest.approx(2.22) - with pytest.raises(ValueError): - percent_type("-2.2") - with pytest.raises(ValueError): - percent_type("104.6666") - - -def test_degrees_type(): - assert degrees_type("355.3") == pytest.approx(355.3) - with pytest.raises(ValueError): - percent_type("-2.2") - with pytest.raises(ValueError): - percent_type("404.6666") diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 00000000..00859744 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,200 @@ +""" +Test utility functions +(tests copied from datacube-core/tests/test_utils_docs.py and test_utils_generic.py) +""" +from collections import OrderedDict +from pathlib import Path +from typing import Iterable, Tuple + +import numpy as np +import pytest + +from eo3.utils import ( + as_url, + jsonify_document, + netcdf_extract_string, + read_documents, + thread_local_cache, +) +from eo3.utils.utils import _open_from_s3, map_with_lookahead, transform_object_tree + + +@pytest.fixture +def sample_document_files(): + files = [ + ("multi_doc.yml", 3), + ("multi_doc.yml.gz", 3), + ("multi_doc.nc", 3), + ("single_doc.yaml", 1), + ("sample.json", 1), + ] + + files = [ + (str(Path(__file__).parent / "data" / f), num_docs) for f, num_docs in files + ] + + return files + + +def test_read_docs_from_local_path(sample_document_files): + _test_read_docs_impl(sample_document_files) + + +def test_read_docs_from_file_uris(sample_document_files): + uris = [("file://" + doc, ndocs) for doc, ndocs in sample_document_files] + _test_read_docs_impl(uris) + + +def test_read_docs_from_s3(sample_document_files, monkeypatch): + """ + Use a mocked S3 bucket to test reading documents from S3 + """ + boto3 = pytest.importorskip("boto3") + moto = pytest.importorskip("moto") + + monkeypatch.setenv("AWS_ACCESS_KEY_ID", "fake") + monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "fake") + + with moto.mock_s3(): + s3 = boto3.resource("s3", region_name="us-east-1") + bucket = s3.create_bucket(Bucket="mybucket") + + mocked_s3_objs = [] + for abs_fname, ndocs in sample_document_files: + if abs_fname.endswith("gz") or abs_fname.endswith("nc"): + continue + + fname = Path(abs_fname).name + bucket.upload_file(abs_fname, fname) + + mocked_s3_objs.append(("s3://mybucket/" + fname, ndocs)) + + _test_read_docs_impl(mocked_s3_objs) + + with pytest.raises(RuntimeError): + with _open_from_s3("https://not-s3.ga/file.txt"): + pass + + +def test_read_docs_from_http(sample_document_files, httpserver): + http_docs = [] + for abs_fname, ndocs in sample_document_files: + if abs_fname.endswith("gz") or abs_fname.endswith("nc"): + continue + path = "/" + Path(abs_fname).name + + httpserver.expect_request(path).respond_with_data(open(abs_fname).read()) + http_docs.append((httpserver.url_for(path), ndocs)) + + _test_read_docs_impl(http_docs) + + +def _test_read_docs_impl(sample_documents: Iterable[Tuple[str, int]]): + # Test case for returning URIs pointing to documents + for doc_url, num_docs in sample_documents: + all_docs = list(read_documents(doc_url, uri=True)) + assert len(all_docs) == num_docs + + for uri, doc in all_docs: + assert isinstance(doc, dict) + assert isinstance(uri, str) + + url = as_url(doc_url) + if num_docs > 1: + expect_uris = [as_url(url) + f"#part={i}" for i in range(num_docs)] + else: + expect_uris = [as_url(url)] + + assert [f for f, _ in all_docs] == expect_uris + + +def test_netcdf_strings(): + assert netcdf_extract_string(np.asarray([b"a", b"b"])) == "ab" + txt = "some string" + assert netcdf_extract_string(txt) is txt + + +def test_jsonify(): + from datetime import datetime + from decimal import Decimal + from uuid import UUID + + assert sorted( + jsonify_document( + { + "a": (1.0, 2.0, 3.0), + "b": float("inf"), + "c": datetime(2016, 3, 11), + "d": np.dtype("int16"), + } + ).items() + ) == [ + ("a", (1.0, 2.0, 3.0)), + ("b", "Infinity"), + ("c", "2016-03-11T00:00:00"), + ("d", "int16"), + ] + + # Converts keys to strings: + assert sorted(jsonify_document({1: "a", "2": Decimal("2")}).items()) == [ + ("1", "a"), + ("2", "2"), + ] + + assert jsonify_document({"k": UUID("1f231570-e777-11e6-820f-185e0f80a5c0")}) == { + "k": "1f231570-e777-11e6-820f-185e0f80a5c0" + } + + +def test_transform_object_tree(): + def add_one(a): + return a + 1 + + assert transform_object_tree(add_one, [1, 2, 3]) == [2, 3, 4] + assert transform_object_tree(add_one, {"a": 1, "b": 2, "c": 3}) == { + "a": 2, + "b": 3, + "c": 4, + } + assert transform_object_tree(add_one, {"a": 1, "b": (2, 3), "c": [4, 5]}) == { + "a": 2, + "b": (3, 4), + "c": [5, 6], + } + assert transform_object_tree( + add_one, {1: 1, "2": 2, 3.0: 3}, key_transform=float + ) == {1.0: 2, 2.0: 3, 3.0: 4} + # Order must be maintained + assert transform_object_tree( + add_one, OrderedDict([("z", 1), ("w", 2), ("y", 3), ("s", 7)]) + ) == OrderedDict([("z", 2), ("w", 3), ("y", 4), ("s", 8)]) + + +def test_map_with_lookahead(): + def if_one(x): + return "one" + str(x) + + def if_many(x): + return "many" + str(x) + + assert list(map_with_lookahead(iter([]), if_one, if_many)) == [] + assert list(map_with_lookahead(iter([1]), if_one, if_many)) == [if_one(1)] + assert list(map_with_lookahead(range(5), if_one, if_many)) == list( + map(if_many, range(5)) + ) + assert list(map_with_lookahead(range(10), if_one=if_one)) == list(range(10)) + assert list(map_with_lookahead(iter([1]), if_many=if_many)) == [1] + + +def test_thread_local_cache(): + name = "test_0123394" + v = {} + + assert thread_local_cache(name, v) is v + assert thread_local_cache(name) is v + assert thread_local_cache(name, purge=True) is v + assert thread_local_cache(name, 33) == 33 + assert thread_local_cache(name, purge=True) == 33 + + assert thread_local_cache("no_such_key", purge=True) is None + assert thread_local_cache("no_such_key", 111, purge=True) == 111 diff --git a/tests/test_utils_aws.py b/tests/test_utils_aws.py new file mode 100644 index 00000000..c73461cc --- /dev/null +++ b/tests/test_utils_aws.py @@ -0,0 +1,157 @@ +# This file is part of the Open Data Cube, see https://opendatacube.org for more information +# +# Copyright (c) 2015-2023 ODC Contributors +# SPDX-License-Identifier: Apache-2.0 +import json +from unittest import mock + +import botocore +import pytest +from botocore.credentials import ReadOnlyCredentials + +from eo3.utils.aws import ( + _fetch_text, + _s3_cache_key, + auto_find_region, + ec2_current_region, + s3_client, + s3_fmt_range, + s3_url_parse, +) + +AWS_ENV_VARS = ( + "AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY AWS_SESSION_TOKEN" + "AWS_DEFAULT_REGION AWS_DEFAULT_OUTPUT AWS_PROFILE " + "AWS_ROLE_SESSION_NAME AWS_CA_BUNDLE " + "AWS_SHARED_CREDENTIALS_FILE AWS_CONFIG_FILE" +).split(" ") + + +@pytest.fixture +def without_aws_env(monkeypatch): + for e in AWS_ENV_VARS: + monkeypatch.delenv(e, raising=False) + + +def _json(**kw): + return json.dumps(kw) + + +def mock_urlopen(text, code=200): + m = mock.MagicMock() + m.getcode.return_value = code + m.read.return_value = text.encode("utf8") + m.__enter__.return_value = m + return m + + +def test_ec2_current_region(): + tests = [ + (None, None), + (_json(region="TT"), "TT"), + (_json(x=3), None), + ("not valid json", None), + ] + + for rv, expect in tests: + with mock.patch("eo3.utils.aws._fetch_text", return_value=rv): + assert ec2_current_region() == expect + + +@mock.patch("eo3.utils.aws.botocore_default_region", return_value=None) +def test_auto_find_region(*mocks): + with mock.patch("eo3.utils.aws._fetch_text", return_value=None): + with pytest.raises(ValueError): + auto_find_region() + + with mock.patch("eo3.utils.aws._fetch_text", return_value=_json(region="TT")): + assert auto_find_region() == "TT" + + +@mock.patch("eo3.utils.aws.botocore_default_region", return_value="tt-from-botocore") +def test_auto_find_region_2(*mocks): + assert auto_find_region() == "tt-from-botocore" + + +def test_fetch_text(): + with mock.patch("eo3.utils.aws.urlopen", return_value=mock_urlopen("", 505)): + assert _fetch_text("http://localhost:8817") is None + + with mock.patch("eo3.utils.aws.urlopen", return_value=mock_urlopen("text", 200)): + assert _fetch_text("http://localhost:8817") == "text" + + def fake_urlopen(*args, **kw): + raise OSError("Always broken") + + with mock.patch("eo3.utils.aws.urlopen", fake_urlopen): + assert _fetch_text("http://localhost:8817") is None + + +def test_s3_basics(without_aws_env): + from botocore.credentials import ReadOnlyCredentials + from numpy import s_ + + assert s3_url_parse("s3://bucket/key") == ("bucket", "key") + assert s3_url_parse("s3://bucket/key/") == ("bucket", "key/") + assert s3_url_parse("s3://bucket/k/k/key") == ("bucket", "k/k/key") + + with pytest.raises(ValueError): + s3_url_parse("file://some/path") + + assert s3_fmt_range((0, 3)) == "bytes=0-2" + assert s3_fmt_range(s_[4:10]) == "bytes=4-9" + assert s3_fmt_range(s_[:10]) == "bytes=0-9" + assert s3_fmt_range(None) is None + + for bad in (s_[10:], s_[-2:3], s_[:-3], (-1, 3), (3, -1), s_[1:100:3]): + with pytest.raises(ValueError): + s3_fmt_range(bad) + + creds = ReadOnlyCredentials("fake-key", "fake-secret", None) + + assert ( + str(s3_client(region_name="kk")._endpoint) == "s3(https://s3.kk.amazonaws.com)" + ) + assert ( + str(s3_client(region_name="kk", use_ssl=False)._endpoint) + == "s3(http://s3.kk.amazonaws.com)" + ) + + s3 = s3_client(region_name="us-west-2", creds=creds) + assert s3 is not None + + +def test_s3_unsigned(monkeypatch, without_aws_env): + s3 = s3_client(aws_unsigned=True) + assert s3._request_signer.signature_version == botocore.UNSIGNED + + monkeypatch.setenv("AWS_UNSIGNED", "yes") + s3 = s3_client() + assert s3._request_signer.signature_version == botocore.UNSIGNED + + +@mock.patch("eo3.utils.aws.ec2_current_region", return_value="us-west-2") +def test_s3_client_cache(monkeypatch, without_aws_env): + monkeypatch.setenv("AWS_ACCESS_KEY_ID", "fake-key-id") + monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "fake-secret") + + s3 = s3_client(cache=True) + assert s3 is s3_client(cache=True) + assert s3 is s3_client(cache="purge") + assert s3_client(cache="purge") is None + assert s3 is not s3_client(cache=True) + + opts = ( + dict(), + dict(region_name="foo"), + dict(region_name="bar"), + dict(profile="foo"), + dict(profile="foo", region_name="xxx"), + dict(profile="bar"), + dict(creds=ReadOnlyCredentials("fake1", "...", None)), + dict(creds=ReadOnlyCredentials("fake1", "...", None), region_name="custom"), + dict(creds=ReadOnlyCredentials("fake2", "...", None)), + ) + + keys = {_s3_cache_key(**o) for o in opts} + assert len(keys) == len(opts) diff --git a/tests/test_utils_uris.py b/tests/test_utils_uris.py new file mode 100644 index 00000000..56b0e7ca --- /dev/null +++ b/tests/test_utils_uris.py @@ -0,0 +1,165 @@ +""" +Test utility uri functions +(tests copied from datacube-core/tests/test_utils_other.py) +""" +import os +from pathlib import Path + +import pytest + +from eo3.utils import ( + as_url, + get_part_from_uri, + is_url, + is_vsipath, + mk_part_uri, + normalise_path, + uri_resolve, + uri_to_local_path, +) +from eo3.utils.uris import default_base_dir + + +def test_uri_to_local_path(): + if os.name == "nt": + assert "C:\\tmp\\test.tmp" == str(uri_to_local_path("file:///C:/tmp/test.tmp")) + assert "\\\\remote\\path\\file.txt" == str( + uri_to_local_path("file://remote/path/file.txt") + ) + + else: + assert "/tmp/something.txt" == str( + uri_to_local_path("file:///tmp/something.txt") + ) + + with pytest.raises(ValueError): + uri_to_local_path("file://remote/path/file.txt") + + assert uri_to_local_path(None) is None + + with pytest.raises(ValueError): + uri_to_local_path("ftp://example.com/tmp/something.txt") + + +def test_part_uri(): + base = "file:///foo.txt" + + for i in range(10): + assert get_part_from_uri(mk_part_uri(base, i)) == i + + assert get_part_from_uri("file:///f.txt") is None + assert get_part_from_uri("file:///f.txt#something_else") is None + assert get_part_from_uri("file:///f.txt#part=aa") == "aa" + assert get_part_from_uri("file:///f.txt#part=111") == 111 + + +@pytest.mark.parametrize( + "test_input,expected", + [ + ("/foo/bar/file.txt", False), + ("file:///foo/bar/file.txt", True), + ("test.bar", False), + ("s3://mybucket/objname.tiff", True), + ("gs://mybucket/objname.tiff", True), + ("wasb://mybucket/objname.tiff", True), + ("wasbs://mybucket/objname.tiff", True), + ("ftp://host.name/filename.txt", True), + ("https://host.name.com/path/file.txt", True), + ("http://host.name.com/path/file.txt", True), + ("sftp://user:pass@host.name.com/path/file.txt", True), + ("file+gzip://host.name.com/path/file.txt", True), + ("bongo:host.name.com/path/file.txt", False), + ], +) +def test_is_url(test_input, expected): + assert is_url(test_input) == expected + if expected: + assert as_url(test_input) is test_input + + +@pytest.mark.parametrize( + "base", + [ + "s3://foo", + "gs://foo", + "wasb://foo", + "wasbs://foo", + "/vsizip//vsicurl/https://host.tld/some/path", + ], +) +def test_uri_resolve(base): + abs_path = "/abs/path/to/something" + some_uri = "http://example.com/file.txt" + + assert uri_resolve(base, abs_path) == "file://" + abs_path + assert uri_resolve(base, some_uri) is some_uri + assert uri_resolve(base, None) is base + assert uri_resolve(base, "") is base + assert uri_resolve(base, "relative/path") == base + "/relative/path" + assert uri_resolve(base + "/", "relative/path") == base + "/relative/path" + assert ( + uri_resolve(base + "/some/dir/", "relative/path") + == base + "/some/dir/relative/path" + ) + + if not is_vsipath(base): + assert ( + uri_resolve(base + "/some/dir/file.txt", "relative/path") + == base + "/some/dir/relative/path" + ) + + +def test_normalise_path(): + cwd = Path(".").resolve() + assert normalise_path(".").resolve() == cwd + + p = Path("/a/b/c/d.txt") + assert normalise_path(p) == Path(p) + assert normalise_path(str(p)) == Path(p) + + base = Path("/a/b/") + p = Path("c/d.txt") + assert normalise_path(p, base) == (base / p) + assert normalise_path(str(p), str(base)) == (base / p) + assert normalise_path(p) == (cwd / p) + + with pytest.raises(ValueError): + normalise_path(p, "not/absolute/path") + + +def test_default_base_dir(monkeypatch): + def set_pwd(p): + if p is None: + monkeypatch.delenv("PWD") + else: + monkeypatch.setenv("PWD", str(p)) + + cwd = Path(".").resolve() + + # Default base dir (once resolved) will never be different from cwd + assert default_base_dir().resolve() == cwd + + # should work when PWD is not set + set_pwd(None) + assert "PWD" not in os.environ + assert default_base_dir() == cwd + + # should work when PWD is not absolute path + set_pwd("this/is/not/a/valid/path") + assert default_base_dir() == cwd + + # should be cwd when PWD points to some other dir + set_pwd(cwd / "deeper") + assert default_base_dir() == cwd + + set_pwd(cwd.parent) + assert default_base_dir() == cwd + + # PWD == cwd + set_pwd(cwd) + assert default_base_dir() == cwd + + # TODO: + # - create symlink to current directory in temp + # - set PWD to that link + # - make sure that returned path is the same as symlink and different from cwd diff --git a/tests/test_verify.py b/tests/test_verify.py deleted file mode 100644 index 59b05ff4..00000000 --- a/tests/test_verify.py +++ /dev/null @@ -1,88 +0,0 @@ -import hashlib -import unittest -from textwrap import dedent - -from eo3 import verify - -from tests import write_files - - -class VerifyTests(unittest.TestCase): - def test_checksum(self): # noqa: T003 - d = write_files({"test1.txt": "test"}) - - test_file = d.joinpath("test1.txt") - - sha1_hash = verify.calculate_file_hash(test_file) - assert sha1_hash == "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3" - - md5_hash = verify.calculate_file_hash(test_file, hash_fn=hashlib.md5) - assert md5_hash == "098f6bcd4621d373cade4e832627b4f6" - - crc32_checksum = verify.calculate_file_crc32(test_file) - assert crc32_checksum == "d87f7e0c" - - def test_package_checksum(self): - d = write_files( - { - "test1.txt": "test", - "package": {"test2.txt": "test2", "test3.txt": "test3"}, - } - ) - - c = verify.PackageChecksum() - - c.add_file(d.joinpath("test1.txt")) - c.add_file(d.joinpath("package", "test3.txt")) - c.add_file(d.joinpath("package", "test2.txt").absolute()) - - checksums_file = d.joinpath("package.sha1") - c.write(checksums_file) - - with checksums_file.open("r") as f: - doc = f.read() - - # One (hash, file) per line separated by a tab. - # - File paths must be relative to the checksum file. - # - Output in filename alphabetical order. - assert ( - dedent( - """\ - 109f4b3c50d7b0df729d299bc6f8e9ef9066971f\tpackage/test2.txt - 3ebfa301dc59196f18593c45e519287a23297589\tpackage/test3.txt - a94a8fe5ccb19ba61c4c0873d391e987982fbbd3\ttest1.txt - """ - ) - == doc - ) - - # After dumping to a file, read()'ing from the file should give us identical values. - c2 = verify.PackageChecksum() - c2.read(checksums_file) - original_items = set(c.items()) - loaded_items = set(c2.items()) - assert original_items == loaded_items - assert c == c2 - # ... and a sanity check of our equals method: - assert c != verify.PackageChecksum() - - # Verification should succeed: - verification_results = set(c2.iteratively_verify()) - expected_verification = { - (d.joinpath("test1.txt").absolute(), True), - (d.joinpath("package", "test3.txt").absolute(), True), - (d.joinpath("package", "test2.txt").absolute(), True), - } - assert expected_verification == verification_results - - # Corrupt a file, and expect it to fail verification. - with d.joinpath("package", "test3.txt").open("w") as f: - f.write("Deliberate corruption!") - - expected_verification = { - (d.joinpath("test1.txt").absolute(), True), - (d.joinpath("package", "test3.txt").absolute(), False), - (d.joinpath("package", "test2.txt").absolute(), True), - } - verification_results = set(c2.iteratively_verify()) - assert expected_verification == verification_results