Skip to content

Commit

Permalink
Merge pull request #314 from ckan/dcat-us-3
Browse files Browse the repository at this point in the history
New profile for DCAT US v3
  • Loading branch information
amercader authored Oct 31, 2024
2 parents ac1c34b + 5f79509 commit bf5cf51
Show file tree
Hide file tree
Showing 27 changed files with 6,019 additions and 30 deletions.
4 changes: 2 additions & 2 deletions ckanext/dcat/plugins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,9 @@ def before_dataset_index(self, dataset_dict):
# Index a flattened version
new_key = f'extras_{field["field_name"]}__{key}'
if not dataset_dict.get(new_key):
dataset_dict[new_key] = value
dataset_dict[new_key] = str(value)
else:
dataset_dict[new_key] += ' ' + value
dataset_dict[new_key] += ' ' + str(value)

subfields = dataset_dict.pop(field['field_name'], None)
if field['field_name'] == 'spatial_coverage':
Expand Down
3 changes: 3 additions & 0 deletions ckanext/dcat/profiles/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from .base import RDFProfile, CleanedURIRef
from .base import (
CNT,
RDF,
XSD,
SKOS,
RDFS,
DCAT,
DCATAP,
DCATUS,
DCT,
ADMS,
VCARD,
Expand All @@ -21,5 +23,6 @@
from .euro_dcat_ap import EuropeanDCATAPProfile
from .euro_dcat_ap_2 import EuropeanDCATAP2Profile
from .euro_dcat_ap_3 import EuropeanDCATAP3Profile
from .dcat_us_3 import DCATUS3Profile
from .euro_dcat_ap_scheming import EuropeanDCATAPSchemingProfile
from .schemaorg import SchemaOrgProfile
10 changes: 9 additions & 1 deletion ckanext/dcat/profiles/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from dateutil.parser import parse as parse_date
from rdflib import term, URIRef, BNode, Literal
from rdflib.namespace import Namespace, RDF, XSD, SKOS, RDFS
from rdflib.namespace import Namespace, RDF, XSD, SKOS, RDFS, ORG
from geomet import wkt, InvalidGeoJSONException

from ckantoolkit import config, url_for, asbool, aslist, get_action, ObjectNotFound
Expand All @@ -13,9 +13,11 @@
from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS
from ckanext.dcat.validators import is_year, is_year_month, is_date

CNT = Namespace("http://www.w3.org/2011/content#")
DCT = Namespace("http://purl.org/dc/terms/")
DCAT = Namespace("http://www.w3.org/ns/dcat#")
DCATAP = Namespace("http://data.europa.eu/r5r/")
DCATUS = Namespace("http://resources.data.gov/ontology/dcat-us#")
ADMS = Namespace("http://www.w3.org/ns/adms#")
VCARD = Namespace("http://www.w3.org/2006/vcard/ns#")
FOAF = Namespace("http://xmlns.com/foaf/0.1/")
Expand All @@ -27,9 +29,11 @@
SPDX = Namespace("http://spdx.org/rdf/terms#")

namespaces = {
"cnt": CNT,
"dct": DCT,
"dcat": DCAT,
"dcatap": DCATAP,
"dcatus": DCATUS,
"adms": ADMS,
"vcard": VCARD,
"foaf": FOAF,
Expand All @@ -39,6 +43,7 @@
"locn": LOCN,
"gsp": GSP,
"owl": OWL,
"org": ORG,
"spdx": SPDX,
}

Expand Down Expand Up @@ -805,6 +810,9 @@ def _read_list_value(self, value):
items = value.split(",")
else:
items = [value] # Normal text value
elif isinstance(value, ((int, float, complex))):
items = [value] # number

return items

def _add_spatial_value_to_graph(self, spatial_ref, predicate, value):
Expand Down
311 changes: 311 additions & 0 deletions ckanext/dcat/profiles/dcat_us_3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,311 @@
import json
from decimal import DecimalException

from rdflib import Literal, BNode, URIRef

from ckanext.dcat.profiles import (
CNT,
DCAT,
DCATUS,
DCT,
FOAF,
RDF,
RDFS,
SKOS,
XSD,
)
from ckanext.dcat.utils import resource_uri

from .base import URIRefOrLiteral, CleanedURIRef, ORG
from .euro_dcat_ap_3 import EuropeanDCATAP3Profile


class DCATUS3Profile(EuropeanDCATAP3Profile):
"""
An RDF profile based on the DCAT-US 3 for data portals in the US
"""

def parse_dataset(self, dataset_dict, dataset_ref):

# Call base method for common properties
dataset_dict = self._parse_dataset_base(dataset_dict, dataset_ref)

# DCAT AP v2 properties also applied to higher versions
dataset_dict = self._parse_dataset_v2(dataset_dict, dataset_ref)

# DCAT AP v2 scheming fields
dataset_dict = self._parse_dataset_v2_scheming(dataset_dict, dataset_ref)

# DCAT US v3 properties also applied to higher versions
self._parse_dataset_v3_us(dataset_dict, dataset_ref)

return dataset_dict

def graph_from_dataset(self, dataset_dict, dataset_ref):

# Call base method for common properties
self._graph_from_dataset_base(dataset_dict, dataset_ref)

# DCAT AP v2 properties also applied to higher versions
self._graph_from_dataset_v2(dataset_dict, dataset_ref)

# DCAT AP v2 scheming fields
self._graph_from_dataset_v2_scheming(dataset_dict, dataset_ref)

# DCAT AP v3 properties also applied to higher versions
self._graph_from_dataset_v3(dataset_dict, dataset_ref)

# DCAT US v3 properties also applied to higher versions
self._graph_from_dataset_v3_us(dataset_dict, dataset_ref)

def graph_from_catalog(self, catalog_dict, catalog_ref):

self._graph_from_catalog_base(catalog_dict, catalog_ref)

def _data_dictionary_parse(self, data_dict, subject):

g = self.g

for data_dictionary_ref in g.objects(subject, DCATUS.describedBy):
if isinstance(data_dictionary_ref, Literal):
data_dict["data_dictionary"] = str(data_dictionary_ref)
else:
if not isinstance(data_dict.get("data_dictionary"), list):
data_dict["data_dictionary"] = []
data_dictionary_dict = {}
for item in [
(DCAT.accessURL, "url"),
(DCT["format"], "format"),
(DCT.license, "license"),
]:
predicate, key = item
value = self._object_value(data_dictionary_ref, predicate)
if value:
data_dictionary_dict[key] = value
if data_dictionary_dict:
data_dict["data_dictionary"].append(data_dictionary_dict)

return data_dict

def _data_dictionary_graph(self, data_dict, subject):
"""
Adds triples related to the data dictionary property of a Datasets
or a Distribution
TODO: Link somehow to the DataStore data dictionary if that exists
and is public
"""

g = self.g

data_dictionary = self._get_dict_value(data_dict, "data_dictionary")
if isinstance(data_dictionary, str):
g.add((subject, DCATUS.describedBy, Literal(data_dictionary)))
elif (
isinstance(data_dictionary, list)
and len(data_dictionary)
and isinstance(data_dictionary[0], dict)
):
data_dictionary = data_dictionary[0]
url = data_dictionary.get("url")
if url:
data_dictionary_ref = BNode()
g.add((data_dictionary_ref, RDF.type, DCAT.Distribution))
self._add_triple_from_dict(
data_dictionary,
data_dictionary_ref,
DCAT.accessURL,
"url",
_type=URIRef,
_class=RDFS.Resource,
)
if data_dictionary.get("format"):
self._add_triple_from_dict(
data_dictionary,
data_dictionary_ref,
DCT["format"],
"format",
_type=URIRefOrLiteral,
_class=DCT.MediaTypeOrExtent,
)
# TODO: fallback to dataset / distribution one
if data_dictionary.get("license"):
self._add_triple_from_dict(
data_dictionary,
data_dictionary_ref,
DCT.license,
"license",
_type=URIRefOrLiteral,
_class=DCT.LicenseDocument,
)
g.add((subject, DCATUS.describedBy, data_dictionary_ref))

def _parse_dataset_v3_us(self, dataset_dict, dataset_ref):

g = self.g

# Bounding box
for bbox_ref in g.objects(dataset_ref, DCATUS.geographicBoundingBox):
if not dataset_dict.get("bbox"):
dataset_dict["bbox"] = []
dataset_dict["bbox"].append(
{
"west": self._object_value(bbox_ref, DCATUS.westBoundingLongitude),
"east": self._object_value(bbox_ref, DCATUS.eastBoundingLongitude),
"north": self._object_value(bbox_ref, DCATUS.northBoundingLatitude),
"south": self._object_value(bbox_ref, DCATUS.southBoundingLatitude),
}
)

# Data dictionary
self._data_dictionary_parse(dataset_dict, dataset_ref)

# Liability statement
value = self._object_value(dataset_ref, DCATUS.liabilityStatement)
if value:
dataset_dict["liability"] = value

# Contributors
contributors = self._agents_details(dataset_ref, DCT.contributor)
if contributors:
dataset_dict["contributor"] = []
for contributor in contributors:
dataset_dict["contributor"].append(contributor)

# List fields
for key, predicate in (
("purpose", DCATUS.purpose),
("usage", SKOS.scopeNote),
):
values = self._object_value_list(dataset_ref, predicate)
if values:
dataset_dict[key] = values

for distribution_ref in self._distributions(dataset_ref):

for resource_dict in dataset_dict.get("resources", []):
if resource_dict["distribution_ref"] == str(distribution_ref):

# Distribution identifier
value = self._object_value(distribution_ref, DCT.identifier)
if value:
resource_dict["identifier"] = value

# Temporal resolution
value = self._object_value(
distribution_ref, DCAT.temporalResolution
)
if value:
resource_dict["temporal_resolution"] = value

# Character encoding
value = self._object_value(
distribution_ref, CNT.characterEncoding
)
if value:
resource_dict["character_encoding"] = value

# Data dictionary
self._data_dictionary_parse(resource_dict, distribution_ref)

def _graph_from_dataset_v3_us(self, dataset_dict, dataset_ref):

g = self.g

# Remove foaf:Document class from landing page and documentation if there
# is no title defined for them
# See Usage note in https://doi-do.github.io/dcat-us/#properties-for-document
for page_ref in g.objects(dataset_ref, DCAT.landingPage):
if not len([t for t in g.triples((page_ref, DCT.title, None))]):
g.remove((page_ref, RDF.type, None))
for doc_ref in g.objects(dataset_ref, FOAF.page):
if not len([t for t in g.triples((page_ref, DCT.title, None))]):
g.remove((doc_ref, RDF.type, None))

for publisher_ref in g.objects(dataset_ref, DCT.publisher):

# Use org:Organization instead of foaf:Agent
g.remove((publisher_ref, RDF.type, None))
g.add((publisher_ref, RDF.type, ORG.Organization))

# Add skos:prefLabel
name = self._object_value(publisher_ref, FOAF.name)
if name:
g.add((publisher_ref, SKOS.prefLabel, Literal(name)))

# Bounding box
# TODO: we could fall back to spatial or spatial_coverage's bbox/geom
bboxes = self._get_dataset_value(dataset_dict, "bbox")
if bboxes:
for bbox in bboxes:
bbox_ref = BNode()
g.add((dataset_ref, DCATUS.geographicBoundingBox, bbox_ref))
g.add((bbox_ref, RDF.type, DCATUS.geographicBoundingBox))

def add_bounding(predicate, value):
try:
g.add(
(
bbox_ref,
predicate,
Literal(value, datatype=XSD.decimal),
)
)
except (ValueError, TypeError, DecimalException):
g.add((bbox_ref, predicate, Literal(value)))

for item in (
(DCATUS.westBoundingLongitude, bbox["west"]),
(DCATUS.eastBoundingLongitude, bbox["east"]),
(DCATUS.northBoundingLatitude, bbox["north"]),
(DCATUS.southBoundingLatitude, bbox["south"]),
):
add_bounding(item[0], item[1])

# Data dictionary
self._data_dictionary_graph(dataset_dict, dataset_ref)

# Liability statement
self._add_statement_to_graph(
dataset_dict,
"liability",
dataset_ref,
DCATUS.liabilityStatement,
DCATUS.LiabilityStatement,
)

# Contributor
self._add_agents(dataset_ref, dataset_dict, "contributor", DCT.contributor)

# Lists
items = [
("purpose", DCATUS.purpose, None, Literal),
("usage", SKOS.scopeNote, None, Literal),
]
self._add_list_triples_from_dict(dataset_dict, dataset_ref, items)

for resource_dict in dataset_dict.get("resources", []):

distribution_ref = CleanedURIRef(resource_uri(resource_dict))

# Distribution identifier
self._add_triple_from_dict(
resource_dict,
distribution_ref,
DCT.identifier,
"identifier",
fallbacks=["guid", "id"],
_type=URIRefOrLiteral,
)

# Data dictionary
self._data_dictionary_graph(resource_dict, distribution_ref)

# Character encoding
self._add_triple_from_dict(
resource_dict,
distribution_ref,
CNT.characterEncoding,
"character_encoding",
_type=Literal,
)
Loading

0 comments on commit bf5cf51

Please sign in to comment.