Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Provide a new plugin for indexing repeating subfields #414

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -280,10 +280,13 @@ When using a plain string translations will be provided with gettext:
This field is the parent of group of repeating subfields. The value is
a list of fields entered the same way as normal fields.

> **_NOTE:_** CKAN needs an IPackageController plugin with `before_index` to
> convert repeating subfields to formats that can be indexed by solr. For
> testing you may use the included `scheming_nerf_index` plugin to encode
> all repeating fields as JSON strings to prevent solr errors.
> [!NOTE]
> CKAN needs an IPackageController plugin with `before_dataset_index` to
> convert repeating subfields to formats that can be indexed by solr. The
> included `scheming_subfields_index` plugin will group the values of the
> same subfields in a text field that will make the values findable. If
> you require more precise handling of a particular subfield,
> you will need to customize the Solr schema to add the necessary fields.

`repeating_label` may be used to provide a singular version of the label
for each group.
Expand Down
47 changes: 47 additions & 0 deletions ckanext/scheming/plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,53 @@ def before_index(self, data_dict):
return data_dict


class SchemingSubfieldsIndexPlugin(p.SingletonPlugin):
"""
Index suitable repeating dataset fields in before_dataset_index to prevent failures
on unmodified solr schema. This will allow hitting results in most text and list
subfields. Ideally you probably want to select the relevant subfields that will get
indexed and modify the Solr schema if necessary.

This implementation will group the values of the same subfields into an
`extras_{field_name}__{key}`,a text Solr field that will allow free-text search on
its value. Again, if you require more precise handling of a particular subfield,
you will need to customize the Solr schema to add particular fields needed.
"""
p.implements(p.IPackageController, inherit=True)

def before_dataset_index(self, data_dict):
return self.before_index(data_dict)

def before_index(self, data_dict):
schemas = SchemingDatasetsPlugin.instance._expanded_schemas
if data_dict['type'] not in schemas:
return data_dict

schema = schemas[data_dict['type']]

for field in schema['dataset_fields']:
if field['field_name'] in data_dict and 'repeating_subfields' in field:
for item in data_dict[field['field_name']]:
for key in item:
value = item[key]
if isinstance(value, dict):
continue
if isinstance(value, list):
value = ' '.join(value)
# Index a flattened version
new_key = 'extras_{field_name}__{key}'.format(
field_name=field["field_name"], key=key
)
if not data_dict.get(new_key):
data_dict[new_key] = value
else:
data_dict[new_key] += ' ' + value

data_dict.pop(field['field_name'], None)

return data_dict


def _load_schemas(schemas, type_field):
out = {}
for n in schemas:
Expand Down
45 changes: 45 additions & 0 deletions ckanext/scheming/tests/test_subfields.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
try:
from unittest import mock
except ImportError:
import mock

import pytest
import ckantoolkit

from ckantoolkit.tests.factories import Dataset
from ckantoolkit.tests.helpers import call_action


dataset_dict = {
"name": "test-dataset",
"type": "test-subfields",
# Repeating subfields
"contact_address": [
{"address": "Maple Street 123", "city": "New Paris", "country": "Maplonia"},
{"address": "Rose Avenue 452", "city": "Old York", "country": "Rosestan"},
],
}


@pytest.mark.usefixtures("with_plugins", "clean_db")
@pytest.mark.ckan_config("ckan.plugins", "scheming_datasets scheming_subfields_index")
def test_repeating_subfields_index():

with mock.patch("ckan.lib.search.index.make_connection") as m:
call_action("package_create", **dataset_dict)

# Dict sent to Solr
search_dict = m.mock_calls[1].kwargs["docs"][0]
assert search_dict["extras_contact_address__city"] == "New Paris Old York"
assert search_dict["extras_contact_address__country"] == "Maplonia Rosestan"


@pytest.mark.usefixtures("with_plugins", "clean_db")
@pytest.mark.ckan_config("ckan.plugins", "scheming_datasets scheming_subfields_index")
def test_repeating_subfields_search():

dataset = call_action("package_create", **dataset_dict)

result = call_action("package_search", q="Old York")

assert result["results"][0]["id"] == dataset["id"]
18 changes: 16 additions & 2 deletions ckanext/scheming/tests/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,22 @@
not_empty = get_validator("not_empty")


pytestmark = [
pytest.mark.usefixtures("with_plugins"),
pytest.mark.ckan_config(
"ckan.plugins",
" ".join([
"scheming_datasets",
"scheming_groups",
"scheming_organizations",
"scheming_test_plugin",
"scheming_subfields_index",
"scheming_test_validation",
])
)
]


class TestGetValidatorOrConverter(object):
def test_missing(self):
with pytest.raises(SchemingException):
Expand Down Expand Up @@ -941,8 +957,6 @@ def test_invalid_choice(self):
raise AssertionError("ValidationError not raised")


@pytest.mark.ckan_config("ckan.plugins", "scheming_test_validation")
@pytest.mark.usefixtures("with_plugins")
class TestValidatorsFromString:
def test_empty(self):
assert validators_from_string("", {}, {}) == []
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
scheming_groups=ckanext.scheming.plugins:SchemingGroupsPlugin
scheming_organizations=ckanext.scheming.plugins:SchemingOrganizationsPlugin
scheming_nerf_index=ckanext.scheming.plugins:SchemingNerfIndexPlugin
scheming_subfields_index=ckanext.scheming.plugins:SchemingSubfieldsIndexPlugin
scheming_test_subclass=ckanext.scheming.tests.plugins:SchemingTestSubclass
scheming_test_plugin=ckanext.scheming.tests.plugins:SchemingTestSchemaPlugin
scheming_test_validation=ckanext.scheming.tests.plugins:SchemingTestValidationPlugin
Expand Down
2 changes: 1 addition & 1 deletion test.ini
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ port = 5000
use = config:../../src/ckan/test-core.ini

ckan.plugins = scheming_datasets scheming_groups scheming_organizations
scheming_test_plugin scheming_nerf_index
scheming_test_plugin scheming_subfields_index
scheming.dataset_schemas = ckanext.scheming:ckan_dataset.yaml
ckanext.scheming.tests:test_schema.json
ckanext.scheming.tests:test_subfields.yaml
Expand Down
Loading