Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-5090 Lucene 9 version of the Lucene SAIL #5091

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 85 additions & 0 deletions core/sail/lucene-v9/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.eclipse.rdf4j</groupId>
<artifactId>rdf4j-sail</artifactId>
<version>5.0.2-SNAPSHOT</version>
</parent>
<artifactId>rdf4j-sail-lucene-v9</artifactId>
<name>RDF4J: Lucene Sail Index</name>
<description>StackableSail implementation offering full-text search on literals, based on Apache Lucene.</description>
<properties>
<lucene.version>9.11.1</lucene.version>
</properties>
<dependencies>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>rdf4j-sail-lucene-api</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queries</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analysis-common</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-spatial-extras</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-backward-codecs</artifactId>
<version>${lucene.version}</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>rdf4j-queryalgebra-geosparql</artifactId>
<version>${project.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>rdf4j-rio-rdfxml</artifactId>
<version>${project.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>rdf4j-sail-memory</artifactId>
<version>${project.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.locationtech.jts</groupId>
<artifactId>jts-core</artifactId>
</dependency>
<dependency>
<groupId>org.junit.vintage</groupId>
<artifactId>junit-vintage-engine</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
/*******************************************************************************
* Copyright (c) 2015 Eclipse RDF4J contributors, Aduna, and others.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Distribution License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
package org.eclipse.rdf4j.sail.lucene.impl;

import java.io.IOException;
import java.text.ParseException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.LatLonPoint;
import org.apache.lucene.document.LatLonShape;
import org.apache.lucene.geo.Line;
import org.apache.lucene.geo.Polygon;
import org.apache.lucene.geo.Rectangle;
import org.apache.lucene.geo.SimpleWKTShapeParser;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.sandbox.document.LatLonBoundingBox;
import org.apache.lucene.spatial.SpatialStrategy;
import org.eclipse.rdf4j.sail.lucene.LuceneSail;
import org.eclipse.rdf4j.sail.lucene.SearchDocument;
import org.eclipse.rdf4j.sail.lucene.SearchFields;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Function;

public class LuceneDocument implements SearchDocument {

private final Document doc;
private final Logger logger = LoggerFactory.getLogger(getClass());

private static final String POINT_FIELD_PREFIX = "_pt_";
private static final String GEO_FIELD_PREFIX = "_geo_";

private final Function<? super String, ? extends SpatialStrategy> geoStrategyMapper;

public LuceneDocument(Function<? super String, ? extends SpatialStrategy> geoStrategyMapper) {
this(new Document(), geoStrategyMapper);
}

public LuceneDocument(Document doc, Function<? super String, ? extends SpatialStrategy> geoStrategyMapper) {
this.doc = doc;
this.geoStrategyMapper = geoStrategyMapper;
}

public LuceneDocument(String id, String resourceId, String context,
Function<? super String, ? extends SpatialStrategy> geoStrategyMapper) {
this(geoStrategyMapper);
setId(id);
setResource(resourceId);
setContext(context);
}

private void setId(String id) {
LuceneIndex.addIDField(id, doc);
}

private void setContext(String context) {
LuceneIndex.addContextField(context, doc);
}

private void setResource(String resourceId) {
LuceneIndex.addResourceField(resourceId, doc);
}

public Document getDocument() {
return doc;
}

@Override
public String getId() {
return doc.get(SearchFields.ID_FIELD_NAME);
}

@Override
public String getResource() {
return doc.get(SearchFields.URI_FIELD_NAME);
}

@Override
public String getContext() {
return doc.get(SearchFields.CONTEXT_FIELD_NAME);
}

@Override
public Set<String> getPropertyNames() {
List<IndexableField> fields = doc.getFields();
Set<String> names = new HashSet<>();
for (IndexableField field : fields) {
String name = field.name();
if (SearchFields.isPropertyField(name)) {
names.add(name);
}
}
return names;
}

@Override
public void addProperty(String name) {
// don't need to do anything
}

/**
* Stores and indexes a property in a Document. We don't have to recalculate the concatenated text: just add another
* TEXT field and Lucene will take care of this. Additional advantage: Lucene may be able to handle the invididual
* strings in a way that may affect e.g. phrase and proximity searches (concatenation basically means loss of
* information). NOTE: The TEXT_FIELD_NAME has to be stored, see in LuceneSail
*
* @see LuceneSail
*/
@Override
public void addProperty(String name, String text) {
LuceneIndex.addPredicateField(name, text, doc);
LuceneIndex.addTextField(text, doc);
}

/**
* Checks whether a field occurs with a specified value in a Document.
*/
@Override
public boolean hasProperty(String fieldName, String value) {
String[] fields = doc.getValues(fieldName);
if (fields != null) {
for (String field : fields) {
if (value.equals(field)) {
return true;
}
}
}

return false;
}

@Override
public List<String> getProperty(String name) {
return Arrays.asList(doc.getValues(name));
}

private void indexShape(Object shape, String field) {

if (shape instanceof Object[]) { // case of GEOMETRYCOLLECTION
Object[] geometries = (Object[]) shape;

for (int i = 0; i < geometries.length; i++) {
indexShape(geometries[i], field);
}
} else {
if (shape instanceof Polygon) { // WKT:POLYGON
for (Field f : LatLonShape.createIndexableFields(GEO_FIELD_PREFIX + field, (Polygon) shape)) {
doc.add(f);
}
} else if (shape instanceof Line) { // WKT:LINESTRING
for (Field f : LatLonShape.createIndexableFields(GEO_FIELD_PREFIX + field, (Line) shape)) {
doc.add(f);
}
} else if (shape instanceof double[]) { // WKT:POINT
double[] point = (double[]) shape;

for (Field f : LatLonShape.createIndexableFields(GEO_FIELD_PREFIX + field, point[1],
point[0])) {
doc.add(f);
}
doc.add(new LatLonPoint(POINT_FIELD_PREFIX + field, point[1], point[0]));
} else if (shape instanceof Rectangle) { // WKT:ENVELOPE / RECTANGLE
Rectangle box = (Rectangle) shape;
doc.add(new LatLonBoundingBox(GEO_FIELD_PREFIX + field, box.minLat, box.minLon, box.maxLat,
box.maxLon));
} else {
throw new IllegalArgumentException("Geometry for shape " + shape.toString() + " is not supported");
}
}
}

@Override
public void addGeoProperty(String field, String value) {
LuceneIndex.addStoredOnlyPredicateField(field, value, doc);
try {
String wkt = value;
Object shape = SimpleWKTShapeParser.parse(wkt);
indexShape(shape, field);
} catch (ParseException e) {
logger.warn("error while processing geo property", e);
} catch (IOException e) {
logger.warn("error while parsing wkt geometry", e);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*******************************************************************************
* Copyright (c) 2015 Eclipse RDF4J contributors, Aduna, and others.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Distribution License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
package org.eclipse.rdf4j.sail.lucene.impl;

import java.text.ParseException;
import java.util.List;
import java.util.Set;

import org.apache.lucene.search.ScoreDoc;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.sail.lucene.DocumentDistance;
import org.eclipse.rdf4j.sail.lucene.SearchFields;
import org.eclipse.rdf4j.sail.lucene.util.GeoUnits;
import org.locationtech.spatial4j.shape.Point;
import org.locationtech.spatial4j.shape.Shape;

import com.google.common.collect.Sets;

public class LuceneDocumentDistance extends LuceneDocumentResult implements DocumentDistance {

private final String geoProperty;

private final IRI units;

private final Point origin;

private static Set<String> requiredFields(String geoProperty, boolean includeContext) {
Set<String> fields = Sets.newHashSet(SearchFields.URI_FIELD_NAME, geoProperty);
if (includeContext) {
fields.add(SearchFields.CONTEXT_FIELD_NAME);
}
return fields;
}

public LuceneDocumentDistance(ScoreDoc doc, String geoProperty, IRI units, Point origin, boolean includeContext,
LuceneIndex index) {
super(doc, index, requiredFields(geoProperty, includeContext));
this.geoProperty = geoProperty;
this.units = units;
this.origin = origin;
}

@Override
public double getDistance() {
List<String> wkts = getDocument().getProperty(geoProperty);
double min = Double.POSITIVE_INFINITY;
for (String wkt : wkts) {
Shape shape;
try {
shape = index.getSpatialContext(geoProperty).readShapeFromWkt(wkt);
double dist = index.getSpatialContext(geoProperty).calcDistance(shape.getCenter(), origin);
min = Math.min(dist, min);
} catch (ParseException e) {
// ignore
}
}
return GeoUnits.fromDegrees(min, units);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/*******************************************************************************
* Copyright (c) 2015 Eclipse RDF4J contributors, Aduna, and others.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Distribution License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
package org.eclipse.rdf4j.sail.lucene.impl;

import java.util.Set;

import org.apache.lucene.document.Document;
import org.apache.lucene.search.ScoreDoc;
import org.eclipse.rdf4j.sail.lucene.DocumentResult;
import org.eclipse.rdf4j.sail.lucene.SearchDocument;

public class LuceneDocumentResult implements DocumentResult {

protected final ScoreDoc scoreDoc;

protected final LuceneIndex index;

private final Set<String> fields;

private LuceneDocument fullDoc;

public LuceneDocumentResult(ScoreDoc doc, LuceneIndex index, Set<String> fields) {
this.scoreDoc = doc;
this.index = index;
this.fields = fields;
}

@Override
public SearchDocument getDocument() {
if (fullDoc == null) {
Document doc = index.getDocument(scoreDoc.doc, fields);
fullDoc = new LuceneDocument(doc, index.getSpatialStrategyMapper());
}
return fullDoc;
}
}
Loading
Loading