Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid deserializing entire parquet geometry just to determine type #898

Merged
merged 1 commit into from
May 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

import com.fasterxml.jackson.annotation.JsonProperty;
import com.onthegomap.planetiler.expression.Expression;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Locale;
import java.util.regex.Pattern;
import org.locationtech.jts.geom.Geometry;
import org.locationtech.jts.geom.Lineal;
import org.locationtech.jts.geom.Polygonal;
Expand Down Expand Up @@ -41,6 +45,39 @@ public static GeometryType valueOf(VectorTileProto.Tile.GeomType geomType) {
};
}

/** Returns the type of a WKB-encoded geometry without needing to deserialize the whole thing. */
public static GeometryType fromWKB(byte[] wkb) {
var bb = ByteBuffer.wrap(wkb);
byte byteOrder = bb.get();
int geomType = bb.order(byteOrder == 1 ? ByteOrder.LITTLE_ENDIAN : ByteOrder.BIG_ENDIAN).getInt();
return switch (geomType) {
case 1, 4 -> GeometryType.POINT;
case 2, 5 -> GeometryType.LINE;
case 3, 6 -> GeometryType.POLYGON;
default -> GeometryType.UNKNOWN;
};
}

private static final Pattern TYPE_PATTERN =
Pattern.compile("^\\s*(multi)?(point|line|polygon)", Pattern.CASE_INSENSITIVE);

/** Returns the type of a WKT-encoded geometry without needing to deserialize the whole thing. */
public static GeometryType fromWKT(String wkt) {
var matcher = TYPE_PATTERN.matcher(wkt);
if (matcher.find()) {
String group = matcher.group(2);
if (group != null) {
return switch (group.toLowerCase(Locale.ROOT)) {
case "point" -> GeometryType.POINT;
case "line" -> GeometryType.LINE;
case "polygon" -> GeometryType.POLYGON;
default -> GeometryType.UNKNOWN;
};
}
}
return GeometryType.UNKNOWN;
}

public static GeometryType valueOf(byte val) {
return valueOf(VectorTileProto.Tile.GeomType.forNumber(val));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,40 +2,59 @@

import com.onthegomap.planetiler.geo.GeoUtils;
import com.onthegomap.planetiler.geo.GeometryException;
import com.onthegomap.planetiler.geo.GeometryType;
import com.onthegomap.planetiler.reader.WithTags;
import com.onthegomap.planetiler.util.FunctionThatThrows;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import org.locationtech.jts.geom.Geometry;

/**
* Decodes geometries from a parquet record based on the {@link GeoParquetMetadata} provided.
*/
class GeometryReader {

private final Map<String, FunctionThatThrows<Object, Geometry>> converters = new HashMap<>();
private final Map<String, FormatHandler> converters = new HashMap<>();
final String geometryColumn;

private record FormatHandler(
FunctionThatThrows<Object, Geometry> parse,
Function<Object, GeometryType> sniffType
) {}

private static <L extends List<?>> FormatHandler arrowHandler(GeometryType type,
FunctionThatThrows<L, Geometry> parser) {
return new FormatHandler(obj -> obj instanceof List<?> list ? parser.apply((L) list) : null, any -> type);
}

GeometryReader(GeoParquetMetadata geoparquet) {
this.geometryColumn = geoparquet.primaryColumn();
for (var entry : geoparquet.columns().entrySet()) {
String column = entry.getKey();
GeoParquetMetadata.ColumnMetadata columnInfo = entry.getValue();
FunctionThatThrows<Object, Geometry> converter = switch (columnInfo.encoding()) {
case "WKB" -> obj -> obj instanceof byte[] bytes ? GeoUtils.wkbReader().read(bytes) : null;
case "WKT" -> obj -> obj instanceof String string ? GeoUtils.wktReader().read(string) : null;
FormatHandler converter = switch (columnInfo.encoding()) {
case "WKB" -> new FormatHandler(
obj -> obj instanceof byte[] bytes ? GeoUtils.wkbReader().read(bytes) : null,
obj -> obj instanceof byte[] bytes ? GeometryType.fromWKB(bytes) : GeometryType.UNKNOWN
);
case "WKT" -> new FormatHandler(
obj -> obj instanceof String string ? GeoUtils.wktReader().read(string) : null,
obj -> obj instanceof String string ? GeometryType.fromWKT(string) : GeometryType.UNKNOWN
);
case "multipolygon", "geoarrow.multipolygon" ->
obj -> obj instanceof List<?> list ? GeoArrow.multipolygon((List<List<List<Object>>>) list) : null;
arrowHandler(GeometryType.POLYGON, GeoArrow::multipolygon);
case "polygon", "geoarrow.polygon" ->
obj -> obj instanceof List<?> list ? GeoArrow.polygon((List<List<Object>>) list) : null;
arrowHandler(GeometryType.POLYGON, GeoArrow::polygon);
case "multilinestring", "geoarrow.multilinestring" ->
obj -> obj instanceof List<?> list ? GeoArrow.multilinestring((List<List<Object>>) list) : null;
arrowHandler(GeometryType.LINE, GeoArrow::multilinestring);
case "linestring", "geoarrow.linestring" ->
obj -> obj instanceof List<?> list ? GeoArrow.linestring((List<Object>) list) : null;
arrowHandler(GeometryType.LINE, GeoArrow::linestring);
case "multipoint", "geoarrow.multipoint" ->
obj -> obj instanceof List<?> list ? GeoArrow.multipoint((List<Object>) list) : null;
case "point", "geoarrow.point" -> GeoArrow::point;
arrowHandler(GeometryType.POINT, GeoArrow::multipoint);
case "point", "geoarrow.point" ->
arrowHandler(GeometryType.POINT, GeoArrow::point);
default -> throw new IllegalArgumentException("Unhandled type: " + columnInfo.encoding());
};
converters.put(column, converter);
Expand All @@ -58,9 +77,17 @@ Geometry parseGeometry(Object value, String column) throws GeometryException {
throw new GeometryException("no_converter", "No geometry converter for " + column);
}
try {
return converter.apply(value);
return converter.parse.apply(value);
} catch (Exception e) {
throw new GeometryException("error_reading", "Error reading " + column, e);
}
}

GeometryType sniffGeometryType(Object value, String column) {
var converter = converters.get(column);
if (value != null && converter != null) {
return converter.sniffType.apply(value);
}
return GeometryType.UNKNOWN;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import com.onthegomap.planetiler.geo.GeoUtils;
import com.onthegomap.planetiler.geo.GeometryException;
import com.onthegomap.planetiler.geo.GeometryType;
import com.onthegomap.planetiler.reader.SourceFeature;
import com.onthegomap.planetiler.reader.Struct;
import java.util.List;
Expand All @@ -21,6 +22,7 @@ public class ParquetFeature extends SourceFeature {
private Geometry latLon;
private Geometry world;
private Struct struct = null;
private GeometryType geometryType = null;

ParquetFeature(String source, String sourceLayer, long id, GeometryReader geometryParser,
Map<String, Object> tags) {
Expand All @@ -40,31 +42,39 @@ public Geometry worldGeometry() throws GeometryException {
(world = GeoUtils.sortPolygonsByAreaDescending(GeoUtils.latLonToWorldCoords(latLonGeometry())));
}

private GeometryType geometryType() {
if (geometryType != null) {
return geometryType;
}
geometryType = geometryParser.sniffGeometryType(rawGeometry, geometryParser.geometryColumn);
if (geometryType == GeometryType.UNKNOWN) {
try {
geometryType = switch (latLonGeometry()) {
case Puntal ignored -> GeometryType.POINT;
case Lineal ignored -> GeometryType.LINE;
case Polygonal ignored -> GeometryType.POLYGON;
default -> GeometryType.UNKNOWN;
};
} catch (GeometryException e) {
throw new IllegalStateException(e);
}
}
return geometryType;
}

@Override
public boolean isPoint() {
try {
return latLonGeometry() instanceof Puntal;
} catch (GeometryException e) {
throw new IllegalStateException(e);
}
return geometryType() == GeometryType.POINT;
}

@Override
public boolean canBePolygon() {
try {
return latLonGeometry() instanceof Polygonal;
} catch (GeometryException e) {
throw new IllegalStateException(e);
}
return geometryType() == GeometryType.POLYGON;
}

@Override
public boolean canBeLine() {
try {
return latLonGeometry() instanceof Lineal;
} catch (GeometryException e) {
throw new IllegalStateException(e);
}
return geometryType() == GeometryType.LINE;
}

private Struct cachedStruct() {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,20 @@
package com.onthegomap.planetiler.geo;

import static java.util.Collections.emptyList;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;

import com.onthegomap.planetiler.TestUtils;
import com.onthegomap.planetiler.reader.SimpleFeature;
import java.util.Map;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvSource;
import org.locationtech.jts.io.ParseException;
import org.locationtech.jts.io.WKBWriter;
import org.locationtech.jts.io.WKTReader;

class GeometryTypeTest {

Expand All @@ -22,20 +30,38 @@ void testGeometryFactory() {
SimpleFeature.createFakeOsmFeature(TestUtils.newPolygon(0, 0, 1, 0, 1, 1, 0, 0), tags, "osm", null, 1,
emptyList());

Assertions.assertTrue(GeometryType.LINE.featureTest().evaluate(line));
Assertions.assertFalse(GeometryType.LINE.featureTest().evaluate(point));
Assertions.assertFalse(GeometryType.LINE.featureTest().evaluate(poly));
assertTrue(GeometryType.LINE.featureTest().evaluate(line));
assertFalse(GeometryType.LINE.featureTest().evaluate(point));
assertFalse(GeometryType.LINE.featureTest().evaluate(poly));

Assertions.assertFalse(GeometryType.POINT.featureTest().evaluate(line));
Assertions.assertTrue(GeometryType.POINT.featureTest().evaluate(point));
Assertions.assertFalse(GeometryType.POINT.featureTest().evaluate(poly));
assertFalse(GeometryType.POINT.featureTest().evaluate(line));
assertTrue(GeometryType.POINT.featureTest().evaluate(point));
assertFalse(GeometryType.POINT.featureTest().evaluate(poly));

Assertions.assertFalse(GeometryType.POLYGON.featureTest().evaluate(line));
Assertions.assertFalse(GeometryType.POLYGON.featureTest().evaluate(point));
Assertions.assertTrue(GeometryType.POLYGON.featureTest().evaluate(poly));
assertFalse(GeometryType.POLYGON.featureTest().evaluate(line));
assertFalse(GeometryType.POLYGON.featureTest().evaluate(point));
assertTrue(GeometryType.POLYGON.featureTest().evaluate(poly));

Assertions.assertThrows(Exception.class, () -> GeometryType.UNKNOWN.featureTest().evaluate(point));
Assertions.assertThrows(Exception.class, () -> GeometryType.UNKNOWN.featureTest().evaluate(line));
Assertions.assertThrows(Exception.class, () -> GeometryType.UNKNOWN.featureTest().evaluate(poly));
assertThrows(Exception.class, () -> GeometryType.UNKNOWN.featureTest().evaluate(point));
assertThrows(Exception.class, () -> GeometryType.UNKNOWN.featureTest().evaluate(line));
assertThrows(Exception.class, () -> GeometryType.UNKNOWN.featureTest().evaluate(poly));
}

@ParameterizedTest
@CsvSource(value = {
"POINT; POINT EMPTY",
"POINT; POINT(1 1)",
"POINT; MULTIPOINT(1 1, 2 2)",
"LINE; lineString(1 1, 2 2)",
"LINE; LINESTRING ZM(1 1 2 3, 2 2 4 5)",
"LINE; multiLineString((1 1, 2 2))",
"POLYGON; POLYGON((0 0, 0 1, 1 0, 0 0))",
"POLYGON; MULTIPOLYGON(((0 0, 0 1, 1 0, 0 0)))",
"UNKNOWN; GEOMETRYCOLLECTION EMPTY",
}, delimiter = ';')
void testSniffTypes(GeometryType expected, String wkt) throws ParseException {
assertEquals(expected, GeometryType.fromWKT(wkt));
var wkb = new WKBWriter().write(new WKTReader().read(wkt));
assertEquals(expected, GeometryType.fromWKB(wkb));
}
}
Loading