Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add compact DB mode (--compact-db) to de-duplicate mbtiles output #219

Merged
merged 10 commits into from
May 24, 2022
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
import com.onthegomap.planetiler.VectorTile;
import com.onthegomap.planetiler.geo.TileCoord;
import com.onthegomap.planetiler.mbtiles.Mbtiles;
import com.onthegomap.planetiler.mbtiles.TileEncodingResult;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.OptionalInt;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
Expand Down Expand Up @@ -51,7 +53,7 @@ void testStilInvalidWithOneTile() throws IOException {
VectorTile.encodeGeometry(point(0, 0)),
Map.of()
)));
writer.write(TileCoord.ofXYZ(0, 0, 0), gzip(tile.encode()));
writer.write(new TileEncodingResult(TileCoord.ofXYZ(0, 0, 0), gzip(tile.encode()), OptionalInt.empty()));
}
assertInvalid(mbtiles);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
package com.onthegomap.planetiler.benchmarks;

import com.google.common.base.Stopwatch;
import com.onthegomap.planetiler.config.Arguments;
import com.onthegomap.planetiler.geo.TileCoord;
import com.onthegomap.planetiler.mbtiles.Mbtiles;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.Duration;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class BenchmarkMbtilesRead {
msbarry marked this conversation as resolved.
Show resolved Hide resolved

private static final Logger LOGGER = LoggerFactory.getLogger(BenchmarkMbtilesRead.class);

private static final String SELECT_RANDOM_COORDS =
"select tile_column, tile_row, zoom_level from tiles order by random() limit ?";

public static void main(String[] args) throws Exception {

Arguments arguments = Arguments.fromArgs(args);
int repetitions = arguments.getInteger("bench_repetitions", "number of repetitions", 10);
int nrTileReads = arguments.getInteger("bench_nr_tile_reads", "number of tiles to read", 500_000);
int preWarms = arguments.getInteger("bench_pre_warms", "number of pre warm runs", 3);

List<Path> mbtilesPaths = arguments.getList("bench_mbtiles", "the mbtiles file to read from", List.of()).stream()
.map(Paths::get).toList();


if (mbtilesPaths.isEmpty()) {
throw new IllegalArgumentException("pass one or many paths to the same mbtiles file");
}

mbtilesPaths.stream().forEach(p -> {
if (!Files.exists(p) || !Files.isRegularFile(p)) {
throw new IllegalArgumentException("%s does not exists".formatted(p));
}
});

List<TileCoord> randomCoordsToFetchPerRepetition = new LinkedList<>();

do {
try (var db = Mbtiles.newReadOnlyDatabase(mbtilesPaths.get(0))) {
try (var statement = db.connection().prepareStatement(SELECT_RANDOM_COORDS)) {
statement.setInt(1, nrTileReads - randomCoordsToFetchPerRepetition.size());
var rs = statement.executeQuery();
while (rs.next()) {
int x = rs.getInt("tile_column");
int y = rs.getInt("tile_row");
int z = rs.getInt("zoom_level");
randomCoordsToFetchPerRepetition.add(TileCoord.ofXYZ(x, (1 << z) - 1 - y, z));
}
}
}
} while (randomCoordsToFetchPerRepetition.size() < nrTileReads);

Map<Path, Double> avgReadOperationsPerSecondPerDb = new HashMap<>();
for (Path dbPath : mbtilesPaths) {
List<ReadResult> results = new LinkedList<>();

LOGGER.info("working on {}", dbPath);

for (int preWarm = 0; preWarm < preWarms; preWarm++) {
readEachTile(randomCoordsToFetchPerRepetition, dbPath);
}

for (int rep = 0; rep < repetitions; rep++) {
results.add(readEachTile(randomCoordsToFetchPerRepetition, dbPath));
}
var readOperationsPerSecondStats =
results.stream().mapToDouble(ReadResult::readOperationsPerSecond).summaryStatistics();
LOGGER.info("readOperationsPerSecondStats: {}", readOperationsPerSecondStats);

avgReadOperationsPerSecondPerDb.put(dbPath, readOperationsPerSecondStats.getAverage());
}

List<Path> keysSorted = avgReadOperationsPerSecondPerDb.entrySet().stream()
.sorted((o1, o2) -> o1.getValue().compareTo(o2.getValue()))
.map(Map.Entry::getKey)
.toList();

LOGGER.info("diffs");
for (int i = 0; i < keysSorted.size() - 1; i++) {
for (int j = i + 1; j < keysSorted.size(); j++) {
Path db0 = keysSorted.get(i);
double avg0 = avgReadOperationsPerSecondPerDb.get(db0);
Path db1 = keysSorted.get(j);
double avg1 = avgReadOperationsPerSecondPerDb.get(db1);

double diff = avg1 * 100 / avg0 - 100;

LOGGER.info("\"{}\" to \"{}\": avg read operations per second improved by {}%", db0, db1, diff);
}
}
}

private static ReadResult readEachTile(List<TileCoord> coordsToFetch, Path dbPath) throws IOException {
try (var db = Mbtiles.newReadOnlyDatabase(dbPath)) {
db.getTile(0, 0, 0); // trigger prepared statement creation
var totalSw = Stopwatch.createStarted();
for (var coordToFetch : coordsToFetch) {
msbarry marked this conversation as resolved.
Show resolved Hide resolved
if (db.getTile(coordToFetch) == null) {
throw new IllegalStateException("%s should exist in %s".formatted(coordToFetch, dbPath));
}
}
totalSw.stop();
return new ReadResult(totalSw.elapsed(), coordsToFetch.size());
}
}

private record ReadResult(Duration duration, int coordsFetchedCount) {
double readOperationsPerSecond() {
double secondsFractional = duration.toNanos() / 1E9;
return coordsFetchedCount / secondsFractional;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
package com.onthegomap.planetiler.benchmarks;

import com.google.common.base.Stopwatch;
import com.onthegomap.planetiler.config.Arguments;
import com.onthegomap.planetiler.config.PlanetilerConfig;
import com.onthegomap.planetiler.geo.TileCoord;
import com.onthegomap.planetiler.mbtiles.Mbtiles;
import com.onthegomap.planetiler.mbtiles.Mbtiles.BatchedTileWriter;
import com.onthegomap.planetiler.mbtiles.TileEncodingResult;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.DoubleSummaryStatistics;
import java.util.OptionalInt;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class BenchmarkMbtilesWriter {
msbarry marked this conversation as resolved.
Show resolved Hide resolved

private static final Logger LOGGER = LoggerFactory.getLogger(BenchmarkMbtilesWriter.class);

public static void main(String[] args) throws IOException {

Arguments arguments = Arguments.fromArgs(args);

int tilesToWrite = arguments.getInteger("bench_tiles_to_write", "number of tiles to write", 1_000_000);
int repetitions = arguments.getInteger("bench_repetitions", "number of repetitions", 10);
/*
* select count(distinct(tile_data_id)) * 100.0 / count(*) from tiles_shallow
* => ~8% (Australia)
*/
int distinctTilesInPercent = arguments.getInteger("bench_distinct_tiles", "distinct tiles in percent", 10);
/*
* select avg(length(tile_data))
* from (select tile_data_id from tiles_shallow group by tile_data_id having count(*) = 1) as x
* join tiles_data using(tile_data_id)
* => ~785 (Australia)
*/
int distinctTileDataSize =
arguments.getInteger("bench_distinct_tile_data_size", "distinct tile data size in bytes", 800);
/*
* select avg(length(tile_data))
* from (select tile_data_id from tiles_shallow group by tile_data_id having count(*) > 1) as x
* join tiles_shallow using(tile_data_id)
* join tiles_data using(tile_data_id)
* => ~93 (Australia)
*/
int dupeTileDataSize = arguments.getInteger("bench_dupe_tile_data_size", "dupe tile data size in bytes", 100);
/*
* select count(*) * 100.0 / sum(usage_count)
* from (select tile_data_id, count(*) as usage_count from tiles_shallow group by tile_data_id having count(*) > 1)
* => ~0.17% (Australia)
*/
int dupeSpreadInPercent = arguments.getInteger("bench_dupe_spread", "dupe spread in percent", 10);

byte[] distinctTileData = createFilledByteArray(distinctTileDataSize);
byte[] dupeTileData = createFilledByteArray(dupeTileDataSize);

PlanetilerConfig config = PlanetilerConfig.from(arguments);

DoubleSummaryStatistics tileWritesPerSecondsStats = new DoubleSummaryStatistics();

for (int repetition = 0; repetition < repetitions; repetition++) {

Path outputPath = getTempOutputPath();
try (var mbtiles = Mbtiles.newWriteToFileDatabase(outputPath, config.compactDb())) {

mbtiles.createTables();
if (!config.deferIndexCreation()) {
mbtiles.addTileIndex();
}

try (var writer = mbtiles.newBatchedTileWriter()) {
Stopwatch sw = Stopwatch.createStarted();
writeTiles(writer, tilesToWrite, distinctTilesInPercent, distinctTileData, dupeTileData, dupeSpreadInPercent);
sw.stop();
double secondsFractional = sw.elapsed(TimeUnit.NANOSECONDS) / 1E9;
double tileWritesPerSecond = tilesToWrite / secondsFractional;
tileWritesPerSecondsStats.accept(tileWritesPerSecond);
}

} finally {
Files.delete(outputPath);
}
}

LOGGER.info("tileWritesPerSecondsStats: {}", tileWritesPerSecondsStats);
}


private static void writeTiles(BatchedTileWriter writer, int tilesToWrite, int distinctTilesInPercent,
byte[] distinctTileData, byte[] dupeTileData, int dupeSpreadInPercent) {

int dupesToWrite = (int) Math.round(tilesToWrite * (100 - distinctTilesInPercent) / 100.0);
int dupeHashMod = (int) Math.round(dupesToWrite * dupeSpreadInPercent / 100.0);
int tilesWritten = 0;
int dupeCounter = 0;
for (int z = 0; z <= 14; z++) {
int maxCoord = 1 << z;
for (int x = 0; x < maxCoord; x++) {
for (int y = 0; y < maxCoord; y++) {

TileCoord coord = TileCoord.ofXYZ(x, y, z);
TileEncodingResult toWrite;
if (tilesWritten % 100 < distinctTilesInPercent) {
toWrite = new TileEncodingResult(coord, distinctTileData, OptionalInt.empty());
} else {
++dupeCounter;
int hash = dupeHashMod == 0 ? 0 : dupeCounter % dupeHashMod;
toWrite = new TileEncodingResult(coord, dupeTileData, OptionalInt.of(hash));
}

writer.write(toWrite);

if (++tilesWritten >= tilesToWrite) {
return;
}
}
}
}
}

private static Path getTempOutputPath() {
File f;
try {
f = File.createTempFile("planetiler", ".mbtiles");
} catch (IOException e) {
throw new IllegalStateException(e);
}
f.deleteOnExit();
return f.toPath();
}

private static byte[] createFilledByteArray(int len) {
byte[] data = new byte[len];
new Random(0).nextBytes(data);
return data;
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package com.onthegomap.planetiler.collection;

import com.carrotsearch.hppc.LongLongHashMap;
import com.google.common.primitives.Longs;
import com.onthegomap.planetiler.Profile;
import com.onthegomap.planetiler.VectorTile;
import com.onthegomap.planetiler.config.PlanetilerConfig;
Expand All @@ -12,6 +13,7 @@
import com.onthegomap.planetiler.util.CloseableConusmer;
import com.onthegomap.planetiler.util.CommonStringEncoder;
import com.onthegomap.planetiler.util.DiskBacked;
import com.onthegomap.planetiler.util.Hashing;
import com.onthegomap.planetiler.util.LayerStats;
import java.io.Closeable;
import java.io.IOException;
Expand Down Expand Up @@ -341,6 +343,22 @@ public TileCoord tileCoord() {
return tileCoord;
}

/**
* Generates a hash over the feature's relevant data: layer, geometry, and attributes. The coordinates are
* <b>not</b> part of the hash.
* <p>
* Used as an optimization to avoid writing the same (ocean) tiles over and over again.
*/
public int generateContentHash() {
int hash = Hashing.FNV1_32_INIT;
for (var feature : entries) {
long layerId = extractLayerIdFromKey(feature.key());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

layerId starts as a byte - can we just keep it a byte and avoid Longs.toByteArray ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure! sorry, got fooled by #hasSameContents in which the result of #extractLayerIdFromKey is assigned to a variable of type long

hash = Hashing.fnv32(hash, Longs.toByteArray(layerId));
hash = Hashing.fnv32(hash, feature.value());
}
return hash;
}

/**
* Returns true if {@code other} contains features with identical layer, geometry, and attributes, as this tile -
* even if the tiles have separate coordinates.
Expand All @@ -363,6 +381,7 @@ public boolean hasSameContents(TileFeatures other) {
return true;
}


private VectorTile.Feature decodeVectorTileFeature(SortableFeature entry) {
try (MessageUnpacker unpacker = MessagePack.newDefaultUnpacker(entry.value())) {
long group;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ public record PlanetilerConfig(
double minFeatureSizeBelowMaxZoom,
double simplifyToleranceAtMaxZoom,
double simplifyToleranceBelowMaxZoom,
boolean osmLazyReads
boolean osmLazyReads,
boolean compactDb
) {

public static final int MIN_MINZOOM = 0;
Expand Down Expand Up @@ -135,6 +136,9 @@ public static PlanetilerConfig from(Arguments arguments) {
0.1d),
arguments.getBoolean("osm_lazy_reads",
"Read OSM blocks from disk in worker threads",
false),
arguments.getBoolean("compact_db",
"Reduce the DB size by separating and deduping the tile data",
false)
);
}
Expand Down
Loading