Skip to content

Commit

Permalink
Fix string decoding, and double precision alignment.
Browse files Browse the repository at this point in the history
  • Loading branch information
twitchax committed Aug 15, 2024
1 parent a35e52a commit 9542412
Show file tree
Hide file tree
Showing 15 changed files with 66 additions and 48 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,10 @@ This implementation trades binary size for performance by employing an in-binary

![Bench](static/bench.png)

On average, for random cities, the OSM dataset lookup time is around `1.5 μs`, and the NED dataset lookup time is around `400 ns`.

![Bench](static/bench_cities_osm.png)

### Free Server

Below is the sample performance to resolve a time zone from a `(lng,lat)` pair to one of the data centers using a concurrency of 1,000, achieving 8,000 RPS.
Expand Down
Binary file modified assets/ned_time_zone_lookup.bincode
Binary file not shown.
Binary file modified assets/osm_admin_lookup.bincode
Binary file not shown.
Binary file modified assets/osm_time_zone_lookup.bincode
Binary file not shown.
1 change: 1 addition & 0 deletions rtz-build/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ default = []
full = ["tz-ned", "tz-osm", "admin-osm", "self-contained", "rtz-core/full"]

force-rebuild = []
double-precision = ["rtz-core/double-precision"]
self-contained = ["rtz-core/self-contained"]

tz-ned = ["rtz-core/tz-ned"]
Expand Down
12 changes: 3 additions & 9 deletions rtz-build/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ fn generate_self_contained_bincodes() {

#[cfg(all(feature = "tz-ned", feature = "self-contained"))]
fn generate_ned_tz_bincodes() {
use std::path::Path;

use rtz_core::geo::{
shared::generate_bincodes,
tz::ned::{get_geojson_features_from_source, NedTimezone, LOOKUP_BINCODE_DESTINATION_NAME, TIMEZONE_BINCODE_DESTINATION_NAME},
Expand All @@ -34,7 +32,7 @@ fn generate_ned_tz_bincodes() {
let lookup_bincode_destination = &format!("../assets/{}", LOOKUP_BINCODE_DESTINATION_NAME);

#[cfg(not(feature = "force-rebuild"))]
if Path::new(timezone_bincode_destination).exists() && Path::new(lookup_bincode_destination).exists() {
if std::path::Path::new(timezone_bincode_destination).exists() && std::path::Path::new(lookup_bincode_destination).exists() {
return;
}

Expand All @@ -46,8 +44,6 @@ fn generate_ned_tz_bincodes() {

#[cfg(all(feature = "tz-osm", feature = "self-contained"))]
fn generate_osm_tz_bincodes() {
use std::path::Path;

use rtz_core::geo::{
shared::generate_bincodes,
tz::osm::{get_geojson_features_from_source, OsmTimezone, LOOKUP_BINCODE_DESTINATION_NAME, TIMEZONE_BINCODE_DESTINATION_NAME},
Expand All @@ -57,7 +53,7 @@ fn generate_osm_tz_bincodes() {
let lookup_bincode_destination = &format!("../assets/{}", LOOKUP_BINCODE_DESTINATION_NAME);

#[cfg(not(feature = "force-rebuild"))]
if Path::new(timezone_bincode_destination).exists() && Path::new(lookup_bincode_destination).exists() {
if std::path::Path::new(timezone_bincode_destination).exists() && std::path::Path::new(lookup_bincode_destination).exists() {
return;
}

Expand All @@ -69,8 +65,6 @@ fn generate_osm_tz_bincodes() {

#[cfg(all(feature = "admin-osm", feature = "self-contained"))]
fn generate_osm_admin_bincodes() {
use std::path::Path;

use rtz_core::geo::{
admin::osm::{get_geojson_features_from_source, OsmAdmin, ADMIN_BINCODE_DESTINATION_NAME, LOOKUP_BINCODE_DESTINATION_NAME},
shared::generate_bincodes,
Expand All @@ -80,7 +74,7 @@ fn generate_osm_admin_bincodes() {
let lookup_bincode_destination = &format!("../assets/{}", LOOKUP_BINCODE_DESTINATION_NAME);

#[cfg(not(feature = "force-rebuild"))]
if Path::new(admin_bincode_destination).exists() && Path::new(lookup_bincode_destination).exists() {
if std::path::Path::new(admin_bincode_destination).exists() && std::path::Path::new(lookup_bincode_destination).exists() {
return;
}

Expand Down
8 changes: 3 additions & 5 deletions rtz-core/src/geo/admin/osm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use bincode::{

use crate::{
base::types::Float,
geo::shared::{get_geojson_feature_from_string, simplify_geometry, CanGetGeoJsonFeaturesFromSource, EncodableGeometry, EncodableString, HasGeometry, HasProperties},
geo::shared::{get_geojson_feature_from_string, simplify_geometry, CanGetGeoJsonFeaturesFromSource, EncodableGeometry, EncodableString, HasGeometry, HasProperties, IdFeaturePair},
};

use super::shared::IsAdmin;
Expand Down Expand Up @@ -58,7 +58,6 @@ pub fn get_geojson_features_from_source() -> geojson::FeatureCollection {
})
.map(|f| {
let json = std::fs::read_to_string(f.path()).unwrap();

get_geojson_feature_from_string(&json)
})
.collect::<Vec<_>>();
Expand All @@ -74,7 +73,6 @@ pub fn get_geojson_features_from_source() -> geojson::FeatureCollection {
///
/// Hacking to local machine, for now. Will create a repo at some point.
pub static ADDRESS: &str = "D://LargeData//admin_data//admin2;D://LargeData//admin_data//admin3;D://LargeData//admin_data//admin4;D://LargeData//admin_data//admin5;D://LargeData//admin_data//admin6;D://LargeData//admin_data//admin7;D://LargeData//admin_data//admin8";
//pub static ADDRESS: &str = "D://LargeData//admin_data//admin8_small";
/// The name of the timezone bincode file.
pub static ADMIN_BINCODE_DESTINATION_NAME: &str = "osm_admins.bincode";
/// The name of the cache bincode file.
Expand Down Expand Up @@ -141,8 +139,8 @@ impl PartialEq for OsmAdmin {
}
}

impl From<(usize, geojson::Feature)> for OsmAdmin {
fn from(value: (usize, geojson::Feature)) -> OsmAdmin {
impl From<IdFeaturePair> for OsmAdmin {
fn from(value: IdFeaturePair) -> OsmAdmin {
let id = value.0;
let properties = value.1.properties.as_ref().unwrap();
let geometry = value.1.geometry.as_ref().unwrap();
Expand Down
63 changes: 41 additions & 22 deletions rtz-core/src/geo/shared.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
// it is not included in the coverage report.
#![cfg(not(tarpaulin_include))]

use core::str;
use std::{
borrow::Cow,
collections::HashMap,
Expand All @@ -14,7 +15,7 @@ use std::{
use chashmap::CHashMap;
use geo::{Coord, Geometry, Intersects, LineString, MultiPolygon, Polygon, Rect, SimplifyVw};
use geojson::{Feature, FeatureCollection, GeoJson};
use rayon::prelude::{IntoParallelIterator, ParallelIterator};
use rayon::prelude::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator};
use serde_json::{Map, Value};
use std::path::Path;

Expand Down Expand Up @@ -59,10 +60,10 @@ impl<T> Deref for ConcreteVec<T> {

impl<T> From<geojson::FeatureCollection> for ConcreteVec<T>
where
T: From<IdFeaturePair>,
T: From<IdFeaturePair> + Send,
{
fn from(value: geojson::FeatureCollection) -> ConcreteVec<T> {
let values = value.features.into_iter().enumerate().map(T::from).collect::<Vec<T>>();
let values = value.features.into_par_iter().enumerate().map(T::from).collect::<Vec<T>>();

ConcreteVec(values)
}
Expand Down Expand Up @@ -131,9 +132,10 @@ impl Decode for EncodableString {
where
D: Decoder,
{
let cow = Cow::<'static, str>::decode(decoder)?;
let data = Vec::decode(decoder)?;

Ok(EncodableString(cow))
// Now, we can limit the slice to trim the null padding.
unpad_string_alignment(&data).map(ToString::to_string).map(Cow::Owned).map(EncodableString)
}
}

Expand All @@ -143,9 +145,14 @@ impl<'de> BorrowDecode<'de> for EncodableString {
where
D: BorrowDecoder<'de>,
{
let cow = Cow::<'static, str>::decode(decoder)?;
let length = usize::decode(decoder)?;
let slice = decoder.borrow_reader().take_bytes(length * std::mem::size_of::<u8>())?;

Ok(EncodableString(cow))
// SAFETY: We know this slice is built into the binary, and it has a static lifetime.
let slice = unsafe { std::mem::transmute::<&'_ [u8], &'static [u8]>(slice) };

// Now, we can limit the slice to trim the null padding.
unpad_string_alignment(slice).map(Cow::Borrowed).map(EncodableString)
}
}

Expand Down Expand Up @@ -207,17 +214,17 @@ impl Decode for EncodableOptionString {
{
let variant = usize::decode(decoder)?;

let cow = match variant {
0 => None,
let result = match variant {
0 => EncodableOptionString(None),
1 => {
let cow = Cow::<'static, str>::decode(decoder)?;
let es = EncodableString::decode(decoder)?;

Some(cow)
EncodableOptionString(Some(es.0))
}
_ => panic!("Unsupported variant."),
};

Ok(EncodableOptionString(cow))
Ok(result)
}
}

Expand All @@ -229,17 +236,17 @@ impl<'de> BorrowDecode<'de> for EncodableOptionString {
{
let variant = usize::decode(decoder)?;

let cow = match variant {
0 => None,
let result = match variant {
0 => EncodableOptionString(None),
1 => {
let cow = Cow::<'static, str>::decode(decoder)?;
let es = EncodableString::borrow_decode(decoder)?;

Some(cow)
EncodableOptionString(Some(es.0))
}
_ => panic!("Unsupported variant."),
};

Ok(EncodableOptionString(cow))
Ok(result)
}
}

Expand Down Expand Up @@ -333,6 +340,19 @@ pub fn pad_string_alignment(string: impl AsRef<str>) -> Vec<u8> {
string.as_ref().as_bytes().iter().chain(std::iter::repeat(&0u8).take(padding)).copied().collect::<Vec<u8>>()
}

/// Unpads a String after decoding to remove any null padding.
#[cfg(feature = "self-contained")]
pub fn unpad_string_alignment(data: &[u8]) -> Result<&str, DecodeError> {
let terminator = data.iter().position(|&x| x == 0).unwrap_or(data.len());
let slice = &data[..terminator];

let str = str::from_utf8(slice).map_err(|e| DecodeError::Utf8 {
inner: e,
})?;

Ok(str)
}

/// Simplifies a [`Geometry`] using the [Visvalingam-Whyatt algorithm](https://bost.ocks.org/mike/simplify/).
///
/// For geometries that cannot be simplified, the original geometry is returned.
Expand Down Expand Up @@ -409,13 +429,13 @@ where

let cache = get_lookup_from_geometries(&timezones);

std::fs::write(bincode_destination, bincode::encode_to_vec(cache, get_global_bincode_config()).unwrap()).unwrap();
bincode::encode_into_std_write(cache, &mut std::fs::File::create(bincode_destination).unwrap(), get_global_bincode_config()).unwrap();
}

/// Get the concrete timezones from features.
pub fn get_items_from_features<T>(features: FeatureCollection) -> ConcreteVec<T>
where
T: HasGeometry + From<IdFeaturePair>,
T: HasGeometry + From<IdFeaturePair> + Send,
{
ConcreteVec::from(features)
}
Expand All @@ -424,11 +444,10 @@ where
#[cfg(feature = "self-contained")]
fn generate_item_bincode<T>(geojson_features: FeatureCollection, bincode_destination: impl AsRef<Path>)
where
T: HasGeometry + Encode + From<IdFeaturePair> + 'static,
T: HasGeometry + Encode + From<IdFeaturePair> + Send + 'static,
{
let items: ConcreteVec<T> = get_items_from_features(geojson_features);

std::fs::write(bincode_destination, bincode::encode_to_vec(items, get_global_bincode_config()).unwrap()).unwrap();
bincode::encode_into_std_write(items, &mut std::fs::File::create(bincode_destination).unwrap(), get_global_bincode_config()).unwrap();
}

/// Get the GeoJSON features from the binary assets.
Expand Down
6 changes: 3 additions & 3 deletions rtz-core/src/geo/tz/ned.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ use bincode::{

use crate::{
base::types::Float,
geo::shared::{get_geojson_features_from_string, simplify_geometry, CanGetGeoJsonFeaturesFromSource, EncodableGeometry, EncodableOptionString, EncodableString, HasGeometry, HasProperties},
geo::shared::{get_geojson_features_from_string, simplify_geometry, CanGetGeoJsonFeaturesFromSource, EncodableGeometry, EncodableOptionString, EncodableString, HasGeometry, HasProperties, IdFeaturePair},
};

use super::shared::IsTimezone;
Expand Down Expand Up @@ -149,8 +149,8 @@ impl PartialEq for NedTimezone {
}
}

impl From<(usize, geojson::Feature)> for NedTimezone {
fn from(value: (usize, geojson::Feature)) -> NedTimezone {
impl From<IdFeaturePair> for NedTimezone {
fn from(value: IdFeaturePair) -> NedTimezone {
let id = value.0;
let properties = value.1.properties.as_ref().unwrap();
let geometry = value.1.geometry.as_ref().unwrap();
Expand Down
6 changes: 3 additions & 3 deletions rtz-core/src/geo/tz/osm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ use bincode::{

use crate::{
base::types::Float,
geo::shared::{get_geojson_features_from_string, simplify_geometry, CanGetGeoJsonFeaturesFromSource, EncodableGeometry, EncodableString, HasGeometry, HasProperties},
geo::shared::{get_geojson_features_from_string, simplify_geometry, CanGetGeoJsonFeaturesFromSource, EncodableGeometry, EncodableString, HasGeometry, HasProperties, IdFeaturePair},
};

use super::shared::IsTimezone;
Expand Down Expand Up @@ -109,8 +109,8 @@ impl PartialEq for OsmTimezone {
}
}

impl From<(usize, geojson::Feature)> for OsmTimezone {
fn from(value: (usize, geojson::Feature)) -> OsmTimezone {
impl From<IdFeaturePair> for OsmTimezone {
fn from(value: IdFeaturePair) -> OsmTimezone {
let id = value.0;
let properties = value.1.properties.as_ref().unwrap();
let geometry = value.1.geometry.as_ref().unwrap();
Expand Down
2 changes: 1 addition & 1 deletion rtz/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ required-features = ["cli"]
default = ["cli"]
full = ["tz-ned", "tz-osm", "admin-osm", "self-contained", "rtz-core/full", "rtz-build/full"]

double-precision = ["rtz-core/double-precision"]
double-precision = ["rtz-core/double-precision", "rtz-build/double-precision"]
unsimplified = ["rtz-core/unsimplified"]
extrasimplified = ["rtz-core/extrasimplified"]
self-contained = ["rtz-core/self-contained", "rtz-build/self-contained", "include_bytes_aligned"]
Expand Down
8 changes: 4 additions & 4 deletions rtz/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@ fn main() {

// Set special host configs.
if cfg!(windows) {
println!("cargo:rustc-cfg=host_family_windows");
println!("cargo::rustc-cfg=host_family_windows");
}
if cfg!(unix) {
println!("cargo:rustc-cfg=host_family_unix");
println!("cargo::rustc-cfg=host_family_unix");
}
if cfg!(wasm) {
println!("cargo:rustc-cfg=host_family_wasm");
println!("cargo:rustc-cfg=wasm");
println!("cargo::rustc-cfg=host_family_wasm");
println!("cargo::rustc-cfg=wasm");
}

// Do not run the build script if the target is wasm.
Expand Down
1 change: 1 addition & 0 deletions rtz/src/geo/tz/osm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ mod tests {
#[test]
fn can_verify_lookup_assisted_accuracy() {
let x = rand::random::<Float>() * 360.0 - 180.0;

(0..100).into_par_iter().for_each(|_| {
let y = rand::random::<Float>() * 180.0 - 90.0;
let full = OsmTimezone::lookup_slow(x, y);
Expand Down
3 changes: 2 additions & 1 deletion rtz/src/web/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,9 @@ mod tests {
assert_eq!(response.status(), StatusCode::OK);

let body = response.into_body().collect().await.unwrap_or_default().to_bytes();
let expected = r#"[{"id":216,"name":"مصر","level":2},{"id":2982,"name":"مطروح","level":4}]"#;
let expected = r#"[{"id":217,"name":"مصر","level":2},{"id":3007,"name":"مطروح","level":4}]"#;

assert_eq!(body, expected);
}
}

Binary file added static/bench_cities_osm.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 9542412

Please sign in to comment.