Skip to content

Commit

Permalink
json path writer (#2224)
Browse files Browse the repository at this point in the history
* refactor logic to JsonPathWriter

* use in encode_column_name

* add inlines

* move unsafe block
  • Loading branch information
PSeitz authored Oct 24, 2023
1 parent 0d45892 commit 07bf66a
Show file tree
Hide file tree
Showing 4 changed files with 137 additions and 45 deletions.
112 changes: 112 additions & 0 deletions common/src/json_path_writer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
use crate::replace_in_place;

/// Separates the different segments of a json path.
pub const JSON_PATH_SEGMENT_SEP: u8 = 1u8;
pub const JSON_PATH_SEGMENT_SEP_STR: &str =
unsafe { std::str::from_utf8_unchecked(&[JSON_PATH_SEGMENT_SEP]) };

/// Create a new JsonPathWriter, that creates flattened json paths for tantivy.
#[derive(Clone, Debug, Default)]
pub struct JsonPathWriter {
path: String,
indices: Vec<usize>,
expand_dots: bool,
}

impl JsonPathWriter {
pub fn new() -> Self {
JsonPathWriter {
path: String::new(),
indices: Vec::new(),
expand_dots: false,
}
}

/// When expand_dots is enabled, json object like
/// `{"k8s.node.id": 5}` is processed as if it was
/// `{"k8s": {"node": {"id": 5}}}`.
/// This option has the merit of allowing users to
/// write queries like `k8s.node.id:5`.
/// On the other, enabling that feature can lead to
/// ambiguity.
#[inline]
pub fn set_expand_dots(&mut self, expand_dots: bool) {
self.expand_dots = expand_dots;
}

/// Push a new segment to the path.
#[inline]
pub fn push(&mut self, segment: &str) {
let len_path = self.path.len();
self.indices.push(len_path);
if !self.path.is_empty() {
self.path.push_str(JSON_PATH_SEGMENT_SEP_STR);
}
self.path.push_str(segment);
if self.expand_dots {
// This might include the separation byte, which is ok because it is not a dot.
let appended_segment = &mut self.path[len_path..];
// The unsafe below is safe as long as b'.' and JSON_PATH_SEGMENT_SEP are
// valid single byte ut8 strings.
// By utf-8 design, they cannot be part of another codepoint.
unsafe {
replace_in_place(b'.', JSON_PATH_SEGMENT_SEP, appended_segment.as_bytes_mut())
};
}
}

/// Remove the last segment. Does nothing if the path is empty.
#[inline]
pub fn pop(&mut self) {
if let Some(last_idx) = self.indices.pop() {
self.path.truncate(last_idx);
}
}

/// Clear the path.
#[inline]
pub fn clear(&mut self) {
self.path.clear();
self.indices.clear();
}

/// Get the current path.
#[inline]
pub fn as_str(&self) -> &str {
&self.path
}
}

impl From<JsonPathWriter> for String {
#[inline]
fn from(value: JsonPathWriter) -> Self {
value.path
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn json_path_writer_test() {
let mut writer = JsonPathWriter::new();

writer.push("root");
assert_eq!(writer.as_str(), "root");

writer.push("child");
assert_eq!(writer.as_str(), "root\u{1}child");

writer.pop();
assert_eq!(writer.as_str(), "root");

writer.push("k8s.node.id");
assert_eq!(writer.as_str(), "root\u{1}k8s.node.id");

writer.set_expand_dots(true);
writer.pop();
writer.push("k8s.node.id");
assert_eq!(writer.as_str(), "root\u{1}k8s\u{1}node\u{1}id");
}
}
2 changes: 2 additions & 0 deletions common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ mod byte_count;
mod datetime;
pub mod file_slice;
mod group_by;
mod json_path_writer;
mod serialize;
mod vint;
mod writer;
Expand All @@ -18,6 +19,7 @@ pub use byte_count::ByteCount;
pub use datetime::DatePrecision;
pub use datetime::{DateTime, DateTimePrecision};
pub use group_by::GroupByIteratorExtended;
pub use json_path_writer::JsonPathWriter;
pub use ownedbytes::{OwnedBytes, StableDeref};
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
pub use vint::{
Expand Down
20 changes: 8 additions & 12 deletions src/core/json_utils.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
use columnar::MonotonicallyMappableToU64;
use common::replace_in_place;
use common::{replace_in_place, JsonPathWriter};
use murmurhash32::murmurhash2;
use rustc_hash::FxHashMap;

use crate::fastfield::FastValue;
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value};
use crate::schema::term::{JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR};
use crate::schema::term::JSON_PATH_SEGMENT_SEP;
use crate::schema::{Field, Type, DATE_TIME_PRECISION_INDEXED};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::{OffsetDateTime, UtcOffset};
Expand Down Expand Up @@ -315,17 +315,13 @@ pub(crate) fn encode_column_name(
json_path: &str,
expand_dots_enabled: bool,
) -> String {
let mut column_key: String = String::with_capacity(field_name.len() + json_path.len() + 1);
column_key.push_str(field_name);
for mut segment in split_json_path(json_path) {
column_key.push_str(JSON_PATH_SEGMENT_SEP_STR);
if expand_dots_enabled {
// We need to replace `.` by JSON_PATH_SEGMENT_SEP.
unsafe { replace_in_place(b'.', JSON_PATH_SEGMENT_SEP, segment.as_bytes_mut()) };
}
column_key.push_str(&segment);
let mut path = JsonPathWriter::default();
path.push(field_name);
path.set_expand_dots(expand_dots_enabled);
for segment in split_json_path(json_path) {
path.push(&segment);
}
column_key
path.into()
}

impl<'a> JsonTermWriter<'a> {
Expand Down
48 changes: 15 additions & 33 deletions src/fastfield/writer.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
use std::io;

use columnar::{ColumnarWriter, NumericalValue};
use common::replace_in_place;
use common::JsonPathWriter;
use tokenizer_api::Token;

use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::schema::document::{Document, ReferenceValue, ReferenceValueLeaf, Value};
use crate::schema::term::{JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR};
use crate::schema::{value_type_to_column_type, Field, FieldType, Schema, Type};
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::{DateTimePrecision, DocId, TantivyError};
Expand All @@ -24,7 +23,7 @@ pub struct FastFieldsWriter {
expand_dots: Vec<bool>,
num_docs: DocId,
// Buffer that we recycle to avoid allocation.
json_path_buffer: String,
json_path_buffer: JsonPathWriter,
}

impl FastFieldsWriter {
Expand Down Expand Up @@ -98,7 +97,7 @@ impl FastFieldsWriter {
num_docs: 0u32,
date_precisions,
expand_dots,
json_path_buffer: String::new(),
json_path_buffer: JsonPathWriter::default(),
})
}

Expand Down Expand Up @@ -212,14 +211,16 @@ impl FastFieldsWriter {
ReferenceValue::Object(val) => {
let expand_dots = self.expand_dots[field.field_id() as usize];
self.json_path_buffer.clear();
self.json_path_buffer.push_str(field_name);
// First field should not be expanded.
self.json_path_buffer.set_expand_dots(false);
self.json_path_buffer.push(field_name);
self.json_path_buffer.set_expand_dots(expand_dots);

let text_analyzer = &mut self.per_field_tokenizer[field.field_id() as usize];

record_json_obj_to_columnar_writer::<V>(
doc_id,
val,
expand_dots,
JSON_DEPTH_LIMIT,
&mut self.json_path_buffer,
&mut self.columnar_writer,
Expand Down Expand Up @@ -250,48 +251,30 @@ impl FastFieldsWriter {
fn record_json_obj_to_columnar_writer<'a, V: Value<'a>>(
doc: DocId,
json_visitor: V::ObjectIter,
expand_dots: bool,
remaining_depth_limit: usize,
json_path_buffer: &mut String,
json_path_buffer: &mut JsonPathWriter,
columnar_writer: &mut columnar::ColumnarWriter,
tokenizer: &mut Option<TextAnalyzer>,
) {
for (key, child) in json_visitor {
let len_path = json_path_buffer.len();
if !json_path_buffer.is_empty() {
json_path_buffer.push_str(JSON_PATH_SEGMENT_SEP_STR);
}
json_path_buffer.push_str(key);
if expand_dots {
// This might include the separation byte, which is ok because it is not a dot.
let appended_segment = &mut json_path_buffer[len_path..];
// The unsafe below is safe as long as b'.' and JSON_PATH_SEGMENT_SEP are
// valid single byte ut8 strings.
// By utf-8 design, they cannot be part of another codepoint.
replace_in_place(b'.', JSON_PATH_SEGMENT_SEP, unsafe {
appended_segment.as_bytes_mut()
});
}
json_path_buffer.push(key);
record_json_value_to_columnar_writer(
doc,
child,
expand_dots,
remaining_depth_limit,
json_path_buffer,
columnar_writer,
tokenizer,
);
// popping our sub path.
json_path_buffer.truncate(len_path);
json_path_buffer.pop();
}
}

fn record_json_value_to_columnar_writer<'a, V: Value<'a>>(
doc: DocId,
json_val: V,
expand_dots: bool,
mut remaining_depth_limit: usize,
json_path_writer: &mut String,
json_path_writer: &mut JsonPathWriter,
columnar_writer: &mut columnar::ColumnarWriter,
tokenizer: &mut Option<TextAnalyzer>,
) {
Expand Down Expand Up @@ -335,7 +318,7 @@ fn record_json_value_to_columnar_writer<'a, V: Value<'a>>(
);
}
ReferenceValueLeaf::Bool(val) => {
columnar_writer.record_bool(doc, json_path_writer, val);
columnar_writer.record_bool(doc, json_path_writer.as_str(), val);
}
ReferenceValueLeaf::Date(val) => {
columnar_writer.record_datetime(doc, json_path_writer.as_str(), val);
Expand All @@ -362,7 +345,6 @@ fn record_json_value_to_columnar_writer<'a, V: Value<'a>>(
record_json_value_to_columnar_writer(
doc,
el,
expand_dots,
remaining_depth_limit,
json_path_writer,
columnar_writer,
Expand All @@ -374,7 +356,6 @@ fn record_json_value_to_columnar_writer<'a, V: Value<'a>>(
record_json_obj_to_columnar_writer::<V>(
doc,
object,
expand_dots,
remaining_depth_limit,
json_path_writer,
columnar_writer,
Expand All @@ -387,6 +368,7 @@ fn record_json_value_to_columnar_writer<'a, V: Value<'a>>(
#[cfg(test)]
mod tests {
use columnar::{Column, ColumnarReader, ColumnarWriter, StrColumn};
use common::JsonPathWriter;

use super::record_json_value_to_columnar_writer;
use crate::fastfield::writer::JSON_DEPTH_LIMIT;
Expand All @@ -397,12 +379,12 @@ mod tests {
expand_dots: bool,
) -> ColumnarReader {
let mut columnar_writer = ColumnarWriter::default();
let mut json_path = String::new();
let mut json_path = JsonPathWriter::default();
json_path.set_expand_dots(expand_dots);
for (doc, json_doc) in json_docs.iter().enumerate() {
record_json_value_to_columnar_writer(
doc as u32,
json_doc,
expand_dots,
JSON_DEPTH_LIMIT,
&mut json_path,
&mut columnar_writer,
Expand Down

0 comments on commit 07bf66a

Please sign in to comment.