Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add support for str fast field range query #2453

Merged
merged 4 commits into from
Jul 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/query/range_query/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ pub use self::range_query_u64_fastfield::FastFieldRangeWeight;
// TODO is this correct?
pub(crate) fn is_type_valid_for_fastfield_range_query(typ: Type) -> bool {
match typ {
Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
Type::Str | Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
Type::IpAddr => true,
Type::Str | Type::Facet | Type::Bytes | Type::Json => false,
Type::Facet | Type::Bytes | Type::Json => false,
}
}

Expand Down
172 changes: 134 additions & 38 deletions src/query/range_query/range_query_u64_fastfield.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
use std::net::Ipv6Addr;
use std::ops::{Bound, RangeInclusive};

use columnar::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64};
use columnar::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64, StrColumn};
use common::BinarySerializable;

use super::fast_field_range_doc_set::RangeDocSet;
Expand Down Expand Up @@ -51,16 +51,22 @@ impl Weight for FastFieldRangeWeight {
}
let field_name = reader.schema().get_field_name(self.field);
let field_type = reader.schema().get_field_entry(self.field).field_type();

let term = inner_bound(&self.lower_bound)
.or(inner_bound(&self.upper_bound))
.expect("At least one bound must be set");
assert_eq!(
term.typ(),
field_type.value_type(),
"Field is of type {:?}, but got term of type {:?}",
field_type,
term.typ()
);
if field_type.is_ip_addr() {
let parse_ip_from_bytes = |term: &Term| {
let ip_u128_bytes: [u8; 16] =
term.serialized_value_bytes().try_into().map_err(|_| {
crate::TantivyError::InvalidArgument(
"Expected 8 bytes for ip address".to_string(),
)
})?;
let ip_u128 = u128::from_be_bytes(ip_u128_bytes);
crate::Result::<Ipv6Addr>::Ok(Ipv6Addr::from_u128(ip_u128))
term.value().as_ip_addr().ok_or_else(|| {
crate::TantivyError::InvalidArgument("Expected ip address".to_string())
})
};
let lower_bound = map_bound_res(&self.lower_bound, parse_ip_from_bytes)?;
let upper_bound = map_bound_res(&self.upper_bound, parse_ip_from_bytes)?;
Expand All @@ -79,33 +85,42 @@ impl Weight for FastFieldRangeWeight {
let docset = RangeDocSet::new(value_range, ip_addr_column);
Ok(Box::new(ConstScorer::new(docset, boost)))
} else {
assert!(
maps_to_u64_fastfield(field_type.value_type()),
"{:?}",
field_type
);

let term = inner_bound(&self.lower_bound)
.or(inner_bound(&self.upper_bound))
.expect("At least one bound must be set");
assert_eq!(
term.typ(),
field_type.value_type(),
"Field is of type {:?}, but got term of type {:?}",
field_type,
term.typ()
);
let (lower_bound, upper_bound) = if field_type.is_str() {
let Some(str_dict_column): Option<StrColumn> =
reader.fast_fields().str(field_name)?
else {
return Ok(Box::new(EmptyScorer));
};
let dict = str_dict_column.dictionary();

let lower_bound = map_bound(&self.lower_bound, |term| {
term.serialized_value_bytes().to_vec()
PSeitz marked this conversation as resolved.
Show resolved Hide resolved
});
let upper_bound = map_bound(&self.upper_bound, |term| {
term.serialized_value_bytes().to_vec()
PSeitz marked this conversation as resolved.
Show resolved Hide resolved
});
// Get term ids for terms
let (lower_bound, upper_bound) =
dict.term_bounds_to_ord(lower_bound, upper_bound)?;
(lower_bound, upper_bound)
} else {
assert!(
maps_to_u64_fastfield(field_type.value_type()),
"{:?}",
field_type
);
let parse_from_bytes = |term: &Term| {
u64::from_be(
BinarySerializable::deserialize(&mut &term.serialized_value_bytes()[..])
.unwrap(),
)
};

let parse_from_bytes = |term: &Term| {
u64::from_be(
BinarySerializable::deserialize(&mut &term.serialized_value_bytes()[..])
.unwrap(),
)
let lower_bound = map_bound(&self.lower_bound, parse_from_bytes);
let upper_bound = map_bound(&self.upper_bound, parse_from_bytes);
(lower_bound, upper_bound)
};

let lower_bound = map_bound(&self.lower_bound, parse_from_bytes);
let upper_bound = map_bound(&self.upper_bound, parse_from_bytes);

let fast_field_reader = reader.fast_fields();
let Some((column, _)) = fast_field_reader.u64_lenient_for_type(None, field_name)?
else {
Expand Down Expand Up @@ -202,12 +217,73 @@ pub mod tests {
use rand::seq::SliceRandom;
use rand::SeedableRng;

use crate::collector::Count;
use crate::collector::{Count, TopDocs};
use crate::query::range_query::range_query_u64_fastfield::FastFieldRangeWeight;
use crate::query::{QueryParser, Weight};
use crate::schema::{NumericOptions, Schema, SchemaBuilder, FAST, INDEXED, STORED, STRING};
use crate::schema::{
NumericOptions, Schema, SchemaBuilder, FAST, INDEXED, STORED, STRING, TEXT,
};
use crate::{Index, IndexWriter, Term, TERMINATED};

#[test]
fn test_text_field_ff_range_query() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("title", TEXT | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer_for_tests()?;
let title = schema.get_field("title").unwrap();
index_writer.add_document(doc!(
title => "bbb"
))?;
index_writer.add_document(doc!(
title => "ddd"
))?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let query_parser = QueryParser::for_index(&index, vec![title]);

let test_query = |query, num_hits| {
let query = query_parser.parse_query(query).unwrap();
let top_docs = searcher.search(&query, &TopDocs::with_limit(10)).unwrap();
assert_eq!(top_docs.len(), num_hits);
};

test_query("title:[aaa TO ccc]", 1);
test_query("title:[aaa TO bbb]", 1);
test_query("title:[bbb TO bbb]", 1);
test_query("title:[bbb TO ddd]", 2);
test_query("title:[bbb TO eee]", 2);
test_query("title:[bb TO eee]", 2);
test_query("title:[ccc TO ccc]", 0);
test_query("title:[ccc TO ddd]", 1);
test_query("title:[ccc TO eee]", 1);
PSeitz marked this conversation as resolved.
Show resolved Hide resolved

test_query("title:[aaa TO *}", 2);
test_query("title:[bbb TO *]", 2);
test_query("title:[bb TO *]", 2);
test_query("title:[ccc TO *]", 1);
test_query("title:[ddd TO *]", 1);
test_query("title:[dddd TO *]", 0);

test_query("title:{aaa TO *}", 2);
test_query("title:{bbb TO *]", 1);
test_query("title:{bb TO *]", 2);
test_query("title:{ccc TO *]", 1);
test_query("title:{ddd TO *]", 0);
test_query("title:{dddd TO *]", 0);

test_query("title:[* TO bb]", 0);
test_query("title:[* TO bbb]", 1);
test_query("title:[* TO ccc]", 1);
test_query("title:[* TO ddd]", 2);
test_query("title:[* TO ddd}", 1);
test_query("title:[* TO eee]", 2);

Ok(())
}

#[derive(Clone, Debug)]
pub struct Doc {
pub id_name: String,
Expand All @@ -224,14 +300,14 @@ pub mod tests {
fn doc_from_id_1(id: u64) -> Doc {
let id = id * 1000;
Doc {
id_name: id.to_string(),
id_name: format!("id_name{:010}", id),
id,
}
}
fn doc_from_id_2(id: u64) -> Doc {
let id = id * 1000;
Doc {
id_name: (id - 1).to_string(),
id_name: format!("id_name{:010}", id - 1),
id,
}
}
Expand Down Expand Up @@ -319,7 +395,8 @@ pub mod tests {
NumericOptions::default().set_fast().set_indexed(),
);

let text_field = schema_builder.add_text_field("id_name", STRING | STORED);
let text_field = schema_builder.add_text_field("id_name", STRING | STORED | FAST);
let text_field2 = schema_builder.add_text_field("id_name_fast", STRING | STORED | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);

Expand All @@ -338,6 +415,7 @@ pub mod tests {
id_f64_field => doc.id as f64,
id_i64_field => doc.id as i64,
text_field => doc.id_name.to_string(),
text_field2 => doc.id_name.to_string(),
))
.unwrap();
}
Expand Down Expand Up @@ -382,6 +460,24 @@ pub mod tests {
let query = gen_query_inclusive("ids", ids[0]..=ids[1]);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);

// Text query
{
let test_text_query = |field_name: &str| {
let mut id_names: Vec<&str> =
sample_docs.iter().map(|doc| doc.id_name.as_str()).collect();
id_names.sort();
let expected_num_hits = docs
.iter()
.filter(|doc| (id_names[0]..=id_names[1]).contains(&doc.id_name.as_str()))
.count();
let query = format!("{}:[{} TO {}]", field_name, id_names[0], id_names[1]);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
};

test_text_query("id_name");
test_text_query("id_name_fast");
}

// Exclusive range
let expected_num_hits = docs
.iter()
Expand Down
5 changes: 5 additions & 0 deletions src/schema/field_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,11 @@ impl FieldType {
matches!(self, FieldType::IpAddr(_))
}

/// returns true if this is an str field
pub fn is_str(&self) -> bool {
matches!(self, FieldType::Str(_))
}

/// returns true if this is an date field
pub fn is_date(&self) -> bool {
matches!(self, FieldType::Date(_))
Expand Down
Loading
Loading