Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change TokenFilter trait to simplify (a bit) the boxing of filters #2101

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ oneshot = "0.1.5"
base64 = "0.21.0"
byteorder = "1.4.3"
crc32fast = "1.3.2"
dyn-clone = "1.0.11"
once_cell = "1.10.0"
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
aho-corasick = "1.0"
Expand Down
25 changes: 23 additions & 2 deletions benches/analyzer.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use criterion::{criterion_group, criterion_main, Criterion};
use tantivy::tokenizer::TokenizerManager;
use tantivy::tokenizer::{
BoxTokenFilter, LowerCaser, RemoveLongFilter, SimpleTokenizer, TextAnalyzer, TokenizerManager,
};

const ALICE_TXT: &str = include_str!("alice.txt");

Expand All @@ -16,7 +18,26 @@ pub fn criterion_benchmark(c: &mut Criterion) {
assert_eq!(word_count, 30_731);
})
});
let token_filters = vec![
BoxTokenFilter::from(RemoveLongFilter::limit(40)),
BoxTokenFilter::from(LowerCaser),
];
let mut dynamic_analyzer = TextAnalyzer::new(SimpleTokenizer::default(), token_filters);
c.bench_function("default-dynamic-tokenize-alice", |b| {
b.iter(|| {
let mut word_count = 0;
let mut token_stream = dynamic_analyzer.token_stream(ALICE_TXT);
while token_stream.advance() {
word_count += 1;
}
assert_eq!(word_count, 30_731);
})
});
}

criterion_group!(benches, criterion_benchmark);
criterion_group! {
name = benches;
config = Criterion::default().sample_size(200);
targets = criterion_benchmark
}
criterion_main!(benches);
2 changes: 1 addition & 1 deletion src/indexer/segment_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ impl SegmentWriter {
for value in values {
let mut token_stream = match value {
Value::PreTokStr(tok_str) => {
PreTokenizedStream::from(tok_str.clone()).into()
Box::new(PreTokenizedStream::from(tok_str.clone()))
}
Value::Str(ref text) => {
let text_analyzer =
Expand Down
7 changes: 2 additions & 5 deletions src/query/more_like_this/more_like_this.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@ use std::collections::{BinaryHeap, HashMap};
use crate::query::bm25::idf;
use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery};
use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value};
use crate::tokenizer::{
BoxTokenStream, FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer,
};
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer};
use crate::{DocAddress, Result, Searcher, TantivyError};

#[derive(Debug, PartialEq)]
Expand Down Expand Up @@ -206,8 +204,7 @@ impl MoreLikeThis {
for value in values {
match value {
Value::PreTokStr(tok_str) => {
let mut token_stream: BoxTokenStream =
PreTokenizedStream::from(tok_str.clone()).into();
let mut token_stream = PreTokenizedStream::from(tok_str.clone());
token_stream.process(&mut |token| {
if !self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text);
Expand Down
21 changes: 4 additions & 17 deletions src/tokenizer/alphanum_only.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
//! // the "emoji" is dropped because its not an alphanum
//! assert!(stream.next().is_none());
//! ```
use super::{Token, TokenFilter, TokenStream, Tokenizer};
use super::{Token, TokenFilter, TokenStream};

/// `TokenFilter` that removes all tokens that contain non
/// ascii alphanumeric characters.
Expand All @@ -39,23 +39,10 @@ impl<T> AlphaNumOnlyFilterStream<T> {
}

impl TokenFilter for AlphaNumOnlyFilter {
type Tokenizer<T: Tokenizer> = AlphaNumOnlyFilterWrapper<T>;
type OutputTokenStream<T: TokenStream> = AlphaNumOnlyFilterStream<T>;

fn transform<T: Tokenizer>(self, tokenizer: T) -> AlphaNumOnlyFilterWrapper<T> {
AlphaNumOnlyFilterWrapper(tokenizer)
}
}

#[derive(Clone)]
pub struct AlphaNumOnlyFilterWrapper<T>(T);

impl<T: Tokenizer> Tokenizer for AlphaNumOnlyFilterWrapper<T> {
type TokenStream<'a> = AlphaNumOnlyFilterStream<T::TokenStream<'a>>;

fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
AlphaNumOnlyFilterStream {
tail: self.0.token_stream(text),
}
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
AlphaNumOnlyFilterStream { tail: token_stream }
}
}

Expand Down
36 changes: 9 additions & 27 deletions src/tokenizer/ascii_folding_filter.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::mem;

use super::{Token, TokenFilter, TokenStream, Tokenizer};
use super::{Token, TokenFilter, TokenStream};

/// This class converts alphabetic, numeric, and symbolic Unicode characters
/// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
Expand All @@ -9,48 +9,30 @@ use super::{Token, TokenFilter, TokenStream, Tokenizer};
pub struct AsciiFoldingFilter;

impl TokenFilter for AsciiFoldingFilter {
type Tokenizer<T: Tokenizer> = AsciiFoldingFilterWrapper<T>;
type OutputTokenStream<T: TokenStream> = AsciiFoldingFilterTokenStream<T>;

fn transform<T: Tokenizer>(self, tokenizer: T) -> AsciiFoldingFilterWrapper<T> {
AsciiFoldingFilterWrapper {
tokenizer,
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
AsciiFoldingFilterTokenStream {
buffer: String::new(),
tail: token_stream,
}
}
}

#[derive(Clone)]
pub struct AsciiFoldingFilterWrapper<T> {
tokenizer: T,
pub struct AsciiFoldingFilterTokenStream<T> {
buffer: String,
}

impl<T: Tokenizer> Tokenizer for AsciiFoldingFilterWrapper<T> {
type TokenStream<'a> = AsciiFoldingFilterTokenStream<'a, T::TokenStream<'a>>;

fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
self.buffer.clear();
AsciiFoldingFilterTokenStream {
buffer: &mut self.buffer,
tail: self.tokenizer.token_stream(text),
}
}
}

pub struct AsciiFoldingFilterTokenStream<'a, T> {
buffer: &'a mut String,
tail: T,
}

impl<'a, T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<'a, T> {
impl<'a, T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<T> {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

clippy...

fn advance(&mut self) -> bool {
if !self.tail.advance() {
return false;
}
if !self.token_mut().text.is_ascii() {
// ignore its already ascii
to_ascii(&self.tail.token().text, self.buffer);
mem::swap(&mut self.tail.token_mut().text, self.buffer);
to_ascii(&self.tail.token().text, &mut self.buffer);
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
}
true
}
Expand Down
36 changes: 9 additions & 27 deletions src/tokenizer/lower_caser.rs
Original file line number Diff line number Diff line change
@@ -1,42 +1,24 @@
use std::mem;

use super::{Token, TokenFilter, TokenStream, Tokenizer};
use super::{Token, TokenFilter, TokenStream};

/// Token filter that lowercase terms.
#[derive(Clone)]
pub struct LowerCaser;

impl TokenFilter for LowerCaser {
type Tokenizer<T: Tokenizer> = LowerCaserFilter<T>;
type OutputTokenStream<T: TokenStream> = LowerCaserTokenStream<T>;

fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T> {
LowerCaserFilter {
tokenizer,
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
LowerCaserTokenStream {
tail: token_stream,
buffer: String::new(),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you are reverting here the work done by Pascal to reduce the number of allocation we have. Is this necessary?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@PSeitz can you confirm?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes indeed :/

On each tokenizer.token_stream, it will call filter and thus will do a String::new().

That was not the case before.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd prefer if we can have an instance with a buffer in the pipeline like before

Copy link
Contributor Author

@fmassot fmassot Jun 29, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, of course. I should have fixed it by adding the allocation at the TokenFilter level.

}
}
}

#[derive(Clone)]
pub struct LowerCaserFilter<T> {
tokenizer: T,
pub struct LowerCaserTokenStream<T> {
buffer: String,
}

impl<T: Tokenizer> Tokenizer for LowerCaserFilter<T> {
type TokenStream<'a> = LowerCaserTokenStream<'a, T::TokenStream<'a>>;

fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
self.buffer.clear();
LowerCaserTokenStream {
tail: self.tokenizer.token_stream(text),
buffer: &mut self.buffer,
}
}
}

pub struct LowerCaserTokenStream<'a, T> {
buffer: &'a mut String,
tail: T,
}

Expand All @@ -51,7 +33,7 @@ fn to_lowercase_unicode(text: &str, output: &mut String) {
}
}

impl<'a, T: TokenStream> TokenStream for LowerCaserTokenStream<'a, T> {
impl<T: TokenStream> TokenStream for LowerCaserTokenStream<T> {
fn advance(&mut self) -> bool {
if !self.tail.advance() {
return false;
Expand All @@ -60,8 +42,8 @@ impl<'a, T: TokenStream> TokenStream for LowerCaserTokenStream<'a, T> {
// fast track for ascii.
self.token_mut().text.make_ascii_lowercase();
} else {
to_lowercase_unicode(&self.tail.token().text, self.buffer);
mem::swap(&mut self.tail.token_mut().text, self.buffer);
to_lowercase_unicode(&self.tail.token().text, &mut self.buffer);
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
}
true
}
Expand Down
4 changes: 2 additions & 2 deletions src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ mod tokenizer;
mod tokenizer_manager;
mod whitespace_tokenizer;

pub use tokenizer_api::{BoxTokenStream, Token, TokenFilter, TokenStream, Tokenizer};
pub use tokenizer_api::{Token, TokenFilter, TokenStream, Tokenizer};

pub use self::alphanum_only::AlphaNumOnlyFilter;
pub use self::ascii_folding_filter::AsciiFoldingFilter;
Expand All @@ -154,7 +154,7 @@ pub use self::split_compound_words::SplitCompoundWords;
pub use self::stemmer::{Language, Stemmer};
pub use self::stop_word_filter::StopWordFilter;
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
pub use self::tokenizer::{TextAnalyzer, TextAnalyzerBuilder};
pub use self::tokenizer::{BoxTokenFilter, TextAnalyzer, TextAnalyzerBuilder};
pub use self::tokenizer_manager::TokenizerManager;
pub use self::whitespace_tokenizer::WhitespaceTokenizer;

Expand Down
25 changes: 4 additions & 21 deletions src/tokenizer/remove_long.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
//! assert_eq!(stream.next().unwrap().text, "nice");
//! assert!(stream.next().is_none());
//! ```
use super::{Token, TokenFilter, TokenStream, Tokenizer};
use super::{Token, TokenFilter, TokenStream};

/// `RemoveLongFilter` removes tokens that are longer
/// than a given number of bytes (in UTF-8 representation).
Expand All @@ -38,29 +38,12 @@ impl<T> RemoveLongFilterStream<T> {
}

impl TokenFilter for RemoveLongFilter {
type Tokenizer<T: Tokenizer> = RemoveLongFilterWrapper<T>;
type OutputTokenStream<T: TokenStream> = RemoveLongFilterStream<T>;

fn transform<T: Tokenizer>(self, tokenizer: T) -> RemoveLongFilterWrapper<T> {
RemoveLongFilterWrapper {
length_limit: self.length_limit,
inner: tokenizer,
}
}
}

#[derive(Clone)]
pub struct RemoveLongFilterWrapper<T: Tokenizer> {
length_limit: usize,
inner: T,
}

impl<T: Tokenizer> Tokenizer for RemoveLongFilterWrapper<T> {
type TokenStream<'a> = RemoveLongFilterStream<T::TokenStream<'a>>;

fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
RemoveLongFilterStream {
token_length_limit: self.length_limit,
tail: self.inner.token_stream(text),
tail: token_stream,
}
}
}
Expand Down
25 changes: 4 additions & 21 deletions src/tokenizer/split_compound_words.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};

use super::{Token, TokenFilter, TokenStream, Tokenizer};
use super::{Token, TokenFilter, TokenStream};

/// A [`TokenFilter`] which splits compound words into their parts
/// based on a given dictionary.
Expand Down Expand Up @@ -80,29 +80,12 @@ impl SplitCompoundWords {
}

impl TokenFilter for SplitCompoundWords {
type Tokenizer<T: Tokenizer> = SplitCompoundWordsFilter<T>;
type OutputTokenStream<T: TokenStream> = SplitCompoundWordsTokenStream<T>;

fn transform<T: Tokenizer>(self, tokenizer: T) -> SplitCompoundWordsFilter<T> {
SplitCompoundWordsFilter {
dict: self.dict,
inner: tokenizer,
}
}
}

#[derive(Clone)]
pub struct SplitCompoundWordsFilter<T> {
dict: AhoCorasick,
inner: T,
}

impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
type TokenStream<'a> = SplitCompoundWordsTokenStream<T::TokenStream<'a>>;

fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
SplitCompoundWordsTokenStream {
dict: self.dict.clone(),
tail: self.inner.token_stream(text),
tail: token_stream,
cuts: Vec::new(),
parts: Vec::new(),
}
Expand Down
25 changes: 4 additions & 21 deletions src/tokenizer/stemmer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use std::mem;
use rust_stemmers::{self, Algorithm};
use serde::{Deserialize, Serialize};

use super::{Token, TokenFilter, TokenStream, Tokenizer};
use super::{Token, TokenFilter, TokenStream};

/// Available stemmer languages.
#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
Expand Down Expand Up @@ -81,29 +81,12 @@ impl Default for Stemmer {
}

impl TokenFilter for Stemmer {
type Tokenizer<T: Tokenizer> = StemmerFilter<T>;
type OutputTokenStream<T: TokenStream> = StemmerTokenStream<T>;

fn transform<T: Tokenizer>(self, tokenizer: T) -> StemmerFilter<T> {
StemmerFilter {
stemmer_algorithm: self.stemmer_algorithm,
inner: tokenizer,
}
}
}

#[derive(Clone)]
pub struct StemmerFilter<T> {
stemmer_algorithm: Algorithm,
inner: T,
}

impl<T: Tokenizer> Tokenizer for StemmerFilter<T> {
type TokenStream<'a> = StemmerTokenStream<T::TokenStream<'a>>;

fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
let stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
StemmerTokenStream {
tail: self.inner.token_stream(text),
tail: token_stream,
stemmer,
buffer: String::new(),
}
Expand Down
Loading