Skip to content

Commit

Permalink
improve docs, rework exports (#2220)
Browse files Browse the repository at this point in the history
* rework exports

move snippet and advice
make indexer pub, remove indexer reexports

* add deprecation warning

* add architecture overview
  • Loading branch information
PSeitz authored Oct 18, 2023
1 parent 7e1980b commit c2b0469
Show file tree
Hide file tree
Showing 10 changed files with 148 additions and 34 deletions.
3 changes: 2 additions & 1 deletion examples/snippet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter, Snippet, SnippetGenerator};
use tantivy::snippet::{Snippet, SnippetGenerator};
use tantivy::{doc, Index, IndexWriter};
use tempfile::TempDir;

fn main() -> tantivy::Result<()> {
Expand Down
2 changes: 1 addition & 1 deletion src/core/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ use crate::directory::{Directory, ManagedDirectory, RamDirectory, INDEX_WRITER_L
use crate::error::{DataCorruption, TantivyError};
use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_BUDGET_NUM_BYTES_MIN};
use crate::indexer::segment_updater::save_metas;
use crate::indexer::IndexWriter;
use crate::reader::{IndexReader, IndexReaderBuilder};
use crate::schema::document::Document;
use crate::schema::{Field, FieldType, Schema};
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::IndexWriter;

fn load_metas(
directory: &dyn Directory,
Expand Down
4 changes: 2 additions & 2 deletions src/directory/mmap_directory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ use std::sync::{Arc, RwLock, Weak};

use common::StableDeref;
use fs4::FileExt;
#[cfg(all(feature = "mmap", unix))]
pub use memmap2::Advice;
use memmap2::Mmap;
use serde::{Deserialize, Serialize};
use tempfile::TempDir;
Expand All @@ -21,8 +23,6 @@ use crate::directory::{
AntiCallToken, Directory, DirectoryLock, FileHandle, Lock, OwnedBytes, TerminatingWrite,
WatchCallback, WatchHandle, WritePtr,
};
#[cfg(unix)]
use crate::Advice;

pub type ArcBytes = Arc<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
pub type WeakArcBytes = Weak<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
Expand Down
3 changes: 3 additions & 0 deletions src/indexer/merge_operation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,13 @@ impl MergeOperation {
}
}

/// Returns the opstamp up to which we want to consume the delete queue and reflect their
/// deletes.
pub fn target_opstamp(&self) -> Opstamp {
self.inner.target_opstamp
}

/// Returns the list of segment to be merged.
pub fn segment_ids(&self) -> &[SegmentId] {
&self.inner.segment_ids[..]
}
Expand Down
32 changes: 19 additions & 13 deletions src/indexer/mod.rs
Original file line number Diff line number Diff line change
@@ -1,23 +1,29 @@
pub mod delete_queue;
//! Indexing and merging data.
//!
//! Contains code to create and merge segments.
//! `IndexWriter` is the main entry point for that, which created from
//! [`Index::writer`](crate::Index::writer).

pub mod doc_id_mapping;
pub(crate) mod delete_queue;

pub(crate) mod doc_id_mapping;
mod doc_opstamp_mapping;
mod flat_map_with_buffer;
pub mod index_writer;
mod index_writer_status;
pub(crate) mod index_writer;
pub(crate) mod index_writer_status;
mod log_merge_policy;
mod merge_operation;
pub mod merge_policy;
pub mod merger;
pub(crate) mod merge_policy;
pub(crate) mod merger;
mod merger_sorted_index_test;
pub mod operation;
pub mod prepared_commit;
pub(crate) mod operation;
pub(crate) mod prepared_commit;
mod segment_entry;
mod segment_manager;
mod segment_register;
pub mod segment_serializer;
pub mod segment_updater;
mod segment_writer;
pub(crate) mod segment_serializer;
pub(crate) mod segment_updater;
pub(crate) mod segment_writer;
mod stamper;

use crossbeam_channel as channel;
Expand All @@ -27,10 +33,10 @@ pub use self::index_writer::IndexWriter;
pub use self::log_merge_policy::LogMergePolicy;
pub use self::merge_operation::MergeOperation;
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
pub use self::operation::UserOperation;
pub use self::prepared_commit::PreparedCommit;
pub use self::segment_entry::SegmentEntry;
pub use self::segment_manager::SegmentManager;
pub use self::segment_serializer::SegmentSerializer;
pub(crate) use self::segment_serializer::SegmentSerializer;
pub use self::segment_updater::{merge_filtered_segments, merge_indices};
pub use self::segment_writer::SegmentWriter;
use crate::indexer::operation::AddOperation;
Expand Down
2 changes: 2 additions & 0 deletions src/indexer/segment_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@ impl SegmentWriter {
Ok(doc_opstamps)
}

/// Returns an estimation of the current memory usage of the segment writer.
/// If the mem usage exceeds the `memory_budget`, the segment be serialized.
pub fn mem_usage(&self) -> usize {
self.ctx.mem_usage()
+ self.fieldnorms_writer.mem_usage()
Expand Down
64 changes: 55 additions & 9 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,48 @@
//! the example code (
//! [literate programming](https://tantivy-search.github.io/examples/basic_search.html) /
//! [source code](https://github.com/quickwit-oss/tantivy/blob/main/examples/basic_search.rs))

//!
//! # Tantivy Architecture Overview
//!
//! Tantivy is inspired by Lucene, the Architecture is very similar.
//!
//! ## Core Concepts
//!
//! - **[Index]**: A collection of segments. The top level entry point for tantivy users to search
//! and index data.
//!
//! - **[Segment]**: At the heart of Tantivy's indexing structure is the [Segment]. It contains
//! documents and indices and is the atomic unit of indexing and search.
//!
//! - **[Schema](schema)**: A schema is a set of fields in an index. Each field has a specific data
//! type and set of attributes.
//!
//! - **[IndexWriter]**: Responsible creating and merging segments. It executes the indexing
//! pipeline including tokenization, creating indices, and storing the index in the
//! [Directory](directory).
//!
//! - **Searching**: [Searcher] searches the segments with anything that implements
//! [Query](query::Query) and merges the results. The list of [supported
//! queries](query::Query#implementors). Custom Queries are supported by implementing the
//! [Query](query::Query) trait.
//!
//! - **[Directory](directory)**: Abstraction over the storage where the index data is stored.
//!
//! - **[Tokenizer](tokenizer)**: Breaks down text into individual tokens. Users can implement or
//! use provided tokenizers.
//!
//! ## Architecture Flow
//!
//! 1. **Document Addition**: Users create documents according to the defined schema. The documents
//! fields are tokenized, processed, and added to the current segment. See
//! [Document](schema::document) for the structure and usage.
//!
//! 2. **Segment Creation**: Once the memory limit threshold is reached or a commit is called, the
//! segment is written to the Directory. Documents are searchable after `commit`.
//!
//! 3. **Merging**: To optimize space and search speed, segments might be merged. This operation is
//! performed in the background. Customize the merge behaviour via
//! [IndexWriter::set_merge_policy].
#[cfg_attr(test, macro_use)]
extern crate serde_json;
#[macro_use]
Expand Down Expand Up @@ -137,7 +178,7 @@ pub use crate::future_result::FutureResult;
pub type Result<T> = std::result::Result<T, TantivyError>;

mod core;
mod indexer;
pub mod indexer;

#[allow(unused_doc_comments)]
pub mod error;
Expand All @@ -161,8 +202,7 @@ pub mod termdict;
mod reader;

pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy, Warmer};
mod snippet;
pub use self::snippet::{Snippet, SnippetGenerator};
pub mod snippet;

mod docset;
use std::fmt;
Expand All @@ -173,6 +213,11 @@ use once_cell::sync::Lazy;
use serde::{Deserialize, Serialize};

pub use self::docset::{DocSet, TERMINATED};
#[deprecated(
since = "0.22.0",
note = "Will be removed in tantivy 0.23. Use export from snippet module instead"
)]
pub use self::snippet::{Snippet, SnippetGenerator};
#[doc(hidden)]
pub use crate::core::json_utils;
pub use crate::core::{
Expand All @@ -181,8 +226,12 @@ pub use crate::core::{
SegmentReader, SingleSegmentIndexWriter,
};
pub use crate::directory::Directory;
pub use crate::indexer::operation::UserOperation;
pub use crate::indexer::{merge_filtered_segments, merge_indices, IndexWriter, PreparedCommit};
pub use crate::indexer::IndexWriter;
#[deprecated(
since = "0.22.0",
note = "Will be removed in tantivy 0.23. Use export from indexer module instead"
)]
pub use crate::indexer::{merge_filtered_segments, merge_indices, PreparedCommit};
pub use crate::postings::Postings;
#[allow(deprecated)]
pub use crate::schema::DatePrecision;
Expand All @@ -191,9 +240,6 @@ pub use crate::schema::{DateOptions, DateTimePrecision, Document, TantivyDocumen
/// Index format version.
const INDEX_FORMAT_VERSION: u32 = 5;

#[cfg(all(feature = "mmap", unix))]
pub use memmap2::Advice;

/// Structure version for the index.
#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Version {
Expand Down
65 changes: 61 additions & 4 deletions src/snippet/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,59 @@
//! [`SnippetGenerator`]
//! Generates a text snippet for a given document, and some highlighted parts inside it.
//! Imagine you doing a text search in a document
//! and want to show a preview of where in the document the search terms occur,
//! along with some surrounding text to give context, and the search terms highlighted.
//!
//! [`SnippetGenerator`] serves this purpose.
//! It scans a document and constructs a snippet, which consists of sections where the search terms
//! have been found, stitched together with "..." in between sections if necessary.
//!
//! ## Example
//!
//! ```rust
//! # use tantivy::query::QueryParser;
//! # use tantivy::schema::{Schema, TEXT};
//! # use tantivy::{doc, Index};
//! use tantivy::snippet::SnippetGenerator;
//!
//! # fn main() -> tantivy::Result<()> {
//! # let mut schema_builder = Schema::builder();
//! # let text_field = schema_builder.add_text_field("text", TEXT);
//! # let schema = schema_builder.build();
//! # let index = Index::create_in_ram(schema);
//! # let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
//! # let doc = doc!(text_field => r#"Comme je descendais des Fleuves impassibles,
//! # Je ne me sentis plus guidé par les haleurs :
//! # Des Peaux-Rouges criards les avaient pris pour cibles,
//! # Les ayant cloués nus aux poteaux de couleurs.
//! #
//! # J'étais insoucieux de tous les équipages,
//! # Porteur de blés flamands ou de cotons anglais.
//! # Quand avec mes haleurs ont fini ces tapages,
//! # Les Fleuves m'ont laissé descendre où je voulais.
//! # "#);
//! # index_writer.add_document(doc.clone())?;
//! # index_writer.commit()?;
//! # let query_parser = QueryParser::for_index(&index, vec![text_field]);
//! // ...
//! let query = query_parser.parse_query("haleurs flamands").unwrap();
//! # let reader = index.reader()?;
//! # let searcher = reader.searcher();
//! let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, text_field)?;
//! snippet_generator.set_max_num_chars(100);
//! let snippet = snippet_generator.snippet_from_doc(&doc);
//! let snippet_html: String = snippet.to_html();
//! assert_eq!(snippet_html, "Comme je descendais des Fleuves impassibles,\n Je ne me sentis plus guidé par les <b>haleurs</b> :\n Des");
//! # Ok(())
//! # }
//! ```
//!
//! You can also specify the maximum number of characters for the snippets generated with the
//! `set_max_num_chars` method. By default, this limit is set to 150.
//!
//! SnippetGenerator needs to be created from the `Searcher` and the query, and the field on which
//! the `SnippetGenerator` should generate the snippets.

use std::cmp::Ordering;
use std::collections::{BTreeMap, BTreeSet};
use std::ops::Range;
Expand All @@ -16,7 +72,7 @@ const DEFAULT_SNIPPET_PREFIX: &str = "<b>";
const DEFAULT_SNIPPET_POSTFIX: &str = "</b>";

#[derive(Debug)]
pub struct FragmentCandidate {
pub(crate) struct FragmentCandidate {
score: Score,
start_offset: usize,
stop_offset: usize,
Expand Down Expand Up @@ -256,7 +312,7 @@ fn is_sorted(mut it: impl Iterator<Item = usize>) -> bool {
/// # use tantivy::query::QueryParser;
/// # use tantivy::schema::{Schema, TEXT};
/// # use tantivy::{doc, Index};
/// use tantivy::SnippetGenerator;
/// use tantivy::snippet::SnippetGenerator;
///
/// # fn main() -> tantivy::Result<()> {
/// # let mut schema_builder = Schema::builder();
Expand Down Expand Up @@ -346,7 +402,7 @@ impl SnippetGenerator {
})
}

/// Sets a maximum number of chars.
/// Sets a maximum number of chars. Default is 150.
pub fn set_max_num_chars(&mut self, max_num_chars: usize) {
self.max_num_chars = max_num_chars;
}
Expand Down Expand Up @@ -398,8 +454,9 @@ mod tests {
use super::{collapse_overlapped_ranges, search_fragments, select_best_fragment_combination};
use crate::query::QueryParser;
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT};
use crate::snippet::SnippetGenerator;
use crate::tokenizer::{NgramTokenizer, SimpleTokenizer};
use crate::{Index, SnippetGenerator};
use crate::Index;

const TEST_TEXT: &str = r#"Rust is a systems programming language sponsored by
Mozilla which describes it as a "safe, concurrent, practical language", supporting functional and
Expand Down
5 changes: 2 additions & 3 deletions src/termdict/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//! The term dictionary main role is to associate the sorted [`Term`s](crate::Term) to
//! a [`TermInfo`](crate::postings::TermInfo) struct that contains some meta-information
//! a [`TermInfo`] struct that contains some meta-information
//! about the term.
//!
//! Internally, the term dictionary relies on the `fst` crate to store
Expand All @@ -16,8 +16,7 @@
//! `f64`-terms are transformed to `u64` using a mapping that preserve order, and are then treated
//! as `u64`.
//!
//! A second datastructure makes it possible to access a
//! [`TermInfo`](crate::postings::TermInfo).
//! A second datastructure makes it possible to access a [`TermInfo`].

#[cfg(not(feature = "quickwit"))]
mod fst_termdict;
Expand Down
2 changes: 1 addition & 1 deletion tokenizer-api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
//! ready for indexing. This is an seperate crate from tantivy, so implementors don't need to update
//! for each new tantivy version.
//!
//! To add support for a tokenizer, implement the [`Tokenizer`](crate::Tokenizer) trait.
//! To add support for a tokenizer, implement the [`Tokenizer`] trait.
//! Checkout the [tantivy repo](https://github.com/quickwit-oss/tantivy/tree/main/src/tokenizer) for some examples.

use std::borrow::{Borrow, BorrowMut};
Expand Down

0 comments on commit c2b0469

Please sign in to comment.