improve docs, rework exports (#2220)

* rework exports move snippet and advice make indexer pub, remove indexer reexports * add deprecation warning * add architecture overview
quickwit-oss · Oct 18, 2023 · c2b0469 · c2b0469
1 parent 7e1980b
commit c2b0469
Show file tree

Hide file tree

Showing 10 changed files with 148 additions and 34 deletions.
diff --git a/examples/snippet.rs b/examples/snippet.rs
@@ -10,7 +10,8 @@
 use tantivy::collector::TopDocs;
 use tantivy::query::QueryParser;
 use tantivy::schema::*;
-use tantivy::{doc, Index, IndexWriter, Snippet, SnippetGenerator};
+use tantivy::snippet::{Snippet, SnippetGenerator};
+use tantivy::{doc, Index, IndexWriter};
 use tempfile::TempDir;
 
 fn main() -> tantivy::Result<()> {

diff --git a/src/core/index.rs b/src/core/index.rs
@@ -18,11 +18,11 @@ use crate::directory::{Directory, ManagedDirectory, RamDirectory, INDEX_WRITER_L
 use crate::error::{DataCorruption, TantivyError};
 use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_BUDGET_NUM_BYTES_MIN};
 use crate::indexer::segment_updater::save_metas;
+use crate::indexer::IndexWriter;
 use crate::reader::{IndexReader, IndexReaderBuilder};
 use crate::schema::document::Document;
 use crate::schema::{Field, FieldType, Schema};
 use crate::tokenizer::{TextAnalyzer, TokenizerManager};
-use crate::IndexWriter;
 
 fn load_metas(
     directory: &dyn Directory,

diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs
@@ -8,6 +8,8 @@ use std::sync::{Arc, RwLock, Weak};
 
 use common::StableDeref;
 use fs4::FileExt;
+#[cfg(all(feature = "mmap", unix))]
+pub use memmap2::Advice;
 use memmap2::Mmap;
 use serde::{Deserialize, Serialize};
 use tempfile::TempDir;
@@ -21,8 +23,6 @@ use crate::directory::{
     AntiCallToken, Directory, DirectoryLock, FileHandle, Lock, OwnedBytes, TerminatingWrite,
     WatchCallback, WatchHandle, WritePtr,
 };
-#[cfg(unix)]
-use crate::Advice;
 
 pub type ArcBytes = Arc<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
 pub type WeakArcBytes = Weak<dyn Deref<Target = [u8]> + Send + Sync + 'static>;

diff --git a/src/indexer/merge_operation.rs b/src/indexer/merge_operation.rs
@@ -63,10 +63,13 @@ impl MergeOperation {
         }
     }
 
+    /// Returns the opstamp up to which we want to consume the delete queue and reflect their
+    /// deletes.
     pub fn target_opstamp(&self) -> Opstamp {
         self.inner.target_opstamp
     }
 
+    /// Returns the list of segment to be merged.
     pub fn segment_ids(&self) -> &[SegmentId] {
         &self.inner.segment_ids[..]
     }

diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs
@@ -1,23 +1,29 @@
-pub mod delete_queue;
+//! Indexing and merging data.
+//!
+//! Contains code to create and merge segments.
+//! `IndexWriter` is the main entry point for that, which created from
+//! [`Index::writer`](crate::Index::writer).
 
-pub mod doc_id_mapping;
+pub(crate) mod delete_queue;
+
+pub(crate) mod doc_id_mapping;
 mod doc_opstamp_mapping;
 mod flat_map_with_buffer;
-pub mod index_writer;
-mod index_writer_status;
+pub(crate) mod index_writer;
+pub(crate) mod index_writer_status;
 mod log_merge_policy;
 mod merge_operation;
-pub mod merge_policy;
-pub mod merger;
+pub(crate) mod merge_policy;
+pub(crate) mod merger;
 mod merger_sorted_index_test;
-pub mod operation;
-pub mod prepared_commit;
+pub(crate) mod operation;
+pub(crate) mod prepared_commit;
 mod segment_entry;
 mod segment_manager;
 mod segment_register;
-pub mod segment_serializer;
-pub mod segment_updater;
-mod segment_writer;
+pub(crate) mod segment_serializer;
+pub(crate) mod segment_updater;
+pub(crate) mod segment_writer;
 mod stamper;
 
 use crossbeam_channel as channel;
@@ -27,10 +33,10 @@ pub use self::index_writer::IndexWriter;
 pub use self::log_merge_policy::LogMergePolicy;
 pub use self::merge_operation::MergeOperation;
 pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
+pub use self::operation::UserOperation;
 pub use self::prepared_commit::PreparedCommit;
 pub use self::segment_entry::SegmentEntry;
-pub use self::segment_manager::SegmentManager;
-pub use self::segment_serializer::SegmentSerializer;
+pub(crate) use self::segment_serializer::SegmentSerializer;
 pub use self::segment_updater::{merge_filtered_segments, merge_indices};
 pub use self::segment_writer::SegmentWriter;
 use crate::indexer::operation::AddOperation;

diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs
@@ -155,6 +155,8 @@ impl SegmentWriter {
         Ok(doc_opstamps)
     }
 
+    /// Returns an estimation of the current memory usage of the segment writer.
+    /// If the mem usage exceeds the `memory_budget`, the segment be serialized.
     pub fn mem_usage(&self) -> usize {
         self.ctx.mem_usage()
             + self.fieldnorms_writer.mem_usage()

diff --git a/src/lib.rs b/src/lib.rs
@@ -103,7 +103,48 @@
 //! the example code (
 //! [literate programming](https://tantivy-search.github.io/examples/basic_search.html) /
 //! [source code](https://github.com/quickwit-oss/tantivy/blob/main/examples/basic_search.rs))
-
+//!
+//! # Tantivy Architecture Overview
+//!
+//! Tantivy is inspired by Lucene, the Architecture is very similar.
+//!
+//! ## Core Concepts
+//!
+//! - **[Index]**: A collection of segments. The top level entry point for tantivy users to search
+//!   and index data.
+//!
+//! - **[Segment]**: At the heart of Tantivy's indexing structure is the [Segment]. It contains
+//!   documents and indices and is the atomic unit of indexing and search.
+//!
+//! - **[Schema](schema)**: A schema is a set of fields in an index. Each field has a specific data
+//!   type and set of attributes.
+//!
+//! - **[IndexWriter]**: Responsible creating and merging segments. It executes the indexing
+//!   pipeline including tokenization, creating indices, and storing the index in the
+//!   [Directory](directory).
+//!
+//! - **Searching**: [Searcher] searches the segments with anything that implements
+//!   [Query](query::Query) and merges the results. The list of [supported
+//! queries](query::Query#implementors). Custom Queries are supported by implementing the
+//! [Query](query::Query) trait.
+//!
+//! - **[Directory](directory)**: Abstraction over the storage where the index data is stored.
+//!
+//! - **[Tokenizer](tokenizer)**: Breaks down text into individual tokens. Users can implement or
+//!   use provided tokenizers.
+//!
+//! ## Architecture Flow
+//!
+//! 1. **Document Addition**: Users create documents according to the defined schema. The documents
+//!    fields are tokenized, processed, and added to the current segment. See
+//!    [Document](schema::document) for the structure and usage.
+//!
+//! 2. **Segment Creation**: Once the memory limit threshold is reached or a commit is called, the
+//!    segment is written to the Directory. Documents are searchable after `commit`.
+//!
+//! 3. **Merging**: To optimize space and search speed, segments might be merged. This operation is
+//!    performed in the background. Customize the merge behaviour via
+//!    [IndexWriter::set_merge_policy].
 #[cfg_attr(test, macro_use)]
 extern crate serde_json;
 #[macro_use]
@@ -137,7 +178,7 @@ pub use crate::future_result::FutureResult;
 pub type Result<T> = std::result::Result<T, TantivyError>;
 
 mod core;
-mod indexer;
+pub mod indexer;
 
 #[allow(unused_doc_comments)]
 pub mod error;
@@ -161,8 +202,7 @@ pub mod termdict;
 mod reader;
 
 pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy, Warmer};
-mod snippet;
-pub use self::snippet::{Snippet, SnippetGenerator};
+pub mod snippet;
 
 mod docset;
 use std::fmt;
@@ -173,6 +213,11 @@ use once_cell::sync::Lazy;
 use serde::{Deserialize, Serialize};
 
 pub use self::docset::{DocSet, TERMINATED};
+#[deprecated(
+    since = "0.22.0",
+    note = "Will be removed in tantivy 0.23. Use export from snippet module instead"
+)]
+pub use self::snippet::{Snippet, SnippetGenerator};
 #[doc(hidden)]
 pub use crate::core::json_utils;
 pub use crate::core::{
@@ -181,8 +226,12 @@ pub use crate::core::{
     SegmentReader, SingleSegmentIndexWriter,
 };
 pub use crate::directory::Directory;
-pub use crate::indexer::operation::UserOperation;
-pub use crate::indexer::{merge_filtered_segments, merge_indices, IndexWriter, PreparedCommit};
+pub use crate::indexer::IndexWriter;
+#[deprecated(
+    since = "0.22.0",
+    note = "Will be removed in tantivy 0.23. Use export from indexer module instead"
+)]
+pub use crate::indexer::{merge_filtered_segments, merge_indices, PreparedCommit};
 pub use crate::postings::Postings;
 #[allow(deprecated)]
 pub use crate::schema::DatePrecision;
@@ -191,9 +240,6 @@ pub use crate::schema::{DateOptions, DateTimePrecision, Document, TantivyDocumen
 /// Index format version.
 const INDEX_FORMAT_VERSION: u32 = 5;
 
-#[cfg(all(feature = "mmap", unix))]
-pub use memmap2::Advice;
-
 /// Structure version for the index.
 #[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
 pub struct Version {

diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs
@@ -1,3 +1,59 @@
+//! [`SnippetGenerator`]
+//! Generates a text snippet for a given document, and some highlighted parts inside it.
+//! Imagine you doing a text search in a document
+//! and want to show a preview of where in the document the search terms occur,
+//! along with some surrounding text to give context, and the search terms highlighted.
+//!
+//! [`SnippetGenerator`] serves this purpose.
+//! It scans a document and constructs a snippet, which consists of sections where the search terms
+//! have been found, stitched together with "..." in between sections if necessary.
+//!
+//! ## Example
+//!
+//! ```rust
+//! # use tantivy::query::QueryParser;
+//! # use tantivy::schema::{Schema, TEXT};
+//! # use tantivy::{doc, Index};
+//! use tantivy::snippet::SnippetGenerator;
+//!
+//! # fn main() -> tantivy::Result<()> {
+//! #    let mut schema_builder = Schema::builder();
+//! #    let text_field = schema_builder.add_text_field("text", TEXT);
+//! #    let schema = schema_builder.build();
+//! #    let index = Index::create_in_ram(schema);
+//! #    let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
+//! #    let doc = doc!(text_field => r#"Comme je descendais des Fleuves impassibles,
+//! #   Je ne me sentis plus guidé par les haleurs :
+//! #  Des Peaux-Rouges criards les avaient pris pour cibles,
+//! #  Les ayant cloués nus aux poteaux de couleurs.
+//! #
+//! #  J'étais insoucieux de tous les équipages,
+//! #  Porteur de blés flamands ou de cotons anglais.
+//! #  Quand avec mes haleurs ont fini ces tapages,
+//! #  Les Fleuves m'ont laissé descendre où je voulais.
+//! #  "#);
+//! #    index_writer.add_document(doc.clone())?;
+//! #    index_writer.commit()?;
+//! #    let query_parser = QueryParser::for_index(&index, vec![text_field]);
+//! // ...
+//! let query = query_parser.parse_query("haleurs flamands").unwrap();
+//! # let reader = index.reader()?;
+//! # let searcher = reader.searcher();
+//! let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, text_field)?;
+//! snippet_generator.set_max_num_chars(100);
+//! let snippet = snippet_generator.snippet_from_doc(&doc);
+//! let snippet_html: String = snippet.to_html();
+//! assert_eq!(snippet_html, "Comme je descendais des Fleuves impassibles,\n  Je ne me sentis plus guidé par les <b>haleurs</b> :\n Des");
+//! #    Ok(())
+//! # }
+//! ```
+//!
+//! You can also specify the maximum number of characters for the snippets generated with the
+//! `set_max_num_chars` method. By default, this limit is set to 150.
+//!
+//! SnippetGenerator needs to be created from the `Searcher` and the query, and the field on which
+//! the `SnippetGenerator` should generate the snippets.
+
 use std::cmp::Ordering;
 use std::collections::{BTreeMap, BTreeSet};
 use std::ops::Range;
@@ -16,7 +72,7 @@ const DEFAULT_SNIPPET_PREFIX: &str = "<b>";
 const DEFAULT_SNIPPET_POSTFIX: &str = "</b>";
 
 #[derive(Debug)]
-pub struct FragmentCandidate {
+pub(crate) struct FragmentCandidate {
     score: Score,
     start_offset: usize,
     stop_offset: usize,
@@ -256,7 +312,7 @@ fn is_sorted(mut it: impl Iterator<Item = usize>) -> bool {
 /// # use tantivy::query::QueryParser;
 /// # use tantivy::schema::{Schema, TEXT};
 /// # use tantivy::{doc, Index};
-/// use tantivy::SnippetGenerator;
+/// use tantivy::snippet::SnippetGenerator;
 ///
 /// # fn main() -> tantivy::Result<()> {
 /// #    let mut schema_builder = Schema::builder();
@@ -346,7 +402,7 @@ impl SnippetGenerator {
         })
     }
 
-    /// Sets a maximum number of chars.
+    /// Sets a maximum number of chars. Default is 150.
     pub fn set_max_num_chars(&mut self, max_num_chars: usize) {
         self.max_num_chars = max_num_chars;
     }
@@ -398,8 +454,9 @@ mod tests {
     use super::{collapse_overlapped_ranges, search_fragments, select_best_fragment_combination};
     use crate::query::QueryParser;
     use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT};
+    use crate::snippet::SnippetGenerator;
     use crate::tokenizer::{NgramTokenizer, SimpleTokenizer};
-    use crate::{Index, SnippetGenerator};
+    use crate::Index;
 
     const TEST_TEXT: &str = r#"Rust is a systems programming language sponsored by
 Mozilla which describes it as a "safe, concurrent, practical language", supporting functional and

diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs
@@ -1,5 +1,5 @@
 //! The term dictionary main role is to associate the sorted [`Term`s](crate::Term) to
-//! a [`TermInfo`](crate::postings::TermInfo) struct that contains some meta-information
+//! a [`TermInfo`] struct that contains some meta-information
 //! about the term.
 //!
 //! Internally, the term dictionary relies on the `fst` crate to store
@@ -16,8 +16,7 @@
 //! `f64`-terms are transformed to `u64` using a mapping that preserve order, and are then treated
 //! as `u64`.
 //!
-//! A second datastructure makes it possible to access a
-//! [`TermInfo`](crate::postings::TermInfo).
+//! A second datastructure makes it possible to access a [`TermInfo`].
 
 #[cfg(not(feature = "quickwit"))]
 mod fst_termdict;

diff --git a/tokenizer-api/src/lib.rs b/tokenizer-api/src/lib.rs
@@ -2,7 +2,7 @@
 //! ready for indexing. This is an seperate crate from tantivy, so implementors don't need to update
 //! for each new tantivy version.
 //!
-//! To add support for a tokenizer, implement the [`Tokenizer`](crate::Tokenizer) trait.
+//! To add support for a tokenizer, implement the [`Tokenizer`] trait.
 //! Checkout the [tantivy repo](https://github.com/quickwit-oss/tantivy/tree/main/src/tokenizer) for some examples.
 
 use std::borrow::{Borrow, BorrowMut};