Added a simple example of performing sparse dense hybrid search (#1990)

Signed-off-by: Buqian Zheng <[email protected]>
milvus-io · Mar 20, 2024 · 584d04b · 584d04b · seetimee · Mar 21, 2024
1 parent 73ada97
commit 584d04b
Showing 1 changed file with 115 additions and 0 deletions.
diff --git a/examples/hello_hybrid_sparse_dense.py b/examples/hello_hybrid_sparse_dense.py
@@ -0,0 +1,115 @@
+# A demo showing hybrid semantic search with dense and sparse vectors using Milvus.
+# You can optionally choose to use the BGE-M3 model to embed the text as dense
+# and sparse vectors, or simply use random generated vectors as the example.
+
+# To use BGE-M3 model, you need to install the optional `model` module in pymilvus:
+# pip install pymilvus[model]
+use_bge_m3 = True
+
+# The overall steps are as follows:
+# 1. embed the text as dense and sparse vectors
+# 2. setup a Milvus collection to store the dense and sparse vectors
+# 3. insert the data to Milvus
+# 4. search and inspect the result!
+import random
+import string
+import numpy as np
+
+from pymilvus import (
+    utility,
+    FieldSchema, CollectionSchema, DataType,
+    Collection, AnnSearchRequest, RRFRanker, connections,
+)
+
+# 1. prepare a small corpus to search
+docs = [
+    "Artificial intelligence was founded as an academic discipline in 1956.",
+    "Alan Turing was the first person to conduct substantial research in AI.",
+    "Born in Maida Vale, London, Turing was raised in southern England.",
+]
+# add some randomly generated texts
+docs.extend([' '.join(''.join(random.choice(string.ascii_lowercase) for _ in range(random.randint(1, 8))) for _ in range(10)) for _ in range(1000)])
+query = "Who started AI research?"
+
+def random_embedding(texts):
+    rng = np.random.default_rng()
+    return {
+        "dense": np.random.rand(len(texts), 768),
+        "sparse": [{d: rng.random() for d in random.sample(range(1000), random.randint(20, 30))} for _ in texts],
+    }
+
+dense_dim = 768
+ef = random_embedding
+
+if use_bge_m3:
+    # BGE-M3 model can embed texts as dense and sparse vectors.
+    # It is included in the optional `model` module in pymilvus, to install it,
+    # simply run "pip install pymilvus[model]".
+    from pymilvus.model.hybrid import BGEM3EmbeddingFunction
+    ef = BGEM3EmbeddingFunction(use_fp16=False, device="cpu")
+    dense_dim = ef.dim["dense"]
+
+docs_embeddings = ef(docs)
+query_embeddings = ef([query])
+
+# 2. setup Milvus collection and index
+connections.connect("default", host="localhost", port="19530")
+
+# Specify the data schema for the new Collection.
+fields = [
+    # Use auto generated id as primary key
+    FieldSchema(name="pk", dtype=DataType.VARCHAR,
+                is_primary=True, auto_id=True, max_length=100),
+    # Store the original text to retrieve based on semantically distance
+    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=512),
+    # Milvus now supports both sparse and dense vectors, we can store each in
+    # a separate field to conduct hybrid search on both vectors.
+    FieldSchema(name="sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR),
+    FieldSchema(name="dense_vector", dtype=DataType.FLOAT_VECTOR,
+                dim=dense_dim),
+]
+schema = CollectionSchema(fields, "")
+col_name = 'hybrid_demo'
+# Now we can create the new collection with above name and schema.
+col = Collection(col_name, schema, consistency_level="Strong")
+
+# We need to create indices for the vector fields. The indices will be loaded
+# into memory for efficient search.
+sparse_index = {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"}
+col.create_index("sparse_vector", sparse_index)
+dense_index = {"index_type": "FLAT", "metric_type": "L2"}
+col.create_index("dense_vector", dense_index)
+col.load()
+
+# 3. insert text and sparse/dense vector representations into the collection
+entities = [docs, docs_embeddings["sparse"], docs_embeddings["dense"]]
+col.insert(entities)
+col.flush()
+
+# 4. search and inspect the result!
+k = 2 # we want to get the top 2 docs closest to the query
+
+# Prepare the search requests for both vector fields
+sparse_search_params = {"metric_type": "IP"}
+sparse_req = AnnSearchRequest(query_embeddings["sparse"],
+                              "sparse_vector", sparse_search_params, limit=k)
+dense_search_params = {"metric_type": "L2"}
+dense_req = AnnSearchRequest(query_embeddings["dense"],
+                             "dense_vector", dense_search_params, limit=k)
+
+# Search topK docs based on dense and sparse vectors and rerank with RRF.
+res = col.hybrid_search([sparse_req, dense_req], rerank=RRFRanker(),
+                        limit=k, output_fields=['text'])
+
+# Currently Milvus only support 1 query in the same hybrid search request, so
+# we inspect res[0] directly. In future release Milvus will accept batch
+# hybrid search queries in the same call.
+for hit in res[0]:
+    print(f'text: {hit.fields["text"]} distance {hit.distance}')
+
+# If you are using BGE-M3 to generate the embedding, you should see the following:
+# text: Alan Turing was the first person to conduct substantial research in AI. distance 0.032786883413791656
+# text: Artificial intelligence was founded as an academic discipline in 1956. distance 0.016129031777381897
+
+# Drop the collection to clean up the data.
+utility.drop_collection(col_name)