Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

langchain_milvus: Add add_embeddings method for Milvus vector store #26770

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 61 additions & 10 deletions libs/partners/milvus/langchain_milvus/vectorstores/milvus.py
Original file line number Diff line number Diff line change
Expand Up @@ -738,9 +738,69 @@ def add_texts(
Returns:
List[str]: The resulting keys for each inserted element.
"""
from pymilvus import Collection, MilvusException

texts = list(texts)

try:
embeddings: list = self.embedding_func.embed_documents(texts)
except NotImplementedError:
embeddings = [self.embedding_func.embed_query(x) for x in texts]

if len(embeddings) == 0:
logger.debug("Nothing to insert, skipping.")
return []

return self.add_embeddings(
text_embeddings=zip(texts, embeddings),
metadatas=metadatas,
timeout=timeout,
batch_size=batch_size,
ids=ids,
**kwargs,
)

def add_embeddings(
self,
text_embeddings: Iterable[Tuple[str, List[float]]],
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should probably be

Suggested change
text_embeddings: Iterable[Tuple[str, List[float]]],
embeddings: Sequence[Tuple[str, List[float]]],

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just a drive by review. I also don't know if it makes more sense to add embeddings as an optional argument to add_texts to avoid proliferation of entry points (we should see what's usually done for other vectorstores).

metadatas: Optional[List[dict]] = None,
timeout: Optional[float] = None,
batch_size: int = 1000,
*,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> List[str]:
"""Insert text data with embeddings vectors into Milvus.
Inserting data when the collection has not be made yet will result
in creating a new Collection. The data of the first entity decides
the schema of the new collection, the dim is extracted from the first
embedding and the columns are decided by the first metadata dict.
Metadata keys will need to be present for all inserted values. At
the moment there is no None equivalent in Milvus.
Args:
text_embeddings (Iterable[Tuple[str, List[float]]]): The texts with
embeddings vectors, it is assumed that they all fit in memory.
metadatas (Optional[List[dict]]): Metadata dicts attached to each of
the texts. Defaults to None.
should be less than 65535 bytes. Required and work when auto_id is False.
timeout (Optional[float]): Timeout for each batch insert. Defaults
to None.
batch_size (int, optional): Batch size to use for insertion.
Defaults to 1000.
ids (Optional[List[str]]): List of text ids. The length of each item
Raises:
MilvusException: Failure to add texts and embeddings
Returns:
List[str]: The resulting keys for each inserted element.
"""

from pymilvus import Collection, MilvusException

texts, embeddings = zip(*text_embeddings)

if not self.auto_id:
assert isinstance(ids, list), (
"A list of valid ids are required when auto_id is False. "
Expand All @@ -762,15 +822,6 @@ def add_texts(
"The ids will be generated automatically."
)

try:
embeddings: list = self.embedding_func.embed_documents(texts)
except NotImplementedError:
embeddings = [self.embedding_func.embed_query(x) for x in texts]

if len(embeddings) == 0:
logger.debug("Nothing to insert, skipping.")
return []

# If the collection hasn't been initialized yet, perform all steps to do so
if not isinstance(self.col, Collection):
kwargs = {"embeddings": embeddings, "metadatas": metadatas}
Expand Down
2 changes: 1 addition & 1 deletion libs/partners/milvus/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"

[tool.poetry]
name = "langchain-milvus"
version = "0.1.5"
version = "0.1.7"
description = "An integration package connecting Milvus and LangChain"
authors = []
readme = "README.md"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,22 @@ def test_milvus(temp_milvus_db: Any) -> None:
assert_docs_equal_without_pk(output, [Document(page_content="foo")])


def test_milvus_add_embeddings_search(temp_milvus_db: Any) -> None:
"""Test end to end with add embeddings"""
embed_func = FakeEmbeddings()
docsearch = Milvus(
embed_func,
connection_args={"uri": temp_milvus_db},
drop_old=True,
consistency_level="Strong",
auto_id=True,
)

docsearch.add_embeddings(zip(fake_texts, embed_func.embed_documents(fake_texts)))
output = docsearch.similarity_search("foo", k=1)
assert_docs_equal_without_pk(output, [Document(page_content="foo")])


def test_milvus_vector_search(temp_milvus_db: Any) -> None:
"""Test end to end construction and search by vector."""
docsearch = _milvus_from_texts(db_path=temp_milvus_db)
Expand Down