diff --git a/libs/partners/milvus/langchain_milvus/vectorstores/milvus.py b/libs/partners/milvus/langchain_milvus/vectorstores/milvus.py index 279f96303698b..8116fa4401445 100644 --- a/libs/partners/milvus/langchain_milvus/vectorstores/milvus.py +++ b/libs/partners/milvus/langchain_milvus/vectorstores/milvus.py @@ -738,9 +738,69 @@ def add_texts( Returns: List[str]: The resulting keys for each inserted element. """ - from pymilvus import Collection, MilvusException texts = list(texts) + + try: + embeddings: list = self.embedding_func.embed_documents(texts) + except NotImplementedError: + embeddings = [self.embedding_func.embed_query(x) for x in texts] + + if len(embeddings) == 0: + logger.debug("Nothing to insert, skipping.") + return [] + + return self.add_embeddings( + text_embeddings=zip(texts, embeddings), + metadatas=metadatas, + timeout=timeout, + batch_size=batch_size, + ids=ids, + **kwargs, + ) + + def add_embeddings( + self, + text_embeddings: Iterable[Tuple[str, List[float]]], + metadatas: Optional[List[dict]] = None, + timeout: Optional[float] = None, + batch_size: int = 1000, + *, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> List[str]: + """Insert text data with embeddings vectors into Milvus. + + Inserting data when the collection has not be made yet will result + in creating a new Collection. The data of the first entity decides + the schema of the new collection, the dim is extracted from the first + embedding and the columns are decided by the first metadata dict. + Metadata keys will need to be present for all inserted values. At + the moment there is no None equivalent in Milvus. + + Args: + text_embeddings (Iterable[Tuple[str, List[float]]]): The texts with + embeddings vectors, it is assumed that they all fit in memory. + metadatas (Optional[List[dict]]): Metadata dicts attached to each of + the texts. Defaults to None. + should be less than 65535 bytes. Required and work when auto_id is False. + timeout (Optional[float]): Timeout for each batch insert. Defaults + to None. + batch_size (int, optional): Batch size to use for insertion. + Defaults to 1000. + ids (Optional[List[str]]): List of text ids. The length of each item + + Raises: + MilvusException: Failure to add texts and embeddings + + Returns: + List[str]: The resulting keys for each inserted element. + """ + + from pymilvus import Collection, MilvusException + + texts, embeddings = zip(*text_embeddings) + if not self.auto_id: assert isinstance(ids, list), ( "A list of valid ids are required when auto_id is False. " @@ -762,15 +822,6 @@ def add_texts( "The ids will be generated automatically." ) - try: - embeddings: list = self.embedding_func.embed_documents(texts) - except NotImplementedError: - embeddings = [self.embedding_func.embed_query(x) for x in texts] - - if len(embeddings) == 0: - logger.debug("Nothing to insert, skipping.") - return [] - # If the collection hasn't been initialized yet, perform all steps to do so if not isinstance(self.col, Collection): kwargs = {"embeddings": embeddings, "metadatas": metadatas} diff --git a/libs/partners/milvus/pyproject.toml b/libs/partners/milvus/pyproject.toml index bfa8fcbea51e4..d22ba74c94ac3 100644 --- a/libs/partners/milvus/pyproject.toml +++ b/libs/partners/milvus/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "langchain-milvus" -version = "0.1.5" +version = "0.1.7" description = "An integration package connecting Milvus and LangChain" authors = [] readme = "README.md" diff --git a/libs/partners/milvus/tests/integration_tests/vectorstores/test_milvus.py b/libs/partners/milvus/tests/integration_tests/vectorstores/test_milvus.py index fffb0bb85def1..3b49a496afbad 100644 --- a/libs/partners/milvus/tests/integration_tests/vectorstores/test_milvus.py +++ b/libs/partners/milvus/tests/integration_tests/vectorstores/test_milvus.py @@ -63,6 +63,22 @@ def test_milvus(temp_milvus_db: Any) -> None: assert_docs_equal_without_pk(output, [Document(page_content="foo")]) +def test_milvus_add_embeddings_search(temp_milvus_db: Any) -> None: + """Test end to end with add embeddings""" + embed_func = FakeEmbeddings() + docsearch = Milvus( + embed_func, + connection_args={"uri": temp_milvus_db}, + drop_old=True, + consistency_level="Strong", + auto_id=True, + ) + + docsearch.add_embeddings(zip(fake_texts, embed_func.embed_documents(fake_texts))) + output = docsearch.similarity_search("foo", k=1) + assert_docs_equal_without_pk(output, [Document(page_content="foo")]) + + def test_milvus_vector_search(temp_milvus_db: Any) -> None: """Test end to end construction and search by vector.""" docsearch = _milvus_from_texts(db_path=temp_milvus_db)