From e3f01143f8f6bc3f6a26bf762284bf6f24027726 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20T=C5=99=C3=ADska?= Date: Thu, 15 Aug 2024 21:24:25 +0200 Subject: [PATCH 1/9] Bugfix for AzureSearch vector store --- .../vectorstores/azuresearch.py | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/azuresearch.py b/libs/community/langchain_community/vectorstores/azuresearch.py index 5d7a6fc8edc8a..17729857a478b 100644 --- a/libs/community/langchain_community/vectorstores/azuresearch.py +++ b/libs/community/langchain_community/vectorstores/azuresearch.py @@ -428,6 +428,12 @@ def add_texts( logger.debug("Nothing to insert, skipping.") return [] + # when `keys`` are not passed in and there is `ids` in kwargs, use those instead + # base class expects `ids` passed in rather than `keys` + # https://github.com/langchain-ai/langchain/blob/4cdaca67dc51dba887289f56c6fead3c1a52f97d/libs/core/langchain_core/vectorstores/base.py#L65 + if (not keys) and ('ids' in kwargs) and (len(kwargs['ids'])==len(embeddings)): + keys=kwargs['ids'] + return self.add_embeddings(zip(texts, embeddings), metadatas, keys=keys) async def aadd_texts( @@ -451,6 +457,12 @@ async def aadd_texts( if len(embeddings) == 0: logger.debug("Nothing to insert, skipping.") return [] + + # when `keys`` are not passed in and there is `ids` in kwargs, use those instead + # base class expects `ids` passed in rather than `keys` + # https://github.com/langchain-ai/langchain/blob/4cdaca67dc51dba887289f56c6fead3c1a52f97d/libs/core/langchain_core/vectorstores/base.py#L65 + if (not keys) and ('ids' in kwargs) and (len(kwargs['ids'])==len(embeddings)): + keys=kwargs['ids'] return await self.aadd_embeddings(zip(texts, embeddings), metadatas, keys=keys) @@ -468,9 +480,13 @@ def add_embeddings( data = [] for i, (text, embedding) in enumerate(text_embeddings): # Use provided key otherwise use default key - key = keys[i] if keys else str(uuid.uuid4()) - # Encoding key for Azure Search valid characters - key = base64.urlsafe_b64encode(bytes(key, "utf-8")).decode("ascii") + if keys: + key = keys[i] + else: + key = str(uuid.uuid4()) + # Encoding key for Azure Search valid characters + key = base64.urlsafe_b64encode(bytes(key, "utf-8")).decode("ascii") + metadata = metadatas[i] if metadatas else {} # Add data to index # Additional metadata to fields mapping From 16922791c559a08bcd7da71b4bc10b940ce9793e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20T=C5=99=C3=ADska?= Date: Thu, 15 Aug 2024 21:25:20 +0200 Subject: [PATCH 2/9] typo fix --- .../community/langchain_community/vectorstores/azuresearch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/azuresearch.py b/libs/community/langchain_community/vectorstores/azuresearch.py index 17729857a478b..58c360700b2b1 100644 --- a/libs/community/langchain_community/vectorstores/azuresearch.py +++ b/libs/community/langchain_community/vectorstores/azuresearch.py @@ -428,7 +428,7 @@ def add_texts( logger.debug("Nothing to insert, skipping.") return [] - # when `keys`` are not passed in and there is `ids` in kwargs, use those instead + # when `keys` are not passed in and there is `ids` in kwargs, use those instead # base class expects `ids` passed in rather than `keys` # https://github.com/langchain-ai/langchain/blob/4cdaca67dc51dba887289f56c6fead3c1a52f97d/libs/core/langchain_core/vectorstores/base.py#L65 if (not keys) and ('ids' in kwargs) and (len(kwargs['ids'])==len(embeddings)): @@ -458,7 +458,7 @@ async def aadd_texts( logger.debug("Nothing to insert, skipping.") return [] - # when `keys`` are not passed in and there is `ids` in kwargs, use those instead + # when `keys` are not passed in and there is `ids` in kwargs, use those instead # base class expects `ids` passed in rather than `keys` # https://github.com/langchain-ai/langchain/blob/4cdaca67dc51dba887289f56c6fead3c1a52f97d/libs/core/langchain_core/vectorstores/base.py#L65 if (not keys) and ('ids' in kwargs) and (len(kwargs['ids'])==len(embeddings)): From d00bbc13401d1c0d25b243e8a4cf0fb50e677a55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20T=C5=99=C3=ADska?= Date: Fri, 16 Aug 2024 14:30:31 +0200 Subject: [PATCH 3/9] auto-format --- .../vectorstores/azuresearch.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/azuresearch.py b/libs/community/langchain_community/vectorstores/azuresearch.py index 58c360700b2b1..6fd12db4322a6 100644 --- a/libs/community/langchain_community/vectorstores/azuresearch.py +++ b/libs/community/langchain_community/vectorstores/azuresearch.py @@ -431,8 +431,8 @@ def add_texts( # when `keys` are not passed in and there is `ids` in kwargs, use those instead # base class expects `ids` passed in rather than `keys` # https://github.com/langchain-ai/langchain/blob/4cdaca67dc51dba887289f56c6fead3c1a52f97d/libs/core/langchain_core/vectorstores/base.py#L65 - if (not keys) and ('ids' in kwargs) and (len(kwargs['ids'])==len(embeddings)): - keys=kwargs['ids'] + if (not keys) and ("ids" in kwargs) and (len(kwargs["ids"]) == len(embeddings)): + keys = kwargs["ids"] return self.add_embeddings(zip(texts, embeddings), metadatas, keys=keys) @@ -457,12 +457,12 @@ async def aadd_texts( if len(embeddings) == 0: logger.debug("Nothing to insert, skipping.") return [] - + # when `keys` are not passed in and there is `ids` in kwargs, use those instead # base class expects `ids` passed in rather than `keys` - # https://github.com/langchain-ai/langchain/blob/4cdaca67dc51dba887289f56c6fead3c1a52f97d/libs/core/langchain_core/vectorstores/base.py#L65 - if (not keys) and ('ids' in kwargs) and (len(kwargs['ids'])==len(embeddings)): - keys=kwargs['ids'] + # https://github.com/langchain-ai/langchain/blob/4cdaca67dc51dba887289f56c6fead3c1a52f97d/libs/core/langchain_core/vectorstores/base.py#L65 + if (not keys) and ("ids" in kwargs) and (len(kwargs["ids"]) == len(embeddings)): + keys = kwargs["ids"] return await self.aadd_embeddings(zip(texts, embeddings), metadatas, keys=keys) @@ -480,13 +480,13 @@ def add_embeddings( data = [] for i, (text, embedding) in enumerate(text_embeddings): # Use provided key otherwise use default key - if keys: + if keys: key = keys[i] else: key = str(uuid.uuid4()) # Encoding key for Azure Search valid characters key = base64.urlsafe_b64encode(bytes(key, "utf-8")).decode("ascii") - + metadata = metadatas[i] if metadatas else {} # Add data to index # Additional metadata to fields mapping From 48f724f69c83bb910bf30cfdc51d93aa7e3a3af3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20T=C5=99=C3=ADska?= Date: Thu, 29 Aug 2024 15:54:54 +0200 Subject: [PATCH 4/9] added unit test --- .../vectorstores/test_azure_search.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/libs/community/tests/unit_tests/vectorstores/test_azure_search.py b/libs/community/tests/unit_tests/vectorstores/test_azure_search.py index a06fbfd151b0b..37cf8f77018f9 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_azure_search.py +++ b/libs/community/tests/unit_tests/vectorstores/test_azure_search.py @@ -188,3 +188,35 @@ def mock_create_index() -> None: ) assert vector_store.client is not None assert vector_store.client._api_version == "test" + +@pytest.mark.requires("azure.search.documents") +def test_ids_used_correctly() -> None: + """Check whether vector store uses the document ids when provided with them. + """ + from langchain_core.documents import Document + from azure.search.documents.indexes import SearchIndexClient + from azure.search.documents import SearchClient + + def mock_upload_documents(self, documents): + # assume all documents uploaded successfuly + response = [type('Response', (object,), {'succeeded': True})() for _ in documents] + return response + + # + documents = [Document(page_content="page zero Lorem Ipsum", + metadata={"source":"document.pdf", + "page":0, + "id": "ID-document-1"}), + Document(page_content="page one Lorem Ipsum", + metadata={"source":"document.pdf", + "page":1, + "id": "ID-document-2"}) + ] + ids_provided = [i.metadata.get("id") for i in documents] + + with patch.object(SearchClient, "upload_documents", mock_upload_documents), \ + patch.object(SearchIndexClient, "get_index", mock_default_index): + vector_store = create_vector_store() + ids_used_at_upload = vector_store.add_texts(documents, ids=ids_provided) + assert len(ids_provided)==len(ids_used_at_upload) + assert ids_provided==ids_used_at_upload \ No newline at end of file From 3b8563f1204250ebef33e566da52d5b5a1a73422 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20T=C5=99=C3=ADska?= Date: Thu, 29 Aug 2024 15:56:59 +0200 Subject: [PATCH 5/9] tiny formatting update --- .../tests/unit_tests/vectorstores/test_azure_search.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libs/community/tests/unit_tests/vectorstores/test_azure_search.py b/libs/community/tests/unit_tests/vectorstores/test_azure_search.py index 37cf8f77018f9..dc519c4731070 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_azure_search.py +++ b/libs/community/tests/unit_tests/vectorstores/test_azure_search.py @@ -202,7 +202,6 @@ def mock_upload_documents(self, documents): response = [type('Response', (object,), {'succeeded': True})() for _ in documents] return response - # documents = [Document(page_content="page zero Lorem Ipsum", metadata={"source":"document.pdf", "page":0, @@ -219,4 +218,4 @@ def mock_upload_documents(self, documents): vector_store = create_vector_store() ids_used_at_upload = vector_store.add_texts(documents, ids=ids_provided) assert len(ids_provided)==len(ids_used_at_upload) - assert ids_provided==ids_used_at_upload \ No newline at end of file + assert ids_provided==ids_used_at_upload From de28870989262b71307983d69537cc5f130a4aaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20T=C5=99=C3=ADska?= Date: Thu, 29 Aug 2024 16:06:30 +0200 Subject: [PATCH 6/9] formatting --- .../vectorstores/test_azure_search.py | 42 ++++++++++--------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/libs/community/tests/unit_tests/vectorstores/test_azure_search.py b/libs/community/tests/unit_tests/vectorstores/test_azure_search.py index dc519c4731070..ea5c64e169866 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_azure_search.py +++ b/libs/community/tests/unit_tests/vectorstores/test_azure_search.py @@ -189,33 +189,37 @@ def mock_create_index() -> None: assert vector_store.client is not None assert vector_store.client._api_version == "test" + @pytest.mark.requires("azure.search.documents") def test_ids_used_correctly() -> None: - """Check whether vector store uses the document ids when provided with them. - """ - from langchain_core.documents import Document - from azure.search.documents.indexes import SearchIndexClient + """Check whether vector store uses the document ids when provided with them.""" from azure.search.documents import SearchClient - + from azure.search.documents.indexes import SearchIndexClient + from langchain_core.documents import Document + def mock_upload_documents(self, documents): # assume all documents uploaded successfuly - response = [type('Response', (object,), {'succeeded': True})() for _ in documents] + response = [ + type("Response", (object,), {"succeeded": True})() for _ in documents + ] return response - - documents = [Document(page_content="page zero Lorem Ipsum", - metadata={"source":"document.pdf", - "page":0, - "id": "ID-document-1"}), - Document(page_content="page one Lorem Ipsum", - metadata={"source":"document.pdf", - "page":1, - "id": "ID-document-2"}) + + documents = [ + Document( + page_content="page zero Lorem Ipsum", + metadata={"source": "document.pdf", "page": 0, "id": "ID-document-1"}, + ), + Document( + page_content="page one Lorem Ipsum", + metadata={"source": "document.pdf", "page": 1, "id": "ID-document-2"}, + ), ] ids_provided = [i.metadata.get("id") for i in documents] - with patch.object(SearchClient, "upload_documents", mock_upload_documents), \ - patch.object(SearchIndexClient, "get_index", mock_default_index): + with patch.object( + SearchClient, "upload_documents", mock_upload_documents + ), patch.object(SearchIndexClient, "get_index", mock_default_index): vector_store = create_vector_store() ids_used_at_upload = vector_store.add_texts(documents, ids=ids_provided) - assert len(ids_provided)==len(ids_used_at_upload) - assert ids_provided==ids_used_at_upload + assert len(ids_provided) == len(ids_used_at_upload) + assert ids_provided == ids_used_at_upload From 9650275b43a84fa7a50b6bb1b0fa8cc876150525 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20T=C5=99=C3=ADska?= Date: Thu, 29 Aug 2024 16:22:19 +0200 Subject: [PATCH 7/9] fixed typing issues in test --- .../unit_tests/vectorstores/test_azure_search.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/libs/community/tests/unit_tests/vectorstores/test_azure_search.py b/libs/community/tests/unit_tests/vectorstores/test_azure_search.py index ea5c64e169866..e1d8c0d595515 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_azure_search.py +++ b/libs/community/tests/unit_tests/vectorstores/test_azure_search.py @@ -197,11 +197,13 @@ def test_ids_used_correctly() -> None: from azure.search.documents.indexes import SearchIndexClient from langchain_core.documents import Document - def mock_upload_documents(self, documents): + class Response: + def __init__(self): + self.succeeded: bool = True + + def mock_upload_documents(self, documents: List[object]) -> List[Response]: # assume all documents uploaded successfuly - response = [ - type("Response", (object,), {"succeeded": True})() for _ in documents - ] + response = [Response() for _ in documents] return response documents = [ @@ -220,6 +222,6 @@ def mock_upload_documents(self, documents): SearchClient, "upload_documents", mock_upload_documents ), patch.object(SearchIndexClient, "get_index", mock_default_index): vector_store = create_vector_store() - ids_used_at_upload = vector_store.add_texts(documents, ids=ids_provided) + ids_used_at_upload = vector_store.add_documents(documents, ids=ids_provided) assert len(ids_provided) == len(ids_used_at_upload) assert ids_provided == ids_used_at_upload From 587267608b3e03e5f16856a05625abacb7da9c25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20T=C5=99=C3=ADska?= Date: Thu, 29 Aug 2024 17:06:35 +0200 Subject: [PATCH 8/9] add "ignore [no-untyped-def]" in the test --- .../tests/unit_tests/vectorstores/test_azure_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/community/tests/unit_tests/vectorstores/test_azure_search.py b/libs/community/tests/unit_tests/vectorstores/test_azure_search.py index e1d8c0d595515..271874f978cf1 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_azure_search.py +++ b/libs/community/tests/unit_tests/vectorstores/test_azure_search.py @@ -198,10 +198,10 @@ def test_ids_used_correctly() -> None: from langchain_core.documents import Document class Response: - def __init__(self): + def __init__(self) -> None: self.succeeded: bool = True - def mock_upload_documents(self, documents: List[object]) -> List[Response]: + def mock_upload_documents(self, documents: List[object]) -> List[Response]: # type: ignore[no-untyped-def] # assume all documents uploaded successfuly response = [Response() for _ in documents] return response From 7c6c232d9eca5744d01c84aee24875eb6ccff551 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20T=C5=99=C3=ADska?= Date: Thu, 29 Aug 2024 17:11:55 +0200 Subject: [PATCH 9/9] formatting: literally added a space before comment :-/ --- .../tests/unit_tests/vectorstores/test_azure_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/community/tests/unit_tests/vectorstores/test_azure_search.py b/libs/community/tests/unit_tests/vectorstores/test_azure_search.py index 271874f978cf1..255105f32894b 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_azure_search.py +++ b/libs/community/tests/unit_tests/vectorstores/test_azure_search.py @@ -201,7 +201,7 @@ class Response: def __init__(self) -> None: self.succeeded: bool = True - def mock_upload_documents(self, documents: List[object]) -> List[Response]: # type: ignore[no-untyped-def] + def mock_upload_documents(self, documents: List[object]) -> List[Response]: # type: ignore[no-untyped-def] # assume all documents uploaded successfuly response = [Response() for _ in documents] return response