Skip to content

Commit

Permalink
community : [bugfix] Use document ids as keys in AzureSearch vectorst…
Browse files Browse the repository at this point in the history
…ore (#25486)

# Description
[Vector store base
class](https://github.com/langchain-ai/langchain/blob/4cdaca67dc51dba887289f56c6fead3c1a52f97d/libs/core/langchain_core/vectorstores/base.py#L65)
currently expects `ids` to be passed in and that is what it passes along
to the AzureSearch vector store when attempting to `add_texts()`.
However AzureSearch expects `keys` to be passed in. When they are not
present, AzureSearch `add_embeddings()` makes up new uuids. This is a
problem when trying to run indexing. [Indexing code
expects](https://github.com/langchain-ai/langchain/blob/b297af5482ae7c6d26779513d637ec657a1cd552/libs/core/langchain_core/indexing/api.py#L371)
the documents to be uploaded using provided ids. Currently AzureSearch
ignores `ids` passed from `indexing` and makes up new ones. Later when
`indexer` attempts to delete removed file, it uses the `id` it had
stored when uploading the document, however it was uploaded under
different `id`.

**Twitter handle: @martintriska1**
  • Loading branch information
MacanPN authored Sep 19, 2024
1 parent a8561bc commit 3fc0ea5
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 3 deletions.
22 changes: 19 additions & 3 deletions libs/community/langchain_community/vectorstores/azuresearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,12 @@ def add_texts(
logger.debug("Nothing to insert, skipping.")
return []

# when `keys` are not passed in and there is `ids` in kwargs, use those instead
# base class expects `ids` passed in rather than `keys`
# https://github.com/langchain-ai/langchain/blob/4cdaca67dc51dba887289f56c6fead3c1a52f97d/libs/core/langchain_core/vectorstores/base.py#L65
if (not keys) and ("ids" in kwargs) and (len(kwargs["ids"]) == len(embeddings)):
keys = kwargs["ids"]

return self.add_embeddings(zip(texts, embeddings), metadatas, keys=keys)

async def aadd_texts(
Expand All @@ -467,6 +473,12 @@ async def aadd_texts(
logger.debug("Nothing to insert, skipping.")
return []

# when `keys` are not passed in and there is `ids` in kwargs, use those instead
# base class expects `ids` passed in rather than `keys`
# https://github.com/langchain-ai/langchain/blob/4cdaca67dc51dba887289f56c6fead3c1a52f97d/libs/core/langchain_core/vectorstores/base.py#L65
if (not keys) and ("ids" in kwargs) and (len(kwargs["ids"]) == len(embeddings)):
keys = kwargs["ids"]

return await self.aadd_embeddings(zip(texts, embeddings), metadatas, keys=keys)

def add_embeddings(
Expand All @@ -483,9 +495,13 @@ def add_embeddings(
data = []
for i, (text, embedding) in enumerate(text_embeddings):
# Use provided key otherwise use default key
key = keys[i] if keys else str(uuid.uuid4())
# Encoding key for Azure Search valid characters
key = base64.urlsafe_b64encode(bytes(key, "utf-8")).decode("ascii")
if keys:
key = keys[i]
else:
key = str(uuid.uuid4())
# Encoding key for Azure Search valid characters
key = base64.urlsafe_b64encode(bytes(key, "utf-8")).decode("ascii")

metadata = metadatas[i] if metadatas else {}
# Add data to index
# Additional metadata to fields mapping
Expand Down
37 changes: 37 additions & 0 deletions libs/community/tests/unit_tests/vectorstores/test_azure_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,3 +190,40 @@ def mock_create_index() -> None:
)
assert vector_store.client is not None
assert vector_store.client._api_version == "test"


@pytest.mark.requires("azure.search.documents")
def test_ids_used_correctly() -> None:
"""Check whether vector store uses the document ids when provided with them."""
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from langchain_core.documents import Document

class Response:
def __init__(self) -> None:
self.succeeded: bool = True

def mock_upload_documents(self, documents: List[object]) -> List[Response]: # type: ignore[no-untyped-def]
# assume all documents uploaded successfuly
response = [Response() for _ in documents]
return response

documents = [
Document(
page_content="page zero Lorem Ipsum",
metadata={"source": "document.pdf", "page": 0, "id": "ID-document-1"},
),
Document(
page_content="page one Lorem Ipsum",
metadata={"source": "document.pdf", "page": 1, "id": "ID-document-2"},
),
]
ids_provided = [i.metadata.get("id") for i in documents]

with patch.object(
SearchClient, "upload_documents", mock_upload_documents
), patch.object(SearchIndexClient, "get_index", mock_default_index):
vector_store = create_vector_store()
ids_used_at_upload = vector_store.add_documents(documents, ids=ids_provided)
assert len(ids_provided) == len(ids_used_at_upload)
assert ids_provided == ids_used_at_upload

0 comments on commit 3fc0ea5

Please sign in to comment.