diff --git a/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py b/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py index 27eba6ecf..75d6270ca 100644 --- a/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py +++ b/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py @@ -267,6 +267,20 @@ def _embedding_retrieval( return self._convert_query_result_to_documents(result) + @staticmethod + def _convert_meta_to_int(metadata: Dict[str, Any]) -> Dict[str, Any]: + """ + Pinecone store numeric metadata values as `float`. Some specific metadata are used in Retrievers components and + are expected to be `int`. This method converts them back to integers. + """ + values_to_convert = ["split_id", "split_idx_start", "page_number"] + + for value in values_to_convert: + if value in metadata: + metadata[value] = int(metadata[value]) if isinstance(metadata[value], float) else metadata[value] + + return metadata + def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> List[Document]: pinecone_docs = query_result["matches"] documents = [] @@ -278,8 +292,7 @@ def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> Li if dataframe_string: dataframe = pd.read_json(io.StringIO(dataframe_string)) - # we always store vectors during writing - # but we don't want to return them if they are dummy vectors + # we always store vectors during writing but we don't want to return them if they are dummy vectors embedding = None if pinecone_doc["values"] != self._dummy_vector: embedding = pinecone_doc["values"] @@ -288,7 +301,7 @@ def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> Li id=pinecone_doc["id"], content=content, dataframe=dataframe, - meta=pinecone_doc["metadata"], + meta=self._convert_meta_to_int(pinecone_doc["metadata"]), embedding=embedding, score=pinecone_doc["score"], ) diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py index bd443b4a8..dcecf7996 100644 --- a/integrations/pinecone/tests/test_document_store.py +++ b/integrations/pinecone/tests/test_document_store.py @@ -5,10 +5,13 @@ import numpy as np import pytest from haystack import Document +from haystack.components.preprocessors import DocumentSplitter +from haystack.components.retrievers import SentenceWindowRetriever from haystack.testing.document_store import CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest from haystack.utils import Secret from pinecone import Pinecone, PodSpec, ServerlessSpec +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever from haystack_integrations.document_stores.pinecone import PineconeDocumentStore @@ -178,6 +181,36 @@ def test_discard_invalid_meta_valid(): assert pinecone_doc.meta["page_number"] == 1 +def test_convert_meta_to_int(): + # Test with floats + meta_data = {"split_id": 1.0, "split_idx_start": 2.0, "page_number": 3.0} + assert PineconeDocumentStore._convert_meta_to_int(meta_data) == { + "split_id": 1, + "split_idx_start": 2, + "page_number": 3, + } + + # Test with floats and ints + meta_data = {"split_id": 1.0, "split_idx_start": 2, "page_number": 3.0} + assert PineconeDocumentStore._convert_meta_to_int(meta_data) == { + "split_id": 1, + "split_idx_start": 2, + "page_number": 3, + } + + # Test with floats and strings + meta_data = {"split_id": 1.0, "other": "other_data", "page_number": 3.0} + assert PineconeDocumentStore._convert_meta_to_int(meta_data) == { + "split_id": 1, + "other": "other_data", + "page_number": 3, + } + + # Test with empty dict + meta_data = {} + assert PineconeDocumentStore._convert_meta_to_int(meta_data) == {} + + @pytest.mark.integration @pytest.mark.skipif("PINECONE_API_KEY" not in os.environ, reason="PINECONE_API_KEY not set") def test_serverless_index_creation_from_scratch(sleep_time): @@ -257,3 +290,28 @@ def test_embedding_retrieval(self, document_store: PineconeDocumentStore): assert len(results) == 2 assert results[0].content == "Most similar document" assert results[1].content == "2nd best document" + + def test_sentence_window_retriever(self, document_store: PineconeDocumentStore): + # indexing + splitter = DocumentSplitter(split_length=10, split_overlap=5, split_by="word") + text = ( + "Whose woods these are I think I know. His house is in the village though; He will not see me stopping " + "here To watch his woods fill up with snow." + ) + docs = splitter.run(documents=[Document(content=text)]) + + for idx, doc in enumerate(docs["documents"]): + if idx == 2: + doc.embedding = [0.1] * 768 + continue + doc.embedding = np.random.rand(768).tolist() + document_store.write_documents(docs["documents"]) + + # query + embedding_retriever = PineconeEmbeddingRetriever(document_store=document_store) + query_embedding = [0.1] * 768 + retrieved_doc = embedding_retriever.run(query_embedding=query_embedding, top_k=1, filters={}) + sentence_window_retriever = SentenceWindowRetriever(document_store=document_store, window_size=2) + result = sentence_window_retriever.run(retrieved_documents=[retrieved_doc["documents"][0]]) + + assert len(result["context_windows"]) == 1