fix: converting Pinecone metadata fields from float back to int (#1034

) * Pinecone converting floats back to int * linting * simplifying tests * fixing tests * fixing test * linting issues * adding and sleep between inserting and querying * cleaning the indexes after tests ran * ruff fix * fixing tests * Update integrations/pinecone/tests/test_document_store.py Co-authored-by: Stefano Fiorucci <[email protected]> * changing docstring for private utility function * adding unit tests to private function * fixing linting * fixing linting according to black rules * fixing linting * removing time sleep, it's contained in the fixture * fixes --------- Co-authored-by: Stefano Fiorucci <[email protected]>
deepset-ai · Aug 29, 2024 · 52da6c7 · 52da6c7
1 parent 32f3ffb
commit 52da6c7
Show file tree

Hide file tree

Showing 2 changed files with 74 additions and 3 deletions.
diff --git a/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py b/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py
@@ -267,6 +267,20 @@ def _embedding_retrieval(
 
         return self._convert_query_result_to_documents(result)
 
+    @staticmethod
+    def _convert_meta_to_int(metadata: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Pinecone store numeric metadata values as `float`. Some specific metadata are used in Retrievers components and
+        are expected to be `int`. This method converts them back to integers.
+        """
+        values_to_convert = ["split_id", "split_idx_start", "page_number"]
+
+        for value in values_to_convert:
+            if value in metadata:
+                metadata[value] = int(metadata[value]) if isinstance(metadata[value], float) else metadata[value]
+
+        return metadata
+
     def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> List[Document]:
         pinecone_docs = query_result["matches"]
         documents = []
@@ -278,8 +292,7 @@ def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> Li
             if dataframe_string:
                 dataframe = pd.read_json(io.StringIO(dataframe_string))
 
-            # we always store vectors during writing
-            # but we don't want to return them if they are dummy vectors
+            # we always store vectors during writing but we don't want to return them if they are dummy vectors
             embedding = None
             if pinecone_doc["values"] != self._dummy_vector:
                 embedding = pinecone_doc["values"]
@@ -288,7 +301,7 @@ def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> Li
                 id=pinecone_doc["id"],
                 content=content,
                 dataframe=dataframe,
-                meta=pinecone_doc["metadata"],
+                meta=self._convert_meta_to_int(pinecone_doc["metadata"]),
                 embedding=embedding,
                 score=pinecone_doc["score"],
             )

diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py
@@ -5,10 +5,13 @@
 import numpy as np
 import pytest
 from haystack import Document
+from haystack.components.preprocessors import DocumentSplitter
+from haystack.components.retrievers import SentenceWindowRetriever
 from haystack.testing.document_store import CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest
 from haystack.utils import Secret
 from pinecone import Pinecone, PodSpec, ServerlessSpec
 
+from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever
 from haystack_integrations.document_stores.pinecone import PineconeDocumentStore
 
 
@@ -178,6 +181,36 @@ def test_discard_invalid_meta_valid():
     assert pinecone_doc.meta["page_number"] == 1
 
 
+def test_convert_meta_to_int():
+    # Test with floats
+    meta_data = {"split_id": 1.0, "split_idx_start": 2.0, "page_number": 3.0}
+    assert PineconeDocumentStore._convert_meta_to_int(meta_data) == {
+        "split_id": 1,
+        "split_idx_start": 2,
+        "page_number": 3,
+    }
+
+    # Test with floats and ints
+    meta_data = {"split_id": 1.0, "split_idx_start": 2, "page_number": 3.0}
+    assert PineconeDocumentStore._convert_meta_to_int(meta_data) == {
+        "split_id": 1,
+        "split_idx_start": 2,
+        "page_number": 3,
+    }
+
+    # Test with floats and strings
+    meta_data = {"split_id": 1.0, "other": "other_data", "page_number": 3.0}
+    assert PineconeDocumentStore._convert_meta_to_int(meta_data) == {
+        "split_id": 1,
+        "other": "other_data",
+        "page_number": 3,
+    }
+
+    # Test with empty dict
+    meta_data = {}
+    assert PineconeDocumentStore._convert_meta_to_int(meta_data) == {}
+
+
 @pytest.mark.integration
 @pytest.mark.skipif("PINECONE_API_KEY" not in os.environ, reason="PINECONE_API_KEY not set")
 def test_serverless_index_creation_from_scratch(sleep_time):
@@ -257,3 +290,28 @@ def test_embedding_retrieval(self, document_store: PineconeDocumentStore):
         assert len(results) == 2
         assert results[0].content == "Most similar document"
         assert results[1].content == "2nd best document"
+
+    def test_sentence_window_retriever(self, document_store: PineconeDocumentStore):
+        # indexing
+        splitter = DocumentSplitter(split_length=10, split_overlap=5, split_by="word")
+        text = (
+            "Whose woods these are I think I know. His house is in the village though; He will not see me stopping "
+            "here To watch his woods fill up with snow."
+        )
+        docs = splitter.run(documents=[Document(content=text)])
+
+        for idx, doc in enumerate(docs["documents"]):
+            if idx == 2:
+                doc.embedding = [0.1] * 768
+                continue
+            doc.embedding = np.random.rand(768).tolist()
+        document_store.write_documents(docs["documents"])
+
+        # query
+        embedding_retriever = PineconeEmbeddingRetriever(document_store=document_store)
+        query_embedding = [0.1] * 768
+        retrieved_doc = embedding_retriever.run(query_embedding=query_embedding, top_k=1, filters={})
+        sentence_window_retriever = SentenceWindowRetriever(document_store=document_store, window_size=2)
+        result = sentence_window_retriever.run(retrieved_documents=[retrieved_doc["documents"][0]])
+
+        assert len(result["context_windows"]) == 1