Skip to content

Commit

Permalink
fix: converting Pinecone metadata fields from float back to int (#1034
Browse files Browse the repository at this point in the history
)

* Pinecone converting floats back to int

* linting

* simplifying tests

* fixing tests

* fixing test

* linting issues

* adding and sleep between inserting and querying

* cleaning the indexes after tests ran

* ruff fix

* fixing tests

* Update integrations/pinecone/tests/test_document_store.py

Co-authored-by: Stefano Fiorucci <[email protected]>

* changing docstring for private utility function

* adding unit tests to private function

* fixing linting

* fixing linting according to black rules

* fixing linting

* removing time sleep, it's contained in the fixture

* fixes

---------

Co-authored-by: Stefano Fiorucci <[email protected]>
  • Loading branch information
davidsbatista and anakin87 authored Aug 29, 2024
1 parent 32f3ffb commit 52da6c7
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,20 @@ def _embedding_retrieval(

return self._convert_query_result_to_documents(result)

@staticmethod
def _convert_meta_to_int(metadata: Dict[str, Any]) -> Dict[str, Any]:
"""
Pinecone store numeric metadata values as `float`. Some specific metadata are used in Retrievers components and
are expected to be `int`. This method converts them back to integers.
"""
values_to_convert = ["split_id", "split_idx_start", "page_number"]

for value in values_to_convert:
if value in metadata:
metadata[value] = int(metadata[value]) if isinstance(metadata[value], float) else metadata[value]

return metadata

def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> List[Document]:
pinecone_docs = query_result["matches"]
documents = []
Expand All @@ -278,8 +292,7 @@ def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> Li
if dataframe_string:
dataframe = pd.read_json(io.StringIO(dataframe_string))

# we always store vectors during writing
# but we don't want to return them if they are dummy vectors
# we always store vectors during writing but we don't want to return them if they are dummy vectors
embedding = None
if pinecone_doc["values"] != self._dummy_vector:
embedding = pinecone_doc["values"]
Expand All @@ -288,7 +301,7 @@ def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> Li
id=pinecone_doc["id"],
content=content,
dataframe=dataframe,
meta=pinecone_doc["metadata"],
meta=self._convert_meta_to_int(pinecone_doc["metadata"]),
embedding=embedding,
score=pinecone_doc["score"],
)
Expand Down
58 changes: 58 additions & 0 deletions integrations/pinecone/tests/test_document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@
import numpy as np
import pytest
from haystack import Document
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.retrievers import SentenceWindowRetriever
from haystack.testing.document_store import CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest
from haystack.utils import Secret
from pinecone import Pinecone, PodSpec, ServerlessSpec

from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever
from haystack_integrations.document_stores.pinecone import PineconeDocumentStore


Expand Down Expand Up @@ -178,6 +181,36 @@ def test_discard_invalid_meta_valid():
assert pinecone_doc.meta["page_number"] == 1


def test_convert_meta_to_int():
# Test with floats
meta_data = {"split_id": 1.0, "split_idx_start": 2.0, "page_number": 3.0}
assert PineconeDocumentStore._convert_meta_to_int(meta_data) == {
"split_id": 1,
"split_idx_start": 2,
"page_number": 3,
}

# Test with floats and ints
meta_data = {"split_id": 1.0, "split_idx_start": 2, "page_number": 3.0}
assert PineconeDocumentStore._convert_meta_to_int(meta_data) == {
"split_id": 1,
"split_idx_start": 2,
"page_number": 3,
}

# Test with floats and strings
meta_data = {"split_id": 1.0, "other": "other_data", "page_number": 3.0}
assert PineconeDocumentStore._convert_meta_to_int(meta_data) == {
"split_id": 1,
"other": "other_data",
"page_number": 3,
}

# Test with empty dict
meta_data = {}
assert PineconeDocumentStore._convert_meta_to_int(meta_data) == {}


@pytest.mark.integration
@pytest.mark.skipif("PINECONE_API_KEY" not in os.environ, reason="PINECONE_API_KEY not set")
def test_serverless_index_creation_from_scratch(sleep_time):
Expand Down Expand Up @@ -257,3 +290,28 @@ def test_embedding_retrieval(self, document_store: PineconeDocumentStore):
assert len(results) == 2
assert results[0].content == "Most similar document"
assert results[1].content == "2nd best document"

def test_sentence_window_retriever(self, document_store: PineconeDocumentStore):
# indexing
splitter = DocumentSplitter(split_length=10, split_overlap=5, split_by="word")
text = (
"Whose woods these are I think I know. His house is in the village though; He will not see me stopping "
"here To watch his woods fill up with snow."
)
docs = splitter.run(documents=[Document(content=text)])

for idx, doc in enumerate(docs["documents"]):
if idx == 2:
doc.embedding = [0.1] * 768
continue
doc.embedding = np.random.rand(768).tolist()
document_store.write_documents(docs["documents"])

# query
embedding_retriever = PineconeEmbeddingRetriever(document_store=document_store)
query_embedding = [0.1] * 768
retrieved_doc = embedding_retriever.run(query_embedding=query_embedding, top_k=1, filters={})
sentence_window_retriever = SentenceWindowRetriever(document_store=document_store, window_size=2)
result = sentence_window_retriever.run(retrieved_documents=[retrieved_doc["documents"][0]])

assert len(result["context_windows"]) == 1

0 comments on commit 52da6c7

Please sign in to comment.