Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: converting Pinecone metadata fields from float back to int #1034

Merged
merged 19 commits into from
Aug 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,20 @@ def _embedding_retrieval(

return self._convert_query_result_to_documents(result)

@staticmethod
def _convert_meta_to_int(metadata: Dict[str, Any]) -> Dict[str, Any]:
"""
Pinecone store numeric metadata values as `float`. Some specific metadata are used in Retrievers components and
are expected to be `int`. This method converts them back to integers.
"""
values_to_convert = ["split_id", "split_idx_start", "page_number"]

for value in values_to_convert:
if value in metadata:
metadata[value] = int(metadata[value]) if isinstance(metadata[value], float) else metadata[value]

return metadata

def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> List[Document]:
pinecone_docs = query_result["matches"]
documents = []
Expand All @@ -278,8 +292,7 @@ def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> Li
if dataframe_string:
dataframe = pd.read_json(io.StringIO(dataframe_string))

# we always store vectors during writing
# but we don't want to return them if they are dummy vectors
# we always store vectors during writing but we don't want to return them if they are dummy vectors
embedding = None
if pinecone_doc["values"] != self._dummy_vector:
embedding = pinecone_doc["values"]
Expand All @@ -288,7 +301,7 @@ def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> Li
id=pinecone_doc["id"],
content=content,
dataframe=dataframe,
meta=pinecone_doc["metadata"],
meta=self._convert_meta_to_int(pinecone_doc["metadata"]),
embedding=embedding,
score=pinecone_doc["score"],
)
Expand Down
58 changes: 58 additions & 0 deletions integrations/pinecone/tests/test_document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@
import numpy as np
import pytest
from haystack import Document
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.retrievers import SentenceWindowRetriever
from haystack.testing.document_store import CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest
from haystack.utils import Secret
from pinecone import Pinecone, PodSpec, ServerlessSpec

from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever
from haystack_integrations.document_stores.pinecone import PineconeDocumentStore


Expand Down Expand Up @@ -178,6 +181,36 @@ def test_discard_invalid_meta_valid():
assert pinecone_doc.meta["page_number"] == 1


def test_convert_meta_to_int():
# Test with floats
meta_data = {"split_id": 1.0, "split_idx_start": 2.0, "page_number": 3.0}
assert PineconeDocumentStore._convert_meta_to_int(meta_data) == {
"split_id": 1,
"split_idx_start": 2,
"page_number": 3,
}

# Test with floats and ints
meta_data = {"split_id": 1.0, "split_idx_start": 2, "page_number": 3.0}
assert PineconeDocumentStore._convert_meta_to_int(meta_data) == {
"split_id": 1,
"split_idx_start": 2,
"page_number": 3,
}

# Test with floats and strings
meta_data = {"split_id": 1.0, "other": "other_data", "page_number": 3.0}
assert PineconeDocumentStore._convert_meta_to_int(meta_data) == {
"split_id": 1,
"other": "other_data",
"page_number": 3,
}

# Test with empty dict
meta_data = {}
assert PineconeDocumentStore._convert_meta_to_int(meta_data) == {}


@pytest.mark.integration
@pytest.mark.skipif("PINECONE_API_KEY" not in os.environ, reason="PINECONE_API_KEY not set")
def test_serverless_index_creation_from_scratch(sleep_time):
Expand Down Expand Up @@ -257,3 +290,28 @@ def test_embedding_retrieval(self, document_store: PineconeDocumentStore):
assert len(results) == 2
assert results[0].content == "Most similar document"
assert results[1].content == "2nd best document"

def test_sentence_window_retriever(self, document_store: PineconeDocumentStore):
# indexing
splitter = DocumentSplitter(split_length=10, split_overlap=5, split_by="word")
text = (
"Whose woods these are I think I know. His house is in the village though; He will not see me stopping "
"here To watch his woods fill up with snow."
)
docs = splitter.run(documents=[Document(content=text)])

for idx, doc in enumerate(docs["documents"]):
if idx == 2:
doc.embedding = [0.1] * 768
continue
doc.embedding = np.random.rand(768).tolist()
document_store.write_documents(docs["documents"])

# query
embedding_retriever = PineconeEmbeddingRetriever(document_store=document_store)
query_embedding = [0.1] * 768
retrieved_doc = embedding_retriever.run(query_embedding=query_embedding, top_k=1, filters={})
sentence_window_retriever = SentenceWindowRetriever(document_store=document_store, window_size=2)
result = sentence_window_retriever.run(retrieved_documents=[retrieved_doc["documents"][0]])

assert len(result["context_windows"]) == 1