diff --git a/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py b/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py index 1fd3adf40..27eba6ecf 100644 --- a/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py +++ b/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py @@ -26,6 +26,7 @@ DEFAULT_STARTER_PLAN_SPEC = {"serverless": {"region": "us-east-1", "cloud": "aws"}} +METADATA_SUPPORTED_TYPES = str, int, bool, float # List[str] is supported and checked separately class PineconeDocumentStore: @@ -295,6 +296,37 @@ def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> Li return documents + @staticmethod + def _discard_invalid_meta(document: Document): + """ + Remove metadata fields with unsupported types from the document. + """ + + def valid_type(value: Any): + return isinstance(value, METADATA_SUPPORTED_TYPES) or ( + isinstance(value, list) and all(isinstance(i, str) for i in value) + ) + + if document.meta: + discarded_keys = [] + new_meta = {} + for key, value in document.meta.items(): + if not valid_type(value): + discarded_keys.append(key) + else: + new_meta[key] = value + + if discarded_keys: + msg = ( + f"Document {document.id} has metadata fields with unsupported types: {discarded_keys}. " + f"Only str, int, bool, and List[str] are supported. The values of these fields will be discarded." + ) + logger.warning(msg) + + document.meta = new_meta + + return document + def _convert_documents_to_pinecone_format(self, documents: List[Document]) -> List[Dict[str, Any]]: documents_for_pinecone = [] for document in documents: @@ -305,6 +337,10 @@ def _convert_documents_to_pinecone_format(self, documents: List[Document]) -> Li "A dummy embedding will be used, but this can affect the search results. " ) embedding = self._dummy_vector + + if document.meta: + self._discard_invalid_meta(document) + doc_for_pinecone = {"id": document.id, "values": embedding, "metadata": dict(document.meta)} # we save content/dataframe as metadata diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py index 90ce2ccff..bd443b4a8 100644 --- a/integrations/pinecone/tests/test_document_store.py +++ b/integrations/pinecone/tests/test_document_store.py @@ -142,6 +142,42 @@ def test_convert_dict_spec_to_pinecone_object_fail(): PineconeDocumentStore._convert_dict_spec_to_pinecone_object(dict_spec) +def test_discard_invalid_meta_invalid(): + invalid_metadata_doc = Document( + content="The moonlight shimmered ", + meta={ + "source_id": "62049ba1d1e1d5ebb1f6230b0b00c5356b8706c56e0b9c36b1dfc86084cd75f0", + "page_number": 1, + "split_id": 0, + "split_idx_start": 0, + "_split_overlap": [ + {"doc_id": "68ed48ba830048c5d7815874ed2de794722e6d10866b6c55349a914fd9a0df65", "range": (0, 20)} + ], + }, + ) + pinecone_doc = PineconeDocumentStore._discard_invalid_meta(invalid_metadata_doc) + + assert pinecone_doc.meta["source_id"] == "62049ba1d1e1d5ebb1f6230b0b00c5356b8706c56e0b9c36b1dfc86084cd75f0" + assert pinecone_doc.meta["page_number"] == 1 + assert pinecone_doc.meta["split_id"] == 0 + assert pinecone_doc.meta["split_idx_start"] == 0 + assert "_split_overlap" not in pinecone_doc.meta + + +def test_discard_invalid_meta_valid(): + valid_metadata_doc = Document( + content="The moonlight shimmered ", + meta={ + "source_id": "62049ba1d1e1d5ebb1f6230b0b00c5356b8706c56e0b9c36b1dfc86084cd75f0", + "page_number": 1, + }, + ) + pinecone_doc = PineconeDocumentStore._discard_invalid_meta(valid_metadata_doc) + + assert pinecone_doc.meta["source_id"] == "62049ba1d1e1d5ebb1f6230b0b00c5356b8706c56e0b9c36b1dfc86084cd75f0" + assert pinecone_doc.meta["page_number"] == 1 + + @pytest.mark.integration @pytest.mark.skipif("PINECONE_API_KEY" not in os.environ, reason="PINECONE_API_KEY not set") def test_serverless_index_creation_from_scratch(sleep_time):