From 05c86877b17a90ffbcba5e6cf7f2929439c954d8 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Sun, 7 Jan 2024 19:08:52 +0100 Subject: [PATCH 1/7] move DocumentJoiner to new joiners package --- e2e/pipelines/test_eval_hybrid_doc_search_pipeline.py | 2 +- e2e/pipelines/test_hybrid_doc_search_pipeline.py | 2 +- haystack/components/joiners/__init__.py | 3 +++ haystack/components/{routers => joiners}/document_joiner.py | 0 haystack/components/routers/__init__.py | 3 +-- test/components/joiners/__init__.py | 0 test/components/{routers => joiners}/test_document_joiner.py | 2 +- 7 files changed, 7 insertions(+), 5 deletions(-) create mode 100644 haystack/components/joiners/__init__.py rename haystack/components/{routers => joiners}/document_joiner.py (100%) create mode 100644 test/components/joiners/__init__.py rename test/components/{routers => joiners}/test_document_joiner.py (98%) diff --git a/e2e/pipelines/test_eval_hybrid_doc_search_pipeline.py b/e2e/pipelines/test_eval_hybrid_doc_search_pipeline.py index f7b4455e65..ed1f0b1e96 100644 --- a/e2e/pipelines/test_eval_hybrid_doc_search_pipeline.py +++ b/e2e/pipelines/test_eval_hybrid_doc_search_pipeline.py @@ -2,7 +2,7 @@ from haystack.components.embedders import SentenceTransformersTextEmbedder from haystack.components.rankers import TransformersSimilarityRanker from haystack.components.retrievers import InMemoryBM25Retriever, InMemoryEmbeddingRetriever -from haystack.components.routers.document_joiner import DocumentJoiner +from haystack.components.joiners.document_joiner import DocumentJoiner from haystack.document_stores import InMemoryDocumentStore from haystack.evaluation.eval import eval diff --git a/e2e/pipelines/test_hybrid_doc_search_pipeline.py b/e2e/pipelines/test_hybrid_doc_search_pipeline.py index a9ead31b1d..fc6f6070e3 100644 --- a/e2e/pipelines/test_hybrid_doc_search_pipeline.py +++ b/e2e/pipelines/test_hybrid_doc_search_pipeline.py @@ -3,7 +3,7 @@ from haystack import Pipeline, Document from haystack.components.embedders import SentenceTransformersTextEmbedder from haystack.components.rankers import TransformersSimilarityRanker -from haystack.components.routers.document_joiner import DocumentJoiner +from haystack.components.joiners.document_joiner import DocumentJoiner from haystack.document_stores import InMemoryDocumentStore from haystack.components.retrievers import InMemoryBM25Retriever, InMemoryEmbeddingRetriever diff --git a/haystack/components/joiners/__init__.py b/haystack/components/joiners/__init__.py new file mode 100644 index 0000000000..2ce5519363 --- /dev/null +++ b/haystack/components/joiners/__init__.py @@ -0,0 +1,3 @@ +from haystack.components.joiners.document_joiner import DocumentJoiner + +__all__ = ["DocumentJoiner"] diff --git a/haystack/components/routers/document_joiner.py b/haystack/components/joiners/document_joiner.py similarity index 100% rename from haystack/components/routers/document_joiner.py rename to haystack/components/joiners/document_joiner.py diff --git a/haystack/components/routers/__init__.py b/haystack/components/routers/__init__.py index 65f8b9cb87..3eaeff616c 100644 --- a/haystack/components/routers/__init__.py +++ b/haystack/components/routers/__init__.py @@ -1,7 +1,6 @@ -from haystack.components.routers.document_joiner import DocumentJoiner from haystack.components.routers.file_type_router import FileTypeRouter from haystack.components.routers.metadata_router import MetadataRouter from haystack.components.routers.conditional_router import ConditionalRouter from haystack.components.routers.text_language_router import TextLanguageRouter -__all__ = ["DocumentJoiner", "FileTypeRouter", "MetadataRouter", "TextLanguageRouter", "ConditionalRouter"] +__all__ = ["FileTypeRouter", "MetadataRouter", "TextLanguageRouter", "ConditionalRouter"] diff --git a/test/components/joiners/__init__.py b/test/components/joiners/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/components/routers/test_document_joiner.py b/test/components/joiners/test_document_joiner.py similarity index 98% rename from test/components/routers/test_document_joiner.py rename to test/components/joiners/test_document_joiner.py index 3362c1c15d..af80ccc2f8 100644 --- a/test/components/routers/test_document_joiner.py +++ b/test/components/joiners/test_document_joiner.py @@ -3,7 +3,7 @@ import pytest from haystack import Document -from haystack.components.routers.document_joiner import DocumentJoiner +from haystack.components.joiners.document_joiner import DocumentJoiner class TestDocumentJoiner: From d2e258d05a5d4ab4403cbef94421e139752757e4 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Sun, 7 Jan 2024 19:18:18 +0100 Subject: [PATCH 2/7] relnote --- ...ove-documentjoiner-to-joiners-7fe188d18d65ffcd.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 releasenotes/notes/move-documentjoiner-to-joiners-7fe188d18d65ffcd.yaml diff --git a/releasenotes/notes/move-documentjoiner-to-joiners-7fe188d18d65ffcd.yaml b/releasenotes/notes/move-documentjoiner-to-joiners-7fe188d18d65ffcd.yaml new file mode 100644 index 0000000000..59948c915f --- /dev/null +++ b/releasenotes/notes/move-documentjoiner-to-joiners-7fe188d18d65ffcd.yaml @@ -0,0 +1,10 @@ +--- +upgrade: + - | + Change any occurrence of: + from haystack.components.routers.document_joiner import DocumentJoiner + to: + from haystack.components.joiners.document_joiner import DocumentJoiner +enhancements: + - | + Create a new package called `joiners` and move `DocumentJoiner` there for clarity. From f9d6292ce139d8e4d61a6991b3a054d3276165d5 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Mon, 8 Jan 2024 08:41:15 +0100 Subject: [PATCH 3/7] leftovers --- e2e/pipelines/test_dense_doc_search.py | 3 ++- e2e/pipelines/test_eval_dense_doc_search.py | 3 ++- examples/pipelines/indexing_pipeline.py | 3 ++- haystack/pipeline_utils/indexing.py | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/e2e/pipelines/test_dense_doc_search.py b/e2e/pipelines/test_dense_doc_search.py index 02f77cd04a..80aec5d277 100644 --- a/e2e/pipelines/test_dense_doc_search.py +++ b/e2e/pipelines/test_dense_doc_search.py @@ -4,7 +4,8 @@ from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder from haystack.components.converters import PyPDFToDocument, TextFileToDocument from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter -from haystack.components.routers import FileTypeRouter, DocumentJoiner +from haystack.components.routers import FileTypeRouter +from haystack.components.joiners import DocumentJoiner from haystack.components.writers import DocumentWriter from haystack.document_stores import InMemoryDocumentStore from haystack.components.retrievers import InMemoryEmbeddingRetriever diff --git a/e2e/pipelines/test_eval_dense_doc_search.py b/e2e/pipelines/test_eval_dense_doc_search.py index b17f052af8..c4320c1030 100644 --- a/e2e/pipelines/test_eval_dense_doc_search.py +++ b/e2e/pipelines/test_eval_dense_doc_search.py @@ -3,7 +3,8 @@ from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter from haystack.components.retrievers import InMemoryEmbeddingRetriever -from haystack.components.routers import DocumentJoiner, FileTypeRouter +from haystack.components.routers import FileTypeRouter +from haystack.components.joiners import DocumentJoiner from haystack.components.writers import DocumentWriter from haystack.dataclasses import Document from haystack.document_stores import InMemoryDocumentStore diff --git a/examples/pipelines/indexing_pipeline.py b/examples/pipelines/indexing_pipeline.py index 8c9217eb7e..ba61d02702 100644 --- a/examples/pipelines/indexing_pipeline.py +++ b/examples/pipelines/indexing_pipeline.py @@ -4,7 +4,8 @@ from haystack.components.embedders import SentenceTransformersDocumentEmbedder from haystack.components.converters import PyPDFToDocument, TextFileToDocument from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter -from haystack.components.routers import FileTypeRouter, DocumentJoiner +from haystack.components.routers import FileTypeRouter +from haystack.components.joiners import DocumentJoiner from haystack.components.writers import DocumentWriter from haystack.document_stores import InMemoryDocumentStore diff --git a/haystack/pipeline_utils/indexing.py b/haystack/pipeline_utils/indexing.py index 0dd188ebc4..f422105027 100644 --- a/haystack/pipeline_utils/indexing.py +++ b/haystack/pipeline_utils/indexing.py @@ -11,7 +11,8 @@ from haystack.components.embedders import SentenceTransformersDocumentEmbedder, OpenAIDocumentEmbedder from haystack.components.fetchers import LinkContentFetcher from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter -from haystack.components.routers import FileTypeRouter, DocumentJoiner +from haystack.components.routers import FileTypeRouter +from haystack.components.joiners import DocumentJoiner from haystack.components.writers import DocumentWriter from haystack.document_stores.protocol import DocumentStore From 370b79c6cd35985011db9d45b5a0bdcddd6f194d Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Mon, 8 Jan 2024 18:09:19 +0100 Subject: [PATCH 4/7] fix docstrings generation --- docs/pydoc/config/joiner.yml | 26 ++++++++++++++++++++++++++ docs/pydoc/config/router.yml | 2 +- 2 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 docs/pydoc/config/joiner.yml diff --git a/docs/pydoc/config/joiner.yml b/docs/pydoc/config/joiner.yml new file mode 100644 index 0000000000..34a9141a63 --- /dev/null +++ b/docs/pydoc/config/joiner.yml @@ -0,0 +1,26 @@ +loaders: + - type: loaders.CustomPythonLoader + search_path: [../../../haystack/components/joiners] + modules: ["document_joiner"] + ignore_when_discovered: ["__init__"] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: renderers.ReadmePreviewRenderer + excerpt: Routes data to the right component based on its file type or metadata. + category_slug: haystack-classes + title: Joiner API + slug: joiner-api + order: 140 + markdown: + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: joiner_api.md diff --git a/docs/pydoc/config/router.yml b/docs/pydoc/config/router.yml index e1cc0ab18e..bac0d1cae9 100644 --- a/docs/pydoc/config/router.yml +++ b/docs/pydoc/config/router.yml @@ -1,7 +1,7 @@ loaders: - type: loaders.CustomPythonLoader search_path: [../../../haystack/components/routers] - modules: ["document_joiner", "conditional_router", "file_type_router", "metadata_router", "text_language_router"] + modules: ["conditional_router", "file_type_router", "metadata_router", "text_language_router"] ignore_when_discovered: ["__init__"] processors: - type: filter From b19570e11b11ef88430869f0c9504da267ee0f14 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Mon, 8 Jan 2024 19:03:24 +0100 Subject: [PATCH 5/7] fix unrelated pydoc misconfiguration --- docs/pydoc/config/caching.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/pydoc/config/caching.yml b/docs/pydoc/config/caching.yml index 88c7d62865..92898fb2c2 100644 --- a/docs/pydoc/config/caching.yml +++ b/docs/pydoc/config/caching.yml @@ -1,7 +1,7 @@ loaders: - type: loaders.CustomPythonLoader search_path: [../../../haystack/components/caching] - modules: ["url_cache_checker"] + modules: ["cache_checker"] ignore_when_discovered: ["__init__"] processors: - type: filter From 704d1d627dc3bfb0ba9f3c0572381d40fbbe141e Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Mon, 8 Jan 2024 19:29:00 +0100 Subject: [PATCH 6/7] more unrelated work, yay! --- test/components/embedders/test_openai_document_embedder.py | 2 +- test/components/embedders/test_openai_text_embedder.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/components/embedders/test_openai_document_embedder.py b/test/components/embedders/test_openai_document_embedder.py index cb0632aa6a..22b921c0e3 100644 --- a/test/components/embedders/test_openai_document_embedder.py +++ b/test/components/embedders/test_openai_document_embedder.py @@ -170,7 +170,7 @@ def test_run(self): Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}), ] - model = "text-similarity-ada-001" + model = "text-embedding-ada-002" embedder = OpenAIDocumentEmbedder(model_name=model, meta_fields_to_embed=["topic"], embedding_separator=" | ") diff --git a/test/components/embedders/test_openai_text_embedder.py b/test/components/embedders/test_openai_text_embedder.py index 5ad91c55a2..0f05b35e15 100644 --- a/test/components/embedders/test_openai_text_embedder.py +++ b/test/components/embedders/test_openai_text_embedder.py @@ -76,7 +76,7 @@ def test_run_wrong_input_format(self): @pytest.mark.integration def test_run(self): - model = "text-similarity-ada-001" + model = "text-embedding-ada-002" embedder = OpenAITextEmbedder(model_name=model, prefix="prefix ", suffix=" suffix") result = embedder.run(text="The food was delicious") From a7c738c52865e69ee4d81744d114e02e651bf209 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Mon, 8 Jan 2024 21:42:37 +0100 Subject: [PATCH 7/7] fix assertions --- test/components/embedders/test_openai_document_embedder.py | 4 ++-- test/components/embedders/test_openai_text_embedder.py | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/test/components/embedders/test_openai_document_embedder.py b/test/components/embedders/test_openai_document_embedder.py index 22b921c0e3..93a9cf3a70 100644 --- a/test/components/embedders/test_openai_document_embedder.py +++ b/test/components/embedders/test_openai_document_embedder.py @@ -183,6 +183,6 @@ def test_run(self): for doc in documents_with_embeddings: assert isinstance(doc, Document) assert isinstance(doc.embedding, list) - assert len(doc.embedding) == 1024 + assert len(doc.embedding) == 1536 assert all(isinstance(x, float) for x in doc.embedding) - assert metadata == {"model": "text-similarity-ada:001", "usage": {"prompt_tokens": 15, "total_tokens": 15}} + assert metadata == {"model": "text-embedding-ada-002-v2", "usage": {"prompt_tokens": 15, "total_tokens": 15}} diff --git a/test/components/embedders/test_openai_text_embedder.py b/test/components/embedders/test_openai_text_embedder.py index 0f05b35e15..2f0b7e1a64 100644 --- a/test/components/embedders/test_openai_text_embedder.py +++ b/test/components/embedders/test_openai_text_embedder.py @@ -81,6 +81,9 @@ def test_run(self): embedder = OpenAITextEmbedder(model_name=model, prefix="prefix ", suffix=" suffix") result = embedder.run(text="The food was delicious") - assert len(result["embedding"]) == 1024 + assert len(result["embedding"]) == 1536 assert all(isinstance(x, float) for x in result["embedding"]) - assert result["meta"] == {"model": "text-similarity-ada:001", "usage": {"prompt_tokens": 6, "total_tokens": 6}} + assert result["meta"] == { + "model": "text-embedding-ada-002-v2", + "usage": {"prompt_tokens": 6, "total_tokens": 6}, + }