Merge branch 'main' into add_exact_match_metric

deepset-ai · Jan 10, 2024 · 6598b09 · 6598b09
2 parents 6ca9e2a + 374a937
commit 6598b09
Show file tree

Hide file tree

Showing 31 changed files with 866 additions and 229 deletions.
diff --git a/e2e/pipelines/test_named_entity_extractor.py b/e2e/pipelines/test_named_entity_extractor.py
@@ -0,0 +1,110 @@
+import pytest
+
+from haystack import Document, Pipeline, ComponentError
+from haystack.components.extractors import NamedEntityAnnotation, NamedEntityExtractor, NamedEntityExtractorBackend
+
+
+@pytest.fixture
+def raw_texts():
+    return [
+        "My name is Clara and I live in Berkeley, California.",
+        "I'm Merlin, the happy pig!",
+        "New York State declared a state of emergency after the announcement of the end of the world.",
+        "",  # Intentionally empty.
+    ]
+
+
+@pytest.fixture
+def hf_annotations():
+    return [
+        [
+            NamedEntityAnnotation(entity="PER", start=11, end=16),
+            NamedEntityAnnotation(entity="LOC", start=31, end=39),
+            NamedEntityAnnotation(entity="LOC", start=41, end=51),
+        ],
+        [NamedEntityAnnotation(entity="PER", start=4, end=10)],
+        [NamedEntityAnnotation(entity="LOC", start=0, end=14)],
+        [],
+    ]
+
+
+@pytest.fixture
+def spacy_annotations():
+    return [
+        [
+            NamedEntityAnnotation(entity="PERSON", start=11, end=16),
+            NamedEntityAnnotation(entity="GPE", start=31, end=39),
+            NamedEntityAnnotation(entity="GPE", start=41, end=51),
+        ],
+        [NamedEntityAnnotation(entity="PERSON", start=4, end=10)],
+        [NamedEntityAnnotation(entity="GPE", start=0, end=14)],
+        [],
+    ]
+
+
+def test_ner_extractor_init():
+    extractor = NamedEntityExtractor(
+        backend=NamedEntityExtractorBackend.HUGGING_FACE, model_name_or_path="dslim/bert-base-NER", device_id=-1
+    )
+
+    with pytest.raises(ComponentError, match=r"not initialized"):
+        extractor.run(documents=[])
+
+    assert not extractor.initialized
+    extractor.warm_up()
+    assert extractor.initialized
+
+
+@pytest.mark.parametrize("batch_size", [1, 3])
+def test_ner_extractor_hf_backend(raw_texts, hf_annotations, batch_size):
+    extractor = NamedEntityExtractor(
+        backend=NamedEntityExtractorBackend.HUGGING_FACE, model_name_or_path="dslim/bert-base-NER"
+    )
+    extractor.warm_up()
+
+    _extract_and_check_predictions(extractor, raw_texts, hf_annotations, batch_size)
+
+
+@pytest.mark.parametrize("batch_size", [1, 3])
+def test_ner_extractor_spacy_backend(raw_texts, spacy_annotations, batch_size):
+    extractor = NamedEntityExtractor(backend=NamedEntityExtractorBackend.SPACY, model_name_or_path="en_core_web_trf")
+    extractor.warm_up()
+
+    _extract_and_check_predictions(extractor, raw_texts, spacy_annotations, batch_size)
+
+
+@pytest.mark.parametrize("batch_size", [1, 3])
+def test_ner_extractor_in_pipeline(raw_texts, hf_annotations, batch_size):
+    pipeline = Pipeline()
+    pipeline.add_component(
+        name="ner_extractor",
+        instance=NamedEntityExtractor(
+            backend=NamedEntityExtractorBackend.HUGGING_FACE, model_name_or_path="dslim/bert-base-NER"
+        ),
+    )
+
+    outputs = pipeline.run(
+        {"ner_extractor": {"documents": [Document(content=text) for text in raw_texts], "batch_size": batch_size}}
+    )["ner_extractor"]["documents"]
+    predicted = [NamedEntityExtractor.get_stored_annotations(doc) for doc in outputs]
+    _check_predictions(predicted, hf_annotations)
+
+
+def _extract_and_check_predictions(extractor, texts, expected, batch_size):
+    docs = [Document(content=text) for text in texts]
+    outputs = extractor.run(documents=docs, batch_size=batch_size)["documents"]
+    assert all(id(a) == id(b) for a, b in zip(docs, outputs))
+    predicted = [NamedEntityExtractor.get_stored_annotations(doc) for doc in outputs]
+
+    _check_predictions(predicted, expected)
+
+
+def _check_predictions(predicted, expected):
+    assert len(predicted) == len(expected)
+    for pred, exp in zip(predicted, expected):
+        assert len(pred) == len(exp)
+
+        for a, b in zip(pred, exp):
+            assert a.entity == b.entity
+            assert a.start == b.start
+            assert a.end == b.end
diff --git a/examples/pipelines/indexing_pipeline_with_meta.py b/examples/pipelines/indexing_pipeline_with_meta.py
@@ -0,0 +1,43 @@
+from typing import Dict, Any
+from pathlib import Path
+from datetime import datetime
+
+from haystack import Pipeline
+from haystack.components.others import Multiplexer
+from haystack.components.converters import PyPDFToDocument, TextFileToDocument
+from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
+from haystack.components.routers import FileTypeRouter, DocumentJoiner
+from haystack.components.writers import DocumentWriter
+from haystack.document_stores import InMemoryDocumentStore
+
+
+document_store = InMemoryDocumentStore()
+
+p = Pipeline()
+p.add_component(instance=FileTypeRouter(mime_types=["text/plain", "application/pdf"]), name="file_type_router")
+p.add_component(instance=Multiplexer(Dict[str, Any]), name="metadata_multiplexer")
+p.add_component(instance=TextFileToDocument(), name="text_file_converter")
+p.add_component(instance=PyPDFToDocument(), name="pdf_file_converter")
+p.add_component(instance=DocumentJoiner(), name="joiner")
+p.add_component(instance=DocumentCleaner(), name="cleaner")
+p.add_component(instance=DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30), name="splitter")
+p.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
+
+p.connect("file_type_router.text/plain", "text_file_converter.sources")
+p.connect("file_type_router.application/pdf", "pdf_file_converter.sources")
+p.connect("metadata_multiplexer", "text_file_converter.meta")
+p.connect("metadata_multiplexer", "pdf_file_converter.meta")
+p.connect("text_file_converter.documents", "joiner.documents")
+p.connect("pdf_file_converter.documents", "joiner.documents")
+p.connect("joiner.documents", "cleaner.documents")
+p.connect("cleaner.documents", "splitter.documents")
+p.connect("splitter.documents", "writer.documents")
+
+result = p.run(
+    {
+        "file_type_router": {"sources": list(Path(".").iterdir())},
+        "metadata_multiplexer": {"value": {"date_added": datetime.now().isoformat()}},
+    }
+)
+
+assert all("date_added" in doc.meta for doc in document_store.filter_documents())
diff --git a/haystack/components/converters/azure.py b/haystack/components/converters/azure.py
@@ -6,7 +6,7 @@
 from haystack.lazy_imports import LazyImport
 from haystack import component, Document, default_to_dict
 from haystack.dataclasses import ByteStream
-from haystack.components.converters.utils import get_bytestream_from_source
+from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
 
 logger = logging.getLogger(__name__)
 
@@ -31,7 +31,7 @@ class AzureOCRDocumentConverter:
     from haystack.components.converters.azure import AzureOCRDocumentConverter
 
     converter = AzureOCRDocumentConverter()
-    results = converter.run(sources=["image-based-document.pdf"])
+    results = converter.run(sources=["image-based-document.pdf"], meta={"date_added": datetime.now().isoformat()})
     documents = results["documents"]
     print(documents[0].content)
     # 'This is a text from the PDF file.'
@@ -76,20 +76,19 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D
         the raw responses from Azure's Document Intelligence service.
 
         :param sources: List of file paths or ByteStream objects.
-        :param meta: Optional list of metadata to attach to the Documents.
-          The length of the list must match the number of sources. Defaults to `None`.
+        :param meta: Optional metadata to attach to the Documents.
+          This value can be either a list of dictionaries or a single dictionary.
+          If it's a single dictionary, its content is added to the metadata of all produced Documents.
+          If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
+          Defaults to `None`.
         :return: A dictionary containing a list of Document objects under the 'documents' key
           and the raw Azure response under the 'raw_azure_response' key.
         """
         documents = []
         azure_output = []
+        meta_list = normalize_metadata(meta=meta, sources_count=len(sources))
 
-        if meta is None:
-            meta = [{}] * len(sources)
-        elif len(sources) != len(meta):
-            raise ValueError("The length of the metadata list must match the number of sources.")
-
-        for source, metadata in zip(sources, meta):
+        for source, metadata in zip(sources, meta_list):
             try:
                 bytestream = get_bytestream_from_source(source=source)
             except Exception as e:

diff --git a/haystack/components/converters/html.py b/haystack/components/converters/html.py
@@ -50,7 +50,11 @@ def __init__(
         self.extractor_type = extractor_type
 
     @component.output_types(documents=List[Document])
-    def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
+    def run(
+        self,
+        sources: List[Union[str, Path, ByteStream]],
+        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
+    ):
         """
         Converts a list of HTML files to Documents.
 

diff --git a/haystack/components/converters/markdown.py b/haystack/components/converters/markdown.py
@@ -7,7 +7,7 @@
 from haystack import Document, component
 from haystack.dataclasses import ByteStream
 from haystack.lazy_imports import LazyImport
-from haystack.components.converters.utils import get_bytestream_from_source
+from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
 
 with LazyImport("Run 'pip install markdown-it-py mdit_plain'") as markdown_conversion_imports:
     from markdown_it import MarkdownIt
@@ -27,7 +27,7 @@ class MarkdownToDocument:
     from haystack.components.converters.markdown import MarkdownToDocument
 
     converter = MarkdownToDocument()
-    results = converter.run(sources=["sample.md"])
+    results = converter.run(sources=["sample.md"], meta={"date_added": datetime.now().isoformat()})
     documents = results["documents"]
     print(documents[0].content)
     # 'This is a text from the markdown file.'
@@ -45,28 +45,31 @@ def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True
         self.progress_bar = progress_bar
 
     @component.output_types(documents=List[Document])
-    def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
+    def run(
+        self,
+        sources: List[Union[str, Path, ByteStream]],
+        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
+    ):
         """
         Reads text from a markdown file and executes optional preprocessing steps.
 
         :param sources: A list of markdown data sources (file paths or binary objects)
-        :param meta: Optional list of metadata to attach to the Documents.
-          The length of the list must match the number of paths. Defaults to `None`.
+        :param meta: Optional metadata to attach to the Documents.
+          This value can be either a list of dictionaries or a single dictionary.
+          If it's a single dictionary, its content is added to the metadata of all produced Documents.
+          If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
+          Defaults to `None`.
         :return: A dictionary containing a list of Document objects under the 'documents' key.
         """
         parser = MarkdownIt(renderer_cls=RendererPlain)
         if self.table_to_single_line:
             parser.enable("table")
 
         documents = []
-
-        if meta is None:
-            meta = [{}] * len(sources)
-        elif len(sources) != len(meta):
-            raise ValueError("The length of the metadata list must match the number of sources.")
+        meta_list = normalize_metadata(meta=meta, sources_count=len(sources))
 
         for source, metadata in tqdm(
-            zip(sources, meta),
+            zip(sources, meta_list),
             total=len(sources),
             desc="Converting markdown files to Documents",
             disable=not self.progress_bar,

diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py
@@ -82,7 +82,11 @@ def to_dict(self):
         return default_to_dict(self, converter_name=self.converter_name)
 
     @component.output_types(documents=List[Document])
-    def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
+    def run(
+        self,
+        sources: List[Union[str, Path, ByteStream]],
+        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
+    ):
         """
         Converts a list of PDF sources into Document objects using the configured converter.
 

diff --git a/haystack/components/extractors/__init__.py b/haystack/components/extractors/__init__.py
@@ -0,0 +1,7 @@
+from haystack.components.extractors.named_entity_extractor import (
+    NamedEntityAnnotation,
+    NamedEntityExtractor,
+    NamedEntityExtractorBackend,
+)
+
+__all__ = ["NamedEntityExtractor", "NamedEntityExtractorBackend", "NamedEntityAnnotation"]