Skip to content

Commit

Permalink
Merge branch 'main' into add_exact_match_metric
Browse files Browse the repository at this point in the history
  • Loading branch information
awinml authored Jan 10, 2024
2 parents 6ca9e2a + 374a937 commit 6598b09
Show file tree
Hide file tree
Showing 31 changed files with 866 additions and 229 deletions.
110 changes: 110 additions & 0 deletions e2e/pipelines/test_named_entity_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import pytest

from haystack import Document, Pipeline, ComponentError
from haystack.components.extractors import NamedEntityAnnotation, NamedEntityExtractor, NamedEntityExtractorBackend


@pytest.fixture
def raw_texts():
return [
"My name is Clara and I live in Berkeley, California.",
"I'm Merlin, the happy pig!",
"New York State declared a state of emergency after the announcement of the end of the world.",
"", # Intentionally empty.
]


@pytest.fixture
def hf_annotations():
return [
[
NamedEntityAnnotation(entity="PER", start=11, end=16),
NamedEntityAnnotation(entity="LOC", start=31, end=39),
NamedEntityAnnotation(entity="LOC", start=41, end=51),
],
[NamedEntityAnnotation(entity="PER", start=4, end=10)],
[NamedEntityAnnotation(entity="LOC", start=0, end=14)],
[],
]


@pytest.fixture
def spacy_annotations():
return [
[
NamedEntityAnnotation(entity="PERSON", start=11, end=16),
NamedEntityAnnotation(entity="GPE", start=31, end=39),
NamedEntityAnnotation(entity="GPE", start=41, end=51),
],
[NamedEntityAnnotation(entity="PERSON", start=4, end=10)],
[NamedEntityAnnotation(entity="GPE", start=0, end=14)],
[],
]


def test_ner_extractor_init():
extractor = NamedEntityExtractor(
backend=NamedEntityExtractorBackend.HUGGING_FACE, model_name_or_path="dslim/bert-base-NER", device_id=-1
)

with pytest.raises(ComponentError, match=r"not initialized"):
extractor.run(documents=[])

assert not extractor.initialized
extractor.warm_up()
assert extractor.initialized


@pytest.mark.parametrize("batch_size", [1, 3])
def test_ner_extractor_hf_backend(raw_texts, hf_annotations, batch_size):
extractor = NamedEntityExtractor(
backend=NamedEntityExtractorBackend.HUGGING_FACE, model_name_or_path="dslim/bert-base-NER"
)
extractor.warm_up()

_extract_and_check_predictions(extractor, raw_texts, hf_annotations, batch_size)


@pytest.mark.parametrize("batch_size", [1, 3])
def test_ner_extractor_spacy_backend(raw_texts, spacy_annotations, batch_size):
extractor = NamedEntityExtractor(backend=NamedEntityExtractorBackend.SPACY, model_name_or_path="en_core_web_trf")
extractor.warm_up()

_extract_and_check_predictions(extractor, raw_texts, spacy_annotations, batch_size)


@pytest.mark.parametrize("batch_size", [1, 3])
def test_ner_extractor_in_pipeline(raw_texts, hf_annotations, batch_size):
pipeline = Pipeline()
pipeline.add_component(
name="ner_extractor",
instance=NamedEntityExtractor(
backend=NamedEntityExtractorBackend.HUGGING_FACE, model_name_or_path="dslim/bert-base-NER"
),
)

outputs = pipeline.run(
{"ner_extractor": {"documents": [Document(content=text) for text in raw_texts], "batch_size": batch_size}}
)["ner_extractor"]["documents"]
predicted = [NamedEntityExtractor.get_stored_annotations(doc) for doc in outputs]
_check_predictions(predicted, hf_annotations)


def _extract_and_check_predictions(extractor, texts, expected, batch_size):
docs = [Document(content=text) for text in texts]
outputs = extractor.run(documents=docs, batch_size=batch_size)["documents"]
assert all(id(a) == id(b) for a, b in zip(docs, outputs))
predicted = [NamedEntityExtractor.get_stored_annotations(doc) for doc in outputs]

_check_predictions(predicted, expected)


def _check_predictions(predicted, expected):
assert len(predicted) == len(expected)
for pred, exp in zip(predicted, expected):
assert len(pred) == len(exp)

for a, b in zip(pred, exp):
assert a.entity == b.entity
assert a.start == b.start
assert a.end == b.end
43 changes: 43 additions & 0 deletions examples/pipelines/indexing_pipeline_with_meta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from typing import Dict, Any
from pathlib import Path
from datetime import datetime

from haystack import Pipeline
from haystack.components.others import Multiplexer
from haystack.components.converters import PyPDFToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.routers import FileTypeRouter, DocumentJoiner
from haystack.components.writers import DocumentWriter
from haystack.document_stores import InMemoryDocumentStore


document_store = InMemoryDocumentStore()

p = Pipeline()
p.add_component(instance=FileTypeRouter(mime_types=["text/plain", "application/pdf"]), name="file_type_router")
p.add_component(instance=Multiplexer(Dict[str, Any]), name="metadata_multiplexer")
p.add_component(instance=TextFileToDocument(), name="text_file_converter")
p.add_component(instance=PyPDFToDocument(), name="pdf_file_converter")
p.add_component(instance=DocumentJoiner(), name="joiner")
p.add_component(instance=DocumentCleaner(), name="cleaner")
p.add_component(instance=DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30), name="splitter")
p.add_component(instance=DocumentWriter(document_store=document_store), name="writer")

p.connect("file_type_router.text/plain", "text_file_converter.sources")
p.connect("file_type_router.application/pdf", "pdf_file_converter.sources")
p.connect("metadata_multiplexer", "text_file_converter.meta")
p.connect("metadata_multiplexer", "pdf_file_converter.meta")
p.connect("text_file_converter.documents", "joiner.documents")
p.connect("pdf_file_converter.documents", "joiner.documents")
p.connect("joiner.documents", "cleaner.documents")
p.connect("cleaner.documents", "splitter.documents")
p.connect("splitter.documents", "writer.documents")

result = p.run(
{
"file_type_router": {"sources": list(Path(".").iterdir())},
"metadata_multiplexer": {"value": {"date_added": datetime.now().isoformat()}},
}
)

assert all("date_added" in doc.meta for doc in document_store.filter_documents())
19 changes: 9 additions & 10 deletions haystack/components/converters/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from haystack.lazy_imports import LazyImport
from haystack import component, Document, default_to_dict
from haystack.dataclasses import ByteStream
from haystack.components.converters.utils import get_bytestream_from_source
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata

logger = logging.getLogger(__name__)

Expand All @@ -31,7 +31,7 @@ class AzureOCRDocumentConverter:
from haystack.components.converters.azure import AzureOCRDocumentConverter
converter = AzureOCRDocumentConverter()
results = converter.run(sources=["image-based-document.pdf"])
results = converter.run(sources=["image-based-document.pdf"], meta={"date_added": datetime.now().isoformat()})
documents = results["documents"]
print(documents[0].content)
# 'This is a text from the PDF file.'
Expand Down Expand Up @@ -76,20 +76,19 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D
the raw responses from Azure's Document Intelligence service.
:param sources: List of file paths or ByteStream objects.
:param meta: Optional list of metadata to attach to the Documents.
The length of the list must match the number of sources. Defaults to `None`.
:param meta: Optional metadata to attach to the Documents.
This value can be either a list of dictionaries or a single dictionary.
If it's a single dictionary, its content is added to the metadata of all produced Documents.
If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
Defaults to `None`.
:return: A dictionary containing a list of Document objects under the 'documents' key
and the raw Azure response under the 'raw_azure_response' key.
"""
documents = []
azure_output = []
meta_list = normalize_metadata(meta=meta, sources_count=len(sources))

if meta is None:
meta = [{}] * len(sources)
elif len(sources) != len(meta):
raise ValueError("The length of the metadata list must match the number of sources.")

for source, metadata in zip(sources, meta):
for source, metadata in zip(sources, meta_list):
try:
bytestream = get_bytestream_from_source(source=source)
except Exception as e:
Expand Down
6 changes: 5 additions & 1 deletion haystack/components/converters/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,11 @@ def __init__(
self.extractor_type = extractor_type

@component.output_types(documents=List[Document])
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
):
"""
Converts a list of HTML files to Documents.
Expand Down
25 changes: 14 additions & 11 deletions haystack/components/converters/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from haystack import Document, component
from haystack.dataclasses import ByteStream
from haystack.lazy_imports import LazyImport
from haystack.components.converters.utils import get_bytestream_from_source
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata

with LazyImport("Run 'pip install markdown-it-py mdit_plain'") as markdown_conversion_imports:
from markdown_it import MarkdownIt
Expand All @@ -27,7 +27,7 @@ class MarkdownToDocument:
from haystack.components.converters.markdown import MarkdownToDocument
converter = MarkdownToDocument()
results = converter.run(sources=["sample.md"])
results = converter.run(sources=["sample.md"], meta={"date_added": datetime.now().isoformat()})
documents = results["documents"]
print(documents[0].content)
# 'This is a text from the markdown file.'
Expand All @@ -45,28 +45,31 @@ def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True
self.progress_bar = progress_bar

@component.output_types(documents=List[Document])
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
):
"""
Reads text from a markdown file and executes optional preprocessing steps.
:param sources: A list of markdown data sources (file paths or binary objects)
:param meta: Optional list of metadata to attach to the Documents.
The length of the list must match the number of paths. Defaults to `None`.
:param meta: Optional metadata to attach to the Documents.
This value can be either a list of dictionaries or a single dictionary.
If it's a single dictionary, its content is added to the metadata of all produced Documents.
If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
Defaults to `None`.
:return: A dictionary containing a list of Document objects under the 'documents' key.
"""
parser = MarkdownIt(renderer_cls=RendererPlain)
if self.table_to_single_line:
parser.enable("table")

documents = []

if meta is None:
meta = [{}] * len(sources)
elif len(sources) != len(meta):
raise ValueError("The length of the metadata list must match the number of sources.")
meta_list = normalize_metadata(meta=meta, sources_count=len(sources))

for source, metadata in tqdm(
zip(sources, meta),
zip(sources, meta_list),
total=len(sources),
desc="Converting markdown files to Documents",
disable=not self.progress_bar,
Expand Down
6 changes: 5 additions & 1 deletion haystack/components/converters/pypdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,11 @@ def to_dict(self):
return default_to_dict(self, converter_name=self.converter_name)

@component.output_types(documents=List[Document])
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
):
"""
Converts a list of PDF sources into Document objects using the configured converter.
Expand Down
7 changes: 7 additions & 0 deletions haystack/components/extractors/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from haystack.components.extractors.named_entity_extractor import (
NamedEntityAnnotation,
NamedEntityExtractor,
NamedEntityExtractorBackend,
)

__all__ = ["NamedEntityExtractor", "NamedEntityExtractorBackend", "NamedEntityAnnotation"]
Loading

0 comments on commit 6598b09

Please sign in to comment.