From e6c8374562625c9a6699a0a86b70f1e07255a0d3 Mon Sep 17 00:00:00 2001 From: Ashwin Mathur <97467100+awinml@users.noreply.github.com> Date: Wed, 22 Nov 2023 02:14:02 +0530 Subject: [PATCH] feat: Add `ByteStream` metadata and other metadata to `Documents` created by `HTMLToDocument` (#6304) * Refactor HTMLToDocument * Add release notes * Add additional tests * remove progress bar * Add additional test for metadata * remove progress bar from release notes * Update tests * Use truthiness checks instead of is not None --- .../components/file_converters/html.py | 46 +++++-- ...adata-HTMLToDocument-42dbd074a46c979e.yaml | 4 + .../file_converters/test_html_to_document.py | 127 +++++++++++++++--- 3 files changed, 151 insertions(+), 26 deletions(-) create mode 100644 releasenotes/notes/add-metadata-HTMLToDocument-42dbd074a46c979e.yaml diff --git a/haystack/preview/components/file_converters/html.py b/haystack/preview/components/file_converters/html.py index 3af9b6a795..ba1132ad4b 100644 --- a/haystack/preview/components/file_converters/html.py +++ b/haystack/preview/components/file_converters/html.py @@ -1,6 +1,6 @@ import logging -from typing import List, Union from pathlib import Path +from typing import Any, Dict, List, Optional, Union from haystack.preview import Document, component from haystack.preview.dataclasses import ByteStream @@ -16,6 +16,18 @@ class HTMLToDocument: """ Converts an HTML file to a Document. + + Usage example: + ```python + from haystack.preview.components.file_converters.html import HTMLToDocument + + converter = HTMLToDocument() + results = converter.run(sources=["sample.html"]) + documents = results["documents"] + print(documents[0].content) + # 'This is a text from the HTML file.' + ``` + """ def __init__(self): @@ -25,18 +37,30 @@ def __init__(self): boilerpy3_import.check() @component.output_types(documents=List[Document]) - def run(self, sources: List[Union[str, Path, ByteStream]]): + def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None): """ Converts a list of HTML files to Documents. :param sources: List of HTML file paths or ByteStream objects. + :param meta: Optional list of metadata to attach to the Documents. + The length of the list must match the number of sources. Defaults to `None`. :return: List of converted Documents. """ + documents = [] + + # Create metadata placeholders if not provided + if meta: + if len(sources) != len(meta): + raise ValueError("The length of the metadata list must match the number of sources.") + else: + meta = [{}] * len(sources) + extractor = extractors.ArticleExtractor(raise_on_failure=False) - for source in sources: + + for source, metadata in zip(sources, meta): try: - file_content = self._extract_content(source) + file_content, extracted_meta = self._extract_content(source) except Exception as e: logger.warning("Could not read %s. Skipping it. Error: %s", source, e) continue @@ -46,21 +70,25 @@ def run(self, sources: List[Union[str, Path, ByteStream]]): logger.warning("Failed to extract text from %s. Skipping it. Error: %s", source, conversion_e) continue - document = Document(content=text) + # Merge metadata received from ByteStream with supplied metadata + if extracted_meta: + # Supplied metadata overwrites metadata from ByteStream for overlapping keys. + metadata = {**extracted_meta, **metadata} + document = Document(content=text, meta=metadata) documents.append(document) return {"documents": documents} - def _extract_content(self, source: Union[str, Path, ByteStream]) -> str: + def _extract_content(self, source: Union[str, Path, ByteStream]) -> tuple: """ Extracts content from the given data source :param source: The data source to extract content from. - :return: The extracted content. + :return: The extracted content and metadata. """ if isinstance(source, (str, Path)): with open(source) as text_file: - return text_file.read() + return (text_file.read(), None) if isinstance(source, ByteStream): - return source.data.decode("utf-8") + return (source.data.decode("utf-8"), source.metadata) raise ValueError(f"Unsupported source type: {type(source)}") diff --git a/releasenotes/notes/add-metadata-HTMLToDocument-42dbd074a46c979e.yaml b/releasenotes/notes/add-metadata-HTMLToDocument-42dbd074a46c979e.yaml new file mode 100644 index 0000000000..709a9c0f66 --- /dev/null +++ b/releasenotes/notes/add-metadata-HTMLToDocument-42dbd074a46c979e.yaml @@ -0,0 +1,4 @@ +--- +preview: + - | + Adds support for adding additional metadata and utilizing metadata received from ByteStream sources when creating documents using HTMLToDocument. diff --git a/test/preview/components/file_converters/test_html_to_document.py b/test/preview/components/file_converters/test_html_to_document.py index 6b6d7b43c5..786a26a066 100644 --- a/test/preview/components/file_converters/test_html_to_document.py +++ b/test/preview/components/file_converters/test_html_to_document.py @@ -12,52 +12,145 @@ def test_run(self, preview_samples_path): """ Test if the component runs correctly. """ - paths = [preview_samples_path / "html" / "what_is_haystack.html"] + sources = [preview_samples_path / "html" / "what_is_haystack.html"] converter = HTMLToDocument() - output = converter.run(sources=paths) - docs = output["documents"] + results = converter.run(sources=sources) + docs = results["documents"] assert len(docs) == 1 assert "Haystack" in docs[0].content + @pytest.mark.unit + def test_run_doc_metadata(self, preview_samples_path): + """ + Test if the component runs correctly when metadata is supplied by the user. + """ + converter = HTMLToDocument() + sources = [preview_samples_path / "html" / "what_is_haystack.html"] + metadata = [{"file_name": "what_is_haystack.html"}] + results = converter.run(sources=sources, meta=metadata) + docs = results["documents"] + + assert len(docs) == 1 + assert "Haystack" in docs[0].content + assert docs[0].meta == {"file_name": "what_is_haystack.html"} + + @pytest.mark.unit + def test_incorrect_meta(self, preview_samples_path): + """ + Test if the component raises an error when incorrect metadata is supplied by the user. + """ + converter = HTMLToDocument() + sources = [preview_samples_path / "html" / "what_is_haystack.html"] + metadata = [{"file_name": "what_is_haystack.html"}, {"file_name": "haystack.html"}] + with pytest.raises(ValueError, match="The length of the metadata list must match the number of sources."): + converter.run(sources=sources, meta=metadata) + + @pytest.mark.unit + def test_run_bytestream_metadata(self, preview_samples_path): + """ + Test if the component runs correctly when metadata is read from the ByteStream object. + """ + converter = HTMLToDocument() + with open(preview_samples_path / "html" / "what_is_haystack.html", "rb") as file: + byte_stream = file.read() + stream = ByteStream(byte_stream, metadata={"content_type": "text/html", "url": "test_url"}) + + results = converter.run(sources=[stream]) + docs = results["documents"] + + assert len(docs) == 1 + assert "Haystack" in docs[0].content + assert docs[0].meta == {"content_type": "text/html", "url": "test_url"} + + @pytest.mark.unit + def test_run_bytestream_and_doc_metadata(self, preview_samples_path): + """ + Test if the component runs correctly when metadata is read from the ByteStream object and supplied by the user. + + There is no overlap between the metadata received. + """ + converter = HTMLToDocument() + with open(preview_samples_path / "html" / "what_is_haystack.html", "rb") as file: + byte_stream = file.read() + stream = ByteStream(byte_stream, metadata={"content_type": "text/html", "url": "test_url"}) + + metadata = [{"file_name": "what_is_haystack.html"}] + results = converter.run(sources=[stream], meta=metadata) + docs = results["documents"] + + assert len(docs) == 1 + assert "Haystack" in docs[0].content + assert docs[0].meta == {"file_name": "what_is_haystack.html", "content_type": "text/html", "url": "test_url"} + + @pytest.mark.unit + def test_run_bytestream_doc_overlapping_metadata(self, preview_samples_path): + """ + Test if the component runs correctly when metadata is read from the ByteStream object and supplied by the user. + + There is an overlap between the metadata received. + + The component should use the supplied metadata to overwrite the values if there is an overlap between the keys. + """ + converter = HTMLToDocument() + with open(preview_samples_path / "html" / "what_is_haystack.html", "rb") as file: + byte_stream = file.read() + # ByteStream has "url" present in metadata + stream = ByteStream(byte_stream, metadata={"content_type": "text/html", "url": "test_url_correct"}) + + # "url" supplied by the user overwrites value present in metadata + metadata = [{"file_name": "what_is_haystack.html", "url": "test_url_new"}] + results = converter.run(sources=[stream], meta=metadata) + docs = results["documents"] + + assert len(docs) == 1 + assert "Haystack" in docs[0].content + assert docs[0].meta == { + "file_name": "what_is_haystack.html", + "content_type": "text/html", + "url": "test_url_new", + } + @pytest.mark.unit def test_run_wrong_file_type(self, preview_samples_path, caplog): """ Test if the component runs correctly when an input file is not of the expected type. """ - paths = [preview_samples_path / "audio" / "answer.wav"] + sources = [preview_samples_path / "audio" / "answer.wav"] converter = HTMLToDocument() with caplog.at_level(logging.WARNING): - output = converter.run(sources=paths) + results = converter.run(sources=sources) assert "codec can't decode byte" in caplog.text - docs = output["documents"] - assert not docs + assert results["documents"] == [] @pytest.mark.unit - def test_run_error_handling(self, preview_samples_path, caplog): + def test_run_error_handling(self, caplog): """ Test if the component correctly handles errors. """ - paths = ["non_existing_file.html"] + sources = ["non_existing_file.html"] converter = HTMLToDocument() with caplog.at_level(logging.WARNING): - result = converter.run(sources=paths) + results = converter.run(sources=sources) assert "Could not read non_existing_file.html" in caplog.text - assert not result["documents"] + assert results["documents"] == [] @pytest.mark.unit def test_mixed_sources_run(self, preview_samples_path): """ - Test if the component runs correctly if the input is a mix of paths and ByteStreams + Test if the component runs correctly if the input is a mix of paths and ByteStreams. """ - paths = [preview_samples_path / "html" / "what_is_haystack.html"] + sources = [ + preview_samples_path / "html" / "what_is_haystack.html", + str((preview_samples_path / "html" / "what_is_haystack.html").absolute()), + ] with open(preview_samples_path / "html" / "what_is_haystack.html", "rb") as f: byte_stream = f.read() - paths.append(ByteStream(byte_stream)) + sources.append(ByteStream(byte_stream)) converter = HTMLToDocument() - output = converter.run(sources=paths) - docs = output["documents"] - assert len(docs) == 2 + results = converter.run(sources=sources) + docs = results["documents"] + assert len(docs) == 3 for doc in docs: assert "Haystack" in doc.content