feat: Add ByteStream metadata and other metadata to Documents cre…

…ated by `HTMLToDocument` (#6304) * Refactor HTMLToDocument * Add release notes * Add additional tests * remove progress bar * Add additional test for metadata * remove progress bar from release notes * Update tests * Use truthiness checks instead of is not None
deepset-ai · Nov 21, 2023 · e6c8374 · e6c8374
1 parent 76165d0
commit e6c8374
Show file tree

Hide file tree

Showing 3 changed files with 151 additions and 26 deletions.
diff --git a/haystack/preview/components/file_converters/html.py b/haystack/preview/components/file_converters/html.py
@@ -1,6 +1,6 @@
 import logging
-from typing import List, Union
 from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
 
 from haystack.preview import Document, component
 from haystack.preview.dataclasses import ByteStream
@@ -16,6 +16,18 @@
 class HTMLToDocument:
     """
     Converts an HTML file to a Document.
+
+    Usage example:
+    ```python
+    from haystack.preview.components.file_converters.html import HTMLToDocument
+
+    converter = HTMLToDocument()
+    results = converter.run(sources=["sample.html"])
+    documents = results["documents"]
+    print(documents[0].content)
+    # 'This is a text from the HTML file.'
+    ```
+
     """
 
     def __init__(self):
@@ -25,18 +37,30 @@ def __init__(self):
         boilerpy3_import.check()
 
     @component.output_types(documents=List[Document])
-    def run(self, sources: List[Union[str, Path, ByteStream]]):
+    def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
         """
         Converts a list of HTML files to Documents.
 
         :param sources: List of HTML file paths or ByteStream objects.
+        :param meta: Optional list of metadata to attach to the Documents.
+        The length of the list must match the number of sources. Defaults to `None`.
         :return: List of converted Documents.
         """
+
         documents = []
+
+        # Create metadata placeholders if not provided
+        if meta:
+            if len(sources) != len(meta):
+                raise ValueError("The length of the metadata list must match the number of sources.")
+        else:
+            meta = [{}] * len(sources)
+
         extractor = extractors.ArticleExtractor(raise_on_failure=False)
-        for source in sources:
+
+        for source, metadata in zip(sources, meta):
             try:
-                file_content = self._extract_content(source)
+                file_content, extracted_meta = self._extract_content(source)
             except Exception as e:
                 logger.warning("Could not read %s. Skipping it. Error: %s", source, e)
                 continue
@@ -46,21 +70,25 @@ def run(self, sources: List[Union[str, Path, ByteStream]]):
                 logger.warning("Failed to extract text from %s. Skipping it. Error: %s", source, conversion_e)
                 continue
 
-            document = Document(content=text)
+            # Merge metadata received from ByteStream with supplied metadata
+            if extracted_meta:
+                # Supplied metadata overwrites metadata from ByteStream for overlapping keys.
+                metadata = {**extracted_meta, **metadata}
+            document = Document(content=text, meta=metadata)
             documents.append(document)
 
         return {"documents": documents}
 
-    def _extract_content(self, source: Union[str, Path, ByteStream]) -> str:
+    def _extract_content(self, source: Union[str, Path, ByteStream]) -> tuple:
         """
         Extracts content from the given data source
         :param source: The data source to extract content from.
-        :return: The extracted content.
+        :return: The extracted content and metadata.
         """
         if isinstance(source, (str, Path)):
             with open(source) as text_file:
-                return text_file.read()
+                return (text_file.read(), None)
         if isinstance(source, ByteStream):
-            return source.data.decode("utf-8")
+            return (source.data.decode("utf-8"), source.metadata)
 
         raise ValueError(f"Unsupported source type: {type(source)}")
diff --git a/releasenotes/notes/add-metadata-HTMLToDocument-42dbd074a46c979e.yaml b/releasenotes/notes/add-metadata-HTMLToDocument-42dbd074a46c979e.yaml
@@ -0,0 +1,4 @@
+---
+preview:
+  - |
+    Adds support for adding additional metadata and utilizing metadata received from ByteStream sources when creating documents using HTMLToDocument.
diff --git a/test/preview/components/file_converters/test_html_to_document.py b/test/preview/components/file_converters/test_html_to_document.py
@@ -12,52 +12,145 @@ def test_run(self, preview_samples_path):
         """
         Test if the component runs correctly.
         """
-        paths = [preview_samples_path / "html" / "what_is_haystack.html"]
+        sources = [preview_samples_path / "html" / "what_is_haystack.html"]
         converter = HTMLToDocument()
-        output = converter.run(sources=paths)
-        docs = output["documents"]
+        results = converter.run(sources=sources)
+        docs = results["documents"]
         assert len(docs) == 1
         assert "Haystack" in docs[0].content
 
+    @pytest.mark.unit
+    def test_run_doc_metadata(self, preview_samples_path):
+        """
+        Test if the component runs correctly when metadata is supplied by the user.
+        """
+        converter = HTMLToDocument()
+        sources = [preview_samples_path / "html" / "what_is_haystack.html"]
+        metadata = [{"file_name": "what_is_haystack.html"}]
+        results = converter.run(sources=sources, meta=metadata)
+        docs = results["documents"]
+
+        assert len(docs) == 1
+        assert "Haystack" in docs[0].content
+        assert docs[0].meta == {"file_name": "what_is_haystack.html"}
+
+    @pytest.mark.unit
+    def test_incorrect_meta(self, preview_samples_path):
+        """
+        Test if the component raises an error when incorrect metadata is supplied by the user.
+        """
+        converter = HTMLToDocument()
+        sources = [preview_samples_path / "html" / "what_is_haystack.html"]
+        metadata = [{"file_name": "what_is_haystack.html"}, {"file_name": "haystack.html"}]
+        with pytest.raises(ValueError, match="The length of the metadata list must match the number of sources."):
+            converter.run(sources=sources, meta=metadata)
+
+    @pytest.mark.unit
+    def test_run_bytestream_metadata(self, preview_samples_path):
+        """
+        Test if the component runs correctly when metadata is read from the ByteStream object.
+        """
+        converter = HTMLToDocument()
+        with open(preview_samples_path / "html" / "what_is_haystack.html", "rb") as file:
+            byte_stream = file.read()
+            stream = ByteStream(byte_stream, metadata={"content_type": "text/html", "url": "test_url"})
+
+        results = converter.run(sources=[stream])
+        docs = results["documents"]
+
+        assert len(docs) == 1
+        assert "Haystack" in docs[0].content
+        assert docs[0].meta == {"content_type": "text/html", "url": "test_url"}
+
+    @pytest.mark.unit
+    def test_run_bytestream_and_doc_metadata(self, preview_samples_path):
+        """
+        Test if the component runs correctly when metadata is read from the ByteStream object and supplied by the user.
+
+        There is no overlap between the metadata received.
+        """
+        converter = HTMLToDocument()
+        with open(preview_samples_path / "html" / "what_is_haystack.html", "rb") as file:
+            byte_stream = file.read()
+            stream = ByteStream(byte_stream, metadata={"content_type": "text/html", "url": "test_url"})
+
+        metadata = [{"file_name": "what_is_haystack.html"}]
+        results = converter.run(sources=[stream], meta=metadata)
+        docs = results["documents"]
+
+        assert len(docs) == 1
+        assert "Haystack" in docs[0].content
+        assert docs[0].meta == {"file_name": "what_is_haystack.html", "content_type": "text/html", "url": "test_url"}
+
+    @pytest.mark.unit
+    def test_run_bytestream_doc_overlapping_metadata(self, preview_samples_path):
+        """
+        Test if the component runs correctly when metadata is read from the ByteStream object and supplied by the user.
+
+        There is an overlap between the metadata received.
+
+        The component should use the supplied metadata to overwrite the values if there is an overlap between the keys.
+        """
+        converter = HTMLToDocument()
+        with open(preview_samples_path / "html" / "what_is_haystack.html", "rb") as file:
+            byte_stream = file.read()
+            # ByteStream has "url" present in metadata
+            stream = ByteStream(byte_stream, metadata={"content_type": "text/html", "url": "test_url_correct"})
+
+        # "url" supplied by the user overwrites value present in metadata
+        metadata = [{"file_name": "what_is_haystack.html", "url": "test_url_new"}]
+        results = converter.run(sources=[stream], meta=metadata)
+        docs = results["documents"]
+
+        assert len(docs) == 1
+        assert "Haystack" in docs[0].content
+        assert docs[0].meta == {
+            "file_name": "what_is_haystack.html",
+            "content_type": "text/html",
+            "url": "test_url_new",
+        }
+
     @pytest.mark.unit
     def test_run_wrong_file_type(self, preview_samples_path, caplog):
         """
         Test if the component runs correctly when an input file is not of the expected type.
         """
-        paths = [preview_samples_path / "audio" / "answer.wav"]
+        sources = [preview_samples_path / "audio" / "answer.wav"]
         converter = HTMLToDocument()
         with caplog.at_level(logging.WARNING):
-            output = converter.run(sources=paths)
+            results = converter.run(sources=sources)
             assert "codec can't decode byte" in caplog.text
 
-        docs = output["documents"]
-        assert not docs
+        assert results["documents"] == []
 
     @pytest.mark.unit
-    def test_run_error_handling(self, preview_samples_path, caplog):
+    def test_run_error_handling(self, caplog):
         """
         Test if the component correctly handles errors.
         """
-        paths = ["non_existing_file.html"]
+        sources = ["non_existing_file.html"]
         converter = HTMLToDocument()
         with caplog.at_level(logging.WARNING):
-            result = converter.run(sources=paths)
+            results = converter.run(sources=sources)
             assert "Could not read non_existing_file.html" in caplog.text
-            assert not result["documents"]
+            assert results["documents"] == []
 
     @pytest.mark.unit
     def test_mixed_sources_run(self, preview_samples_path):
         """
-        Test if the component runs correctly if the input is a mix of paths and ByteStreams
+        Test if the component runs correctly if the input is a mix of paths and ByteStreams.
         """
-        paths = [preview_samples_path / "html" / "what_is_haystack.html"]
+        sources = [
+            preview_samples_path / "html" / "what_is_haystack.html",
+            str((preview_samples_path / "html" / "what_is_haystack.html").absolute()),
+        ]
         with open(preview_samples_path / "html" / "what_is_haystack.html", "rb") as f:
             byte_stream = f.read()
-            paths.append(ByteStream(byte_stream))
+            sources.append(ByteStream(byte_stream))
 
         converter = HTMLToDocument()
-        output = converter.run(sources=paths)
-        docs = output["documents"]
-        assert len(docs) == 2
+        results = converter.run(sources=sources)
+        docs = results["documents"]
+        assert len(docs) == 3
         for doc in docs:
             assert "Haystack" in doc.content