Skip to content

Commit

Permalink
feat: Add ByteStream metadata and other metadata to Documents cre…
Browse files Browse the repository at this point in the history
…ated by `HTMLToDocument` (#6304)

* Refactor HTMLToDocument

* Add release notes

* Add additional tests

* remove progress bar

* Add additional test for metadata

* remove progress bar from release notes

* Update tests

* Use truthiness checks instead of is not None
  • Loading branch information
awinml authored Nov 21, 2023
1 parent 76165d0 commit e6c8374
Show file tree
Hide file tree
Showing 3 changed files with 151 additions and 26 deletions.
46 changes: 37 additions & 9 deletions haystack/preview/components/file_converters/html.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from typing import List, Union
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

from haystack.preview import Document, component
from haystack.preview.dataclasses import ByteStream
Expand All @@ -16,6 +16,18 @@
class HTMLToDocument:
"""
Converts an HTML file to a Document.
Usage example:
```python
from haystack.preview.components.file_converters.html import HTMLToDocument
converter = HTMLToDocument()
results = converter.run(sources=["sample.html"])
documents = results["documents"]
print(documents[0].content)
# 'This is a text from the HTML file.'
```
"""

def __init__(self):
Expand All @@ -25,18 +37,30 @@ def __init__(self):
boilerpy3_import.check()

@component.output_types(documents=List[Document])
def run(self, sources: List[Union[str, Path, ByteStream]]):
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
"""
Converts a list of HTML files to Documents.
:param sources: List of HTML file paths or ByteStream objects.
:param meta: Optional list of metadata to attach to the Documents.
The length of the list must match the number of sources. Defaults to `None`.
:return: List of converted Documents.
"""

documents = []

# Create metadata placeholders if not provided
if meta:
if len(sources) != len(meta):
raise ValueError("The length of the metadata list must match the number of sources.")
else:
meta = [{}] * len(sources)

extractor = extractors.ArticleExtractor(raise_on_failure=False)
for source in sources:

for source, metadata in zip(sources, meta):
try:
file_content = self._extract_content(source)
file_content, extracted_meta = self._extract_content(source)
except Exception as e:
logger.warning("Could not read %s. Skipping it. Error: %s", source, e)
continue
Expand All @@ -46,21 +70,25 @@ def run(self, sources: List[Union[str, Path, ByteStream]]):
logger.warning("Failed to extract text from %s. Skipping it. Error: %s", source, conversion_e)
continue

document = Document(content=text)
# Merge metadata received from ByteStream with supplied metadata
if extracted_meta:
# Supplied metadata overwrites metadata from ByteStream for overlapping keys.
metadata = {**extracted_meta, **metadata}
document = Document(content=text, meta=metadata)
documents.append(document)

return {"documents": documents}

def _extract_content(self, source: Union[str, Path, ByteStream]) -> str:
def _extract_content(self, source: Union[str, Path, ByteStream]) -> tuple:
"""
Extracts content from the given data source
:param source: The data source to extract content from.
:return: The extracted content.
:return: The extracted content and metadata.
"""
if isinstance(source, (str, Path)):
with open(source) as text_file:
return text_file.read()
return (text_file.read(), None)
if isinstance(source, ByteStream):
return source.data.decode("utf-8")
return (source.data.decode("utf-8"), source.metadata)

raise ValueError(f"Unsupported source type: {type(source)}")
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
preview:
- |
Adds support for adding additional metadata and utilizing metadata received from ByteStream sources when creating documents using HTMLToDocument.
127 changes: 110 additions & 17 deletions test/preview/components/file_converters/test_html_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,52 +12,145 @@ def test_run(self, preview_samples_path):
"""
Test if the component runs correctly.
"""
paths = [preview_samples_path / "html" / "what_is_haystack.html"]
sources = [preview_samples_path / "html" / "what_is_haystack.html"]
converter = HTMLToDocument()
output = converter.run(sources=paths)
docs = output["documents"]
results = converter.run(sources=sources)
docs = results["documents"]
assert len(docs) == 1
assert "Haystack" in docs[0].content

@pytest.mark.unit
def test_run_doc_metadata(self, preview_samples_path):
"""
Test if the component runs correctly when metadata is supplied by the user.
"""
converter = HTMLToDocument()
sources = [preview_samples_path / "html" / "what_is_haystack.html"]
metadata = [{"file_name": "what_is_haystack.html"}]
results = converter.run(sources=sources, meta=metadata)
docs = results["documents"]

assert len(docs) == 1
assert "Haystack" in docs[0].content
assert docs[0].meta == {"file_name": "what_is_haystack.html"}

@pytest.mark.unit
def test_incorrect_meta(self, preview_samples_path):
"""
Test if the component raises an error when incorrect metadata is supplied by the user.
"""
converter = HTMLToDocument()
sources = [preview_samples_path / "html" / "what_is_haystack.html"]
metadata = [{"file_name": "what_is_haystack.html"}, {"file_name": "haystack.html"}]
with pytest.raises(ValueError, match="The length of the metadata list must match the number of sources."):
converter.run(sources=sources, meta=metadata)

@pytest.mark.unit
def test_run_bytestream_metadata(self, preview_samples_path):
"""
Test if the component runs correctly when metadata is read from the ByteStream object.
"""
converter = HTMLToDocument()
with open(preview_samples_path / "html" / "what_is_haystack.html", "rb") as file:
byte_stream = file.read()
stream = ByteStream(byte_stream, metadata={"content_type": "text/html", "url": "test_url"})

results = converter.run(sources=[stream])
docs = results["documents"]

assert len(docs) == 1
assert "Haystack" in docs[0].content
assert docs[0].meta == {"content_type": "text/html", "url": "test_url"}

@pytest.mark.unit
def test_run_bytestream_and_doc_metadata(self, preview_samples_path):
"""
Test if the component runs correctly when metadata is read from the ByteStream object and supplied by the user.
There is no overlap between the metadata received.
"""
converter = HTMLToDocument()
with open(preview_samples_path / "html" / "what_is_haystack.html", "rb") as file:
byte_stream = file.read()
stream = ByteStream(byte_stream, metadata={"content_type": "text/html", "url": "test_url"})

metadata = [{"file_name": "what_is_haystack.html"}]
results = converter.run(sources=[stream], meta=metadata)
docs = results["documents"]

assert len(docs) == 1
assert "Haystack" in docs[0].content
assert docs[0].meta == {"file_name": "what_is_haystack.html", "content_type": "text/html", "url": "test_url"}

@pytest.mark.unit
def test_run_bytestream_doc_overlapping_metadata(self, preview_samples_path):
"""
Test if the component runs correctly when metadata is read from the ByteStream object and supplied by the user.
There is an overlap between the metadata received.
The component should use the supplied metadata to overwrite the values if there is an overlap between the keys.
"""
converter = HTMLToDocument()
with open(preview_samples_path / "html" / "what_is_haystack.html", "rb") as file:
byte_stream = file.read()
# ByteStream has "url" present in metadata
stream = ByteStream(byte_stream, metadata={"content_type": "text/html", "url": "test_url_correct"})

# "url" supplied by the user overwrites value present in metadata
metadata = [{"file_name": "what_is_haystack.html", "url": "test_url_new"}]
results = converter.run(sources=[stream], meta=metadata)
docs = results["documents"]

assert len(docs) == 1
assert "Haystack" in docs[0].content
assert docs[0].meta == {
"file_name": "what_is_haystack.html",
"content_type": "text/html",
"url": "test_url_new",
}

@pytest.mark.unit
def test_run_wrong_file_type(self, preview_samples_path, caplog):
"""
Test if the component runs correctly when an input file is not of the expected type.
"""
paths = [preview_samples_path / "audio" / "answer.wav"]
sources = [preview_samples_path / "audio" / "answer.wav"]
converter = HTMLToDocument()
with caplog.at_level(logging.WARNING):
output = converter.run(sources=paths)
results = converter.run(sources=sources)
assert "codec can't decode byte" in caplog.text

docs = output["documents"]
assert not docs
assert results["documents"] == []

@pytest.mark.unit
def test_run_error_handling(self, preview_samples_path, caplog):
def test_run_error_handling(self, caplog):
"""
Test if the component correctly handles errors.
"""
paths = ["non_existing_file.html"]
sources = ["non_existing_file.html"]
converter = HTMLToDocument()
with caplog.at_level(logging.WARNING):
result = converter.run(sources=paths)
results = converter.run(sources=sources)
assert "Could not read non_existing_file.html" in caplog.text
assert not result["documents"]
assert results["documents"] == []

@pytest.mark.unit
def test_mixed_sources_run(self, preview_samples_path):
"""
Test if the component runs correctly if the input is a mix of paths and ByteStreams
Test if the component runs correctly if the input is a mix of paths and ByteStreams.
"""
paths = [preview_samples_path / "html" / "what_is_haystack.html"]
sources = [
preview_samples_path / "html" / "what_is_haystack.html",
str((preview_samples_path / "html" / "what_is_haystack.html").absolute()),
]
with open(preview_samples_path / "html" / "what_is_haystack.html", "rb") as f:
byte_stream = f.read()
paths.append(ByteStream(byte_stream))
sources.append(ByteStream(byte_stream))

converter = HTMLToDocument()
output = converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 2
results = converter.run(sources=sources)
docs = results["documents"]
assert len(docs) == 3
for doc in docs:
assert "Haystack" in doc.content

0 comments on commit e6c8374

Please sign in to comment.