Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add ByteStream metadata and other metadata to Documents created by HTMLToDocument #6304

Merged
merged 8 commits into from
Nov 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 37 additions & 9 deletions haystack/preview/components/file_converters/html.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from typing import List, Union
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

from haystack.preview import Document, component
from haystack.preview.dataclasses import ByteStream
Expand All @@ -16,6 +16,18 @@
class HTMLToDocument:
"""
Converts an HTML file to a Document.

Usage example:
```python
from haystack.preview.components.file_converters.html import HTMLToDocument

converter = HTMLToDocument()
results = converter.run(sources=["sample.html"])
documents = results["documents"]
print(documents[0].content)
# 'This is a text from the HTML file.'
```

"""

def __init__(self):
Expand All @@ -25,18 +37,30 @@ def __init__(self):
boilerpy3_import.check()

@component.output_types(documents=List[Document])
def run(self, sources: List[Union[str, Path, ByteStream]]):
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
"""
Converts a list of HTML files to Documents.

:param sources: List of HTML file paths or ByteStream objects.
:param meta: Optional list of metadata to attach to the Documents.
The length of the list must match the number of sources. Defaults to `None`.
:return: List of converted Documents.
"""

documents = []

# Create metadata placeholders if not provided
if meta:
if len(sources) != len(meta):
raise ValueError("The length of the metadata list must match the number of sources.")
else:
meta = [{}] * len(sources)

extractor = extractors.ArticleExtractor(raise_on_failure=False)
for source in sources:

for source, metadata in zip(sources, meta):
try:
file_content = self._extract_content(source)
file_content, extracted_meta = self._extract_content(source)
except Exception as e:
logger.warning("Could not read %s. Skipping it. Error: %s", source, e)
continue
Expand All @@ -46,21 +70,25 @@ def run(self, sources: List[Union[str, Path, ByteStream]]):
logger.warning("Failed to extract text from %s. Skipping it. Error: %s", source, conversion_e)
continue

document = Document(content=text)
# Merge metadata received from ByteStream with supplied metadata
if extracted_meta:
# Supplied metadata overwrites metadata from ByteStream for overlapping keys.
metadata = {**extracted_meta, **metadata}
document = Document(content=text, meta=metadata)
documents.append(document)

return {"documents": documents}

def _extract_content(self, source: Union[str, Path, ByteStream]) -> str:
def _extract_content(self, source: Union[str, Path, ByteStream]) -> tuple:
"""
Extracts content from the given data source
:param source: The data source to extract content from.
:return: The extracted content.
:return: The extracted content and metadata.
"""
if isinstance(source, (str, Path)):
with open(source) as text_file:
return text_file.read()
return (text_file.read(), None)
if isinstance(source, ByteStream):
return source.data.decode("utf-8")
return (source.data.decode("utf-8"), source.metadata)

raise ValueError(f"Unsupported source type: {type(source)}")
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
preview:
- |
Adds support for adding additional metadata and utilizing metadata received from ByteStream sources when creating documents using HTMLToDocument.
127 changes: 110 additions & 17 deletions test/preview/components/file_converters/test_html_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,52 +12,145 @@ def test_run(self, preview_samples_path):
"""
Test if the component runs correctly.
"""
paths = [preview_samples_path / "html" / "what_is_haystack.html"]
sources = [preview_samples_path / "html" / "what_is_haystack.html"]
converter = HTMLToDocument()
output = converter.run(sources=paths)
docs = output["documents"]
results = converter.run(sources=sources)
docs = results["documents"]
assert len(docs) == 1
assert "Haystack" in docs[0].content

@pytest.mark.unit
def test_run_doc_metadata(self, preview_samples_path):
"""
Test if the component runs correctly when metadata is supplied by the user.
"""
converter = HTMLToDocument()
sources = [preview_samples_path / "html" / "what_is_haystack.html"]
metadata = [{"file_name": "what_is_haystack.html"}]
results = converter.run(sources=sources, meta=metadata)
docs = results["documents"]

assert len(docs) == 1
assert "Haystack" in docs[0].content
assert docs[0].meta == {"file_name": "what_is_haystack.html"}

@pytest.mark.unit
def test_incorrect_meta(self, preview_samples_path):
"""
Test if the component raises an error when incorrect metadata is supplied by the user.
"""
converter = HTMLToDocument()
sources = [preview_samples_path / "html" / "what_is_haystack.html"]
metadata = [{"file_name": "what_is_haystack.html"}, {"file_name": "haystack.html"}]
with pytest.raises(ValueError, match="The length of the metadata list must match the number of sources."):
converter.run(sources=sources, meta=metadata)

@pytest.mark.unit
def test_run_bytestream_metadata(self, preview_samples_path):
"""
Test if the component runs correctly when metadata is read from the ByteStream object.
"""
converter = HTMLToDocument()
with open(preview_samples_path / "html" / "what_is_haystack.html", "rb") as file:
byte_stream = file.read()
stream = ByteStream(byte_stream, metadata={"content_type": "text/html", "url": "test_url"})

results = converter.run(sources=[stream])
docs = results["documents"]

assert len(docs) == 1
assert "Haystack" in docs[0].content
assert docs[0].meta == {"content_type": "text/html", "url": "test_url"}

@pytest.mark.unit
def test_run_bytestream_and_doc_metadata(self, preview_samples_path):
"""
Test if the component runs correctly when metadata is read from the ByteStream object and supplied by the user.

There is no overlap between the metadata received.
"""
converter = HTMLToDocument()
with open(preview_samples_path / "html" / "what_is_haystack.html", "rb") as file:
byte_stream = file.read()
stream = ByteStream(byte_stream, metadata={"content_type": "text/html", "url": "test_url"})

metadata = [{"file_name": "what_is_haystack.html"}]
results = converter.run(sources=[stream], meta=metadata)
docs = results["documents"]

assert len(docs) == 1
assert "Haystack" in docs[0].content
assert docs[0].meta == {"file_name": "what_is_haystack.html", "content_type": "text/html", "url": "test_url"}

@pytest.mark.unit
def test_run_bytestream_doc_overlapping_metadata(self, preview_samples_path):
"""
Test if the component runs correctly when metadata is read from the ByteStream object and supplied by the user.

There is an overlap between the metadata received.

The component should use the supplied metadata to overwrite the values if there is an overlap between the keys.
"""
converter = HTMLToDocument()
with open(preview_samples_path / "html" / "what_is_haystack.html", "rb") as file:
byte_stream = file.read()
# ByteStream has "url" present in metadata
stream = ByteStream(byte_stream, metadata={"content_type": "text/html", "url": "test_url_correct"})

# "url" supplied by the user overwrites value present in metadata
metadata = [{"file_name": "what_is_haystack.html", "url": "test_url_new"}]
results = converter.run(sources=[stream], meta=metadata)
docs = results["documents"]

assert len(docs) == 1
assert "Haystack" in docs[0].content
assert docs[0].meta == {
"file_name": "what_is_haystack.html",
"content_type": "text/html",
"url": "test_url_new",
}

@pytest.mark.unit
def test_run_wrong_file_type(self, preview_samples_path, caplog):
"""
Test if the component runs correctly when an input file is not of the expected type.
"""
paths = [preview_samples_path / "audio" / "answer.wav"]
sources = [preview_samples_path / "audio" / "answer.wav"]
converter = HTMLToDocument()
with caplog.at_level(logging.WARNING):
output = converter.run(sources=paths)
results = converter.run(sources=sources)
assert "codec can't decode byte" in caplog.text

docs = output["documents"]
assert not docs
assert results["documents"] == []

@pytest.mark.unit
def test_run_error_handling(self, preview_samples_path, caplog):
def test_run_error_handling(self, caplog):
"""
Test if the component correctly handles errors.
"""
paths = ["non_existing_file.html"]
sources = ["non_existing_file.html"]
converter = HTMLToDocument()
with caplog.at_level(logging.WARNING):
result = converter.run(sources=paths)
results = converter.run(sources=sources)
assert "Could not read non_existing_file.html" in caplog.text
assert not result["documents"]
assert results["documents"] == []

@pytest.mark.unit
def test_mixed_sources_run(self, preview_samples_path):
"""
Test if the component runs correctly if the input is a mix of paths and ByteStreams
Test if the component runs correctly if the input is a mix of paths and ByteStreams.
"""
paths = [preview_samples_path / "html" / "what_is_haystack.html"]
sources = [
preview_samples_path / "html" / "what_is_haystack.html",
str((preview_samples_path / "html" / "what_is_haystack.html").absolute()),
]
with open(preview_samples_path / "html" / "what_is_haystack.html", "rb") as f:
byte_stream = f.read()
paths.append(ByteStream(byte_stream))
sources.append(ByteStream(byte_stream))

converter = HTMLToDocument()
output = converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 2
results = converter.run(sources=sources)
docs = results["documents"]
assert len(docs) == 3
for doc in docs:
assert "Haystack" in doc.content