-
Notifications
You must be signed in to change notification settings - Fork 90
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add minimal HTML processing
- Loading branch information
Showing
8 changed files
with
252 additions
and
97 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
"""Load and chunk HTMLs with potential pre-processing to clean the html.""" | ||
|
||
import re | ||
from typing import Tuple | ||
|
||
import markdownify | ||
from bs4 import BeautifulSoup | ||
from langchain.schema import Document | ||
|
||
from kor.documents.typedefs import AbstractDocumentProcessor | ||
|
||
# Regular expression pattern to detect multiple new lines in a row with optional | ||
# whitespace in between | ||
CONSECUTIVE_NEW_LINES = re.compile(r"\n(\s*\n)+", flags=re.UNICODE) | ||
|
||
|
||
def _get_mini_html(html: str, *, tags_to_remove: Tuple[str, ...] = tuple()) -> str: | ||
"""Clean up HTML tags.""" | ||
# Parse the HTML document using BeautifulSoup | ||
soup = BeautifulSoup(html, "html.parser") | ||
|
||
# Remove all CSS stylesheets | ||
for stylesheet in soup.find_all("link", rel="stylesheet"): | ||
stylesheet.extract() | ||
|
||
for tag_to_remove in tags_to_remove: | ||
# Remove all matching tags | ||
for tag in soup.find_all(tag_to_remove): | ||
tag.extract() | ||
|
||
new_html = repr(soup) | ||
return new_html | ||
|
||
|
||
def _clean_html(html: str, *, tags_to_remove: Tuple[str, ...] = tuple()) -> str: | ||
"""Clean up HTML and convert to markdown using markdownify.""" | ||
html = _get_mini_html(html, tags_to_remove=tags_to_remove) | ||
md = markdownify.markdownify(html) | ||
return CONSECUTIVE_NEW_LINES.sub("\n\n", md).strip() | ||
|
||
|
||
## PUBLIC API | ||
|
||
|
||
class MarkdownifyHTMLProcessor(AbstractDocumentProcessor): | ||
"""A preprocessor to clean HTML and convert to markdown using markdownify.""" | ||
|
||
def __init__( | ||
self, | ||
tags_to_remove: Tuple[str, ...] = ("svg", "img", "script", "style"), | ||
) -> None: | ||
"""Initialize the preprocessor. | ||
Args: | ||
tags_to_remove: A tuple of tags to remove from the HTML | ||
""" | ||
self.tags_to_remove = tags_to_remove | ||
|
||
def process(self, document: Document) -> Document: | ||
"""Clean up HTML and convert to markdown using markdownify. | ||
Args: | ||
document: a document with HTML content | ||
Returns: | ||
The cleaned HTML | ||
""" | ||
new_document = document.copy() | ||
new_document.page_content = _clean_html( | ||
document.page_content, tags_to_remove=self.tags_to_remove | ||
) | ||
return new_document |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import abc | ||
|
||
from langchain.schema import Document | ||
|
||
|
||
class AbstractDocumentProcessor(abc.ABC): | ||
"""An interface for document transformers.""" | ||
|
||
@abc.abstractmethod | ||
def process(self, document: Document) -> Document: | ||
"""Process document.""" | ||
raise NotImplementedError() |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
from typing import Optional, Tuple | ||
|
||
import pytest | ||
from langchain.schema import Document | ||
|
||
from kor.documents.html import MarkdownifyHTMLProcessor | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"tags,expected", | ||
[ | ||
(None, "Title\n\nTest"), # Use default | ||
(tuple(), "Title\nSvg\nStyle\nScript\n\nTest"), | ||
(("title",), "Svg\nStyle\nScript\n\nTest"), | ||
], | ||
) | ||
def test_markdownify_html_preprocessor( | ||
tags: Optional[Tuple[str, ...]], expected: str | ||
) -> None: | ||
"""Test the MarkDownifyHTMLPreprocessor.""" | ||
if tags is not None: | ||
processor = MarkdownifyHTMLProcessor(tags_to_remove=tags) | ||
else: | ||
processor = MarkdownifyHTMLProcessor() | ||
|
||
html = """ | ||
<html> | ||
<head> | ||
<title>Title</title> | ||
<svg>Svg</svg> | ||
<style>Style</style> | ||
<script>Script</script> | ||
</head> | ||
<body> | ||
<p>Test | ||
</p> | ||
</body> | ||
</html> | ||
""" | ||
document = Document(page_content=html, metadata={"a": 1}) | ||
processed_document = processor.process(document) | ||
assert isinstance(processed_document, Document) | ||
assert processed_document.page_content == expected | ||
assert processed_document.metadata == {"a": 1} |