Skip to content

Commit

Permalink
Add minimal HTML processing (#123)
Browse files Browse the repository at this point in the history
Add minimal HTML processing
  • Loading branch information
eyurtsev authored Apr 7, 2023
1 parent 94e809d commit b218678
Show file tree
Hide file tree
Showing 8 changed files with 252 additions and 97 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
python-version: ${{ matrix.python-version }}
cache: "poetry"
- name: Install dependencies
run: poetry install --with test
run: poetry install --with test -E html
- name: Run unit tests
run: |
poetry run poe test
Empty file added kor/documents/__init__.py
Empty file.
72 changes: 72 additions & 0 deletions kor/documents/html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""Load and chunk HTMLs with potential pre-processing to clean the html."""

import re
from typing import Tuple

import markdownify
from bs4 import BeautifulSoup
from langchain.schema import Document

from kor.documents.typedefs import AbstractDocumentProcessor

# Regular expression pattern to detect multiple new lines in a row with optional
# whitespace in between
CONSECUTIVE_NEW_LINES = re.compile(r"\n(\s*\n)+", flags=re.UNICODE)


def _get_mini_html(html: str, *, tags_to_remove: Tuple[str, ...] = tuple()) -> str:
"""Clean up HTML tags."""
# Parse the HTML document using BeautifulSoup
soup = BeautifulSoup(html, "html.parser")

# Remove all CSS stylesheets
for stylesheet in soup.find_all("link", rel="stylesheet"):
stylesheet.extract()

for tag_to_remove in tags_to_remove:
# Remove all matching tags
for tag in soup.find_all(tag_to_remove):
tag.extract()

new_html = repr(soup)
return new_html


def _clean_html(html: str, *, tags_to_remove: Tuple[str, ...] = tuple()) -> str:
"""Clean up HTML and convert to markdown using markdownify."""
html = _get_mini_html(html, tags_to_remove=tags_to_remove)
md = markdownify.markdownify(html)
return CONSECUTIVE_NEW_LINES.sub("\n\n", md).strip()


## PUBLIC API


class MarkdownifyHTMLProcessor(AbstractDocumentProcessor):
"""A preprocessor to clean HTML and convert to markdown using markdownify."""

def __init__(
self,
tags_to_remove: Tuple[str, ...] = ("svg", "img", "script", "style"),
) -> None:
"""Initialize the preprocessor.
Args:
tags_to_remove: A tuple of tags to remove from the HTML
"""
self.tags_to_remove = tags_to_remove

def process(self, document: Document) -> Document:
"""Clean up HTML and convert to markdown using markdownify.
Args:
document: a document with HTML content
Returns:
The cleaned HTML
"""
new_document = document.copy()
new_document.page_content = _clean_html(
document.page_content, tags_to_remove=self.tags_to_remove
)
return new_document
12 changes: 12 additions & 0 deletions kor/documents/typedefs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import abc

from langchain.schema import Document


class AbstractDocumentProcessor(abc.ABC):
"""An interface for document transformers."""

@abc.abstractmethod
def process(self, document: Document) -> Document:
"""Process document."""
raise NotImplementedError()
209 changes: 114 additions & 95 deletions poetry.lock

Large diffs are not rendered by default.

8 changes: 7 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ python = "^3.8.1"
openai = "^0.27"
langchain = ">=0.0.110"
pandas = "^1.5.3"
markdownify = {version = "^0.11.6", optional = true}

[tool.poetry.group.dev.dependencies]
jupyterlab = "^3.6.1"


[tool.poetry.group.test.dependencies]
pytest = "^7.2.1"
black = { version="^23.1.0", extras=["jupyter"] }
Expand All @@ -43,6 +43,12 @@ mypy = "^0.991"
[tool.poetry.group.types.dependencies]
types-toml = "^0.10.8.5"


[tool.poetry.group.html.dependencies]

[tool.poetry.extras]
html = ["markdownify"]

[tool.poe.tasks]
black = "black"
ruff = "ruff"
Expand Down
Empty file added tests/documents/__init__.py
Empty file.
46 changes: 46 additions & 0 deletions tests/documents/test_html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from typing import Optional, Tuple

import pytest
from langchain.schema import Document

from kor.documents.html import MarkdownifyHTMLProcessor


@pytest.mark.parametrize(
"tags,expected",
[
(None, "Title\n\nTest"), # Use default
(tuple(), "Title\nSvg\nStyle\nScript\n\nTest"),
(("title",), "Svg\nStyle\nScript\n\nTest"),
],
)
def test_markdownify_html_preprocessor(
tags: Optional[Tuple[str, ...]], expected: str
) -> None:
"""Test the MarkDownifyHTMLPreprocessor."""
if tags is not None:
processor = MarkdownifyHTMLProcessor(tags_to_remove=tags)
else:
processor = MarkdownifyHTMLProcessor()

html = """
<html>
<head>
<title>Title</title>
<svg>Svg</svg>
<style>Style</style>
<script>Script</script>
</head>
<body>
<p>Test
</p>
</body>
</html>
"""
document = Document(page_content=html, metadata={"a": 1})
processed_document = processor.process(document)
assert isinstance(processed_document, Document)
assert processed_document.page_content == expected
assert processed_document.metadata == {"a": 1}

0 comments on commit b218678

Please sign in to comment.