Add minimal HTML processing (#123)

Add minimal HTML processing
eyurtsev · Apr 7, 2023 · b218678 · b218678
1 parent 94e809d
commit b218678
Show file tree

Hide file tree

Showing 8 changed files with 252 additions and 97 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -27,7 +27,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
           cache: "poetry"
       - name: Install dependencies
-        run: poetry install --with test
+        run: poetry install --with test -E html
       - name: Run unit tests
         run: |
           poetry run poe test
diff --git a/kor/documents/__init__.py b/kor/documents/__init__.py
diff --git a/kor/documents/html.py b/kor/documents/html.py
@@ -0,0 +1,72 @@
+"""Load and chunk HTMLs with potential pre-processing to clean the html."""
+
+import re
+from typing import Tuple
+
+import markdownify
+from bs4 import BeautifulSoup
+from langchain.schema import Document
+
+from kor.documents.typedefs import AbstractDocumentProcessor
+
+# Regular expression pattern to detect multiple new lines in a row with optional
+# whitespace in between
+CONSECUTIVE_NEW_LINES = re.compile(r"\n(\s*\n)+", flags=re.UNICODE)
+
+
+def _get_mini_html(html: str, *, tags_to_remove: Tuple[str, ...] = tuple()) -> str:
+    """Clean up HTML tags."""
+    # Parse the HTML document using BeautifulSoup
+    soup = BeautifulSoup(html, "html.parser")
+
+    # Remove all CSS stylesheets
+    for stylesheet in soup.find_all("link", rel="stylesheet"):
+        stylesheet.extract()
+
+    for tag_to_remove in tags_to_remove:
+        # Remove all matching tags
+        for tag in soup.find_all(tag_to_remove):
+            tag.extract()
+
+    new_html = repr(soup)
+    return new_html
+
+
+def _clean_html(html: str, *, tags_to_remove: Tuple[str, ...] = tuple()) -> str:
+    """Clean up HTML and convert to markdown using markdownify."""
+    html = _get_mini_html(html, tags_to_remove=tags_to_remove)
+    md = markdownify.markdownify(html)
+    return CONSECUTIVE_NEW_LINES.sub("\n\n", md).strip()
+
+
+## PUBLIC API
+
+
+class MarkdownifyHTMLProcessor(AbstractDocumentProcessor):
+    """A preprocessor to clean HTML and convert to markdown using markdownify."""
+
+    def __init__(
+        self,
+        tags_to_remove: Tuple[str, ...] = ("svg", "img", "script", "style"),
+    ) -> None:
+        """Initialize the preprocessor.
+
+        Args:
+            tags_to_remove: A tuple of tags to remove from the HTML
+        """
+        self.tags_to_remove = tags_to_remove
+
+    def process(self, document: Document) -> Document:
+        """Clean up HTML and convert to markdown using markdownify.
+
+        Args:
+            document: a document with HTML content
+
+        Returns:
+            The cleaned HTML
+        """
+        new_document = document.copy()
+        new_document.page_content = _clean_html(
+            document.page_content, tags_to_remove=self.tags_to_remove
+        )
+        return new_document
diff --git a/kor/documents/typedefs.py b/kor/documents/typedefs.py
@@ -0,0 +1,12 @@
+import abc
+
+from langchain.schema import Document
+
+
+class AbstractDocumentProcessor(abc.ABC):
+    """An interface for document transformers."""
+
+    @abc.abstractmethod
+    def process(self, document: Document) -> Document:
+        """Process document."""
+        raise NotImplementedError()
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,11 +12,11 @@ python = "^3.8.1"
 openai = "^0.27"
 langchain = ">=0.0.110"
 pandas = "^1.5.3"
+markdownify = {version = "^0.11.6", optional = true}
 
 [tool.poetry.group.dev.dependencies]
 jupyterlab = "^3.6.1"
 
-
 [tool.poetry.group.test.dependencies]
 pytest = "^7.2.1"
 black = { version="^23.1.0", extras=["jupyter"] }
@@ -43,6 +43,12 @@ mypy = "^0.991"
 [tool.poetry.group.types.dependencies]
 types-toml = "^0.10.8.5"
 
+
+[tool.poetry.group.html.dependencies]
+
+[tool.poetry.extras]
+html = ["markdownify"]
+
 [tool.poe.tasks]
 black = "black"
 ruff = "ruff"

diff --git a/tests/documents/__init__.py b/tests/documents/__init__.py
diff --git a/tests/documents/test_html.py b/tests/documents/test_html.py
@@ -0,0 +1,46 @@
+from typing import Optional, Tuple
+
+import pytest
+from langchain.schema import Document
+
+from kor.documents.html import MarkdownifyHTMLProcessor
+
+
+@pytest.mark.parametrize(
+    "tags,expected",
+    [
+        (None, "Title\n\nTest"),  # Use default
+        (tuple(), "Title\nSvg\nStyle\nScript\n\nTest"),
+        (("title",), "Svg\nStyle\nScript\n\nTest"),
+    ],
+)
+def test_markdownify_html_preprocessor(
+    tags: Optional[Tuple[str, ...]], expected: str
+) -> None:
+    """Test the MarkDownifyHTMLPreprocessor."""
+    if tags is not None:
+        processor = MarkdownifyHTMLProcessor(tags_to_remove=tags)
+    else:
+        processor = MarkdownifyHTMLProcessor()
+
+    html = """
+    <html>
+    <head>
+    <title>Title</title>
+    <svg>Svg</svg>
+    <style>Style</style>
+    <script>Script</script>
+    </head>
+    <body>
+    <p>Test
+    
+    
+    </p>
+    </body>
+    </html>
+    """
+    document = Document(page_content=html, metadata={"a": 1})
+    processed_document = processor.process(document)
+    assert isinstance(processed_document, Document)
+    assert processed_document.page_content == expected
+    assert processed_document.metadata == {"a": 1}