fix: MarkdownChunker, retain subsection headers

Closes weaviate#322
alexchao · Nov 8, 2024 · 565d140 · 565d140
1 parent 7fe6168
commit 565d140
Showing 1 changed file with 33 additions and 11 deletions.
diff --git a/goldenverba/components/chunking/MarkdownChunker.py b/goldenverba/components/chunking/MarkdownChunker.py
@@ -2,13 +2,38 @@
 
 with contextlib.suppress(Exception):
     from langchain_text_splitters import MarkdownHeaderTextSplitter
+    from langchain_core.documents import Document as LangChainDocument
 
 from goldenverba.components.chunk import Chunk
 from goldenverba.components.interfaces import Chunker
 from goldenverba.components.document import Document
 from goldenverba.components.interfaces import Embedding
 
 
+HEADERS_TO_SPLIT_ON = [
+    ("#", "Header 1"),
+    ("##", "Header 2"),
+    ("###", "Header 3"),
+]
+
+
+def get_header_values(
+    split_doc: LangChainDocument,
+) -> list[str]:
+    """
+    Get the text values of the headers in the LangChain Document resulting from a split.
+    """
+    # This function uses an explicit list of header keys because the LangChain Document
+    # metadata is a dictionary with arbitrary entries, some of which may not be headers.
+    header_keys = [header_key for _, header_key in HEADERS_TO_SPLIT_ON]
+
+    return [
+        header_value
+        for header_key in header_keys
+        if (header_value := split_doc.metadata.get(header_key)) is not None
+    ]
+
+
 class MarkdownChunker(Chunker):
     """
     MarkdownChunker for Verba using LangChain.
@@ -31,11 +56,7 @@ async def chunk(
     ) -> list[Document]:
 
         text_splitter = MarkdownHeaderTextSplitter(
-            headers_to_split_on=[
-                ("#", "Header 1"),
-                ("##", "Header 2"),
-                ("###", "Header 3"),
-            ]
+            headers_to_split_on=HEADERS_TO_SPLIT_ON
         )
 
         char_end_i = -1
@@ -45,16 +66,17 @@ async def chunk(
             if len(document.chunks) > 0:
                 continue
 
-            for i, chunk in enumerate(text_splitter.split_text(document.content)):
-                
+            for i, split_doc in enumerate(text_splitter.split_text(document.content)):
+
                 chunk_text = ""
 
-                # append title and page content (should only be one header as we are splitting at header so index at 0), if a header is found
-                if len(chunk.metadata) > 0:
-                    chunk_text += list(chunk.metadata.values())[0] + "\n"
+                # Add header content to retain context and improve retrieval
+                header_values = get_header_values(split_doc)
+                for header_value in header_values:
+                    chunk_text += header_value + "\n"
 
                 # append page content (always there)
-                chunk_text += chunk.page_content
+                chunk_text += split_doc.page_content
 
                 char_start_i = char_end_i + 1
                 char_end_i = char_start_i + len(chunk_text)