Skip to content

Commit

Permalink
Enable conditional splitting for html files (#184)
Browse files Browse the repository at this point in the history
* enable html header

Signed-off-by: XuhuiRen <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update utils.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: XuhuiRen <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
XuhuiRen and pre-commit-ci[bot] committed Jun 26, 2024
1 parent 86412c8 commit e1dad1d
Show file tree
Hide file tree
Showing 7 changed files with 49 additions and 14 deletions.
16 changes: 13 additions & 3 deletions comps/dataprep/milvus/prepare_doc_milvus.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings, HuggingFaceHubEmbeddings
from langchain_milvus.vectorstores import Milvus
from langchain_text_splitters import HTMLHeaderTextSplitter

from comps.cores.mega.micro_service import opea_microservices, register_microservice
from comps.cores.proto.docarray import DocPath
Expand All @@ -34,9 +35,18 @@ def ingest_documents(doc_path: DocPath):
path = doc_path.path
print(f"Parsing document {path}.")

text_splitter = RecursiveCharacterTextSplitter(
chunk_size=doc_path.chunk_size, chunk_overlap=doc_path.chunk_size, add_start_index=True
)
if path.endswith(".html"):
headers_to_split_on = [
("h1", "Header 1"),
("h2", "Header 2"),
("h3", "Header 3"),
]
text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
else:
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=doc_path.chunk_size, chunk_overlap=100, add_start_index=True
)

content = document_loader(path)
chunks = text_splitter.split_text(content)

Expand Down
1 change: 1 addition & 0 deletions comps/dataprep/milvus/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ frontend==0.0.3
huggingface_hub
langchain
langchain-community
langchain-text-splitters
langchain_milvus
numpy
opentelemetry-api
Expand Down
16 changes: 13 additions & 3 deletions comps/dataprep/qdrant/prepare_doc_qdrant.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings, HuggingFaceHubEmbeddings
from langchain_community.vectorstores import Qdrant
from langchain_text_splitters import HTMLHeaderTextSplitter

from comps import DocPath, opea_microservices, opea_telemetry, register_microservice
from comps.dataprep.utils import document_loader
Expand All @@ -28,9 +29,18 @@ def ingest_documents(doc_path: DocPath):
path = doc_path.path
print(f"Parsing document {path}.")

text_splitter = RecursiveCharacterTextSplitter(
chunk_size=doc_path.chunk_size, chunk_overlap=doc_path.chunk_size, add_start_index=True
)
if path.endswith(".html"):
headers_to_split_on = [
("h1", "Header 1"),
("h2", "Header 2"),
("h3", "Header 3"),
]
text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
else:
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=doc_path.chunk_size, chunk_overlap=100, add_start_index=True
)

content = document_loader(path)
chunks = text_splitter.split_text(content)

Expand Down
1 change: 1 addition & 0 deletions comps/dataprep/qdrant/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ fastapi
huggingface_hub
langchain
langchain-community
langchain-text-splitters
numpy
opentelemetry-api
opentelemetry-exporter-otlp
Expand Down
17 changes: 14 additions & 3 deletions comps/dataprep/redis/langchain/prepare_doc_redis.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings
from langchain_community.vectorstores import Redis
from langchain_text_splitters import HTMLHeaderTextSplitter
from langsmith import traceable

from comps import DocPath, opea_microservices, register_microservice
Expand All @@ -36,10 +37,20 @@ def ingest_data_to_redis(doc_path: DocPath):
path = doc_path.path
print(f"Parsing document {path}.")

text_splitter = RecursiveCharacterTextSplitter(
chunk_size=doc_path.chunk_size, chunk_overlap=doc_path.chunk_size, add_start_index=True
)
if path.endswith(".html"):
headers_to_split_on = [
("h1", "Header 1"),
("h2", "Header 2"),
("h3", "Header 3"),
]
text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
else:
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=doc_path.chunk_size, chunk_overlap=100, add_start_index=True
)

content = document_loader(path)

chunks = text_splitter.split_text(content)
print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf")

Expand Down
1 change: 1 addition & 0 deletions comps/dataprep/redis/langchain/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ fastapi
huggingface_hub
langchain
langchain-community
langchain-text-splitters
langsmith
numpy
opentelemetry-api
Expand Down
11 changes: 6 additions & 5 deletions comps/dataprep/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from bs4 import BeautifulSoup
from docx import Document as DDocument
from langchain_community.document_loaders import (
UnstructuredHTMLLoader,
UnstructuredImageLoader,
UnstructuredMarkdownLoader,
UnstructuredPowerPointLoader,
Expand Down Expand Up @@ -112,11 +113,11 @@ def load_pdf(pdf_path):

def load_html(html_path):
"""Load the html file."""
with open(html_path, "r", encoding="utf-8") as file:
html = file.read()
soup = BeautifulSoup(html, "html.parser")
text = soup.get_text(strip=True)
return text
data_html = UnstructuredHTMLLoader(html_path).load()
content = ""
for ins in data_html:
content += ins.page_content
return content


def load_txt(txt_path):
Expand Down

0 comments on commit e1dad1d

Please sign in to comment.