forked from danny-avila/rag_api
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parsers.py
40 lines (30 loc) · 1.15 KB
/
parsers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from typing import List, Optional
from langchain.schema import Document
from config import CHUNK_OVERLAP
def clean_text(text: str) -> str:
"""
Remove NUL (0x00) characters from a string.
:param text: The original text with potential NUL characters.
:return: Cleaned text without NUL characters.
"""
return text.replace("\x00", "")
def process_documents(documents: List[Document]) -> str:
processed_text = ""
last_page: Optional[int] = None
doc_basename = ""
for doc in documents:
if "source" in doc.metadata:
doc_basename = doc.metadata["source"].split("/")[-1]
break
processed_text += f"{doc_basename}\n"
for doc in documents:
current_page = doc.metadata.get("page")
if current_page and current_page != last_page:
processed_text += f"\n# PAGE {doc.metadata['page']}\n\n"
last_page = current_page
new_content = doc.page_content
if processed_text.endswith(new_content[:CHUNK_OVERLAP]):
processed_text += new_content[CHUNK_OVERLAP:]
else:
processed_text += new_content
return processed_text.strip()