Skip to content

Commit

Permalink
[Dataprep] Reduce Upload File Time Consumption (#744)
Browse files Browse the repository at this point in the history
* reduce upload file time consumption

Signed-off-by: letonghan <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
letonghan and pre-commit-ci[bot] committed Sep 27, 2024
1 parent ad8bd4f commit 7134899
Show file tree
Hide file tree
Showing 10 changed files with 44 additions and 33 deletions.
1 change: 1 addition & 0 deletions comps/dataprep/milvus/langchain/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Pillow
prometheus-fastapi-instrumentator
pymupdf
pyspark
pytesseract
python-docx
python-pptx
sentence_transformers
Expand Down
2 changes: 1 addition & 1 deletion comps/dataprep/neo4j/langchain/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ pandas
Pillow
prometheus-fastapi-instrumentator
pymupdf
pytesseract
python-docx
python-pptx
sentence_transformers
shortuuid
unstructured[all-docs]==0.15.7
uvicorn

2 changes: 1 addition & 1 deletion comps/dataprep/pgvector/langchain/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ prometheus-fastapi-instrumentator
psycopg2-binary
pymupdf
pyspark
pytesseract
python-docx
python-multipart
python-pptx
Expand All @@ -28,4 +29,3 @@ shortuuid
tiktoken
unstructured[all-docs]==0.15.7
uvicorn

1 change: 1 addition & 0 deletions comps/dataprep/pinecone/langchain/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ pinecone-client
prometheus-fastapi-instrumentator
pymupdf
pyspark
pytesseract
python-bidi==0.4.2
python-docx
python-pptx
Expand Down
1 change: 1 addition & 0 deletions comps/dataprep/qdrant/langchain/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ pandas
Pillow
prometheus-fastapi-instrumentator
pymupdf
pytesseract
python-docx
python-pptx
qdrant-client
Expand Down
1 change: 1 addition & 0 deletions comps/dataprep/redis/langchain/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Pillow
prometheus-fastapi-instrumentator
pymupdf
pyspark
pytesseract
python-bidi
python-docx
python-pptx
Expand Down
1 change: 1 addition & 0 deletions comps/dataprep/redis/langchain_ray/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Pillow
prometheus-fastapi-instrumentator
pyarrow
pymupdf
pytesseract
python-bidi==0.4.2
python-docx
python-multipart
Expand Down
1 change: 1 addition & 0 deletions comps/dataprep/redis/llama_index/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ opentelemetry-api
opentelemetry-exporter-otlp
opentelemetry-sdk
prometheus-fastapi-instrumentator
pytesseract
python-bidi==0.4.2
python-multipart
redis
Expand Down
66 changes: 35 additions & 31 deletions comps/dataprep/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,23 @@
import timeit
import unicodedata
import urllib.parse
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from typing import Dict, List, Union
from urllib.parse import urlparse, urlunparse

import cairosvg
import cv2
import docx
import docx2txt
import easyocr
import fitz
import numpy as np
import pandas as pd
import pptx
import pytesseract
import requests
import yaml
from bs4 import BeautifulSoup
from docx import Document as DDocument
from langchain import LLMChain, PromptTemplate
from langchain_community.document_loaders import (
UnstructuredHTMLLoader,
Expand All @@ -40,7 +41,6 @@
UnstructuredXMLLoader,
)
from langchain_community.llms import HuggingFaceEndpoint
from PIL import Image

from comps import CustomLogger

Expand Down Expand Up @@ -112,36 +112,40 @@ def get_separators():
return separators


def process_page(doc, idx):
page = doc.load_page(idx)
pagetext = page.get_text().strip()
result = pagetext if pagetext.endswith(("!", "?", ".")) else pagetext + "."

page_images = doc.get_page_images(idx)
if page_images:
for img_index, img in enumerate(page_images):
xref = img[0]
img_data = doc.extract_image(xref)
img_bytes = img_data["image"]

# process images
img_array = cv2.imdecode(np.frombuffer(img_bytes, np.uint8), cv2.IMREAD_COLOR)
img_result = pytesseract.image_to_string(img_array, lang="eng", config="--psm 6")

# add results
pageimg = img_result.strip()
pageimg += "" if pageimg.endswith(("!", "?", ".")) else "."
result += pageimg
return result


def load_pdf(pdf_path):
"""Load the pdf file."""
doc = fitz.open(pdf_path)
reader = easyocr.Reader(["en"], gpu=False)
result = ""
for i in range(doc.page_count):
page = doc.load_page(i)
pagetext = page.get_text().strip()
if pagetext:
if pagetext.endswith("!") or pagetext.endswith("?") or pagetext.endswith("."):
result = result + pagetext
else:
result = result + pagetext + "."
if len(doc.get_page_images(i)) > 0:
for img in doc.get_page_images(i):
if img:
pageimg = ""
xref = img[0]
img_data = doc.extract_image(xref)
img_bytes = img_data["image"]
pil_image = Image.open(io.BytesIO(img_bytes))
img = np.array(pil_image)
img_result = reader.readtext(img, paragraph=True, detail=0)
pageimg = pageimg + ", ".join(img_result).strip()
if pageimg.endswith("!") or pageimg.endswith("?") or pageimg.endswith("."):
pass
else:
pageimg = pageimg + "."
result = result + pageimg
return result
results = []

with ThreadPoolExecutor(max_workers=4) as executor:
futures = [executor.submit(process_page, doc, i) for i in range(doc.page_count)]
for future in as_completed(futures):
results.append(future.result())

combined_result = "".join(results)
return combined_result


def load_html(html_path):
Expand Down
1 change: 1 addition & 0 deletions comps/dataprep/vdms/langchain/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Pillow
prometheus-fastapi-instrumentator
pymupdf
pyspark
pytesseract
python-bidi==0.4.2
python-docx
python-pptx
Expand Down

0 comments on commit 7134899

Please sign in to comment.