diff --git a/comps/dataprep/milvus/langchain/requirements.txt b/comps/dataprep/milvus/langchain/requirements.txt index 53e2e73ca..85ba3e972 100644 --- a/comps/dataprep/milvus/langchain/requirements.txt +++ b/comps/dataprep/milvus/langchain/requirements.txt @@ -20,6 +20,7 @@ Pillow prometheus-fastapi-instrumentator pymupdf pyspark +pytesseract python-docx python-pptx sentence_transformers diff --git a/comps/dataprep/neo4j/langchain/requirements.txt b/comps/dataprep/neo4j/langchain/requirements.txt index b8326a623..f0f825b31 100644 --- a/comps/dataprep/neo4j/langchain/requirements.txt +++ b/comps/dataprep/neo4j/langchain/requirements.txt @@ -22,10 +22,10 @@ pandas Pillow prometheus-fastapi-instrumentator pymupdf +pytesseract python-docx python-pptx sentence_transformers shortuuid unstructured[all-docs]==0.15.7 uvicorn - diff --git a/comps/dataprep/pgvector/langchain/requirements.txt b/comps/dataprep/pgvector/langchain/requirements.txt index e89c3947a..5235cd5ff 100644 --- a/comps/dataprep/pgvector/langchain/requirements.txt +++ b/comps/dataprep/pgvector/langchain/requirements.txt @@ -20,6 +20,7 @@ prometheus-fastapi-instrumentator psycopg2-binary pymupdf pyspark +pytesseract python-docx python-multipart python-pptx @@ -28,4 +29,3 @@ shortuuid tiktoken unstructured[all-docs]==0.15.7 uvicorn - diff --git a/comps/dataprep/pinecone/langchain/requirements.txt b/comps/dataprep/pinecone/langchain/requirements.txt index a48c6343f..80f81bd5e 100644 --- a/comps/dataprep/pinecone/langchain/requirements.txt +++ b/comps/dataprep/pinecone/langchain/requirements.txt @@ -21,6 +21,7 @@ pinecone-client prometheus-fastapi-instrumentator pymupdf pyspark +pytesseract python-bidi==0.4.2 python-docx python-pptx diff --git a/comps/dataprep/qdrant/langchain/requirements.txt b/comps/dataprep/qdrant/langchain/requirements.txt index 0c371dd15..f505af163 100644 --- a/comps/dataprep/qdrant/langchain/requirements.txt +++ b/comps/dataprep/qdrant/langchain/requirements.txt @@ -18,6 +18,7 @@ pandas Pillow prometheus-fastapi-instrumentator pymupdf +pytesseract python-docx python-pptx qdrant-client diff --git a/comps/dataprep/redis/langchain/requirements.txt b/comps/dataprep/redis/langchain/requirements.txt index 2bea55be8..8c3b116fa 100644 --- a/comps/dataprep/redis/langchain/requirements.txt +++ b/comps/dataprep/redis/langchain/requirements.txt @@ -19,6 +19,7 @@ Pillow prometheus-fastapi-instrumentator pymupdf pyspark +pytesseract python-bidi python-docx python-pptx diff --git a/comps/dataprep/redis/langchain_ray/requirements.txt b/comps/dataprep/redis/langchain_ray/requirements.txt index 2ac816fc2..0237109e7 100644 --- a/comps/dataprep/redis/langchain_ray/requirements.txt +++ b/comps/dataprep/redis/langchain_ray/requirements.txt @@ -16,6 +16,7 @@ Pillow prometheus-fastapi-instrumentator pyarrow pymupdf +pytesseract python-bidi==0.4.2 python-docx python-multipart diff --git a/comps/dataprep/redis/llama_index/requirements.txt b/comps/dataprep/redis/llama_index/requirements.txt index 2f808e534..46640180e 100644 --- a/comps/dataprep/redis/llama_index/requirements.txt +++ b/comps/dataprep/redis/llama_index/requirements.txt @@ -10,6 +10,7 @@ opentelemetry-api opentelemetry-exporter-otlp opentelemetry-sdk prometheus-fastapi-instrumentator +pytesseract python-bidi==0.4.2 python-multipart redis diff --git a/comps/dataprep/utils.py b/comps/dataprep/utils.py index f48d97157..9285b893c 100644 --- a/comps/dataprep/utils.py +++ b/comps/dataprep/utils.py @@ -16,22 +16,23 @@ import timeit import unicodedata import urllib.parse +from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path from typing import Dict, List, Union from urllib.parse import urlparse, urlunparse import cairosvg +import cv2 import docx import docx2txt -import easyocr import fitz import numpy as np import pandas as pd import pptx +import pytesseract import requests import yaml from bs4 import BeautifulSoup -from docx import Document as DDocument from langchain import LLMChain, PromptTemplate from langchain_community.document_loaders import ( UnstructuredHTMLLoader, @@ -40,7 +41,6 @@ UnstructuredXMLLoader, ) from langchain_community.llms import HuggingFaceEndpoint -from PIL import Image from comps import CustomLogger @@ -112,36 +112,40 @@ def get_separators(): return separators +def process_page(doc, idx): + page = doc.load_page(idx) + pagetext = page.get_text().strip() + result = pagetext if pagetext.endswith(("!", "?", ".")) else pagetext + "." + + page_images = doc.get_page_images(idx) + if page_images: + for img_index, img in enumerate(page_images): + xref = img[0] + img_data = doc.extract_image(xref) + img_bytes = img_data["image"] + + # process images + img_array = cv2.imdecode(np.frombuffer(img_bytes, np.uint8), cv2.IMREAD_COLOR) + img_result = pytesseract.image_to_string(img_array, lang="eng", config="--psm 6") + + # add results + pageimg = img_result.strip() + pageimg += "" if pageimg.endswith(("!", "?", ".")) else "." + result += pageimg + return result + + def load_pdf(pdf_path): - """Load the pdf file.""" doc = fitz.open(pdf_path) - reader = easyocr.Reader(["en"], gpu=False) - result = "" - for i in range(doc.page_count): - page = doc.load_page(i) - pagetext = page.get_text().strip() - if pagetext: - if pagetext.endswith("!") or pagetext.endswith("?") or pagetext.endswith("."): - result = result + pagetext - else: - result = result + pagetext + "." - if len(doc.get_page_images(i)) > 0: - for img in doc.get_page_images(i): - if img: - pageimg = "" - xref = img[0] - img_data = doc.extract_image(xref) - img_bytes = img_data["image"] - pil_image = Image.open(io.BytesIO(img_bytes)) - img = np.array(pil_image) - img_result = reader.readtext(img, paragraph=True, detail=0) - pageimg = pageimg + ", ".join(img_result).strip() - if pageimg.endswith("!") or pageimg.endswith("?") or pageimg.endswith("."): - pass - else: - pageimg = pageimg + "." - result = result + pageimg - return result + results = [] + + with ThreadPoolExecutor(max_workers=4) as executor: + futures = [executor.submit(process_page, doc, i) for i in range(doc.page_count)] + for future in as_completed(futures): + results.append(future.result()) + + combined_result = "".join(results) + return combined_result def load_html(html_path): diff --git a/comps/dataprep/vdms/langchain/requirements.txt b/comps/dataprep/vdms/langchain/requirements.txt index f6044266c..88b2c033a 100644 --- a/comps/dataprep/vdms/langchain/requirements.txt +++ b/comps/dataprep/vdms/langchain/requirements.txt @@ -23,6 +23,7 @@ Pillow prometheus-fastapi-instrumentator pymupdf pyspark +pytesseract python-bidi==0.4.2 python-docx python-pptx