diff --git a/comps/dataprep/milvus/langchain/requirements.txt b/comps/dataprep/milvus/langchain/requirements.txt
index 53e2e73ca..85ba3e972 100644
--- a/comps/dataprep/milvus/langchain/requirements.txt
+++ b/comps/dataprep/milvus/langchain/requirements.txt
@@ -20,6 +20,7 @@ Pillow
 prometheus-fastapi-instrumentator
 pymupdf
 pyspark
+pytesseract
 python-docx
 python-pptx
 sentence_transformers
diff --git a/comps/dataprep/neo4j/langchain/requirements.txt b/comps/dataprep/neo4j/langchain/requirements.txt
index b8326a623..f0f825b31 100644
--- a/comps/dataprep/neo4j/langchain/requirements.txt
+++ b/comps/dataprep/neo4j/langchain/requirements.txt
@@ -22,10 +22,10 @@ pandas
 Pillow
 prometheus-fastapi-instrumentator
 pymupdf
+pytesseract
 python-docx
 python-pptx
 sentence_transformers
 shortuuid
 unstructured[all-docs]==0.15.7
 uvicorn
-
diff --git a/comps/dataprep/pgvector/langchain/requirements.txt b/comps/dataprep/pgvector/langchain/requirements.txt
index e89c3947a..5235cd5ff 100644
--- a/comps/dataprep/pgvector/langchain/requirements.txt
+++ b/comps/dataprep/pgvector/langchain/requirements.txt
@@ -20,6 +20,7 @@ prometheus-fastapi-instrumentator
 psycopg2-binary
 pymupdf
 pyspark
+pytesseract
 python-docx
 python-multipart
 python-pptx
@@ -28,4 +29,3 @@ shortuuid
 tiktoken
 unstructured[all-docs]==0.15.7
 uvicorn
-
diff --git a/comps/dataprep/pinecone/langchain/requirements.txt b/comps/dataprep/pinecone/langchain/requirements.txt
index a48c6343f..80f81bd5e 100644
--- a/comps/dataprep/pinecone/langchain/requirements.txt
+++ b/comps/dataprep/pinecone/langchain/requirements.txt
@@ -21,6 +21,7 @@ pinecone-client
 prometheus-fastapi-instrumentator
 pymupdf
 pyspark
+pytesseract
 python-bidi==0.4.2
 python-docx
 python-pptx
diff --git a/comps/dataprep/qdrant/langchain/requirements.txt b/comps/dataprep/qdrant/langchain/requirements.txt
index 0c371dd15..f505af163 100644
--- a/comps/dataprep/qdrant/langchain/requirements.txt
+++ b/comps/dataprep/qdrant/langchain/requirements.txt
@@ -18,6 +18,7 @@ pandas
 Pillow
 prometheus-fastapi-instrumentator
 pymupdf
+pytesseract
 python-docx
 python-pptx
 qdrant-client
diff --git a/comps/dataprep/redis/langchain/requirements.txt b/comps/dataprep/redis/langchain/requirements.txt
index 2bea55be8..8c3b116fa 100644
--- a/comps/dataprep/redis/langchain/requirements.txt
+++ b/comps/dataprep/redis/langchain/requirements.txt
@@ -19,6 +19,7 @@ Pillow
 prometheus-fastapi-instrumentator
 pymupdf
 pyspark
+pytesseract
 python-bidi
 python-docx
 python-pptx
diff --git a/comps/dataprep/redis/langchain_ray/requirements.txt b/comps/dataprep/redis/langchain_ray/requirements.txt
index 2ac816fc2..0237109e7 100644
--- a/comps/dataprep/redis/langchain_ray/requirements.txt
+++ b/comps/dataprep/redis/langchain_ray/requirements.txt
@@ -16,6 +16,7 @@ Pillow
 prometheus-fastapi-instrumentator
 pyarrow
 pymupdf
+pytesseract
 python-bidi==0.4.2
 python-docx
 python-multipart
diff --git a/comps/dataprep/redis/llama_index/requirements.txt b/comps/dataprep/redis/llama_index/requirements.txt
index 2f808e534..46640180e 100644
--- a/comps/dataprep/redis/llama_index/requirements.txt
+++ b/comps/dataprep/redis/llama_index/requirements.txt
@@ -10,6 +10,7 @@ opentelemetry-api
 opentelemetry-exporter-otlp
 opentelemetry-sdk
 prometheus-fastapi-instrumentator
+pytesseract
 python-bidi==0.4.2
 python-multipart
 redis
diff --git a/comps/dataprep/utils.py b/comps/dataprep/utils.py
index f48d97157..9285b893c 100644
--- a/comps/dataprep/utils.py
+++ b/comps/dataprep/utils.py
@@ -16,22 +16,23 @@
 import timeit
 import unicodedata
 import urllib.parse
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 from typing import Dict, List, Union
 from urllib.parse import urlparse, urlunparse
 
 import cairosvg
+import cv2
 import docx
 import docx2txt
-import easyocr
 import fitz
 import numpy as np
 import pandas as pd
 import pptx
+import pytesseract
 import requests
 import yaml
 from bs4 import BeautifulSoup
-from docx import Document as DDocument
 from langchain import LLMChain, PromptTemplate
 from langchain_community.document_loaders import (
     UnstructuredHTMLLoader,
@@ -40,7 +41,6 @@
     UnstructuredXMLLoader,
 )
 from langchain_community.llms import HuggingFaceEndpoint
-from PIL import Image
 
 from comps import CustomLogger
 
@@ -112,36 +112,40 @@ def get_separators():
     return separators
 
 
+def process_page(doc, idx):
+    page = doc.load_page(idx)
+    pagetext = page.get_text().strip()
+    result = pagetext if pagetext.endswith(("!", "?", ".")) else pagetext + "."
+
+    page_images = doc.get_page_images(idx)
+    if page_images:
+        for img_index, img in enumerate(page_images):
+            xref = img[0]
+            img_data = doc.extract_image(xref)
+            img_bytes = img_data["image"]
+
+            # process images
+            img_array = cv2.imdecode(np.frombuffer(img_bytes, np.uint8), cv2.IMREAD_COLOR)
+            img_result = pytesseract.image_to_string(img_array, lang="eng", config="--psm 6")
+
+            # add results
+            pageimg = img_result.strip()
+            pageimg += "" if pageimg.endswith(("!", "?", ".")) else "."
+            result += pageimg
+    return result
+
+
 def load_pdf(pdf_path):
-    """Load the pdf file."""
     doc = fitz.open(pdf_path)
-    reader = easyocr.Reader(["en"], gpu=False)
-    result = ""
-    for i in range(doc.page_count):
-        page = doc.load_page(i)
-        pagetext = page.get_text().strip()
-        if pagetext:
-            if pagetext.endswith("!") or pagetext.endswith("?") or pagetext.endswith("."):
-                result = result + pagetext
-            else:
-                result = result + pagetext + "."
-        if len(doc.get_page_images(i)) > 0:
-            for img in doc.get_page_images(i):
-                if img:
-                    pageimg = ""
-                    xref = img[0]
-                    img_data = doc.extract_image(xref)
-                    img_bytes = img_data["image"]
-                    pil_image = Image.open(io.BytesIO(img_bytes))
-                    img = np.array(pil_image)
-                    img_result = reader.readtext(img, paragraph=True, detail=0)
-                    pageimg = pageimg + ", ".join(img_result).strip()
-                    if pageimg.endswith("!") or pageimg.endswith("?") or pageimg.endswith("."):
-                        pass
-                    else:
-                        pageimg = pageimg + "."
-                result = result + pageimg
-    return result
+    results = []
+
+    with ThreadPoolExecutor(max_workers=4) as executor:
+        futures = [executor.submit(process_page, doc, i) for i in range(doc.page_count)]
+        for future in as_completed(futures):
+            results.append(future.result())
+
+    combined_result = "".join(results)
+    return combined_result
 
 
 def load_html(html_path):
diff --git a/comps/dataprep/vdms/langchain/requirements.txt b/comps/dataprep/vdms/langchain/requirements.txt
index f6044266c..88b2c033a 100644
--- a/comps/dataprep/vdms/langchain/requirements.txt
+++ b/comps/dataprep/vdms/langchain/requirements.txt
@@ -23,6 +23,7 @@ Pillow
 prometheus-fastapi-instrumentator
 pymupdf
 pyspark
+pytesseract
 python-bidi==0.4.2
 python-docx
 python-pptx