diff --git a/libs/community/tests/integration_tests/document_loaders/test_pdf.py b/libs/community/tests/integration_tests/document_loaders/test_pdf.py index 462e20d357904..50c9fde29d918 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_pdf.py +++ b/libs/community/tests/integration_tests/document_loaders/test_pdf.py @@ -1,4 +1,3 @@ -import re from pathlib import Path from typing import Sequence, Union @@ -11,7 +10,6 @@ PDFMinerPDFasHTMLLoader, PyMuPDFLoader, PyPDFium2Loader, - PyPDFLoader, UnstructuredPDFLoader, ) @@ -86,37 +84,6 @@ def test_pdfminer_pdf_as_html_loader() -> None: assert len(docs) == 1 -def test_pypdf_loader() -> None: - """Test PyPDFLoader.""" - file_path = Path(__file__).parent.parent / "examples/hello.pdf" - loader = PyPDFLoader(str(file_path)) - docs = loader.load() - - assert len(docs) == 1 - - file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" - loader = PyPDFLoader(str(file_path)) - - docs = loader.load() - assert len(docs) == 16 - - -def test_pypdf_loader_with_layout() -> None: - """Test PyPDFLoader with layout mode.""" - file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" - loader = PyPDFLoader(str(file_path), extraction_mode="layout") - - docs = loader.load() - first_page = docs[0].page_content - - expected = ( - Path(__file__).parent.parent / "examples/layout-parser-paper-page-1.txt" - ).read_text(encoding="utf-8") - cleaned_first_page = re.sub(r"\x00", "", first_page) - cleaned_expected = re.sub(r"\x00", "", expected) - assert cleaned_first_page == cleaned_expected - - def test_pypdfium2_loader() -> None: """Test PyPDFium2Loader.""" file_path = Path(__file__).parent.parent / "examples/hello.pdf" diff --git a/libs/community/tests/unit_tests/document_loaders/test_pdf.py b/libs/community/tests/unit_tests/document_loaders/test_pdf.py new file mode 100644 index 0000000000000..ae7356ea4952e --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/test_pdf.py @@ -0,0 +1,62 @@ +import re +from pathlib import Path + +import pytest + +from langchain_community.document_loaders import PyPDFLoader + +path_to_simple_pdf = ( + Path(__file__).parent.parent.parent / "integration_tests/examples/hello.pdf" +) +path_to_layout_pdf = ( + Path(__file__).parent.parent + / "document_loaders/sample_documents/layout-parser-paper.pdf" +) +path_to_layout_pdf_txt = ( + Path(__file__).parent.parent.parent + / "integration_tests/examples/layout-parser-paper-page-1.txt" +) + + +@pytest.mark.requires("pypdf") +def test_pypdf_loader() -> None: + """Test PyPDFLoader.""" + loader = PyPDFLoader(str(path_to_simple_pdf)) + docs = loader.load() + + assert len(docs) == 1 + + loader = PyPDFLoader(str(path_to_layout_pdf)) + + docs = loader.load() + assert len(docs) == 16 + for page, doc in enumerate(docs): + assert doc.metadata["page"] == page + assert doc.metadata["source"].endswith("layout-parser-paper.pdf") + assert len(doc.page_content) > 10 + + first_page = docs[0].page_content + for expected in ["LayoutParser", "A Unified Toolkit"]: + assert expected in first_page + + +@pytest.mark.requires("pypdf") +def test_pypdf_loader_with_layout() -> None: + """Test PyPDFLoader with layout mode.""" + loader = PyPDFLoader(str(path_to_layout_pdf), extraction_mode="layout") + + docs = loader.load() + assert len(docs) == 16 + for page, doc in enumerate(docs): + assert doc.metadata["page"] == page + assert doc.metadata["source"].endswith("layout-parser-paper.pdf") + assert len(doc.page_content) > 10 + + first_page = docs[0].page_content + for expected in ["LayoutParser", "A Unified Toolkit"]: + assert expected in first_page + + expected = path_to_layout_pdf_txt.read_text(encoding="utf-8") + cleaned_first_page = re.sub(r"\x00", "", first_page) + cleaned_expected = re.sub(r"\x00", "", expected) + assert cleaned_first_page == cleaned_expected