From fc05f1fb17965736c202a012c8e4edf64fc6befb Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Tue, 17 Sep 2024 17:50:23 -0400 Subject: [PATCH 1/4] move some pdf integration tests to extended tests --- .../document_loaders/test_pdf.py | 33 ------------- .../unit_tests/document_loaders/test_pdf.py | 46 +++++++++++++++++++ 2 files changed, 46 insertions(+), 33 deletions(-) create mode 100644 libs/community/tests/unit_tests/document_loaders/test_pdf.py diff --git a/libs/community/tests/integration_tests/document_loaders/test_pdf.py b/libs/community/tests/integration_tests/document_loaders/test_pdf.py index 462e20d357904..50c9fde29d918 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_pdf.py +++ b/libs/community/tests/integration_tests/document_loaders/test_pdf.py @@ -1,4 +1,3 @@ -import re from pathlib import Path from typing import Sequence, Union @@ -11,7 +10,6 @@ PDFMinerPDFasHTMLLoader, PyMuPDFLoader, PyPDFium2Loader, - PyPDFLoader, UnstructuredPDFLoader, ) @@ -86,37 +84,6 @@ def test_pdfminer_pdf_as_html_loader() -> None: assert len(docs) == 1 -def test_pypdf_loader() -> None: - """Test PyPDFLoader.""" - file_path = Path(__file__).parent.parent / "examples/hello.pdf" - loader = PyPDFLoader(str(file_path)) - docs = loader.load() - - assert len(docs) == 1 - - file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" - loader = PyPDFLoader(str(file_path)) - - docs = loader.load() - assert len(docs) == 16 - - -def test_pypdf_loader_with_layout() -> None: - """Test PyPDFLoader with layout mode.""" - file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" - loader = PyPDFLoader(str(file_path), extraction_mode="layout") - - docs = loader.load() - first_page = docs[0].page_content - - expected = ( - Path(__file__).parent.parent / "examples/layout-parser-paper-page-1.txt" - ).read_text(encoding="utf-8") - cleaned_first_page = re.sub(r"\x00", "", first_page) - cleaned_expected = re.sub(r"\x00", "", expected) - assert cleaned_first_page == cleaned_expected - - def test_pypdfium2_loader() -> None: """Test PyPDFium2Loader.""" file_path = Path(__file__).parent.parent / "examples/hello.pdf" diff --git a/libs/community/tests/unit_tests/document_loaders/test_pdf.py b/libs/community/tests/unit_tests/document_loaders/test_pdf.py new file mode 100644 index 0000000000000..d62363723bd60 --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/test_pdf.py @@ -0,0 +1,46 @@ +import re +from pathlib import Path + +import pytest + +from langchain_community.document_loaders import PyPDFLoader + +path_to_simple_pdf = ( + Path(__file__).parent.parent.parent / "integration_tests/examples/hello.pdf" +) +path_to_layout_pdf = ( + Path(__file__).parent.parent + / "document_loaders/sample_documents/layout-parser-paper.pdf" +) +path_to_layout_pdf_txt = ( + Path(__file__).parent.parent.parent + / "integration_tests/examples/layout-parser-paper-page-1.txt" +) + + +@pytest.mark.requires("pypdf") +def test_pypdf_loader() -> None: + """Test PyPDFLoader.""" + loader = PyPDFLoader(str(path_to_simple_pdf)) + docs = loader.load() + + assert len(docs) == 1 + + loader = PyPDFLoader(str(path_to_layout_pdf)) + + docs = loader.load() + assert len(docs) == 16 + + +@pytest.mark.requires("pypdf") +def test_pypdf_loader_with_layout() -> None: + """Test PyPDFLoader with layout mode.""" + loader = PyPDFLoader(str(path_to_layout_pdf), extraction_mode="layout") + + docs = loader.load() + first_page = docs[0].page_content + + expected = path_to_layout_pdf_txt.read_text(encoding="utf-8") + cleaned_first_page = re.sub(r"\x00", "", first_page) + cleaned_expected = re.sub(r"\x00", "", expected) + assert cleaned_first_page == cleaned_expected From f7df5634460ffe7d568b0a354b5d73ccdf3394d9 Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Thu, 19 Sep 2024 10:36:55 -0400 Subject: [PATCH 2/4] add to pypdf tests --- .../unit_tests/document_loaders/test_pdf.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/libs/community/tests/unit_tests/document_loaders/test_pdf.py b/libs/community/tests/unit_tests/document_loaders/test_pdf.py index d62363723bd60..ae7356ea4952e 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_pdf.py +++ b/libs/community/tests/unit_tests/document_loaders/test_pdf.py @@ -30,6 +30,14 @@ def test_pypdf_loader() -> None: docs = loader.load() assert len(docs) == 16 + for page, doc in enumerate(docs): + assert doc.metadata["page"] == page + assert doc.metadata["source"].endswith("layout-parser-paper.pdf") + assert len(doc.page_content) > 10 + + first_page = docs[0].page_content + for expected in ["LayoutParser", "A Unified Toolkit"]: + assert expected in first_page @pytest.mark.requires("pypdf") @@ -38,7 +46,15 @@ def test_pypdf_loader_with_layout() -> None: loader = PyPDFLoader(str(path_to_layout_pdf), extraction_mode="layout") docs = loader.load() + assert len(docs) == 16 + for page, doc in enumerate(docs): + assert doc.metadata["page"] == page + assert doc.metadata["source"].endswith("layout-parser-paper.pdf") + assert len(doc.page_content) > 10 + first_page = docs[0].page_content + for expected in ["LayoutParser", "A Unified Toolkit"]: + assert expected in first_page expected = path_to_layout_pdf_txt.read_text(encoding="utf-8") cleaned_first_page = re.sub(r"\x00", "", first_page) From d7efdd469993ba96a11ee071950e4bc0417ef6c8 Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Thu, 19 Sep 2024 10:37:33 -0400 Subject: [PATCH 3/4] temporarily raise error to check that tests run in extended tests --- libs/community/tests/unit_tests/document_loaders/test_pdf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/libs/community/tests/unit_tests/document_loaders/test_pdf.py b/libs/community/tests/unit_tests/document_loaders/test_pdf.py index ae7356ea4952e..c66dd4441a82e 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_pdf.py +++ b/libs/community/tests/unit_tests/document_loaders/test_pdf.py @@ -21,6 +21,7 @@ @pytest.mark.requires("pypdf") def test_pypdf_loader() -> None: """Test PyPDFLoader.""" + raise AssertionError("Testing!") loader = PyPDFLoader(str(path_to_simple_pdf)) docs = loader.load() From 74c58a7a6973718e381af8881851b7a0533207a7 Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Thu, 19 Sep 2024 10:42:43 -0400 Subject: [PATCH 4/4] Revert "temporarily raise error to check that tests run in extended tests" This reverts commit d7efdd469993ba96a11ee071950e4bc0417ef6c8. --- libs/community/tests/unit_tests/document_loaders/test_pdf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/libs/community/tests/unit_tests/document_loaders/test_pdf.py b/libs/community/tests/unit_tests/document_loaders/test_pdf.py index c66dd4441a82e..ae7356ea4952e 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_pdf.py +++ b/libs/community/tests/unit_tests/document_loaders/test_pdf.py @@ -21,7 +21,6 @@ @pytest.mark.requires("pypdf") def test_pypdf_loader() -> None: """Test PyPDFLoader.""" - raise AssertionError("Testing!") loader = PyPDFLoader(str(path_to_simple_pdf)) docs = loader.load()