From 7d49ee9741416c7be94f7eb85a788d17e0ecdd80 Mon Sep 17 00:00:00 2001 From: ccurme Date: Thu, 19 Sep 2024 13:43:34 -0400 Subject: [PATCH] unstructured[patch]: add to integration tests (#26666) - Add to tests on parsed content; - Add tests for async + lazy loading; - Add a test for `strategy="hi_res"`. --- .../test_document_loaders.py | 108 ++++++++++++++++-- 1 file changed, 99 insertions(+), 9 deletions(-) diff --git a/libs/partners/unstructured/tests/integration_tests/test_document_loaders.py b/libs/partners/unstructured/tests/integration_tests/test_document_loaders.py index f4fa4e7f9aab8..f27ddf718670e 100644 --- a/libs/partners/unstructured/tests/integration_tests/test_document_loaders.py +++ b/libs/partners/unstructured/tests/integration_tests/test_document_loaders.py @@ -1,8 +1,9 @@ import os from pathlib import Path -from typing import Callable +from typing import Callable, List import pytest +from langchain_core.documents import Document from langchain_unstructured import UnstructuredLoader @@ -13,6 +14,51 @@ UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY") +def _check_docs_content(docs: List[Document]) -> None: + assert all( + doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs + ) + assert ( + sum(doc.metadata.get("category") == "PageBreak" for doc in docs) == 16 + ) # 16 page doc + + expected_metadata_keys = [ + "source", + "languages", + "page_number", + "category", + "coordinates", + "element_id", + ] + for doc in docs: + if doc.page_content: + for key in expected_metadata_keys: + assert key in doc.metadata + else: + assert doc.metadata.get("category") == "PageBreak" + + page_numbers = [] + for doc in docs: + if page_number := doc.metadata.get("page_number"): + page_numbers.append(page_number) + + assert set(page_numbers) == set(range(1, 17)) + assert len(docs) >= 32 # (16 pages * (>=1 element per page) + 16 page breaks) + + page_1_content = "" + for doc in docs: + if doc.metadata.get("page_number") == 1: + page_1_content += f" {doc.page_content}" + assert ( + "LayoutParser: A Uniļ¬ed Toolkit for Deep Learning " + "Based Document Image Analysis" + ) in page_1_content + + categories = set(doc.metadata.get("category") for doc in docs) + assert "NarrativeText" in categories + assert "Title" in categories + + # -- Local partition -- @@ -27,10 +73,24 @@ def test_loader_partitions_locally() -> None: include_page_breaks=True, ).load() - assert all( - doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs + _check_docs_content(docs) + + +@pytest.mark.local +async def test_loader_partitions_locally_async_lazy() -> None: + file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf") + + loader = UnstructuredLoader( + file_path=file_path, + # Unstructured kwargs + strategy="fast", + include_page_breaks=True, ) - assert any(doc.metadata.get("category") == "PageBreak" for doc in docs) + docs = [] + async for doc in loader.alazy_load(): + docs.append(doc) + + _check_docs_content(docs) @pytest.mark.local @@ -79,16 +139,30 @@ def test_loader_partitions_via_api() -> None: # Unstructured kwargs strategy="fast", include_page_breaks=True, + coordinates=True, ) docs = loader.load() - assert len(docs) > 1 - assert any(doc.metadata.get("category") == "PageBreak" for doc in docs) - assert all( - doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs + _check_docs_content(docs) + + +async def test_loader_partitions_via_api_async_lazy() -> None: + file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf") + loader = UnstructuredLoader( + file_path=file_path, + partition_via_api=True, + # Unstructured kwargs + strategy="fast", + include_page_breaks=True, + coordinates=True, ) - assert docs[0].metadata.get("element_id") is not None + + docs = [] + async for doc in loader.alazy_load(): + docs.append(doc) + + _check_docs_content(docs) def test_loader_partitions_multiple_via_api() -> None: @@ -124,6 +198,22 @@ def test_loader_partition_via_api_raises_TypeError_with_invalid_arg() -> None: loader.load() +def test_loader_partitions_via_api_hi_res() -> None: + file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf") + loader = UnstructuredLoader( + file_path=file_path, + partition_via_api=True, + # Unstructured kwargs + strategy="hi_res", + ) + + docs = loader.load() + + categories = set(doc.metadata.get("category") for doc in docs) + assert "Table" in categories + assert "Image" in categories + + # -- fixtures ---