Skip to content

Commit

Permalink
unstructured[patch]: add to integration tests (#26666)
Browse files Browse the repository at this point in the history
- Add to tests on parsed content;
- Add tests for async + lazy loading;
- Add a test for `strategy="hi_res"`.
  • Loading branch information
ccurme committed Sep 19, 2024
1 parent 28dd656 commit 7d49ee9
Showing 1 changed file with 99 additions and 9 deletions.
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import os
from pathlib import Path
from typing import Callable
from typing import Callable, List

import pytest
from langchain_core.documents import Document

from langchain_unstructured import UnstructuredLoader

Expand All @@ -13,6 +14,51 @@
UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")


def _check_docs_content(docs: List[Document]) -> None:
assert all(
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
)
assert (
sum(doc.metadata.get("category") == "PageBreak" for doc in docs) == 16
) # 16 page doc

expected_metadata_keys = [
"source",
"languages",
"page_number",
"category",
"coordinates",
"element_id",
]
for doc in docs:
if doc.page_content:
for key in expected_metadata_keys:
assert key in doc.metadata
else:
assert doc.metadata.get("category") == "PageBreak"

page_numbers = []
for doc in docs:
if page_number := doc.metadata.get("page_number"):
page_numbers.append(page_number)

assert set(page_numbers) == set(range(1, 17))
assert len(docs) >= 32 # (16 pages * (>=1 element per page) + 16 page breaks)

page_1_content = ""
for doc in docs:
if doc.metadata.get("page_number") == 1:
page_1_content += f" {doc.page_content}"
assert (
"LayoutParser: A Unified Toolkit for Deep Learning "
"Based Document Image Analysis"
) in page_1_content

categories = set(doc.metadata.get("category") for doc in docs)
assert "NarrativeText" in categories
assert "Title" in categories


# -- Local partition --


Expand All @@ -27,10 +73,24 @@ def test_loader_partitions_locally() -> None:
include_page_breaks=True,
).load()

assert all(
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
_check_docs_content(docs)


@pytest.mark.local
async def test_loader_partitions_locally_async_lazy() -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")

loader = UnstructuredLoader(
file_path=file_path,
# Unstructured kwargs
strategy="fast",
include_page_breaks=True,
)
assert any(doc.metadata.get("category") == "PageBreak" for doc in docs)
docs = []
async for doc in loader.alazy_load():
docs.append(doc)

_check_docs_content(docs)


@pytest.mark.local
Expand Down Expand Up @@ -79,16 +139,30 @@ def test_loader_partitions_via_api() -> None:
# Unstructured kwargs
strategy="fast",
include_page_breaks=True,
coordinates=True,
)

docs = loader.load()

assert len(docs) > 1
assert any(doc.metadata.get("category") == "PageBreak" for doc in docs)
assert all(
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
_check_docs_content(docs)


async def test_loader_partitions_via_api_async_lazy() -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
loader = UnstructuredLoader(
file_path=file_path,
partition_via_api=True,
# Unstructured kwargs
strategy="fast",
include_page_breaks=True,
coordinates=True,
)
assert docs[0].metadata.get("element_id") is not None

docs = []
async for doc in loader.alazy_load():
docs.append(doc)

_check_docs_content(docs)


def test_loader_partitions_multiple_via_api() -> None:
Expand Down Expand Up @@ -124,6 +198,22 @@ def test_loader_partition_via_api_raises_TypeError_with_invalid_arg() -> None:
loader.load()


def test_loader_partitions_via_api_hi_res() -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
loader = UnstructuredLoader(
file_path=file_path,
partition_via_api=True,
# Unstructured kwargs
strategy="hi_res",
)

docs = loader.load()

categories = set(doc.metadata.get("category") for doc in docs)
assert "Table" in categories
assert "Image" in categories


# -- fixtures ---


Expand Down

0 comments on commit 7d49ee9

Please sign in to comment.