Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

unstructured[patch]: add to integration tests #26666

Merged
merged 4 commits into from
Sep 19, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import os
from pathlib import Path
from typing import Callable
from typing import Callable, List

import pytest
from langchain_core.documents import Document

from langchain_unstructured import UnstructuredLoader

Expand All @@ -13,6 +14,51 @@
UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")


def _check_docs_content(docs: List[Document]) -> None:
assert all(
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
)
assert (
sum(doc.metadata.get("category") == "PageBreak" for doc in docs) == 16
) # 16 page doc

expected_metadata_keys = [
"source",
"languages",
"page_number",
"category",
"coordinates",
"element_id",
]
for doc in docs:
if doc.page_content:
for key in expected_metadata_keys:
assert key in doc.metadata
else:
assert doc.metadata.get("category") == "PageBreak"

page_numbers = []
for doc in docs:
if page_number := doc.metadata.get("page_number"):
page_numbers.append(page_number)

assert set(page_numbers) == set(range(1, 17))
assert len(docs) >= 32 # (16 pages * (>=1 element per page) + 16 page breaks)

page_1_content = ""
for doc in docs:
if doc.metadata.get("page_number") == 1:
page_1_content += f" {doc.page_content}"
assert (
"LayoutParser: A Unified Toolkit for Deep Learning "
"Based Document Image Analysis"
) in page_1_content

categories = set(doc.metadata.get("category") for doc in docs)
assert "NarrativeText" in categories
assert "Title" in categories


# -- Local partition --


Expand All @@ -27,10 +73,24 @@ def test_loader_partitions_locally() -> None:
include_page_breaks=True,
).load()

assert all(
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
_check_docs_content(docs)


@pytest.mark.local
async def test_loader_partitions_locally_async_lazy() -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")

loader = UnstructuredLoader(
file_path=file_path,
# Unstructured kwargs
strategy="fast",
include_page_breaks=True,
)
assert any(doc.metadata.get("category") == "PageBreak" for doc in docs)
docs = []
async for doc in loader.alazy_load():
docs.append(doc)

_check_docs_content(docs)


@pytest.mark.local
Expand Down Expand Up @@ -79,16 +139,30 @@ def test_loader_partitions_via_api() -> None:
# Unstructured kwargs
strategy="fast",
include_page_breaks=True,
coordinates=True,
)

docs = loader.load()

assert len(docs) > 1
assert any(doc.metadata.get("category") == "PageBreak" for doc in docs)
assert all(
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
_check_docs_content(docs)


async def test_loader_partitions_via_api_async_lazy() -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
loader = UnstructuredLoader(
file_path=file_path,
partition_via_api=True,
# Unstructured kwargs
strategy="fast",
include_page_breaks=True,
coordinates=True,
)
assert docs[0].metadata.get("element_id") is not None

docs = []
async for doc in loader.alazy_load():
docs.append(doc)

_check_docs_content(docs)


def test_loader_partitions_multiple_via_api() -> None:
Expand Down Expand Up @@ -124,6 +198,22 @@ def test_loader_partition_via_api_raises_TypeError_with_invalid_arg() -> None:
loader.load()


def test_loader_partitions_via_api_hi_res() -> None:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
loader = UnstructuredLoader(
file_path=file_path,
partition_via_api=True,
# Unstructured kwargs
strategy="hi_res",
)

docs = loader.load()

categories = set(doc.metadata.get("category") for doc in docs)
assert "Table" in categories
assert "Image" in categories


# -- fixtures ---


Expand Down
Loading