Skip to content

Commit

Permalink
[community] Added PebbloTextLoader for loading text data in PebbloSaf…
Browse files Browse the repository at this point in the history
…eLoader (#26582)

- **Description:** Added PebbloTextLoader for loading text in
PebbloSafeLoader.
- Since PebbloSafeLoader wraps document loaders, this new loader enables
direct loading of text into Documents using PebbloSafeLoader.
- **Issue:** NA
- **Dependencies:** NA
- [x] **Tests**: Added/Updated tests
  • Loading branch information
Raj725 committed Sep 19, 2024
1 parent 55b641b commit 60dc19d
Show file tree
Hide file tree
Showing 4 changed files with 113 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@
)
from langchain_community.document_loaders.pebblo import (
PebbloSafeLoader,
PebbloTextLoader,
)
from langchain_community.document_loaders.polars_dataframe import (
PolarsDataFrameLoader,
Expand Down Expand Up @@ -650,6 +651,7 @@
"PDFPlumberLoader": "langchain_community.document_loaders.pdf",
"PagedPDFSplitter": "langchain_community.document_loaders.pdf",
"PebbloSafeLoader": "langchain_community.document_loaders.pebblo",
"PebbloTextLoader": "langchain_community.document_loaders.pebblo",
"PlaywrightURLLoader": "langchain_community.document_loaders.url_playwright",
"PolarsDataFrameLoader": "langchain_community.document_loaders.polars_dataframe",
"PsychicLoader": "langchain_community.document_loaders.psychic",
Expand Down Expand Up @@ -855,6 +857,7 @@ def __getattr__(name: str) -> Any:
"PDFPlumberLoader",
"PagedPDFSplitter",
"PebbloSafeLoader",
"PebbloTextLoader",
"PlaywrightURLLoader",
"PolarsDataFrameLoader",
"PsychicLoader",
Expand Down
66 changes: 65 additions & 1 deletion libs/community/langchain_community/document_loaders/pebblo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
import uuid
from importlib.metadata import version
from typing import Dict, Iterator, List, Optional
from typing import Any, Dict, Iterable, Iterator, List, Optional

from langchain_core.documents import Document

Expand Down Expand Up @@ -271,3 +271,67 @@ def _add_pebblo_specific_metadata(self, classified_docs: dict) -> None:
doc_metadata["pb_checksum"] = classified_docs.get(doc.pb_id, {}).get(
"pb_checksum", None
)


class PebbloTextLoader(BaseLoader):
"""
Loader for text data.
Since PebbloSafeLoader is a wrapper around document loaders, this loader is
used to load text data directly into Documents.
"""

def __init__(
self,
texts: Iterable[str],
*,
source: Optional[str] = None,
ids: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
metadatas: Optional[List[Dict[str, Any]]] = None,
) -> None:
"""
Args:
texts: Iterable of text data.
source: Source of the text data.
Optional. Defaults to None.
ids: List of unique identifiers for each text.
Optional. Defaults to None.
metadata: Metadata for all texts.
Optional. Defaults to None.
metadatas: List of metadata for each text.
Optional. Defaults to None.
"""
self.texts = texts
self.source = source
self.ids = ids
self.metadata = metadata
self.metadatas = metadatas

def lazy_load(self) -> Iterator[Document]:
"""
Lazy load text data into Documents.
Returns:
Iterator of Documents
"""
for i, text in enumerate(self.texts):
_id = None
metadata = self.metadata or {}
if self.metadatas and i < len(self.metadatas) and self.metadatas[i]:
metadata.update(self.metadatas[i])
if self.ids and i < len(self.ids):
_id = self.ids[i]
yield Document(id=_id, page_content=text, metadata=metadata)

def load(self) -> List[Document]:
"""
Load text data into Documents.
Returns:
List of Documents
"""
documents = []
for doc in self.lazy_load():
documents.append(doc)
return documents
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
"DedocFileLoader",
"DedocPDFLoader",
"PebbloSafeLoader",
"PebbloTextLoader",
"DiffbotLoader",
"DirectoryLoader",
"DiscordChatLoader",
Expand Down
44 changes: 44 additions & 0 deletions libs/community/tests/unit_tests/document_loaders/test_pebblo.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ def test_pebblo_import() -> None:
from langchain_community.document_loaders import PebbloSafeLoader # noqa: F401


def test_pebblo_text_loader_import() -> None:
"""Test that the Pebblo text loader can be imported."""
from langchain_community.document_loaders import PebbloTextLoader # noqa: F401


def test_empty_filebased_loader(mocker: MockerFixture) -> None:
"""Test basic file based csv loader."""
# Setup
Expand Down Expand Up @@ -146,3 +151,42 @@ def test_pebblo_safe_loader_api_key() -> None:
# Assert
assert loader.pb_client.api_key == api_key
assert loader.pb_client.classifier_location == "local"


def test_pebblo_text_loader(mocker: MockerFixture) -> None:
"""
Test loading in-memory text with PebbloTextLoader and PebbloSafeLoader.
"""
# Setup
from langchain_community.document_loaders import PebbloSafeLoader, PebbloTextLoader

mocker.patch.multiple(
"requests",
get=MockResponse(json_data={"data": ""}, status_code=200),
post=MockResponse(json_data={"data": ""}, status_code=200),
)

text = "This is a test text."
source = "fake_source"
expected_docs = [
Document(
metadata={
"full_path": source,
"pb_checksum": None,
},
page_content=text,
),
]

# Exercise
texts = [text]
loader = PebbloSafeLoader(
PebbloTextLoader(texts, source=source),
"dummy_app_name",
"dummy_owner",
"dummy_description",
)
result = loader.load()

# Assert
assert result == expected_docs

0 comments on commit 60dc19d

Please sign in to comment.