Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

docs: review DocumentLanguageClassifier docstrings #7210

Merged
merged 3 commits into from
Feb 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/pydoc/config/classifiers_api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ processors:
- type: crossref
renderer:
type: haystack_pydoc_tools.renderers.ReadmePreviewRenderer
excerpt: Detects the language of the Documents and routes them appropriately.
excerpt: Detects the language of the Documents and adds it to the metadata.
category_slug: haystack-api
title: Classifiers
slug: classifiers-api
Expand Down
49 changes: 34 additions & 15 deletions haystack/components/classifiers/document_language_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,32 +14,46 @@
class DocumentLanguageClassifier:
"""
Classify the language of documents and add the detected language to their metadata.
A MetadataRouter can then route them onto different output connections depending on their language.
This is useful to route documents to different models in a pipeline depending on their language.

A `MetadataRouter` can then route them onto different output connections depending on their language.
The set of supported languages can be specified.
For routing plain text using the same logic, use the related TextLanguageRouter component instead.
For routing plain text using the same logic, use the related `TextLanguageRouter` component instead.

Example usage within an indexing pipeline, storing in a Document Store
Usage example within an indexing pipeline, storing in a Document Store
only documents written in English:

```python
from haystack import Document, Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.classifiers import DocumentLanguageClassifier
from haystack.components.routers import MetadataRouter
from haystack.components.writers import DocumentWriter

docs = [Document(id="1", content="This is an English document"),
Document(id="2", content="Este es un documento en español")]

document_store = InMemoryDocumentStore()

p = Pipeline()
p.add_component(instance=TextFileToDocument(), name="text_file_converter")
p.add_component(instance=DocumentLanguageClassifier(), name="language_classifier")
p.add_component(instance=DocumentLanguageClassifier(languages=["en"]), name="language_classifier")
p.add_component(instance=MetadataRouter(rules={"en": {"language": {"$eq": "en"}}}), name="router")
p.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
p.connect("text_file_converter.documents", "language_classifier.documents")
p.connect("language_classifier.documents", "router.documents")
p.connect("router.en", "writer.documents")

p.run({"language_classifier": {"documents": docs}})

written_docs = document_store.filter_documents()
assert len(written_docs) == 1
assert written_docs[0] == Document(id="1", content="This is an English document", meta={"language": "en"})
```
"""

def __init__(self, languages: Optional[List[str]] = None):
"""
:param languages: A list of languages in ISO code, each corresponding to a different output connection
(see [langdetect` documentation](https://github.com/Mimino666/langdetect#languages)).
By default, only ["en"] is supported and Documents of any other language are routed to "unmatched".
:param languages: A list of languages in ISO code, each corresponding to a different output connection.
For supported languages, see the [`langdetect` documentation](https://github.com/Mimino666/langdetect#languages).
If not specified, the default is ["en"].
"""
langdetect_import.check()
if not languages:
Expand All @@ -49,11 +63,16 @@ def __init__(self, languages: Optional[List[str]] = None):
@component.output_types(documents=List[Document])
def run(self, documents: List[Document]):
"""
Run the DocumentLanguageClassifier. This method classifies the documents' language and adds it to their metadata.
If a Document's text does not match any of the languages specified at initialization, the metadata value "unmatched" will be stored.
This method classifies the documents' language and adds it to their metadata.
If a Document's text does not match any of the languages specified at initialization,
the metadata value "unmatched" will be stored.

:param documents: A list of documents to classify their language.
:return: List of Documents with an added metadata field called language.

:returns: A dictionary with the following key:
- `documents`: List of Documents with an added metadata field called `language`.

:raises TypeError: if the input is not a list of Documents.
"""
if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
raise TypeError(
Expand All @@ -65,15 +84,15 @@ def run(self, documents: List[Document]):
output["unmatched"] = []

for document in documents:
detected_language = self.detect_language(document)
detected_language = self._detect_language(document)
if detected_language in self.languages:
document.meta["language"] = detected_language
else:
document.meta["language"] = "unmatched"

return {"documents": documents}

def detect_language(self, document: Document) -> Optional[str]:
def _detect_language(self, document: Document) -> Optional[str]:
try:
language = langdetect.detect(document.content)
except langdetect.LangDetectException:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def test_empty_list(self):

def test_detect_language(self):
classifier = DocumentLanguageClassifier()
detected_language = classifier.detect_language(Document(content="This is an english sentence."))
detected_language = classifier._detect_language(Document(content="This is an english sentence."))
assert detected_language == "en"

def test_classify_as_en_and_unmatched(self):
Expand Down