deepset-ai · anakin87 · Feb 27, 2024 · Feb 26, 2024 · Feb 26, 2024 · Feb 26, 2024
@@ -13,7 +13,7 @@ processors:
   - type: crossref
 renderer:
   type: haystack_pydoc_tools.renderers.ReadmePreviewRenderer
-  excerpt: Detects the language of the Documents and routes them appropriately.
+  excerpt: Detects the language of the Documents and adds it to the metadata.
   category_slug: haystack-api
   title: Classifiers
   slug: classifiers-api

@@ -14,32 +14,46 @@
 class DocumentLanguageClassifier:
     """
     Classify the language of documents and add the detected language to their metadata.
-    A MetadataRouter can then route them onto different output connections depending on their language.
-    This is useful to route documents to different models in a pipeline depending on their language.
+
+    A `MetadataRouter` can then route them onto different output connections depending on their language.
     The set of supported languages can be specified.
-    For routing plain text using the same logic, use the related TextLanguageRouter component instead.
+    For routing plain text using the same logic, use the related `TextLanguageRouter` component instead.
 
-    Example usage within an indexing pipeline, storing in a Document Store
+    Usage example within an indexing pipeline, storing in a Document Store
     only documents written in English:
 
     ```python
+    from haystack import Document, Pipeline
+    from haystack.document_stores.in_memory import InMemoryDocumentStore
+    from haystack.components.classifiers import DocumentLanguageClassifier
+    from haystack.components.routers import MetadataRouter
+    from haystack.components.writers import DocumentWriter
+
+    docs = [Document(id="1", content="This is an English document"),
+            Document(id="2", content="Este es un documento en español")]
+
     document_store = InMemoryDocumentStore()
+
     p = Pipeline()
-    p.add_component(instance=TextFileToDocument(), name="text_file_converter")
-    p.add_component(instance=DocumentLanguageClassifier(), name="language_classifier")
+    p.add_component(instance=DocumentLanguageClassifier(languages=["en"]), name="language_classifier")
     p.add_component(instance=MetadataRouter(rules={"en": {"language": {"$eq": "en"}}}), name="router")
     p.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
-    p.connect("text_file_converter.documents", "language_classifier.documents")
     p.connect("language_classifier.documents", "router.documents")
     p.connect("router.en", "writer.documents")
+
+    p.run({"language_classifier": {"documents": docs}})
+
+    written_docs = document_store.filter_documents()
+    assert len(written_docs) == 1
+    assert written_docs[0] == Document(id="1", content="This is an English document", meta={"language": "en"})
     ```
     """
 
     def __init__(self, languages: Optional[List[str]] = None):
         """
-        :param languages: A list of languages in ISO code, each corresponding to a different output connection
-            (see [langdetect` documentation](https://github.com/Mimino666/langdetect#languages)).
-            By default, only ["en"] is supported and Documents of any other language are routed to "unmatched".
+        :param languages: A list of languages in ISO code, each corresponding to a different output connection.
+            For supported languages, see the [`langdetect` documentation](https://github.com/Mimino666/langdetect#languages).
+            If not specified, the default is ["en"].
         """
         langdetect_import.check()
         if not languages:
@@ -49,11 +63,16 @@ def __init__(self, languages: Optional[List[str]] = None):
     @component.output_types(documents=List[Document])
     def run(self, documents: List[Document]):
         """
-        Run the DocumentLanguageClassifier. This method classifies the documents' language and adds it to their metadata.
-        If a Document's text does not match any of the languages specified at initialization, the metadata value "unmatched" will be stored.
+        This method classifies the documents' language and adds it to their metadata.
+        If a Document's text does not match any of the languages specified at initialization,
+        the metadata value "unmatched" will be stored.
 
         :param documents: A list of documents to classify their language.
-        :return: List of Documents with an added metadata field called language.
+
+        :returns: A dictionary with the following key:
+            - `documents`: List of Documents with an added metadata field called `language`.
+
+        :raises TypeError: if the input is not a list of Documents.
         """
         if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
             raise TypeError(
@@ -65,15 +84,15 @@ def run(self, documents: List[Document]):
         output["unmatched"] = []
 
         for document in documents:
-            detected_language = self.detect_language(document)
+            detected_language = self._detect_language(document)
             if detected_language in self.languages:
                 document.meta["language"] = detected_language
             else:
                 document.meta["language"] = "unmatched"
 
         return {"documents": documents}
 
-    def detect_language(self, document: Document) -> Optional[str]:
+    def _detect_language(self, document: Document) -> Optional[str]:
         try:
             language = langdetect.detect(document.content)
         except langdetect.LangDetectException:

@@ -27,7 +27,7 @@ def test_empty_list(self):
 
     def test_detect_language(self):
         classifier = DocumentLanguageClassifier()
-        detected_language = classifier.detect_language(Document(content="This is an english sentence."))
+        detected_language = classifier._detect_language(Document(content="This is an english sentence."))
         assert detected_language == "en"
 
     def test_classify_as_en_and_unmatched(self):