docs: Update docstrings and API doc config for `haystack.components.e…

…xtractors`
deepset-ai · Feb 28, 2024 · 533d6c7 · 533d6c7
1 parent d6ef2b5
commit 533d6c7
Show file tree

Hide file tree

Showing 2 changed files with 52 additions and 26 deletions.
diff --git a/docs/pydoc/config/extractors_api.yml b/docs/pydoc/config/extractors_api.yml
@@ -9,6 +9,8 @@ processors:
     documented_only: true
     do_not_filter_modules: false
     skip_empty_modules: true
+  - type: filter
+    expression: "name not in ['_BackendEnumMeta', '_NerBackend', '_HfBackend', '_SpacyBackend']"
   - type: smart
   - type: crossref
 renderer:

diff --git a/haystack/components/extractors/named_entity_extractor.py b/haystack/components/extractors/named_entity_extractor.py
@@ -44,13 +44,9 @@ class NamedEntityExtractorBackend(Enum, metaclass=_BackendEnumMeta):
     NLP backend to use for Named Entity Recognition.
     """
 
-    #: Hugging Face.
-    #:
     #: Uses an Hugging Face model and pipeline.
     HUGGING_FACE = "hugging_face"
 
-    #: spaCy.
-    #:
     #: Uses a spaCy model and pipeline.
     SPACY = "spacy"
 
@@ -78,6 +74,33 @@ class NamedEntityAnnotation:
 
 @component
 class NamedEntityExtractor:
+    """
+    Annotates named entities in a collection of documents.
+
+    The component supports two backends: Hugging Face and spaCy. The
+    former can be used with any sequence classification model from the
+    [Hugging Face model hub](https://huggingface.co/models), while the
+    latter can be used with any [spaCy model](https://spacy.io/models)
+    that contains an NER component. Annotations are stored as metadata
+    in the documents.
+
+    Usage example:
+    ```python
+    from haystack import Document
+    from haystack.components.extractors.named_entity_extractor import NamedEntityExtractor
+
+    documents = [
+        Document(content="I'm Merlin, the happy pig!"),
+        Document(content="My name is Clara and I live in Berkeley, California."),
+    ]
+    extractor = NamedEntityExtractor(backend="hugging_face", model="dslim/bert-base-NER")
+    extractor.warm_up()
+    results = extractor.run(documents=documents)["documents"]
+    annotations = [NamedEntityExtractor.get_stored_annotations(doc) for doc in results]
+    print(annotations)
+    ```
+    """
+
     _METADATA_KEY = "named_entities"
 
     def __init__(
@@ -89,27 +112,22 @@ def __init__(
         device: Optional[ComponentDevice] = None,
     ) -> None:
         """
-        Construct a Named Entity extractor component.
+        Create a Named Entity extractor component.
 
         :param backend:
             Backend to use for NER.
         :param model:
             Name of the model or a path to the model on
-            the local disk.
-
-            Dependent on the backend.
+            the local disk. Dependent on the backend.
         :param pipeline_kwargs:
             Keyword arguments passed to the pipeline. The
-            pipeline can override these arguments.
-
-            Dependent on the backend.
+            pipeline can override these arguments. Dependent on the backend.
         :param device:
             The device on which the model is loaded. If `None`,
-            the default device is automatically selected.
-
-            If a device/device map is specified in `pipeline_kwargs`,
-            it overrides this parameter (only applicable to the HuggingFace
-            backend).
+            the default device is automatically selected. If a
+            device/device map is specified in `pipeline_kwargs`,
+            it overrides this parameter (only applicable to the
+            HuggingFace backend).
         """
 
         if isinstance(backend, str):
@@ -127,7 +145,7 @@ def __init__(
 
     def warm_up(self):
         """
-        Initialize the named entity extractor backend.
+        Initialize the component.
         """
         try:
             self._backend.initialize()
@@ -139,14 +157,15 @@ def warm_up(self):
     @component.output_types(documents=List[Document])
     def run(self, documents: List[Document], batch_size: int = 1) -> Dict[str, Any]:
         """
-         Run the named-entity extractor.
+        Annotate named entities in each document and store
+        the annotations in the document's metadata.
 
         :param documents:
-             Documents to process.
+            Documents to process.
         :param batch_size:
-             Batch size used for processing the documents.
+            Batch size used for processing the documents.
         :returns:
-             The processed documents.
+            Processed documents.
         """
         texts = [doc.content if doc.content is not None else "" for doc in documents]
         annotations = self._backend.annotate(texts, batch_size=batch_size)
@@ -164,7 +183,10 @@ def run(self, documents: List[Document], batch_size: int = 1) -> Dict[str, Any]:
 
     def to_dict(self) -> Dict[str, Any]:
         """
-        Serialize this component to a dictionary.
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
         """
         return default_to_dict(
             self,
@@ -177,10 +199,12 @@ def to_dict(self) -> Dict[str, Any]:
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "NamedEntityExtractor":
         """
-        Deserialize the component from a dictionary.
+        Deserializes the component from a dictionary.
 
         :param data:
-            The dictionary to deserialize from.
+            Dictionary to deserialize from.
+        :returns:
+            Deserialized component.
         """
         try:
             init_params = data["init_parameters"]
@@ -267,8 +291,8 @@ def model_name(self) -> str:
     @property
     def device(self) -> ComponentDevice:
         """
-        Returns the identifier of the device on which
-        the backend's model is loaded.
+        :returns:
+            The device on which the backend's model is loaded.
         """
         return self._device