Skip to content

Commit

Permalink
docs: Update docstrings and API doc config for `haystack.components.e…
Browse files Browse the repository at this point in the history
…xtractors`
  • Loading branch information
shadeMe committed Feb 28, 2024
1 parent d6ef2b5 commit 533d6c7
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 26 deletions.
2 changes: 2 additions & 0 deletions docs/pydoc/config/extractors_api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ processors:
documented_only: true
do_not_filter_modules: false
skip_empty_modules: true
- type: filter
expression: "name not in ['_BackendEnumMeta', '_NerBackend', '_HfBackend', '_SpacyBackend']"
- type: smart
- type: crossref
renderer:
Expand Down
76 changes: 50 additions & 26 deletions haystack/components/extractors/named_entity_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,9 @@ class NamedEntityExtractorBackend(Enum, metaclass=_BackendEnumMeta):
NLP backend to use for Named Entity Recognition.
"""

#: Hugging Face.
#:
#: Uses an Hugging Face model and pipeline.
HUGGING_FACE = "hugging_face"

#: spaCy.
#:
#: Uses a spaCy model and pipeline.
SPACY = "spacy"

Expand Down Expand Up @@ -78,6 +74,33 @@ class NamedEntityAnnotation:

@component
class NamedEntityExtractor:
"""
Annotates named entities in a collection of documents.
The component supports two backends: Hugging Face and spaCy. The
former can be used with any sequence classification model from the
[Hugging Face model hub](https://huggingface.co/models), while the
latter can be used with any [spaCy model](https://spacy.io/models)
that contains an NER component. Annotations are stored as metadata
in the documents.
Usage example:
```python
from haystack import Document
from haystack.components.extractors.named_entity_extractor import NamedEntityExtractor
documents = [
Document(content="I'm Merlin, the happy pig!"),
Document(content="My name is Clara and I live in Berkeley, California."),
]
extractor = NamedEntityExtractor(backend="hugging_face", model="dslim/bert-base-NER")
extractor.warm_up()
results = extractor.run(documents=documents)["documents"]
annotations = [NamedEntityExtractor.get_stored_annotations(doc) for doc in results]
print(annotations)
```
"""

_METADATA_KEY = "named_entities"

def __init__(
Expand All @@ -89,27 +112,22 @@ def __init__(
device: Optional[ComponentDevice] = None,
) -> None:
"""
Construct a Named Entity extractor component.
Create a Named Entity extractor component.
:param backend:
Backend to use for NER.
:param model:
Name of the model or a path to the model on
the local disk.
Dependent on the backend.
the local disk. Dependent on the backend.
:param pipeline_kwargs:
Keyword arguments passed to the pipeline. The
pipeline can override these arguments.
Dependent on the backend.
pipeline can override these arguments. Dependent on the backend.
:param device:
The device on which the model is loaded. If `None`,
the default device is automatically selected.
If a device/device map is specified in `pipeline_kwargs`,
it overrides this parameter (only applicable to the HuggingFace
backend).
the default device is automatically selected. If a
device/device map is specified in `pipeline_kwargs`,
it overrides this parameter (only applicable to the
HuggingFace backend).
"""

if isinstance(backend, str):
Expand All @@ -127,7 +145,7 @@ def __init__(

def warm_up(self):
"""
Initialize the named entity extractor backend.
Initialize the component.
"""
try:
self._backend.initialize()
Expand All @@ -139,14 +157,15 @@ def warm_up(self):
@component.output_types(documents=List[Document])
def run(self, documents: List[Document], batch_size: int = 1) -> Dict[str, Any]:
"""
Run the named-entity extractor.
Annotate named entities in each document and store
the annotations in the document's metadata.
:param documents:
Documents to process.
Documents to process.
:param batch_size:
Batch size used for processing the documents.
Batch size used for processing the documents.
:returns:
The processed documents.
Processed documents.
"""
texts = [doc.content if doc.content is not None else "" for doc in documents]
annotations = self._backend.annotate(texts, batch_size=batch_size)
Expand All @@ -164,7 +183,10 @@ def run(self, documents: List[Document], batch_size: int = 1) -> Dict[str, Any]:

def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""
return default_to_dict(
self,
Expand All @@ -177,10 +199,12 @@ def to_dict(self) -> Dict[str, Any]:
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "NamedEntityExtractor":
"""
Deserialize the component from a dictionary.
Deserializes the component from a dictionary.
:param data:
The dictionary to deserialize from.
Dictionary to deserialize from.
:returns:
Deserialized component.
"""
try:
init_params = data["init_parameters"]
Expand Down Expand Up @@ -267,8 +291,8 @@ def model_name(self) -> str:
@property
def device(self) -> ComponentDevice:
"""
Returns the identifier of the device on which
the backend's model is loaded.
:returns:
The device on which the backend's model is loaded.
"""
return self._device

Expand Down

0 comments on commit 533d6c7

Please sign in to comment.