Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add ArgillaSpaCyTransformersTrainer & improve ArgillaSpaCyTrainer #3256

Merged
merged 17 commits into from
Jun 29, 2023
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions src/argilla/client/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,11 +460,12 @@ def prepare_for_training(
Framework.PEFT,
]:
return self._prepare_for_training_with_transformers(train_size=train_size, test_size=test_size, seed=seed)
elif framework is Framework.SPACY and lang is None:
elif framework in [Framework.SPACY.value, Framework.SPACY_TRANSFORMERS.value] and lang is None:
raise ValueError(
"Please provide a spacy language model to prepare the dataset for training with the spacy framework."
"Please provide a `spaCy` language model to prepare the dataset for"
" training with the `spaCy`/`spaCy-transformers` framework."
)
elif framework in [Framework.SPACY, Framework.SPARK_NLP, Framework.OPENAI]:
elif framework in [Framework.SPACY, Framework.SPACY_TRANSFORMERS, Framework.SPARK_NLP, Framework.OPENAI]:
if train_size and test_size:
require_version("scikit-learn")
from sklearn.model_selection import train_test_split
Expand All @@ -475,7 +476,7 @@ def prepare_for_training(
shuffle=False,
random_state=seed,
)
if framework is Framework.SPACY:
if framework in [Framework.SPACY.value, Framework.SPACY_TRANSFORMERS.value]:
train_docbin = self._prepare_for_training_with_spacy(nlp=lang, records=records_train)
test_docbin = self._prepare_for_training_with_spacy(nlp=lang, records=records_test)
return train_docbin, test_docbin
Expand All @@ -488,7 +489,7 @@ def prepare_for_training(
test_jsonl = self._prepare_for_training_with_openai(records=records_test)
return train_jsonl, test_jsonl
else:
if framework is Framework.SPACY:
if framework in [Framework.SPACY, Framework.SPACY_TRANSFORMERS]:
return self._prepare_for_training_with_spacy(nlp=lang, records=shuffled_records)
elif framework is Framework.SPARK_NLP:
return self._prepare_for_training_with_spark_nlp(records=shuffled_records)
Expand Down
1 change: 1 addition & 0 deletions src/argilla/client/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class Framework(Enum):
PEFT = "peft"
SETFIT = "setfit"
SPACY = "spacy"
SPACY_TRANSFORMERS = "spacy-transformers"
SPAN_MARKER = "span_marker"
SPARK_NLP = "spark-nlp"
OPENAI = "openai"
Expand Down
19 changes: 18 additions & 1 deletion src/argilla/training/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def __init__(
train_size: Optional[float] = None,
seed: Optional[int] = None,
gpu_id: Optional[int] = -1,
framework_kwargs: Optional[dict] = {},
alvarobartt marked this conversation as resolved.
Show resolved Hide resolved
**load_kwargs: Optional[dict],
) -> None:
"""
Expand Down Expand Up @@ -115,7 +116,7 @@ def __init__(
self._settings = self.dataset_full._infer_settings_from_records()

framework = Framework(framework)
if framework is Framework.SPACY:
if framework in [Framework.SPACY.value, Framework.SPACY_TRANSFORMERS.value]:
import spacy

self.dataset_full_prepared = self.dataset_full.prepare_for_training(
Expand Down Expand Up @@ -185,6 +186,22 @@ def __init__(
settings=self._settings,
seed=self._seed,
gpu_id=gpu_id,
**framework_kwargs, # freeze_tok2vec
)
elif framework is Framework.SPACY_TRANSFORMERS:
from argilla.training.spacy import ArgillaSpaCyTransformersTrainer

self._trainer = ArgillaSpaCyTransformersTrainer(
name=self._name,
workspace=self._workspace,
record_class=self._rg_dataset_type._RECORD_TYPE,
dataset=self.dataset_full_prepared,
model=self.model,
multi_label=self._multi_label,
settings=self._settings,
seed=self._seed,
gpu_id=gpu_id,
**framework_kwargs, # update_transformer
)
elif framework is Framework.OPENAI:
from argilla.training.openai import ArgillaOpenAITrainer
Expand Down
178 changes: 134 additions & 44 deletions src/argilla/training/spacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,18 @@
import logging
import sys
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Literal, Optional, Union

from pydantic import BaseModel

from argilla.client.models import TextClassificationRecord, TokenClassificationRecord
from argilla.training.base import ArgillaTrainerSkeleton
from argilla.utils.dependency import require_version

__all__ = ["ArgillaSpaCyTrainer", "ArgillaSpaCyTransformersTrainer"]

class ArgillaSpaCyTrainer(ArgillaTrainerSkeleton):

class _ArgillaSpaCyTrainerBase(ArgillaTrainerSkeleton):
_logger = logging.getLogger("ArgillaSpaCyTrainer")
_logger.setLevel(logging.INFO)

Expand All @@ -35,18 +37,17 @@ def __init__(
language: Optional[str] = None,
gpu_id: Optional[int] = -1,
model: Optional[str] = None,
optimize: Literal["efficiency", "accuracy"] = "efficiency",
alvarobartt marked this conversation as resolved.
Show resolved Hide resolved
*args,
**kwargs,
) -> None:
"""Initialize the `ArgillaSpaCyTrainer` class.
"""Initialize the `_ArgillaSpaCyTrainerBase` class.

Args:
dataset: A `spacy.tokens.DocBin` object or a tuple of `spacy.tokens.DocBin` objects.
record_class:
A `TextClassificationRecord`, `TokenClassificationRecord`, or `Text2TextRecord`
object. Defaults to None.
model:
A `str` with either the `spaCy` model name if using the CPU e.g. "en_core_web_sm". Defaults to None.
seed: A `int` with the seed for the random number generator. Defaults to None.
multi_label: A `bool` indicating whether the task is multi-label or not. Defaults to False.
language:
Expand All @@ -55,25 +56,29 @@ def __init__(
gpu_id:
the GPU ID to use. Defaults to -1, which means that the CPU will be used by default.
GPU IDs start in 0, which stands for the default GPU in the system, if available.
model:
A `str` with the `spaCy` model name to use. If it contains vectors it
can also be used for training/fine-tuning, e.g. "en_core_web_lg"
contains vectors, while "en_core_web_sm" doesn't. Defaults to None.
optimize:
A `str` with the optimization strategy to use. Either "efficiency" or "accuracy".
Defaults to "efficiency", which means that the model will be smaller, faster,
and use less memory, but it will be less accurate. If "accuracy" is used, the model
will be larger, slower, and use more memory, but it will be more accurate.
Defaults to "efficiency".

Raises:
NotImplementedError: If `record_class` is `rg.Text2TextRecord`.

Example:
>>> from argilla import TokenClassificationRecord
>>> from argilla.training import ArgillaSpaCyTrainer
>>> dataset = ... # Load the dataset
>>> trainer = ArgillaSpaCyTrainer(dataset, record_class=TokenClassificationRecord)
>>> trainer.update_config(max_epochs=10)
>>> trainer.train()
>>> trainer.save("./model")
NotImplementedError: If the `record_class` is not supported or if the
`init_training_args` method has not been implemented.
"""
super().__init__(*args, **kwargs)
import spacy

self._nlp = None
self._model = model

self.config = {}

if self._record_class == TokenClassificationRecord:
self._column_mapping = {
"text": "text",
Expand All @@ -95,9 +100,10 @@ def __init__(
self._dataset if isinstance(self._dataset, tuple) and len(self._dataset) > 1 else (self._dataset, None)
)
self._train_dataset_path = "./train.spacy"
self._eval_dataset_path = "./dev.spacy" if self._eval_dataset else None
self._eval_dataset_path = "./dev.spacy" if self._eval_dataset else "./train.spacy"
alvarobartt marked this conversation as resolved.
Show resolved Hide resolved

self.language = language or "en"
self.optimize = optimize

self.gpu_id = gpu_id
self.use_gpu = False
Expand Down Expand Up @@ -128,34 +134,6 @@ def __init__(

self.init_training_args()

def init_training_args(self):
from spacy.cli.init_config import init_config

self.config = init_config(
lang=self.language,
pipeline=self._pipeline,
optimize="efficiency",
)

self.config["paths"]["train"] = self._train_dataset_path
self.config["paths"]["dev"] = self._eval_dataset_path or self._train_dataset_path
self.config["system"]["seed"] = self._seed or 42
if not self._model:
self._logger.warning(
"`model` is not specified and it's recommended to specify the"
" `spaCy` model to use. Using `en_core_web_sm` as the default model"
" instead."
)
self._model = "en_core_web_sm"
self.config["paths"]["vectors"] = self._model
if self.use_gpu:
self.config["system"]["gpu_allocator"] = (
"pytorch" if self.has_torch else "tensorflow" if self.has_tensorflow else None
)
self.config["nlp"]["batch_size"] = 128

self._nlp = None

def init_model(self):
import spacy

Expand Down Expand Up @@ -213,8 +191,13 @@ def train(self, output_dir: Optional[str] = None) -> None:
self._logger.info(f"Dumping the dev dataset to {self._eval_dataset_path}")
self._eval_dataset.to_disk(self._eval_dataset_path)

# Both `init_nlp` and `train_nlp` must be executed in the same Jupyter Notebook
# cell if using the GPU, otherwise, since `thinc` is using `ContextVars` to
# store the `Config` object, the `Config` object will be lost between cells and
# the training will fail.
self._nlp = init_nlp(self.config, use_gpu=self.gpu_id)
self._nlp, _ = train_nlp(self._nlp, use_gpu=self.gpu_id, stdout=sys.stdout, stderr=sys.stderr)

if output_dir:
self.save(output_dir)

Expand Down Expand Up @@ -280,3 +263,110 @@ def predict(
if str_input:
formatted_prediction = list(formatted_prediction)[0]
return formatted_prediction


class ArgillaSpaCyTrainer(_ArgillaSpaCyTrainerBase):
def __init__(self, freeze_tok2vec: bool = False, **kwargs) -> None:
"""Initialize the `ArgillaSpaCyTrainer` class.

Args:
freeze_tok2vec: A `bool` indicating whether to freeze the `tok2vec` weights
during the training. Defaults to False.
**kwargs: The `ArgillaSpaCyTrainerBase` arguments.

Examples:
>>> from argilla import ArgillaSpaCyTrainer
>>> trainer = ArgillaSpaCyTrainer(
"""
super().__init__(**kwargs)
self.freeze_tok2vec = freeze_tok2vec

def init_training_args(self) -> None:
from spacy.cli.init_config import init_config

# We generate the config with GPU just when we are using `spacy-transformers`,
# otherwise the default configuration will be messed up for `spacy`.
self.config = init_config(
lang=self.language,
pipeline=self._pipeline,
optimize=self.optimize,
gpu=False,
)

self.config["paths"]["train"] = self._train_dataset_path
self.config["paths"]["dev"] = self._eval_dataset_path
self.config["system"]["seed"] = self._seed or 42

# Now we can already set the GPU properties if we want to train/fine-tune a
# `spacy` model using the GPU, or a `spacy-transformers` model using the CPU.
self.config["system"]["gpu_allocator"] = (
("pytorch" if self.has_torch else "tensorflow" if self.has_tensorflow else None) if self.use_gpu else None
)
self.config["nlp"]["batch_size"] = 128 if self.use_gpu else 1000

if "tok2vec" in self.config["nlp"]["pipeline"]:
# If we want to fine-tune the `tok2vec` component, then we need to set the
# `init_tok2vec` path to the model we want to fine-tune, and set the
# `include_static_vectors` to `True`.
if self.freeze_tok2vec is False:
self.config["paths"]["init_tok2vec"] = self._model
self.config["components"]["tok2vec"]["include_static_vectors"] = True
else:
# Otherwise, if we don't want to fine-tune the `tok2vec` component, then we
# need to set the `frozen_components` and `annotating_components` to
# `["tok2vec"]`.
self.config["training"]["frozen_components"] = ["tok2vec"]
self.config["training"]["annotating_components"] = ["tok2vec"]


class ArgillaSpaCyTransformersTrainer(_ArgillaSpaCyTrainerBase):
require_version("spacy-transformers")

def __init__(self, update_transformer: bool = True, **kwargs) -> None:
"""Initialize the `ArgillaSpaCyTransformersTrainer` class.

Args:
update_transformer: A `bool` indicating whether to update the transformer
weights during the training. Defaults to True.
**kwargs: The `ArgillaSpaCyTrainerBase` arguments.
"""
super().__init__(**kwargs)
self.update_transformer = update_transformer

def init_training_args(self) -> None:
from spacy.cli.init_config import init_config

# We generate the config with GPU just when we are using `spacy-transformers`,
# otherwise the default configuration will be messed up for `spacy`.
self.config = init_config(
lang=self.language,
pipeline=self._pipeline,
optimize=self.optimize,
gpu=True,
)

self.config["paths"]["train"] = self._train_dataset_path
self.config["paths"]["dev"] = self._eval_dataset_path
self.config["system"]["seed"] = self._seed or 42

# Now we can already set the GPU properties if we want to train/fine-tune a
# `spacy` model using the GPU, or a `spacy-transformers` model using the CPU.
self.config["system"]["gpu_allocator"] = (
("pytorch" if self.has_torch else "tensorflow" if self.has_tensorflow else None) if self.use_gpu else None
)
self.config["nlp"]["batch_size"] = 128 if self.use_gpu else 16

# If we use `spacy-transformers` then we need to set the `transformer` component
# in the pipeline, and we need to set the `name` of the model to load.
self.config["components"]["transformer"]["name"] = self._model
self.config["nlp"]["pipeline"] = ["transformer"] + self._pipeline

if "transformer" in self.config["nlp"]["pipeline"]:
# The `transformer` component cannot be frozen, but we can set the `grad_factor`
# to 0.0 to avoid updating the weights of the `transformer` component. Even though
# the computation of those weights will be performed, the gradients will be
# multiplied by 0.0, so the weights will not be updated.
# self.config["training"]["frozen_components"] = ["transformer"]
# self.config["training"]["annotating_components"] = ["transformer"]
if not self.update_transformer:
self.config["components"]["transformer"]["grad_factor"] = 0.0
Loading