argilla-io · alvarobartt · Jun 29, 2023 · Jun 23, 2023 · Jun 23, 2023 · Jun 23, 2023
diff --git a/src/argilla/client/datasets.py b/src/argilla/client/datasets.py
@@ -460,11 +460,12 @@ def prepare_for_training(
             Framework.PEFT,
         ]:
             return self._prepare_for_training_with_transformers(train_size=train_size, test_size=test_size, seed=seed)
-        elif framework is Framework.SPACY and lang is None:
+        elif framework in [Framework.SPACY.value, Framework.SPACY_TRANSFORMERS.value] and lang is None:
             raise ValueError(
-                "Please provide a spacy language model to prepare the dataset for training with the spacy framework."
+                "Please provide a `spaCy` language model to prepare the dataset for"
+                " training with the `spaCy`/`spaCy-transformers` framework."
             )
-        elif framework in [Framework.SPACY, Framework.SPARK_NLP, Framework.OPENAI]:
+        elif framework in [Framework.SPACY, Framework.SPACY_TRANSFORMERS, Framework.SPARK_NLP, Framework.OPENAI]:
             if train_size and test_size:
                 require_version("scikit-learn")
                 from sklearn.model_selection import train_test_split
@@ -475,7 +476,7 @@ def prepare_for_training(
                     shuffle=False,
                     random_state=seed,
                 )
-                if framework is Framework.SPACY:
+                if framework in [Framework.SPACY.value, Framework.SPACY_TRANSFORMERS.value]:
                     train_docbin = self._prepare_for_training_with_spacy(nlp=lang, records=records_train)
                     test_docbin = self._prepare_for_training_with_spacy(nlp=lang, records=records_test)
                     return train_docbin, test_docbin
@@ -488,7 +489,7 @@ def prepare_for_training(
                     test_jsonl = self._prepare_for_training_with_openai(records=records_test)
                     return train_jsonl, test_jsonl
             else:
-                if framework is Framework.SPACY:
+                if framework in [Framework.SPACY, Framework.SPACY_TRANSFORMERS]:
                     return self._prepare_for_training_with_spacy(nlp=lang, records=shuffled_records)
                 elif framework is Framework.SPARK_NLP:
                     return self._prepare_for_training_with_spark_nlp(records=shuffled_records)

diff --git a/src/argilla/client/models.py b/src/argilla/client/models.py
@@ -53,6 +53,7 @@ class Framework(Enum):
     PEFT = "peft"
     SETFIT = "setfit"
     SPACY = "spacy"
+    SPACY_TRANSFORMERS = "spacy-transformers"
     SPAN_MARKER = "span_marker"
     SPARK_NLP = "spark-nlp"
     OPENAI = "openai"

diff --git a/src/argilla/training/base.py b/src/argilla/training/base.py
@@ -55,6 +55,7 @@ def __init__(
         train_size: Optional[float] = None,
         seed: Optional[int] = None,
         gpu_id: Optional[int] = -1,
+        framework_kwargs: Optional[dict] = {},
         **load_kwargs: Optional[dict],
     ) -> None:
         """
@@ -115,7 +116,7 @@ def __init__(
                 self._settings = self.dataset_full._infer_settings_from_records()
 
         framework = Framework(framework)
-        if framework is Framework.SPACY:
+        if framework in [Framework.SPACY.value, Framework.SPACY_TRANSFORMERS.value]:
             import spacy
 
             self.dataset_full_prepared = self.dataset_full.prepare_for_training(
@@ -185,6 +186,22 @@ def __init__(
                 settings=self._settings,
                 seed=self._seed,
                 gpu_id=gpu_id,
+                **framework_kwargs,  # freeze_tok2vec
+            )
+        elif framework is Framework.SPACY_TRANSFORMERS:
+            from argilla.training.spacy import ArgillaSpaCyTransformersTrainer
+
+            self._trainer = ArgillaSpaCyTransformersTrainer(
+                name=self._name,
+                workspace=self._workspace,
+                record_class=self._rg_dataset_type._RECORD_TYPE,
+                dataset=self.dataset_full_prepared,
+                model=self.model,
+                multi_label=self._multi_label,
+                settings=self._settings,
+                seed=self._seed,
+                gpu_id=gpu_id,
+                **framework_kwargs,  # update_transformer
             )
         elif framework is Framework.OPENAI:
             from argilla.training.openai import ArgillaOpenAITrainer

diff --git a/src/argilla/training/spacy.py b/src/argilla/training/spacy.py
@@ -15,16 +15,18 @@
 import logging
 import sys
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Literal, Optional, Union
 
 from pydantic import BaseModel
 
 from argilla.client.models import TextClassificationRecord, TokenClassificationRecord
 from argilla.training.base import ArgillaTrainerSkeleton
 from argilla.utils.dependency import require_version
 
+__all__ = ["ArgillaSpaCyTrainer", "ArgillaSpaCyTransformersTrainer"]
 
-class ArgillaSpaCyTrainer(ArgillaTrainerSkeleton):
+
+class _ArgillaSpaCyTrainerBase(ArgillaTrainerSkeleton):
     _logger = logging.getLogger("ArgillaSpaCyTrainer")
     _logger.setLevel(logging.INFO)
 
@@ -35,18 +37,17 @@ def __init__(
         language: Optional[str] = None,
         gpu_id: Optional[int] = -1,
         model: Optional[str] = None,
+        optimize: Literal["efficiency", "accuracy"] = "efficiency",
         *args,
         **kwargs,
     ) -> None:
-        """Initialize the `ArgillaSpaCyTrainer` class.
+        """Initialize the `_ArgillaSpaCyTrainerBase` class.
 
         Args:
             dataset: A `spacy.tokens.DocBin` object or a tuple of `spacy.tokens.DocBin` objects.
             record_class:
                 A `TextClassificationRecord`, `TokenClassificationRecord`, or `Text2TextRecord`
                 object. Defaults to None.
-            model:
-                A `str` with either the `spaCy` model name if using the CPU e.g. "en_core_web_sm". Defaults to None.
             seed: A `int` with the seed for the random number generator. Defaults to None.
             multi_label: A `bool` indicating whether the task is multi-label or not. Defaults to False.
             language:
@@ -55,25 +56,29 @@ def __init__(
             gpu_id:
                 the GPU ID to use. Defaults to -1, which means that the CPU will be used by default.
                 GPU IDs start in 0, which stands for the default GPU in the system, if available.
+            model:
+                A `str` with the `spaCy` model name to use. If it contains vectors it
+                can also be used for training/fine-tuning, e.g. "en_core_web_lg"
+                contains vectors, while "en_core_web_sm" doesn't. Defaults to None.
+            optimize:
+                A `str` with the optimization strategy to use. Either "efficiency" or "accuracy".
+                Defaults to "efficiency", which means that the model will be smaller, faster,
+                and use less memory, but it will be less accurate. If "accuracy" is used, the model
+                will be larger, slower, and use more memory, but it will be more accurate.
+                Defaults to "efficiency".
 
         Raises:
-            NotImplementedError: If `record_class` is `rg.Text2TextRecord`.
-
-        Example:
-            >>> from argilla import TokenClassificationRecord
-            >>> from argilla.training import ArgillaSpaCyTrainer
-            >>> dataset = ... # Load the dataset
-            >>> trainer = ArgillaSpaCyTrainer(dataset, record_class=TokenClassificationRecord)
-            >>> trainer.update_config(max_epochs=10)
-            >>> trainer.train()
-            >>> trainer.save("./model")
+            NotImplementedError: If the `record_class` is not supported or if the
+                `init_training_args` method has not been implemented.
         """
         super().__init__(*args, **kwargs)
         import spacy
 
         self._nlp = None
         self._model = model
 
+        self.config = {}
+
         if self._record_class == TokenClassificationRecord:
             self._column_mapping = {
                 "text": "text",
@@ -95,9 +100,10 @@ def __init__(
             self._dataset if isinstance(self._dataset, tuple) and len(self._dataset) > 1 else (self._dataset, None)
         )
         self._train_dataset_path = "./train.spacy"
-        self._eval_dataset_path = "./dev.spacy" if self._eval_dataset else None
+        self._eval_dataset_path = "./dev.spacy" if self._eval_dataset else "./train.spacy"
 
         self.language = language or "en"
+        self.optimize = optimize
 
         self.gpu_id = gpu_id
         self.use_gpu = False
@@ -128,34 +134,6 @@ def __init__(
 
         self.init_training_args()
 
-    def init_training_args(self):
-        from spacy.cli.init_config import init_config
-
-        self.config = init_config(
-            lang=self.language,
-            pipeline=self._pipeline,
-            optimize="efficiency",
-        )
-
-        self.config["paths"]["train"] = self._train_dataset_path
-        self.config["paths"]["dev"] = self._eval_dataset_path or self._train_dataset_path
-        self.config["system"]["seed"] = self._seed or 42
-        if not self._model:
-            self._logger.warning(
-                "`model` is not specified and it's recommended to specify the"
-                " `spaCy` model to use. Using `en_core_web_sm` as the default model"
-                " instead."
-            )
-            self._model = "en_core_web_sm"
-        self.config["paths"]["vectors"] = self._model
-        if self.use_gpu:
-            self.config["system"]["gpu_allocator"] = (
-                "pytorch" if self.has_torch else "tensorflow" if self.has_tensorflow else None
-            )
-            self.config["nlp"]["batch_size"] = 128
-
-        self._nlp = None
-
     def init_model(self):
         import spacy
 
@@ -213,8 +191,13 @@ def train(self, output_dir: Optional[str] = None) -> None:
             self._logger.info(f"Dumping the dev dataset to {self._eval_dataset_path}")
             self._eval_dataset.to_disk(self._eval_dataset_path)
 
+        # Both `init_nlp` and `train_nlp` must be executed in the same Jupyter Notebook
+        # cell if using the GPU, otherwise, since `thinc` is using `ContextVars` to
+        # store the `Config` object, the `Config` object will be lost between cells and
+        # the training will fail.
         self._nlp = init_nlp(self.config, use_gpu=self.gpu_id)
         self._nlp, _ = train_nlp(self._nlp, use_gpu=self.gpu_id, stdout=sys.stdout, stderr=sys.stderr)
+
         if output_dir:
             self.save(output_dir)
 
@@ -280,3 +263,110 @@ def predict(
         if str_input:
             formatted_prediction = list(formatted_prediction)[0]
         return formatted_prediction
+
+
+class ArgillaSpaCyTrainer(_ArgillaSpaCyTrainerBase):
+    def __init__(self, freeze_tok2vec: bool = False, **kwargs) -> None:
+        """Initialize the `ArgillaSpaCyTrainer` class.
+
+        Args:
+            freeze_tok2vec: A `bool` indicating whether to freeze the `tok2vec` weights
+                during the training. Defaults to False.
+            **kwargs: The `ArgillaSpaCyTrainerBase` arguments.
+
+        Examples:
+            >>> from argilla import ArgillaSpaCyTrainer
+            >>> trainer = ArgillaSpaCyTrainer(
+        """
+        super().__init__(**kwargs)
+        self.freeze_tok2vec = freeze_tok2vec
+
+    def init_training_args(self) -> None:
+        from spacy.cli.init_config import init_config
+
+        # We generate the config with GPU just when we are using `spacy-transformers`,
+        # otherwise the default configuration will be messed up for `spacy`.
+        self.config = init_config(
+            lang=self.language,
+            pipeline=self._pipeline,
+            optimize=self.optimize,
+            gpu=False,
+        )
+
+        self.config["paths"]["train"] = self._train_dataset_path
+        self.config["paths"]["dev"] = self._eval_dataset_path
+        self.config["system"]["seed"] = self._seed or 42
+
+        # Now we can already set the GPU properties if we want to train/fine-tune a
+        # `spacy` model using the GPU, or a `spacy-transformers` model using the CPU.
+        self.config["system"]["gpu_allocator"] = (
+            ("pytorch" if self.has_torch else "tensorflow" if self.has_tensorflow else None) if self.use_gpu else None
+        )
+        self.config["nlp"]["batch_size"] = 128 if self.use_gpu else 1000
+
+        if "tok2vec" in self.config["nlp"]["pipeline"]:
+            # If we want to fine-tune the `tok2vec` component, then we need to set the
+            # `init_tok2vec` path to the model we want to fine-tune, and set the
+            # `include_static_vectors` to `True`.
+            if self.freeze_tok2vec is False:
+                self.config["paths"]["init_tok2vec"] = self._model
+                self.config["components"]["tok2vec"]["include_static_vectors"] = True
+            else:
+                # Otherwise, if we don't want to fine-tune the `tok2vec` component, then we
+                # need to set the `frozen_components` and `annotating_components` to
+                # `["tok2vec"]`.
+                self.config["training"]["frozen_components"] = ["tok2vec"]
+                self.config["training"]["annotating_components"] = ["tok2vec"]
+
+
+class ArgillaSpaCyTransformersTrainer(_ArgillaSpaCyTrainerBase):
+    require_version("spacy-transformers")
+
+    def __init__(self, update_transformer: bool = True, **kwargs) -> None:
+        """Initialize the `ArgillaSpaCyTransformersTrainer` class.
+
+        Args:
+            update_transformer: A `bool` indicating whether to update the transformer
+                weights during the training. Defaults to True.
+            **kwargs: The `ArgillaSpaCyTrainerBase` arguments.
+        """
+        super().__init__(**kwargs)
+        self.update_transformer = update_transformer
+
+    def init_training_args(self) -> None:
+        from spacy.cli.init_config import init_config
+
+        # We generate the config with GPU just when we are using `spacy-transformers`,
+        # otherwise the default configuration will be messed up for `spacy`.
+        self.config = init_config(
+            lang=self.language,
+            pipeline=self._pipeline,
+            optimize=self.optimize,
+            gpu=True,
+        )
+
+        self.config["paths"]["train"] = self._train_dataset_path
+        self.config["paths"]["dev"] = self._eval_dataset_path
+        self.config["system"]["seed"] = self._seed or 42
+
+        # Now we can already set the GPU properties if we want to train/fine-tune a
+        # `spacy` model using the GPU, or a `spacy-transformers` model using the CPU.
+        self.config["system"]["gpu_allocator"] = (
+            ("pytorch" if self.has_torch else "tensorflow" if self.has_tensorflow else None) if self.use_gpu else None
+        )
+        self.config["nlp"]["batch_size"] = 128 if self.use_gpu else 16
+
+        # If we use `spacy-transformers` then we need to set the `transformer` component
+        # in the pipeline, and we need to set the `name` of the model to load.
+        self.config["components"]["transformer"]["name"] = self._model
+        self.config["nlp"]["pipeline"] = ["transformer"] + self._pipeline
+
+        if "transformer" in self.config["nlp"]["pipeline"]:
+            # The `transformer` component cannot be frozen, but we can set the `grad_factor`
+            # to 0.0 to avoid updating the weights of the `transformer` component. Even though
+            # the computation of those weights will be performed, the gradients will be
+            # multiplied by 0.0, so the weights will not be updated.
+            # self.config["training"]["frozen_components"] = ["transformer"]
+            # self.config["training"]["annotating_components"] = ["transformer"]
+            if not self.update_transformer:
+                self.config["components"]["transformer"]["grad_factor"] = 0.0