Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add ArgillaSpaCyTransformersTrainer & improve ArgillaSpaCyTrainer #3256

Merged
merged 17 commits into from
Jun 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion environment_dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,9 @@ dependencies:
- pgmpy
- plotly>=4.1.0
- snorkel>=0.9.7
- spacy==3.5.0
- spacy==3.5.3
- https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0.tar.gz
- spacy-transformers>=1.2.5
- transformers[torch]>=4.19.0
- evaluate
- seqeval
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ integrations = [
"pgmpy",
"plotly >= 4.1.0",
"snorkel >= 0.9.7",
"spacy == 3.5.0",
"spacy == 3.5.3",
"spacy-transformers >= 1.2.5",
"transformers[torch] >= 4.19.0",
"evaluate",
"seqeval",
Expand Down
11 changes: 6 additions & 5 deletions src/argilla/client/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,11 +460,12 @@ def prepare_for_training(
Framework.PEFT,
]:
return self._prepare_for_training_with_transformers(train_size=train_size, test_size=test_size, seed=seed)
elif framework is Framework.SPACY and lang is None:
elif framework in [Framework.SPACY, Framework.SPACY_TRANSFORMERS] and lang is None:
raise ValueError(
"Please provide a spacy language model to prepare the dataset for training with the spacy framework."
"Please provide a `spaCy` language model to prepare the dataset for"
" training with the `spaCy`/`spaCy-transformers` framework."
)
elif framework in [Framework.SPACY, Framework.SPARK_NLP, Framework.OPENAI]:
elif framework in [Framework.SPACY, Framework.SPACY_TRANSFORMERS, Framework.SPARK_NLP, Framework.OPENAI]:
if train_size and test_size:
require_version("scikit-learn")
from sklearn.model_selection import train_test_split
Expand All @@ -475,7 +476,7 @@ def prepare_for_training(
shuffle=False,
random_state=seed,
)
if framework is Framework.SPACY:
if framework in [Framework.SPACY, Framework.SPACY_TRANSFORMERS]:
train_docbin = self._prepare_for_training_with_spacy(nlp=lang, records=records_train)
test_docbin = self._prepare_for_training_with_spacy(nlp=lang, records=records_test)
return train_docbin, test_docbin
Expand All @@ -488,7 +489,7 @@ def prepare_for_training(
test_jsonl = self._prepare_for_training_with_openai(records=records_test)
return train_jsonl, test_jsonl
else:
if framework is Framework.SPACY:
if framework in [Framework.SPACY, Framework.SPACY_TRANSFORMERS]:
return self._prepare_for_training_with_spacy(nlp=lang, records=shuffled_records)
elif framework is Framework.SPARK_NLP:
return self._prepare_for_training_with_spark_nlp(records=shuffled_records)
Expand Down
2 changes: 1 addition & 1 deletion src/argilla/client/feedback/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1062,7 +1062,7 @@ def prepare_for_training(
return task_mapping._prepare_for_training_with_transformers(
data=data, train_size=train_size, seed=seed, framework=framework
)
elif framework is Framework.SPACY:
elif framework is Framework.SPACY or framework is Framework.SPACY_TRANSFORMERS:
require_version("spacy")
import spacy

Expand Down
17 changes: 17 additions & 0 deletions src/argilla/client/feedback/training/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def __init__(
train_size: Optional[float] = None,
seed: Optional[int] = None,
gpu_id: Optional[int] = -1,
framework_kwargs: Optional[dict] = {},
fetch_records: bool = True,
) -> None:
"""
Expand All @@ -65,6 +66,7 @@ def __init__(
the GPU ID to use when training a SpaCy model. Defaults to -1, which means that the CPU
will be used by default. GPU IDs start in 0, which stands for the default GPU in the system,
if available.
framework_kwargs (dict): arguments for the framework's trainer.
**load_kwargs: arguments for the rg.load() function.
"""
self._dataset = dataset
Expand Down Expand Up @@ -135,6 +137,21 @@ def __init__(
seed=self._seed,
model=self._model,
gpu_id=gpu_id,
framework_kwargs=framework_kwargs, # freeze_tok2vec
)
elif framework is Framework.SPACY_TRANSFORMERS:
from argilla.client.feedback.training.frameworks.spacy import (
ArgillaSpaCyTransformersTrainer,
)

self._trainer = ArgillaSpaCyTransformersTrainer(
feedback_dataset=self._dataset,
task_mapping=self._task_mapping,
prepared_data=self._prepared_data,
seed=self._seed,
model=self._model,
gpu_id=gpu_id,
framework_kwargs=framework_kwargs, # update_transformer
)
elif framework is Framework.OPENAI:
from argilla.client.feedback.training.frameworks.openai import (
Expand Down
80 changes: 64 additions & 16 deletions src/argilla/client/feedback/training/frameworks/spacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,32 +12,45 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from typing import Optional

from typing_extensions import Literal

from argilla.client.feedback.training.base import ArgillaTrainerSkeleton
from argilla.client.models import TextClassificationRecord, TokenClassificationRecord
from argilla.training.spacy import ArgillaSpaCyTrainer as ArgillaSpaCyTrainerV1
from argilla.training.spacy import (
ArgillaSpaCyTransformersTrainer as ArgillaSpaCyTransformersTrainerV1,
)
from argilla.training.spacy import (
_ArgillaSpaCyTrainerBase as _ArgillaSpaCyTrainerBaseV1,
)
from argilla.utils.dependency import require_version


class ArgillaSpaCyTrainer(ArgillaSpaCyTrainerV1, ArgillaTrainerSkeleton):
class _ArgillaSpaCyTrainerBase(_ArgillaSpaCyTrainerBaseV1, ArgillaTrainerSkeleton):
_logger = logging.getLogger("ArgillaSpaCyTrainer")
_logger.setLevel(logging.INFO)

require_version("spacy")

def __init__(
self,
language: Optional[str] = None,
gpu_id: Optional[int] = -1,
model: Optional[str] = None,
optimize: Literal["efficiency", "accuracy"] = "efficiency",
*args,
**kwargs,
) -> None:
"""Initialize the `ArgillaSpaCyTrainer` class.
"""Initialize the `_ArgillaSpaCyTrainerBase` class.

Args:
dataset: A `spacy.tokens.DocBin` object or a tuple of `spacy.tokens.DocBin` objects.
record_class:
A `TextClassificationRecord`, `TokenClassificationRecord`, or `Text2TextRecord`
object. Defaults to None.
model:
A `str` with either the `spaCy` model name if using the CPU e.g. "en_core_web_sm". Defaults to None.
seed: A `int` with the seed for the random number generator. Defaults to None.
multi_label: A `bool` indicating whether the task is multi-label or not. Defaults to False.
language:
Expand All @@ -46,25 +59,29 @@ def __init__(
gpu_id:
the GPU ID to use. Defaults to -1, which means that the CPU will be used by default.
GPU IDs start in 0, which stands for the default GPU in the system, if available.
model:
A `str` with the `spaCy` model name to use. If it contains vectors it
can also be used for training/fine-tuning, e.g. "en_core_web_lg"
contains vectors, while "en_core_web_sm" doesn't. Defaults to None.
optimize:
A `str` with the optimization strategy to use. Either "efficiency" or "accuracy".
Defaults to "efficiency", which means that the model will be smaller, faster,
and use less memory, but it will be less accurate. If "accuracy" is used, the model
will be larger, slower, and use more memory, but it will be more accurate.
Defaults to "efficiency".

Raises:
NotImplementedError: If `record_class` is `Text2TextRecord`.

Example:
>>> from argilla import TokenClassificationRecord
>>> from argilla.training import ArgillaSpaCyTrainer
>>> dataset = ... # Load the dataset
>>> trainer = ArgillaSpaCyTrainer(dataset, record_class=TokenClassificationRecord)
>>> trainer.update_config(max_epochs=10)
>>> trainer.train()
>>> trainer.save("./model")
NotImplementedError: If the `record_class` is not supported or if the
`init_training_args` method has not been implemented.
"""
ArgillaTrainerSkeleton.__init__(self, *args, **kwargs)
import spacy

self._nlp = None
self._model = model

self.config = {}

if self._record_class == TokenClassificationRecord:
self._column_mapping = {
"text": "text",
Expand All @@ -80,15 +97,16 @@ def __init__(
self._column_mapping = {"text": "text", "label": "label"}
self._pipeline = ["textcat"]
else:
raise NotImplementedError("`Text2TextRecord` is not supported yet.")
raise NotImplementedError("`rg.Text2TextRecord` is not supported yet.")

self._train_dataset, self._eval_dataset = (
self._dataset if isinstance(self._dataset, tuple) and len(self._dataset) > 1 else (self._dataset, None)
)
self._train_dataset_path = "./train.spacy"
self._eval_dataset_path = "./dev.spacy" if self._eval_dataset else None
self._eval_dataset_path = "./dev.spacy" if self._eval_dataset else "./train.spacy"

self.language = language or "en"
self.optimize = optimize

self.gpu_id = gpu_id
self.use_gpu = False
Expand Down Expand Up @@ -118,3 +136,33 @@ def __init__(
self.gpu_id = -1

self.init_training_args()


class ArgillaSpaCyTrainer(ArgillaSpaCyTrainerV1, _ArgillaSpaCyTrainerBase):
def __init__(self, freeze_tok2vec: bool = False, **kwargs) -> None:
"""Initialize the `ArgillaSpaCyTrainer` class.

Args:
freeze_tok2vec: A `bool` indicating whether to freeze the `tok2vec` weights
during the training. Defaults to False.
**kwargs: The `ArgillaSpaCyTrainerBase` arguments.

Examples:
>>> from argilla import ArgillaSpaCyTrainer
>>> trainer = ArgillaSpaCyTrainer(
"""
self.freeze_tok2vec = freeze_tok2vec
_ArgillaSpaCyTrainerBase.__init__(self, **kwargs)


class ArgillaSpaCyTransformersTrainer(ArgillaSpaCyTransformersTrainerV1, _ArgillaSpaCyTrainerBase):
def __init__(self, update_transformer: bool = True, **kwargs) -> None:
"""Initialize the `ArgillaSpaCyTransformersTrainer` class.

Args:
update_transformer: A `bool` indicating whether to update the transformer
weights during the training. Defaults to True.
**kwargs: The `ArgillaSpaCyTrainerBase` arguments.
"""
self.update_transformer = update_transformer
_ArgillaSpaCyTrainerBase.__init__(self, **kwargs)
2 changes: 1 addition & 1 deletion src/argilla/client/feedback/training/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ class TrainingTaskMappingForTextClassification(BaseModel, TrainingData):

@property
def supported_frameworks(self):
names = ["transformers", "spacy", "openai", "setfit", "peft", "spark-nlp"]
names = ["transformers", "spacy", "openai", "setfit", "peft", "spark-nlp", "spacy-transformers"]
return [Framework(name) for name in names]

@property
Expand Down
2 changes: 2 additions & 0 deletions src/argilla/client/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class Framework(Enum):
peft: PEFT Transformers library
setfit: SetFit Transformers library
spacy: Spacy Explosion
spacy-transformers: Spacy Transformers Explosion library
span_marker: SpanMarker Tom Aarsen library
spark-nlp: Spark NLP John Snow Labs library
openai: OpenAI LLMs
Expand All @@ -53,6 +54,7 @@ class Framework(Enum):
PEFT = "peft"
SETFIT = "setfit"
SPACY = "spacy"
SPACY_TRANSFORMERS = "spacy-transformers"
SPAN_MARKER = "span_marker"
SPARK_NLP = "spark-nlp"
OPENAI = "openai"
Expand Down
20 changes: 19 additions & 1 deletion src/argilla/training/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def __init__(
train_size: Optional[float] = None,
seed: Optional[int] = None,
gpu_id: Optional[int] = -1,
framework_kwargs: Optional[dict] = {},
alvarobartt marked this conversation as resolved.
Show resolved Hide resolved
**load_kwargs: Optional[dict],
) -> None:
"""
Expand All @@ -79,6 +80,7 @@ def __init__(
the GPU ID to use when training a SpaCy model. Defaults to -1, which means that the CPU
will be used by default. GPU IDs start in 0, which stands for the default GPU in the system,
if available.
framework_kwargs (dict): additional arguments for the framework.
**load_kwargs: arguments for the rg.load() function.
"""
self._name = name
Expand Down Expand Up @@ -115,7 +117,7 @@ def __init__(
self._settings = self.dataset_full._infer_settings_from_records()

framework = Framework(framework)
if framework is Framework.SPACY:
if framework in [Framework.SPACY, Framework.SPACY_TRANSFORMERS]:
import spacy

self.dataset_full_prepared = self.dataset_full.prepare_for_training(
Expand Down Expand Up @@ -185,6 +187,22 @@ def __init__(
settings=self._settings,
seed=self._seed,
gpu_id=gpu_id,
**framework_kwargs, # freeze_tok2vec
)
elif framework is Framework.SPACY_TRANSFORMERS:
from argilla.training.spacy import ArgillaSpaCyTransformersTrainer

self._trainer = ArgillaSpaCyTransformersTrainer(
name=self._name,
workspace=self._workspace,
record_class=self._rg_dataset_type._RECORD_TYPE,
dataset=self.dataset_full_prepared,
model=self.model,
multi_label=self._multi_label,
settings=self._settings,
seed=self._seed,
gpu_id=gpu_id,
**framework_kwargs, # update_transformer
)
elif framework is Framework.OPENAI:
from argilla.training.openai import ArgillaOpenAITrainer
Expand Down
Loading