-
Notifications
You must be signed in to change notification settings - Fork 377
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
chore: Merge branch from develop #3969
chore: Merge branch from develop #3969
Conversation
for more information, see https://pre-commit.ci
The URL of the deployed environment for this PR is https://argilla-quickstart-pr-3969-ki24f765kq-no.a.run.app |
" dataset via the `FeedbackDataset.add_records` method first." | ||
) | ||
@abstractmethod | ||
def pull(self): |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe would be a nice chance to rename pull
to pull_from_argilla
?
from argilla.client.feedback.dataset.mixins import ArgillaMixin, UnificationMixin | ||
from argilla.client.feedback.schemas.enums import RecordSortField, ResponseStatusFilter, SortOrder | ||
from argilla.client.feedback.schemas.metadata import MetadataFilters | ||
from argilla.client.feedback.dataset.local.mixins import ArgillaMixin |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe also a nice chance to rename the ArgillaMixin
to something more intuitive? e.g. ArgillaPushAndPullMixin
or something more readable, otherwise we can probably split those into two separate mixins
lang: Optional[str] = None, | ||
) -> Any: | ||
""" | ||
Prepares the dataset for training for a specific training framework and NLP task by splitting the dataset into train and test sets. | ||
|
||
Args: | ||
framework: the framework to use for training. Currently supported frameworks are: `transformers`, `peft`, | ||
`setfit`, `spacy`, `spacy-transformers`, `span_marker`, `spark-nlp`, `openai`, `trl`, `sentence-transformers`. | ||
task: the NLP task to use for training. Currently supported tasks are: `TrainingTaskForTextClassification`, | ||
`TrainingTaskForSFT`, `TrainingTaskForRM`, `TrainingTaskForPPO`, `TrainingTaskForDPO`, `TrainingTaskForSentenceSimilarity`. | ||
train_size: the size of the train set. If `None`, the whole dataset will be used for training. | ||
test_size: the size of the test set. If `None`, the whole dataset will be used for testing. | ||
seed: the seed to use for splitting the dataset into train and test sets. | ||
lang: the spaCy language to use for training. If `None`, the language of the dataset will be used. | ||
""" | ||
if isinstance(framework, str): | ||
framework = Framework(framework) | ||
|
||
# validate train and test sizes | ||
if train_size is None: | ||
train_size = 1 | ||
if test_size is None: | ||
test_size = 1 - train_size | ||
|
||
# check if all numbers are larger than 0 | ||
if not [abs(train_size), abs(test_size)] == [train_size, test_size]: | ||
raise ValueError("`train_size` and `test_size` must be larger than 0.") | ||
# check if train sizes sum up to 1 | ||
if not (train_size + test_size) == 1: | ||
raise ValueError("`train_size` and `test_size` must sum to 1.") | ||
|
||
if test_size == 0: | ||
test_size = None | ||
|
||
if len(self.records) < 1: | ||
raise ValueError( | ||
"No records found in the dataset. Make sure you add records to the" | ||
" dataset via the `FeedbackDataset.add_records()` method first." | ||
) | ||
|
||
local_dataset = self.pull() | ||
if isinstance(task, (TrainingTaskForTextClassification, TrainingTaskForSentenceSimilarity)): | ||
if task.formatting_func is None: | ||
# in sentence-transformer models we can train without labels | ||
if task.label: | ||
local_dataset = local_dataset.unify_responses( | ||
question=task.label.question, strategy=task.label.strategy | ||
) | ||
elif isinstance(task, TrainingTaskForQuestionAnswering): | ||
if task.formatting_func is None: | ||
local_dataset = self.unify_responses(question=task.answer.name, strategy="disagreement") | ||
elif not isinstance( | ||
task, | ||
( | ||
TrainingTaskForSFT, | ||
TrainingTaskForRM, | ||
TrainingTaskForPPO, | ||
TrainingTaskForDPO, | ||
TrainingTaskForChatCompletion, | ||
), | ||
): | ||
raise ValueError(f"Training data {type(task)} is not supported yet") | ||
|
||
data = task._format_data(local_dataset) | ||
if framework in [ | ||
Framework.TRANSFORMERS, | ||
Framework.SETFIT, | ||
Framework.SPAN_MARKER, | ||
Framework.PEFT, | ||
]: | ||
return task._prepare_for_training_with_transformers( | ||
data=data, train_size=train_size, seed=seed, framework=framework | ||
) | ||
elif framework in [Framework.SPACY, Framework.SPACY_TRANSFORMERS]: | ||
require_dependencies("spacy") | ||
import spacy | ||
|
||
if lang is None: | ||
_LOGGER.warning("spaCy `lang` is not provided. Using `en`(English) as default language.") | ||
lang = spacy.blank("en") | ||
elif lang.isinstance(str): | ||
if len(lang) == 2: | ||
lang = spacy.blank(lang) | ||
else: | ||
lang = spacy.load(lang) | ||
return task._prepare_for_training_with_spacy(data=data, train_size=train_size, seed=seed, lang=lang) | ||
elif framework is Framework.SPARK_NLP: | ||
return task._prepare_for_training_with_spark_nlp(data=data, train_size=train_size, seed=seed) | ||
elif framework is Framework.OPENAI: | ||
return task._prepare_for_training_with_openai(data=data, train_size=train_size, seed=seed) | ||
elif framework is Framework.TRL: | ||
return task._prepare_for_training_with_trl(data=data, train_size=train_size, seed=seed) | ||
elif framework is Framework.TRLX: | ||
return task._prepare_for_training_with_trlx(data=data, train_size=train_size, seed=seed) | ||
elif framework is Framework.SENTENCE_TRANSFORMERS: | ||
return task._prepare_for_training_with_sentence_transformers(data=data, train_size=train_size, seed=seed) | ||
else: | ||
raise NotImplementedError( | ||
f"Framework {framework} is not supported. Choose from: {[e.value for e in Framework]}" | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't all that go into different mixins?
) | ||
return self | ||
|
||
def pull(self) -> "FeedbackDataset": |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Dito (plus a DeprecationWarning
on pull
)
def pull(self) -> "FeedbackDataset": | |
def pull_from_argilla(self) -> "FeedbackDataset": |
__all__ = [ | ||
"ArgillaDatasetCard", | ||
"size_categories_parser", | ||
] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why this change here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When I added the model card, the classes currently in model_card/__init__.py
were here. Then I moved the model card to its own module but forgot to let this import as it was originally.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@plaguss could you add a small PR to fix the comments from @alvarobartt
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@davidberenstein1957 here it is: #3983
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from .model_card import ( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe we could use the absolute import here instead of the relative one
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Has this file been included in the pyproject.toml
as a package file?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No but it should, I will tackle these changes.
…io/argilla into chore/merge-branch-from-develop
Co-authored-by: Alvaro Bartolome <[email protected]>
…returns key not found error with custom dataset #3970
d913337
into
feature/support-for-metadata-filtering-and-sorting
# Description This PR addresses some pending fixes as a follow up of #3969 (as pointed out by @alvarobartt), which were preventing the `ModelCard` to be generated once the package is uploaded to PyPI as the file was kept locally as not as part of the `package-files`. Besides that, this PR also fixes some minor style improvements to keep consistency with the rest of the codebase. **Type of change** - [x] Bug fix (non-breaking change which fixes an issue)
Description
This is a manual merge for lates changes in
develop
branch