argilla-io · frascuchon · Jul 4, 2024 · Jun 24, 2024 · Jun 24, 2024 · Jun 25, 2024
diff --git a/argilla/docs/how_to_guides/record.md b/argilla/docs/how_to_guides/record.md
@@ -318,24 +318,31 @@ Suggestions refer to suggested responses (e.g. model predictions) that you can a
     You can add suggestions as a dictionary, where the keys correspond to the `name`s of the labels that were configured for your dataset. Remember that you can also use the `mapping` parameter to specify the data structure.
 
     ```python
-    # Add records to the dataset with the label 'my_label'
+        # Add records to the dataset with the label 'my_label'
     data = [
         {
             "question": "Do you need oxygen to breathe?",
             "answer": "Yes",
-            "my_label.suggestion": "positive",
-            "my_label.suggestion.score": 0.9,
-            "my_label.suggestion.agent": "model_name"
+            "label": "positive",
+            "score": 0.9,
+            "agent": "model_name",
         },
         {
             "question": "What is the boiling point of water?",
             "answer": "100 degrees Celsius",
-            "my_label.suggestion": "negative",
-            "my_label.suggestion.score": 0.9,
-            "my_label.suggestion.agent": "model_name"
+            "label": "negative",
+            "score": 0.9,
+            "agent": "model_name",
         },
     ]
-    dataset.records.log(data)
+    dataset.records.log(
+        data=data,
+        mapping={
+            "label": "my_label",
+            "score": "my_label.suggestion.score",
+            "agent": "my_label.suggestion.agent",
+        },
+    )
     ```
 
 ### Responses
@@ -385,15 +392,15 @@ If your dataset includes some annotations, you can add those to the records as y
         {
             "question": "Do you need oxygen to breathe?",
             "answer": "Yes",
-            "my_label.response": "positive",
+            "label": "positive",
         },
         {
             "question": "What is the boiling point of water?",
             "answer": "100 degrees Celsius",
-            "my_label.response": "negative",
+            "label": "negative",
         },
     ]
-    dataset.records.log(data, user_id=user.id)
+    dataset.records.log(data, user_id=user.id, mapping={"label": "my_label.response"})
     ```
 
 ## List records
@@ -415,7 +422,7 @@ for record in dataset.records(
 
     # Access the responses of the record
     for response in record.responses:
-        print(record.["<question_name>"].value)
+        print(record["<question_name>"].value)
 ```
 
 ## Update records
@@ -460,8 +467,8 @@ dataset.records.log(records=updated_data)
 
     for record in dataset.records():
 
-        record.vectors["new_vector"] = [...]
-        record.vector["v"] = [...]
+        record.vectors["new_vector"] = [ 0, 1, 2, 3, 4, 5 ]
+        record.vector["v"] = [ 0.1, 0.2, 0.3 ]
 
         updated_records.append(record)
 

diff --git a/argilla/docs/reference/argilla/records/records.md b/argilla/docs/reference/argilla/records/records.md
@@ -12,7 +12,7 @@ The `Record` object is used to represent a single record in Argilla. It contains
 To create records, you can use the `Record` class and pass it to the `Dataset.records.log` method. The `Record` class requires a `fields` parameter, which is a dictionary of field names and values. The field names must match the field names in the dataset's `Settings` object to be accepted.
 
 ```python
-dataset.records.add(
+dataset.records.log(
     records=[
         rg.Record(
             fields={"text": "Hello World, how are you?"},

diff --git a/argilla/src/argilla/_exceptions/__init__.py b/argilla/src/argilla/_exceptions/__init__.py
@@ -16,3 +16,4 @@
 from argilla._exceptions._metadata import *  # noqa: F403
 from argilla._exceptions._serialization import *  # noqa: F403
 from argilla._exceptions._settings import *  # noqa: F403
+from argilla._exceptions._records import *  # noqa: F403
diff --git a/argilla/src/argilla/_exceptions/_records.py b/argilla/src/argilla/_exceptions/_records.py
@@ -0,0 +1,19 @@
+# Copyright 2024-present, Argilla, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from argilla._exceptions._base import ArgillaErrorBase
+
+
+class RecordsIngestionError(ArgillaErrorBase):
+    pass
diff --git a/argilla/src/argilla/client.py b/argilla/src/argilla/client.py
@@ -271,7 +271,7 @@ def __call__(self, name: str, workspace: Optional[Union["Workspace", str]] = Non
 
         for dataset in workspace.datasets:
             if dataset.name == name:
-                return dataset
+                return dataset.get()
         warnings.warn(f"Dataset {name} not found. Creating a new dataset. Do `dataset.create()` to create the dataset.")
         return Dataset(name=name, workspace=workspace, client=self._client, **kwargs)
 

diff --git a/argilla/src/argilla/datasets/_resource.py b/argilla/src/argilla/datasets/_resource.py
@@ -101,8 +101,6 @@ def records(self) -> "DatasetRecords":
 
     @property
     def settings(self) -> Settings:
-        if self._is_published() and self._settings.is_outdated:
-            self._settings.get()
         return self._settings
 
     @settings.setter
@@ -142,6 +140,11 @@ def schema(self) -> dict:
     #  Core methods     #
     #####################
 
+    def get(self) -> "Dataset":
+        super().get()
+        self.settings.get()
+        return self
+
     def exists(self) -> bool:
         """Checks if the dataset exists on the server
 
@@ -185,7 +188,7 @@ def _publish(self) -> "Dataset":
         self._settings.create()
         self._api.publish(dataset_id=self._model.id)
 
-        return self.get()  # type: ignore
+        return self.get()
 
     def _workspace_id_from_name(self, workspace: Optional[Union["Workspace", str]]) -> UUID:
         if workspace is None:

diff --git a/argilla/src/argilla/records/_dataset_records.py b/argilla/src/argilla/records/_dataset_records.py
@@ -11,8 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import warnings
-from collections import defaultdict
+
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union
 from uuid import UUID
@@ -21,16 +20,13 @@
 
 from argilla._api import RecordsAPI
 from argilla._helpers import LoggingMixin
-from argilla._models import RecordModel, MetadataValue, VectorValue, FieldValue
+from argilla._models import RecordModel
+from argilla._exceptions import RecordsIngestionError
 from argilla.client import Argilla
 from argilla.records._io import GenericIO, HFDataset, HFDatasetsIO, JsonIO
+from argilla.records._mapping import IngestedRecordMapper
 from argilla.records._resource import Record
 from argilla.records._search import Query
-from argilla.responses import Response
-from argilla.settings import TextField, VectorField
-from argilla.settings._metadata import MetadataPropertyBase
-from argilla.settings._question import QuestionPropertyBase
-from argilla.suggestions import Suggestion
 
 if TYPE_CHECKING:
     from argilla.datasets import Dataset
@@ -188,8 +184,8 @@ def __call__(
             self._validate_vector_names(vector_names=with_vectors)
 
         return DatasetRecordsIterator(
-            self.__dataset,
-            self.__client,
+            dataset=self.__dataset,
+            client=self.__client,
             query=query,
             batch_size=batch_size,
             start_offset=start_offset,
@@ -208,7 +204,7 @@ def __repr__(self) -> str:
     def log(
         self,
         records: Union[List[dict], List[Record], HFDataset],
-        mapping: Optional[Dict[str, str]] = None,
+        mapping: Optional[Dict[str, Union[str, Sequence[str]]]] = None,
         user_id: Optional[UUID] = None,
         batch_size: int = DEFAULT_BATCH_SIZE,
     ) -> "DatasetRecords":
@@ -222,12 +218,12 @@ def log(
                      If records are defined as a dictionaries or a dataset, the keys/ column names should correspond to the
                      fields in the Argilla dataset's fields and questions. `id` should be provided to identify the records when updating.
             mapping: A dictionary that maps the keys/ column names in the records to the fields or questions in the Argilla dataset.
+                     To assign an incoming key or column to multiple fields or questions, provide a list or tuple of field or question names.
             user_id: The user id to be associated with the records' response. If not provided, the current user id is used.
             batch_size: The number of records to send in each batch. The default is 256.
 
         Returns:
             A list of Record objects representing the updated records.
-
         """
         record_models = self._ingest_records(records=records, mapping=mapping, user_id=user_id or self.__client.me.id)
         batch_size = self._normalize_batch_size(
@@ -238,8 +234,12 @@ def log(
 
         created_or_updated = []
         records_updated = 0
+
         for batch in tqdm(
-            iterable=range(0, len(records), batch_size), desc="Adding and updating records", unit="batch"
+            iterable=range(0, len(records), batch_size),
+            desc="Sending records...",
+            total=len(records) // batch_size,
+            unit="batch",
         ):
             self._log_message(message=f"Sending records from {batch} to {batch + batch_size}.")
             batch_records = record_models[batch : batch + batch_size]
@@ -357,26 +357,36 @@ def to_datasets(self) -> HFDataset:
 
     def _ingest_records(
         self,
-        records: Union[List[Dict[str, Any]], Dict[str, Any], List[Record], Record, HFDataset],
-        mapping: Optional[Dict[str, str]] = None,
+        records: Union[List[Dict[str, Any]], List[Record], HFDataset],
+        mapping: Optional[Dict[str, Union[str, Sequence[str]]]] = None,
         user_id: Optional[UUID] = None,
     ) -> List[RecordModel]:
+        """Ingests records from a list of dictionaries, a Hugging Face Dataset, or a list of Record objects."""
+
         if len(records) == 0:
             raise ValueError("No records provided to ingest.")
+
         if HFDatasetsIO._is_hf_dataset(dataset=records):
             records = HFDatasetsIO._record_dicts_from_datasets(dataset=records)
-        if all(map(lambda r: isinstance(r, dict), records)):
-            # Records as flat dicts of values to be matched to questions as suggestion or response
-            records = [self._infer_record_from_mapping(data=r, mapping=mapping, user_id=user_id) for r in records]  # type: ignore
-        elif all(map(lambda r: isinstance(r, Record), records)):
-            for record in records:
-                record.dataset = self.__dataset
-        else:
-            raise ValueError(
-                "Records should be a a list Record instances, "
-                "a Hugging Face Dataset, or a list of dictionaries representing the records."
-            )
-        return [record.api_model() for record in records]
+
+        ingested_records = []
+        record_mapper = IngestedRecordMapper(mapping=mapping, dataset=self.__dataset, user_id=user_id)
+        for record in records:
+            try:
+                if not isinstance(record, Record):
+                    record = record_mapper(data=record)
+                elif isinstance(record, Record):
+                    record.dataset = self.__dataset
+                else:
+                    raise ValueError(
+                        "Records should be a a list Record instances, "
+                        "a Hugging Face Dataset, or a list of dictionaries representing the records."
+                        f"Found a record of type {type(record)}: {record}."
+                    )
+            except Exception as e:
+                raise RecordsIngestionError(f"Failed to ingest record from dict {record}: {e}")
+            ingested_records.append(record.api_model())
+        return ingested_records
 
     def _normalize_batch_size(self, batch_size: int, records_length, max_value: int):
         norm_batch_size = min(batch_size, records_length, max_value)
@@ -397,100 +407,3 @@ def _validate_vector_names(self, vector_names: Union[List[str], str]) -> None:
                 continue
             if vector_name not in self.__dataset.schema:
                 raise ValueError(f"Vector field {vector_name} not found in dataset schema.")
-
-    def _infer_record_from_mapping(
-        self,
-        data: dict,
-        mapping: Optional[Dict[str, str]] = None,
-        user_id: Optional[UUID] = None,
-    ) -> "Record":
-        """Converts a mapped record dictionary to a Record object for use by the add or update methods.
-        Args:
-            dataset: The dataset object to which the record belongs.
-            data: A dictionary representing the record.
-            mapping: A dictionary mapping source data keys to Argilla fields, questions, and ids.
-            user_id: The user id to associate with the record responses.
-        Returns:
-            A Record object.
-        """
-        record_id: Optional[str] = None
-
-        fields: Dict[str, FieldValue] = {}
-        vectors: Dict[str, VectorValue] = {}
-        metadata: Dict[str, MetadataValue] = {}
-
-        responses: List[Response] = []
-        suggestion_values: Dict[str, dict] = defaultdict(dict)
-
-        schema = self.__dataset.schema
-
-        for attribute, value in data.items():
-            schema_item = schema.get(attribute)
-            attribute_type = None
-            sub_attribute = None
-
-            # Map source data keys using the mapping
-            if mapping and attribute in mapping:
-                attribute_mapping = mapping.get(attribute)
-                attribute_mapping = attribute_mapping.split(".")
-                attribute = attribute_mapping[0]
-                schema_item = schema.get(attribute)
-                if len(attribute_mapping) > 1:
-                    attribute_type = attribute_mapping[1]
-                if len(attribute_mapping) > 2:
-                    sub_attribute = attribute_mapping[2]
-            elif schema_item is mapping is None and attribute != "id":
-                warnings.warn(
-                    message=f"""Record attribute {attribute} is not in the schema so skipping.
-                        Define a mapping to map source data fields to Argilla Fields, Questions, and ids
-                        """
-                )
-                continue
-
-            if attribute == "id":
-                record_id = value
-                continue
-
-            # Add suggestion values to the suggestions
-            if attribute_type == "suggestion":
-                if sub_attribute in ["score", "agent"]:
-                    suggestion_values[attribute][sub_attribute] = value
-
-                elif sub_attribute is None:
-                    suggestion_values[attribute].update(
-                        {"value": value, "question_name": attribute, "question_id": schema_item.id}
-                    )
-                else:
-                    warnings.warn(
-                        message=f"Record attribute {sub_attribute} is not a valid suggestion sub_attribute so skipping."
-                    )
-                continue
-
-            # Assign the value to question, field, or response based on schema item
-            if isinstance(schema_item, TextField):
-                fields[attribute] = value
-            elif isinstance(schema_item, QuestionPropertyBase) and attribute_type == "response":
-                responses.append(Response(question_name=attribute, value=value, user_id=user_id))
-            elif isinstance(schema_item, QuestionPropertyBase) and attribute_type is None:
-                suggestion_values[attribute].update(
-                    {"value": value, "question_name": attribute, "question_id": schema_item.id}
-                )
-            elif isinstance(schema_item, VectorField):
-                vectors[attribute] = value
-            elif isinstance(schema_item, MetadataPropertyBase):
-                metadata[attribute] = value
-            else:
-                warnings.warn(message=f"Record attribute {attribute} is not in the schema or mapping so skipping.")
-                continue
-
-        suggestions = [Suggestion(**suggestion_dict) for suggestion_dict in suggestion_values.values()]
-
-        return Record(
-            id=record_id,
-            fields=fields,
-            vectors=vectors,
-            metadata=metadata,
-            suggestions=suggestions,
-            responses=responses,
-            _dataset=self.__dataset,
-        )