fix: format_as("datasets") when no responses (#3224)

# Description As just reported by @dvsrepo, the following flow: `FeedbackDataset.from_argilla() -> .format_as("datasets")` was failing under some scenarios where either the optional responses had no value or there were no responses at all, as either we were trying to retrieve those when any, or we were setting it's value to `None` if not found, instead of `{"user_id": None, "values": None, "status": None}` which is what 🤗 Datasets is producing based on the features. So on, this PR solves that bug and now the flow mentioned above is possible and works as expected! **Type of change** - [X] Bug fix (non-breaking change which fixes an issue) **How Has This Been Tested** - [X] To also cover the scenario when there are no responses, the fixture `feedback_dataset_records` has been modified to contain a record with and another without responses **Checklist** - [X] I have merged the original branch into my forked branch - [X] follows the style guidelines of this project - [X] I did a self-review of my code - [X] My changes generate no new warnings - [X] I have added tests that prove my fix is effective or that my feature works - [X] I have added relevant notes to the CHANGELOG.md file (See https://keepachangelog.com/)
argilla-io · Jun 20, 2023 · 36819ec · 36819ec
1 parent af18560
commit 36819ec
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 23 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -23,6 +23,8 @@ These are the section headers that we use:
 ### Fixed
 
 - Replaced `np.float` alias by `float` to avoid `AttributeError` when using `find_label_errors` function with `numpy>=1.24.0` ([#3214](https://github.com/argilla-io/argilla/pull/3214)).
+- Fixed `format_as("datasets")` when no responses or optional respones in `FeedbackRecord`, to set their value to what 🤗 Datasets expects instead of just `None` ([#3224](https://github.com/argilla-io/argilla/pull/3224)).
+- Fixed `push_to_huggingface()` when `generate_card=True` (default behaviour), as we were passing a sample record to the `ArgillaDatasetCard` class, and `UUID`s introduced in 1.10.0 ([#3192](https://github.com/argilla-io/argilla/pull/3192)), are not JSON-serializable ([#3231](https://github.com/argilla-io/argilla/pull/3231)).
 
 ### Added
 

diff --git a/src/argilla/client/feedback/dataset.py b/src/argilla/client/feedback/dataset.py
@@ -11,6 +11,7 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+
 import json
 import logging
 import tempfile
@@ -752,15 +753,16 @@ def format_as(self, format: Literal["datasets"]) -> "Dataset":
                     dataset[field.name].append(record.fields[field.name])
                 for question in self.questions:
                     dataset[question.name].append(
-                        [
-                            {
-                                "user_id": r.user_id,
-                                "value": r.values[question.name].value,
-                                "status": r.status,
-                            }
-                            for r in record.responses
-                        ]
-                        or None
+                        {
+                            "user_id": [r.user_id for r in record.responses],
+                            "value": [
+                                r.values[question.name].value if question.name in r.values else None
+                                for r in record.responses
+                            ],
+                            "status": [r.status for r in record.responses],
+                        }
+                        if record.responses
+                        else None
                     )
                 dataset["metadata"].append(json.dumps(record.metadata) if record.metadata else None)
                 dataset["external_id"].append(record.external_id or None)
@@ -838,7 +840,7 @@ def push_to_huggingface(self, repo_id: str, generate_card: Optional[bool] = True
                 argilla_fields=self.fields,
                 argilla_questions=self.questions,
                 argilla_guidelines=self.guidelines,
-                argilla_record=self.records[0].dict(),
+                argilla_record=json.loads(self.records[0].json()),
                 huggingface_record=hfds[0],
             )
             card.push_to_hub(repo_id, repo_type="dataset", token=kwargs.get("token"))
@@ -887,7 +889,7 @@ def from_huggingface(cls, repo_id: str, *args: Any, **kwargs: Any) -> "FeedbackD
             repo_type="dataset",
             **hub_auth,
         )
-        with open(config_path, "rb") as f:
+        with open(config_path, "r") as f:
             config = FeedbackDatasetConfig.parse_raw(f.read())
 
         cls = cls(

diff --git a/src/argilla/client/feedback/utils.py b/src/argilla/client/feedback/utils.py
@@ -12,7 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, List, Optional, Union
 
 from pydantic import (
     BaseModel,

diff --git a/tests/client/conftest.py b/tests/client/conftest.py
@@ -428,17 +428,6 @@ def feedback_dataset_records() -> List[FeedbackRecord]:
         ),
         FeedbackRecord(
             fields={"text": "This is a negative example", "label": "negative"},
-            responses=[
-                {
-                    "values": {
-                        "question-1": {"value": "This is a response to question 1"},
-                        "question-2": {"value": 1},
-                        "question-3": {"value": "a"},
-                        "question-4": {"value": ["a", "b"]},
-                    },
-                    "status": "submitted",
-                }
-            ],
             metadata={"another unit": "test"},
             external_id="2",
         ),