Support standalone yaml (#6557)

* support standalone yaml * add test * fix file name * move to config.py
huggingface · Jan 11, 2024 · 9d6d161 · 9d6d161
1 parent 4a5b7d9
commit 9d6d161
Show file tree

Hide file tree

Showing 7 changed files with 72 additions and 17 deletions.
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -5390,7 +5390,7 @@ def push_to_hub(
         repo_splits = []  # use a list to keep the order of the splits
         repo_files_to_add = [addition.path_in_repo for addition in additions]
         for repo_file in list_files_info(api, repo_id=repo_id, revision=revision, repo_type="dataset", token=token):
-            if repo_file.rfilename == "README.md":
+            if repo_file.rfilename == config.REPOCARD_FILENAME:
                 repo_with_dataset_card = True
             elif repo_file.rfilename == config.DATASETDICT_INFOS_FILENAME:
                 repo_with_dataset_infos = True
@@ -5421,7 +5421,9 @@ def push_to_hub(
         )
         # get the info from the README to update them
         if repo_with_dataset_card:
-            dataset_card_path = api.hf_hub_download(repo_id, "README.md", repo_type="dataset", revision=revision)
+            dataset_card_path = api.hf_hub_download(
+                repo_id, config.REPOCARD_FILENAME, repo_type="dataset", revision=revision
+            )
             dataset_card = DatasetCard.load(Path(dataset_card_path))
             dataset_card_data = dataset_card.data
             metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
@@ -5523,7 +5525,9 @@ def push_to_hub(
         DatasetInfosDict({config_name: info_to_dump}).to_dataset_card_data(dataset_card_data)
         MetadataConfigs({config_name: metadata_config_to_dump}).to_dataset_card_data(dataset_card_data)
         dataset_card = DatasetCard(f"---\n{dataset_card_data}\n---\n") if dataset_card is None else dataset_card
-        additions.append(CommitOperationAdd(path_in_repo="README.md", path_or_fileobj=str(dataset_card).encode()))
+        additions.append(
+            CommitOperationAdd(path_in_repo=config.REPOCARD_FILENAME, path_or_fileobj=str(dataset_card).encode())
+        )
 
         commit_message = commit_message if commit_message is not None else "Upload dataset"
         if len(additions) <= config.UPLOADS_MAX_NUMBER_PER_COMMIT:

diff --git a/src/datasets/commands/test.py b/src/datasets/commands/test.py
@@ -162,7 +162,9 @@ def get_builders() -> Generator[DatasetBuilder, None, None]:
             # Let's move it to the original directory of the dataset script, to allow the user to
             # upload them on S3 at the same time afterwards.
             if self._save_infos:
-                dataset_readme_path = os.path.join(builder_cls.get_imported_module_dir(), "README.md")
+                dataset_readme_path = os.path.join(
+                    builder_cls.get_imported_module_dir(), datasets.config.REPOCARD_FILENAME
+                )
                 name = Path(path).name + ".py"
                 combined_path = os.path.join(path, name)
                 if os.path.isfile(path):
@@ -177,7 +179,7 @@ def get_builders() -> Generator[DatasetBuilder, None, None]:
 
                 # Move dataset_info back to the user
                 if dataset_dir is not None:
-                    user_dataset_readme_path = os.path.join(dataset_dir, "README.md")
+                    user_dataset_readme_path = os.path.join(dataset_dir, datasets.config.REPOCARD_FILENAME)
                     copyfile(dataset_readme_path, user_dataset_readme_path)
                     print(f"Dataset card saved at {user_dataset_readme_path}")
 

diff --git a/src/datasets/config.py b/src/datasets/config.py
@@ -230,6 +230,8 @@
 METRIC_INFO_FILENAME = "metric_info.json"
 DATASETDICT_JSON_FILENAME = "dataset_dict.json"
 METADATA_CONFIGS_FIELD = "configs"
+REPOCARD_FILENAME = "README.md"
+REPOYAML_FILENAME = ".huggingface.yaml"
 
 MODULE_NAME_FOR_DYNAMIC_MODULES = "datasets_modules"
 

diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py
@@ -1729,7 +1729,7 @@ def push_to_hub(
         deletions = []
         repo_files_to_add = [addition.path_in_repo for addition in additions]
         for repo_file in list_files_info(api, repo_id=repo_id, revision=revision, repo_type="dataset", token=token):
-            if repo_file.rfilename == "README.md":
+            if repo_file.rfilename == config.REPOCARD_FILENAME:
                 repo_with_dataset_card = True
             elif repo_file.rfilename == config.DATASETDICT_INFOS_FILENAME:
                 repo_with_dataset_infos = True
@@ -1750,7 +1750,9 @@ def push_to_hub(
 
         # get the info from the README to update them
         if repo_with_dataset_card:
-            dataset_card_path = api.hf_hub_download(repo_id, "README.md", repo_type="dataset", revision=revision)
+            dataset_card_path = api.hf_hub_download(
+                repo_id, config.REPOCARD_FILENAME, repo_type="dataset", revision=revision
+            )
             dataset_card = DatasetCard.load(Path(dataset_card_path))
             dataset_card_data = dataset_card.data
             metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
@@ -1800,7 +1802,9 @@ def push_to_hub(
         DatasetInfosDict({config_name: info_to_dump}).to_dataset_card_data(dataset_card_data)
         MetadataConfigs({config_name: metadata_config_to_dump}).to_dataset_card_data(dataset_card_data)
         dataset_card = DatasetCard(f"---\n{dataset_card_data}\n---\n") if dataset_card is None else dataset_card
-        additions.append(CommitOperationAdd(path_in_repo="README.md", path_or_fileobj=str(dataset_card).encode()))
+        additions.append(
+            CommitOperationAdd(path_in_repo=config.REPOCARD_FILENAME, path_or_fileobj=str(dataset_card).encode())
+        )
 
         commit_message = commit_message if commit_message is not None else "Upload dataset"
         if len(additions) <= config.UPLOADS_MAX_NUMBER_PER_COMMIT:

diff --git a/src/datasets/info.py b/src/datasets/info.py
@@ -397,7 +397,7 @@ class DatasetInfosDict(Dict[str, DatasetInfo]):
     def write_to_directory(self, dataset_infos_dir, overwrite=False, pretty_print=False) -> None:
         total_dataset_infos = {}
         dataset_infos_path = os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME)
-        dataset_readme_path = os.path.join(dataset_infos_dir, "README.md")
+        dataset_readme_path = os.path.join(dataset_infos_dir, config.REPOCARD_FILENAME)
         if not overwrite:
             total_dataset_infos = self.from_directory(dataset_infos_dir)
         total_dataset_infos.update(self)
@@ -426,8 +426,8 @@ def write_to_directory(self, dataset_infos_dir, overwrite=False, pretty_print=Fa
     def from_directory(cls, dataset_infos_dir) -> "DatasetInfosDict":
         logger.info(f"Loading Dataset Infos from {dataset_infos_dir}")
         # Load the info from the YAML part of README.md
-        if os.path.exists(os.path.join(dataset_infos_dir, "README.md")):
-            dataset_card_data = DatasetCard.load(Path(dataset_infos_dir) / "README.md").data
+        if os.path.exists(os.path.join(dataset_infos_dir, config.REPOCARD_FILENAME)):
+            dataset_card_data = DatasetCard.load(Path(dataset_infos_dir) / config.REPOCARD_FILENAME).data
             if "dataset_info" in dataset_card_data:
                 return cls.from_dataset_card_data(dataset_card_data)
         if os.path.exists(os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME)):

diff --git a/src/datasets/load.py b/src/datasets/load.py
@@ -32,6 +32,7 @@
 
 import fsspec
 import requests
+import yaml
 from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HfFileSystem
 
 from . import config
@@ -928,7 +929,7 @@ def get_module(self) -> DatasetModule:
             )
         # get script and other files
         dataset_infos_path = Path(self.path).parent / config.DATASETDICT_INFOS_FILENAME
-        dataset_readme_path = Path(self.path).parent / "README.md"
+        dataset_readme_path = Path(self.path).parent / config.REPOCARD_FILENAME
         imports = get_imports(self.path)
         local_imports = _download_additional_modules(
             name=self.name,
@@ -940,7 +941,7 @@ def get_module(self) -> DatasetModule:
         if dataset_infos_path.is_file():
             additional_files.append((config.DATASETDICT_INFOS_FILENAME, str(dataset_infos_path)))
         if dataset_readme_path.is_file():
-            additional_files.append(("README.md", dataset_readme_path))
+            additional_files.append((config.REPOCARD_FILENAME, dataset_readme_path))
         # copy the script and the files in an importable directory
         dynamic_modules_path = self.dynamic_modules_path if self.dynamic_modules_path else init_dynamic_modules()
         hash = files_to_hash([self.path] + [loc[1] for loc in local_imports])
@@ -1003,8 +1004,16 @@ def __init__(
         self.download_mode = download_mode
 
     def get_module(self) -> DatasetModule:
-        readme_path = os.path.join(self.path, "README.md")
+        readme_path = os.path.join(self.path, config.REPOCARD_FILENAME)
+        standalone_yaml_path = os.path.join(self.path, config.REPOYAML_FILENAME)
         dataset_card_data = DatasetCard.load(readme_path).data if os.path.isfile(readme_path) else DatasetCardData()
+        if os.path.exists(standalone_yaml_path):
+            with open(standalone_yaml_path, "r", encoding="utf-8") as f:
+                standalone_yaml_data = yaml.safe_load(f.read())
+                if standalone_yaml_data:
+                    _dataset_card_data_dict = dataset_card_data.to_dict()
+                    _dataset_card_data_dict.update(standalone_yaml_data)
+                    dataset_card_data = DatasetCardData(**_dataset_card_data_dict)
         metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
         dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
         # we need a set of data files to find which dataset builder to use
@@ -1190,12 +1199,28 @@ def get_module(self) -> DatasetModule:
             download_config.download_desc = "Downloading readme"
         try:
             dataset_readme_path = cached_path(
-                hf_hub_url(self.name, "README.md", revision=revision),
+                hf_hub_url(self.name, config.REPOCARD_FILENAME, revision=revision),
                 download_config=download_config,
             )
             dataset_card_data = DatasetCard.load(Path(dataset_readme_path)).data
         except FileNotFoundError:
             dataset_card_data = DatasetCardData()
+        download_config = self.download_config.copy()
+        if download_config.download_desc is None:
+            download_config.download_desc = "Downloading standalone yaml"
+        try:
+            standalone_yaml_path = cached_path(
+                hf_hub_url(self.name, config.REPOYAML_FILENAME, revision=revision),
+                download_config=download_config,
+            )
+            with open(standalone_yaml_path, "r", encoding="utf-8") as f:
+                standalone_yaml_data = yaml.safe_load(f.read())
+                if standalone_yaml_data:
+                    _dataset_card_data_dict = dataset_card_data.to_dict()
+                    _dataset_card_data_dict.update(standalone_yaml_data)
+                    dataset_card_data = DatasetCardData(**_dataset_card_data_dict)
+        except FileNotFoundError:
+            pass
         metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
         dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
         # we need a set of data files to find which dataset builder to use
@@ -1411,7 +1436,7 @@ def download_dataset_infos_file(self) -> str:
             return None
 
     def download_dataset_readme_file(self) -> str:
-        readme_url = hf_hub_url(self.name, "README.md", revision=self.revision)
+        readme_url = hf_hub_url(self.name, config.REPOCARD_FILENAME, revision=self.revision)
         # Download the dataset infos file if available
         download_config = self.download_config.copy()
         if download_config.download_desc is None:
@@ -1448,7 +1473,7 @@ def get_module(self) -> DatasetModule:
         if dataset_infos_path:
             additional_files.append((config.DATASETDICT_INFOS_FILENAME, dataset_infos_path))
         if dataset_readme_path:
-            additional_files.append(("README.md", dataset_readme_path))
+            additional_files.append((config.REPOCARD_FILENAME, dataset_readme_path))
         # copy the script and the files in an importable directory
         dynamic_modules_path = self.dynamic_modules_path if self.dynamic_modules_path else init_dynamic_modules()
         hash = files_to_hash([local_path] + [loc[1] for loc in local_imports])

diff --git a/tests/test_load.py b/tests/test_load.py
@@ -1688,3 +1688,21 @@ def test_reload_old_cache_from_2_15(tmp_path: Path):
             cache_dir / "polinaeterna___audiofolder_two_configs_in_metadata" / "v2" / "0.0.0" / str(builder.hash)
         ).as_posix()
     )  # new cache
+
+
+@pytest.mark.integration
+def test_update_dataset_card_data_with_standalone_yaml():
+    # Labels defined in .huggingface.yml because they are too long to be in README.md
+    from datasets.utils.metadata import MetadataConfigs
+
+    with patch(
+        "datasets.utils.metadata.MetadataConfigs.from_dataset_card_data",
+        side_effect=MetadataConfigs.from_dataset_card_data,
+    ) as card_data_read_mock:
+        builder = load_dataset_builder("datasets-maintainers/dataset-with-standalone-yaml")
+    assert card_data_read_mock.call_args.args[0]["license"] is not None  # from README.md
+    assert card_data_read_mock.call_args.args[0]["dataset_info"] is not None  # from standalone yaml
+    assert card_data_read_mock.call_args.args[0]["tags"] == ["test"]  # standalone yaml has precedence
+    assert isinstance(
+        builder.info.features["label"], datasets.ClassLabel
+    )  # correctly loaded from long labels list in standalone yaml