diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 6ed7de81c12..64a29ec7e59 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -5390,7 +5390,7 @@ def push_to_hub( repo_splits = [] # use a list to keep the order of the splits repo_files_to_add = [addition.path_in_repo for addition in additions] for repo_file in list_files_info(api, repo_id=repo_id, revision=revision, repo_type="dataset", token=token): - if repo_file.rfilename == "README.md": + if repo_file.rfilename == config.REPOCARD_FILENAME: repo_with_dataset_card = True elif repo_file.rfilename == config.DATASETDICT_INFOS_FILENAME: repo_with_dataset_infos = True @@ -5421,7 +5421,9 @@ def push_to_hub( ) # get the info from the README to update them if repo_with_dataset_card: - dataset_card_path = api.hf_hub_download(repo_id, "README.md", repo_type="dataset", revision=revision) + dataset_card_path = api.hf_hub_download( + repo_id, config.REPOCARD_FILENAME, repo_type="dataset", revision=revision + ) dataset_card = DatasetCard.load(Path(dataset_card_path)) dataset_card_data = dataset_card.data metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data) @@ -5523,7 +5525,9 @@ def push_to_hub( DatasetInfosDict({config_name: info_to_dump}).to_dataset_card_data(dataset_card_data) MetadataConfigs({config_name: metadata_config_to_dump}).to_dataset_card_data(dataset_card_data) dataset_card = DatasetCard(f"---\n{dataset_card_data}\n---\n") if dataset_card is None else dataset_card - additions.append(CommitOperationAdd(path_in_repo="README.md", path_or_fileobj=str(dataset_card).encode())) + additions.append( + CommitOperationAdd(path_in_repo=config.REPOCARD_FILENAME, path_or_fileobj=str(dataset_card).encode()) + ) commit_message = commit_message if commit_message is not None else "Upload dataset" if len(additions) <= config.UPLOADS_MAX_NUMBER_PER_COMMIT: diff --git a/src/datasets/commands/test.py b/src/datasets/commands/test.py index 2f194782142..0754158c198 100644 --- a/src/datasets/commands/test.py +++ b/src/datasets/commands/test.py @@ -162,7 +162,9 @@ def get_builders() -> Generator[DatasetBuilder, None, None]: # Let's move it to the original directory of the dataset script, to allow the user to # upload them on S3 at the same time afterwards. if self._save_infos: - dataset_readme_path = os.path.join(builder_cls.get_imported_module_dir(), "README.md") + dataset_readme_path = os.path.join( + builder_cls.get_imported_module_dir(), datasets.config.REPOCARD_FILENAME + ) name = Path(path).name + ".py" combined_path = os.path.join(path, name) if os.path.isfile(path): @@ -177,7 +179,7 @@ def get_builders() -> Generator[DatasetBuilder, None, None]: # Move dataset_info back to the user if dataset_dir is not None: - user_dataset_readme_path = os.path.join(dataset_dir, "README.md") + user_dataset_readme_path = os.path.join(dataset_dir, datasets.config.REPOCARD_FILENAME) copyfile(dataset_readme_path, user_dataset_readme_path) print(f"Dataset card saved at {user_dataset_readme_path}") diff --git a/src/datasets/config.py b/src/datasets/config.py index cbced9636a2..32127bea7dc 100644 --- a/src/datasets/config.py +++ b/src/datasets/config.py @@ -230,6 +230,8 @@ METRIC_INFO_FILENAME = "metric_info.json" DATASETDICT_JSON_FILENAME = "dataset_dict.json" METADATA_CONFIGS_FIELD = "configs" +REPOCARD_FILENAME = "README.md" +REPOYAML_FILENAME = ".huggingface.yaml" MODULE_NAME_FOR_DYNAMIC_MODULES = "datasets_modules" diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index 6ca6ec79a83..06fd124e2c3 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -1729,7 +1729,7 @@ def push_to_hub( deletions = [] repo_files_to_add = [addition.path_in_repo for addition in additions] for repo_file in list_files_info(api, repo_id=repo_id, revision=revision, repo_type="dataset", token=token): - if repo_file.rfilename == "README.md": + if repo_file.rfilename == config.REPOCARD_FILENAME: repo_with_dataset_card = True elif repo_file.rfilename == config.DATASETDICT_INFOS_FILENAME: repo_with_dataset_infos = True @@ -1750,7 +1750,9 @@ def push_to_hub( # get the info from the README to update them if repo_with_dataset_card: - dataset_card_path = api.hf_hub_download(repo_id, "README.md", repo_type="dataset", revision=revision) + dataset_card_path = api.hf_hub_download( + repo_id, config.REPOCARD_FILENAME, repo_type="dataset", revision=revision + ) dataset_card = DatasetCard.load(Path(dataset_card_path)) dataset_card_data = dataset_card.data metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data) @@ -1800,7 +1802,9 @@ def push_to_hub( DatasetInfosDict({config_name: info_to_dump}).to_dataset_card_data(dataset_card_data) MetadataConfigs({config_name: metadata_config_to_dump}).to_dataset_card_data(dataset_card_data) dataset_card = DatasetCard(f"---\n{dataset_card_data}\n---\n") if dataset_card is None else dataset_card - additions.append(CommitOperationAdd(path_in_repo="README.md", path_or_fileobj=str(dataset_card).encode())) + additions.append( + CommitOperationAdd(path_in_repo=config.REPOCARD_FILENAME, path_or_fileobj=str(dataset_card).encode()) + ) commit_message = commit_message if commit_message is not None else "Upload dataset" if len(additions) <= config.UPLOADS_MAX_NUMBER_PER_COMMIT: diff --git a/src/datasets/info.py b/src/datasets/info.py index e17477d636d..bab49e5deae 100644 --- a/src/datasets/info.py +++ b/src/datasets/info.py @@ -397,7 +397,7 @@ class DatasetInfosDict(Dict[str, DatasetInfo]): def write_to_directory(self, dataset_infos_dir, overwrite=False, pretty_print=False) -> None: total_dataset_infos = {} dataset_infos_path = os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME) - dataset_readme_path = os.path.join(dataset_infos_dir, "README.md") + dataset_readme_path = os.path.join(dataset_infos_dir, config.REPOCARD_FILENAME) if not overwrite: total_dataset_infos = self.from_directory(dataset_infos_dir) total_dataset_infos.update(self) @@ -426,8 +426,8 @@ def write_to_directory(self, dataset_infos_dir, overwrite=False, pretty_print=Fa def from_directory(cls, dataset_infos_dir) -> "DatasetInfosDict": logger.info(f"Loading Dataset Infos from {dataset_infos_dir}") # Load the info from the YAML part of README.md - if os.path.exists(os.path.join(dataset_infos_dir, "README.md")): - dataset_card_data = DatasetCard.load(Path(dataset_infos_dir) / "README.md").data + if os.path.exists(os.path.join(dataset_infos_dir, config.REPOCARD_FILENAME)): + dataset_card_data = DatasetCard.load(Path(dataset_infos_dir) / config.REPOCARD_FILENAME).data if "dataset_info" in dataset_card_data: return cls.from_dataset_card_data(dataset_card_data) if os.path.exists(os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME)): diff --git a/src/datasets/load.py b/src/datasets/load.py index 9e2efa9883d..0b8839f0406 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -32,6 +32,7 @@ import fsspec import requests +import yaml from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HfFileSystem from . import config @@ -928,7 +929,7 @@ def get_module(self) -> DatasetModule: ) # get script and other files dataset_infos_path = Path(self.path).parent / config.DATASETDICT_INFOS_FILENAME - dataset_readme_path = Path(self.path).parent / "README.md" + dataset_readme_path = Path(self.path).parent / config.REPOCARD_FILENAME imports = get_imports(self.path) local_imports = _download_additional_modules( name=self.name, @@ -940,7 +941,7 @@ def get_module(self) -> DatasetModule: if dataset_infos_path.is_file(): additional_files.append((config.DATASETDICT_INFOS_FILENAME, str(dataset_infos_path))) if dataset_readme_path.is_file(): - additional_files.append(("README.md", dataset_readme_path)) + additional_files.append((config.REPOCARD_FILENAME, dataset_readme_path)) # copy the script and the files in an importable directory dynamic_modules_path = self.dynamic_modules_path if self.dynamic_modules_path else init_dynamic_modules() hash = files_to_hash([self.path] + [loc[1] for loc in local_imports]) @@ -1003,8 +1004,16 @@ def __init__( self.download_mode = download_mode def get_module(self) -> DatasetModule: - readme_path = os.path.join(self.path, "README.md") + readme_path = os.path.join(self.path, config.REPOCARD_FILENAME) + standalone_yaml_path = os.path.join(self.path, config.REPOYAML_FILENAME) dataset_card_data = DatasetCard.load(readme_path).data if os.path.isfile(readme_path) else DatasetCardData() + if os.path.exists(standalone_yaml_path): + with open(standalone_yaml_path, "r", encoding="utf-8") as f: + standalone_yaml_data = yaml.safe_load(f.read()) + if standalone_yaml_data: + _dataset_card_data_dict = dataset_card_data.to_dict() + _dataset_card_data_dict.update(standalone_yaml_data) + dataset_card_data = DatasetCardData(**_dataset_card_data_dict) metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data) dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data) # we need a set of data files to find which dataset builder to use @@ -1190,12 +1199,28 @@ def get_module(self) -> DatasetModule: download_config.download_desc = "Downloading readme" try: dataset_readme_path = cached_path( - hf_hub_url(self.name, "README.md", revision=revision), + hf_hub_url(self.name, config.REPOCARD_FILENAME, revision=revision), download_config=download_config, ) dataset_card_data = DatasetCard.load(Path(dataset_readme_path)).data except FileNotFoundError: dataset_card_data = DatasetCardData() + download_config = self.download_config.copy() + if download_config.download_desc is None: + download_config.download_desc = "Downloading standalone yaml" + try: + standalone_yaml_path = cached_path( + hf_hub_url(self.name, config.REPOYAML_FILENAME, revision=revision), + download_config=download_config, + ) + with open(standalone_yaml_path, "r", encoding="utf-8") as f: + standalone_yaml_data = yaml.safe_load(f.read()) + if standalone_yaml_data: + _dataset_card_data_dict = dataset_card_data.to_dict() + _dataset_card_data_dict.update(standalone_yaml_data) + dataset_card_data = DatasetCardData(**_dataset_card_data_dict) + except FileNotFoundError: + pass metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data) dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data) # we need a set of data files to find which dataset builder to use @@ -1411,7 +1436,7 @@ def download_dataset_infos_file(self) -> str: return None def download_dataset_readme_file(self) -> str: - readme_url = hf_hub_url(self.name, "README.md", revision=self.revision) + readme_url = hf_hub_url(self.name, config.REPOCARD_FILENAME, revision=self.revision) # Download the dataset infos file if available download_config = self.download_config.copy() if download_config.download_desc is None: @@ -1448,7 +1473,7 @@ def get_module(self) -> DatasetModule: if dataset_infos_path: additional_files.append((config.DATASETDICT_INFOS_FILENAME, dataset_infos_path)) if dataset_readme_path: - additional_files.append(("README.md", dataset_readme_path)) + additional_files.append((config.REPOCARD_FILENAME, dataset_readme_path)) # copy the script and the files in an importable directory dynamic_modules_path = self.dynamic_modules_path if self.dynamic_modules_path else init_dynamic_modules() hash = files_to_hash([local_path] + [loc[1] for loc in local_imports]) diff --git a/tests/test_load.py b/tests/test_load.py index af12d22a6af..58724c49d53 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -1688,3 +1688,21 @@ def test_reload_old_cache_from_2_15(tmp_path: Path): cache_dir / "polinaeterna___audiofolder_two_configs_in_metadata" / "v2" / "0.0.0" / str(builder.hash) ).as_posix() ) # new cache + + +@pytest.mark.integration +def test_update_dataset_card_data_with_standalone_yaml(): + # Labels defined in .huggingface.yml because they are too long to be in README.md + from datasets.utils.metadata import MetadataConfigs + + with patch( + "datasets.utils.metadata.MetadataConfigs.from_dataset_card_data", + side_effect=MetadataConfigs.from_dataset_card_data, + ) as card_data_read_mock: + builder = load_dataset_builder("datasets-maintainers/dataset-with-standalone-yaml") + assert card_data_read_mock.call_args.args[0]["license"] is not None # from README.md + assert card_data_read_mock.call_args.args[0]["dataset_info"] is not None # from standalone yaml + assert card_data_read_mock.call_args.args[0]["tags"] == ["test"] # standalone yaml has precedence + assert isinstance( + builder.info.features["label"], datasets.ClassLabel + ) # correctly loaded from long labels list in standalone yaml