Skip to content

Commit

Permalink
Support standalone yaml (#6557)
Browse files Browse the repository at this point in the history
* support standalone yaml

* add test

* fix file name

* move to config.py
  • Loading branch information
lhoestq authored Jan 11, 2024
1 parent 4a5b7d9 commit 9d6d161
Show file tree
Hide file tree
Showing 7 changed files with 72 additions and 17 deletions.
10 changes: 7 additions & 3 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5390,7 +5390,7 @@ def push_to_hub(
repo_splits = [] # use a list to keep the order of the splits
repo_files_to_add = [addition.path_in_repo for addition in additions]
for repo_file in list_files_info(api, repo_id=repo_id, revision=revision, repo_type="dataset", token=token):
if repo_file.rfilename == "README.md":
if repo_file.rfilename == config.REPOCARD_FILENAME:
repo_with_dataset_card = True
elif repo_file.rfilename == config.DATASETDICT_INFOS_FILENAME:
repo_with_dataset_infos = True
Expand Down Expand Up @@ -5421,7 +5421,9 @@ def push_to_hub(
)
# get the info from the README to update them
if repo_with_dataset_card:
dataset_card_path = api.hf_hub_download(repo_id, "README.md", repo_type="dataset", revision=revision)
dataset_card_path = api.hf_hub_download(
repo_id, config.REPOCARD_FILENAME, repo_type="dataset", revision=revision
)
dataset_card = DatasetCard.load(Path(dataset_card_path))
dataset_card_data = dataset_card.data
metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
Expand Down Expand Up @@ -5523,7 +5525,9 @@ def push_to_hub(
DatasetInfosDict({config_name: info_to_dump}).to_dataset_card_data(dataset_card_data)
MetadataConfigs({config_name: metadata_config_to_dump}).to_dataset_card_data(dataset_card_data)
dataset_card = DatasetCard(f"---\n{dataset_card_data}\n---\n") if dataset_card is None else dataset_card
additions.append(CommitOperationAdd(path_in_repo="README.md", path_or_fileobj=str(dataset_card).encode()))
additions.append(
CommitOperationAdd(path_in_repo=config.REPOCARD_FILENAME, path_or_fileobj=str(dataset_card).encode())
)

commit_message = commit_message if commit_message is not None else "Upload dataset"
if len(additions) <= config.UPLOADS_MAX_NUMBER_PER_COMMIT:
Expand Down
6 changes: 4 additions & 2 deletions src/datasets/commands/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,9 @@ def get_builders() -> Generator[DatasetBuilder, None, None]:
# Let's move it to the original directory of the dataset script, to allow the user to
# upload them on S3 at the same time afterwards.
if self._save_infos:
dataset_readme_path = os.path.join(builder_cls.get_imported_module_dir(), "README.md")
dataset_readme_path = os.path.join(
builder_cls.get_imported_module_dir(), datasets.config.REPOCARD_FILENAME
)
name = Path(path).name + ".py"
combined_path = os.path.join(path, name)
if os.path.isfile(path):
Expand All @@ -177,7 +179,7 @@ def get_builders() -> Generator[DatasetBuilder, None, None]:

# Move dataset_info back to the user
if dataset_dir is not None:
user_dataset_readme_path = os.path.join(dataset_dir, "README.md")
user_dataset_readme_path = os.path.join(dataset_dir, datasets.config.REPOCARD_FILENAME)
copyfile(dataset_readme_path, user_dataset_readme_path)
print(f"Dataset card saved at {user_dataset_readme_path}")

Expand Down
2 changes: 2 additions & 0 deletions src/datasets/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,8 @@
METRIC_INFO_FILENAME = "metric_info.json"
DATASETDICT_JSON_FILENAME = "dataset_dict.json"
METADATA_CONFIGS_FIELD = "configs"
REPOCARD_FILENAME = "README.md"
REPOYAML_FILENAME = ".huggingface.yaml"

MODULE_NAME_FOR_DYNAMIC_MODULES = "datasets_modules"

Expand Down
10 changes: 7 additions & 3 deletions src/datasets/dataset_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -1729,7 +1729,7 @@ def push_to_hub(
deletions = []
repo_files_to_add = [addition.path_in_repo for addition in additions]
for repo_file in list_files_info(api, repo_id=repo_id, revision=revision, repo_type="dataset", token=token):
if repo_file.rfilename == "README.md":
if repo_file.rfilename == config.REPOCARD_FILENAME:
repo_with_dataset_card = True
elif repo_file.rfilename == config.DATASETDICT_INFOS_FILENAME:
repo_with_dataset_infos = True
Expand All @@ -1750,7 +1750,9 @@ def push_to_hub(

# get the info from the README to update them
if repo_with_dataset_card:
dataset_card_path = api.hf_hub_download(repo_id, "README.md", repo_type="dataset", revision=revision)
dataset_card_path = api.hf_hub_download(
repo_id, config.REPOCARD_FILENAME, repo_type="dataset", revision=revision
)
dataset_card = DatasetCard.load(Path(dataset_card_path))
dataset_card_data = dataset_card.data
metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
Expand Down Expand Up @@ -1800,7 +1802,9 @@ def push_to_hub(
DatasetInfosDict({config_name: info_to_dump}).to_dataset_card_data(dataset_card_data)
MetadataConfigs({config_name: metadata_config_to_dump}).to_dataset_card_data(dataset_card_data)
dataset_card = DatasetCard(f"---\n{dataset_card_data}\n---\n") if dataset_card is None else dataset_card
additions.append(CommitOperationAdd(path_in_repo="README.md", path_or_fileobj=str(dataset_card).encode()))
additions.append(
CommitOperationAdd(path_in_repo=config.REPOCARD_FILENAME, path_or_fileobj=str(dataset_card).encode())
)

commit_message = commit_message if commit_message is not None else "Upload dataset"
if len(additions) <= config.UPLOADS_MAX_NUMBER_PER_COMMIT:
Expand Down
6 changes: 3 additions & 3 deletions src/datasets/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ class DatasetInfosDict(Dict[str, DatasetInfo]):
def write_to_directory(self, dataset_infos_dir, overwrite=False, pretty_print=False) -> None:
total_dataset_infos = {}
dataset_infos_path = os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME)
dataset_readme_path = os.path.join(dataset_infos_dir, "README.md")
dataset_readme_path = os.path.join(dataset_infos_dir, config.REPOCARD_FILENAME)
if not overwrite:
total_dataset_infos = self.from_directory(dataset_infos_dir)
total_dataset_infos.update(self)
Expand Down Expand Up @@ -426,8 +426,8 @@ def write_to_directory(self, dataset_infos_dir, overwrite=False, pretty_print=Fa
def from_directory(cls, dataset_infos_dir) -> "DatasetInfosDict":
logger.info(f"Loading Dataset Infos from {dataset_infos_dir}")
# Load the info from the YAML part of README.md
if os.path.exists(os.path.join(dataset_infos_dir, "README.md")):
dataset_card_data = DatasetCard.load(Path(dataset_infos_dir) / "README.md").data
if os.path.exists(os.path.join(dataset_infos_dir, config.REPOCARD_FILENAME)):
dataset_card_data = DatasetCard.load(Path(dataset_infos_dir) / config.REPOCARD_FILENAME).data
if "dataset_info" in dataset_card_data:
return cls.from_dataset_card_data(dataset_card_data)
if os.path.exists(os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME)):
Expand Down
37 changes: 31 additions & 6 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@

import fsspec
import requests
import yaml
from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HfFileSystem

from . import config
Expand Down Expand Up @@ -928,7 +929,7 @@ def get_module(self) -> DatasetModule:
)
# get script and other files
dataset_infos_path = Path(self.path).parent / config.DATASETDICT_INFOS_FILENAME
dataset_readme_path = Path(self.path).parent / "README.md"
dataset_readme_path = Path(self.path).parent / config.REPOCARD_FILENAME
imports = get_imports(self.path)
local_imports = _download_additional_modules(
name=self.name,
Expand All @@ -940,7 +941,7 @@ def get_module(self) -> DatasetModule:
if dataset_infos_path.is_file():
additional_files.append((config.DATASETDICT_INFOS_FILENAME, str(dataset_infos_path)))
if dataset_readme_path.is_file():
additional_files.append(("README.md", dataset_readme_path))
additional_files.append((config.REPOCARD_FILENAME, dataset_readme_path))
# copy the script and the files in an importable directory
dynamic_modules_path = self.dynamic_modules_path if self.dynamic_modules_path else init_dynamic_modules()
hash = files_to_hash([self.path] + [loc[1] for loc in local_imports])
Expand Down Expand Up @@ -1003,8 +1004,16 @@ def __init__(
self.download_mode = download_mode

def get_module(self) -> DatasetModule:
readme_path = os.path.join(self.path, "README.md")
readme_path = os.path.join(self.path, config.REPOCARD_FILENAME)
standalone_yaml_path = os.path.join(self.path, config.REPOYAML_FILENAME)
dataset_card_data = DatasetCard.load(readme_path).data if os.path.isfile(readme_path) else DatasetCardData()
if os.path.exists(standalone_yaml_path):
with open(standalone_yaml_path, "r", encoding="utf-8") as f:
standalone_yaml_data = yaml.safe_load(f.read())
if standalone_yaml_data:
_dataset_card_data_dict = dataset_card_data.to_dict()
_dataset_card_data_dict.update(standalone_yaml_data)
dataset_card_data = DatasetCardData(**_dataset_card_data_dict)
metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
# we need a set of data files to find which dataset builder to use
Expand Down Expand Up @@ -1190,12 +1199,28 @@ def get_module(self) -> DatasetModule:
download_config.download_desc = "Downloading readme"
try:
dataset_readme_path = cached_path(
hf_hub_url(self.name, "README.md", revision=revision),
hf_hub_url(self.name, config.REPOCARD_FILENAME, revision=revision),
download_config=download_config,
)
dataset_card_data = DatasetCard.load(Path(dataset_readme_path)).data
except FileNotFoundError:
dataset_card_data = DatasetCardData()
download_config = self.download_config.copy()
if download_config.download_desc is None:
download_config.download_desc = "Downloading standalone yaml"
try:
standalone_yaml_path = cached_path(
hf_hub_url(self.name, config.REPOYAML_FILENAME, revision=revision),
download_config=download_config,
)
with open(standalone_yaml_path, "r", encoding="utf-8") as f:
standalone_yaml_data = yaml.safe_load(f.read())
if standalone_yaml_data:
_dataset_card_data_dict = dataset_card_data.to_dict()
_dataset_card_data_dict.update(standalone_yaml_data)
dataset_card_data = DatasetCardData(**_dataset_card_data_dict)
except FileNotFoundError:
pass
metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
# we need a set of data files to find which dataset builder to use
Expand Down Expand Up @@ -1411,7 +1436,7 @@ def download_dataset_infos_file(self) -> str:
return None

def download_dataset_readme_file(self) -> str:
readme_url = hf_hub_url(self.name, "README.md", revision=self.revision)
readme_url = hf_hub_url(self.name, config.REPOCARD_FILENAME, revision=self.revision)
# Download the dataset infos file if available
download_config = self.download_config.copy()
if download_config.download_desc is None:
Expand Down Expand Up @@ -1448,7 +1473,7 @@ def get_module(self) -> DatasetModule:
if dataset_infos_path:
additional_files.append((config.DATASETDICT_INFOS_FILENAME, dataset_infos_path))
if dataset_readme_path:
additional_files.append(("README.md", dataset_readme_path))
additional_files.append((config.REPOCARD_FILENAME, dataset_readme_path))
# copy the script and the files in an importable directory
dynamic_modules_path = self.dynamic_modules_path if self.dynamic_modules_path else init_dynamic_modules()
hash = files_to_hash([local_path] + [loc[1] for loc in local_imports])
Expand Down
18 changes: 18 additions & 0 deletions tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -1688,3 +1688,21 @@ def test_reload_old_cache_from_2_15(tmp_path: Path):
cache_dir / "polinaeterna___audiofolder_two_configs_in_metadata" / "v2" / "0.0.0" / str(builder.hash)
).as_posix()
) # new cache


@pytest.mark.integration
def test_update_dataset_card_data_with_standalone_yaml():
# Labels defined in .huggingface.yml because they are too long to be in README.md
from datasets.utils.metadata import MetadataConfigs

with patch(
"datasets.utils.metadata.MetadataConfigs.from_dataset_card_data",
side_effect=MetadataConfigs.from_dataset_card_data,
) as card_data_read_mock:
builder = load_dataset_builder("datasets-maintainers/dataset-with-standalone-yaml")
assert card_data_read_mock.call_args.args[0]["license"] is not None # from README.md
assert card_data_read_mock.call_args.args[0]["dataset_info"] is not None # from standalone yaml
assert card_data_read_mock.call_args.args[0]["tags"] == ["test"] # standalone yaml has precedence
assert isinstance(
builder.info.features["label"], datasets.ClassLabel
) # correctly loaded from long labels list in standalone yaml

0 comments on commit 9d6d161

Please sign in to comment.