Skip to content

Commit

Permalink
Test single commit push_to_hub API
Browse files Browse the repository at this point in the history
  • Loading branch information
mariosasko committed Sep 29, 2023
1 parent 0cc77d7 commit a8f5116
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 19 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,10 @@ jobs:
python -m spacy download fr_core_news_sm
- name: Install dependencies (latest versions)
if: ${{ matrix.deps_versions == 'deps-latest' }}
run: pip install --upgrade pyarrow huggingface-hub dill
run: pip install --upgrade pyarrow dill
- name: Install depencencies (minimum versions)
if: ${{ matrix.deps_versions != 'deps-latest' }}
run: pip install pyarrow==8.0.0 huggingface-hub==0.14.0 transformers dill==0.3.1.1
run: pip install pyarrow==8.0.0 transformers dill==0.3.1.1
- name: Test with pytest
run: |
python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@
"aiohttp",
# To get datasets from the Datasets Hub on huggingface.co
# minimum 0.14.0 to support HfFileSystem
"huggingface-hub>=0.14.0,<1.0.0",
"huggingface_hub @ git+https://github.com/huggingface/huggingface_hub.git@preupload-files-before-commit",
# Utilities from PyPA to e.g., compare versions
"packaging",
# To parse YAML metadata from dataset cards
Expand Down
39 changes: 24 additions & 15 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,14 @@
import pandas as pd
import pyarrow as pa
import pyarrow.compute as pc
from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HfFolder
from huggingface_hub import (
CommitOperationAdd,
CommitOperationDelete,
DatasetCard,
DatasetCardData,
HfApi,
HfFolder,
)
from multiprocess import Pool
from requests import HTTPError

Expand Down Expand Up @@ -5293,6 +5300,7 @@ def path_in_repo(_index, shard):

uploaded_size = 0
shards_path_in_repo = []
operations = []
for index, shard in logging.tqdm(
enumerate(itertools.chain([first_shard], shards_iter)),
desc="Pushing dataset shards to the dataset hub",
Expand All @@ -5305,12 +5313,13 @@ def path_in_repo(_index, shard):
buffer = BytesIO()
shard.to_parquet(buffer)
uploaded_size += buffer.tell()
shard_addition = CommitOperationAdd(path_in_repo=shard_path_in_repo, path_or_fileobj=buffer)
api.preupload_lfs_files(repo_id, [shard_addition], token=token, repo_type="dataset", revision=branch)
_retry(
api.upload_file,
api.preupload_lfs_files,
func_kwargs={
"path_or_fileobj": buffer.getvalue(),
"path_in_repo": shard_path_in_repo,
"repo_id": repo_id,
"additions": [shard_addition],
"token": token,
"repo_type": "dataset",
"revision": branch,
Expand All @@ -5321,6 +5330,7 @@ def path_in_repo(_index, shard):
max_retries=5,
max_wait_time=20.0,
)
operations.append(shard_addition)
shards_path_in_repo.append(shard_path_in_repo)

# Cleanup to remove unused files
Expand All @@ -5329,23 +5339,22 @@ def path_in_repo(_index, shard):
for data_file in data_files
if data_file.startswith(f"{data_dir}/{split}-") and data_file not in shards_path_in_repo
]
for data_file in data_files_to_delete:
operations.append(CommitOperationDelete(path_in_repo=data_file))
download_config = DownloadConfig(token=token)
deleted_size = sum(
xgetsize(hf_hub_url(repo_id, data_file, revision=branch), download_config=download_config)
for data_file in data_files_to_delete
)

def delete_file(file):
api.delete_file(file, repo_id=repo_id, token=token, repo_type="dataset", revision=branch)

if len(data_files_to_delete):
for data_file in logging.tqdm(
data_files_to_delete,
desc="Deleting unused files from dataset repository",
total=len(data_files_to_delete),
disable=not logging.is_progress_bar_enabled(),
):
delete_file(data_file)
api.create_commit(
repo_id,
operations=operations,
token=token,
repo_type="dataset",
revision=branch,
commit_message="Uplod data files",
)

repo_files = list(set(files) - set(data_files_to_delete))

Expand Down
4 changes: 3 additions & 1 deletion tests/test_upstream_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,9 @@ def test_push_dataset_to_hub_custom_splits(self, temporary_repo):
def test_push_dataset_to_hub_skip_identical_files(self, temporary_repo):
ds = Dataset.from_dict({"x": list(range(1000)), "y": list(range(1000))})
with temporary_repo() as ds_name:
with patch("datasets.arrow_dataset.HfApi.upload_file", side_effect=self._api.upload_file) as mock_hf_api:
with patch(
"datasets.arrow_dataset.HfApi.preupload_lfs_files", side_effect=self._api.preupload_lfs_files
) as mock_hf_api:
# Initial push
ds.push_to_hub(ds_name, token=self._token, max_shard_size="1KB")
call_count_old = mock_hf_api.call_count
Expand Down

0 comments on commit a8f5116

Please sign in to comment.