From d64e624ab3acf0ed49fd7a964a6b269020349521 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Wed, 27 Sep 2023 15:45:56 +0200 Subject: [PATCH 1/3] Remove `apache_beam` import in save_info in `BeamBasedBuilder` --- src/datasets/builder.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/src/datasets/builder.py b/src/datasets/builder.py index 419d84bd895..2e4b3a3a7e8 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -2099,14 +2099,17 @@ def _download_and_prepare(self, dl_manager, verification_mode, **prepare_splits_ self._rename(src_fpath, dst_fpath) def _save_info(self): - import apache_beam as beam - - fs = beam.io.filesystems.FileSystems - path_join = os.path.join if not is_remote_filesystem(self._fs) else posixpath.join - with fs.create(path_join(self._output_dir, config.DATASET_INFO_FILENAME)) as f: + download_config = ( + self.dl_manager.download_config + if self.dl_manager + else DownloadConfig(token=self.token, storage_options=self._fs.storage_options) + ) + with xopen(f"{self._output_dir}/{config.DATASET_ARROW_FILENAME}", "wb", download_config=download_config) as f: self.info._dump_info(f) if self.info.license: - with fs.create(path_join(self._output_dir, config.LICENSE_FILENAME)) as f: + with xopen( + f"{self._output_dir}/{config.DATASET_ARROW_FILENAME}", "wb", download_config=download_config + ) as f: self.info._dump_license(f) def _prepare_split( @@ -2176,8 +2179,13 @@ def _generate_examples_from_hf_gcs(self, split: SplitInfo): else: remote_prepared_urls = [f"{self._remote_cache_dir_from_hf_gcs}/{self.name}-{split.name}.arrow"] key = 0 + download_config = ( + self.dl_manager.download_config + if self.dl_manager + else DownloadConfig(token=self.token, storage_options=self._fs.storage_options) + ) for remote_prepared_url in remote_prepared_urls: - with xopen(remote_prepared_url, "rb") as f: + with xopen(remote_prepared_url, "rb", download_config=download_config) as f: with pa.ipc.open_stream(f) as reader: for record_batch in reader: for record in record_batch.to_pylist(): @@ -2189,7 +2197,12 @@ def _request_info_from_hf_gcs(self): remote_dataset_info = f"{self._remote_cache_dir_from_hf_gcs}/{config.DATASET_INFO_FILENAME}" try: - with xopen(remote_dataset_info) as f: + download_config = download_config = ( + self.dl_manager.download_config + if self.dl_manager + else DownloadConfig(token=self.token, storage_options=self._fs.storage_options) + ) + with xopen(remote_dataset_info, download_config=download_config) as f: import json _info = json.load(f) From ddd9ba4170cef4bc040d7b8a1d862cc1e192bc7b Mon Sep 17 00:00:00 2001 From: mariosasko Date: Wed, 27 Sep 2023 17:06:50 +0200 Subject: [PATCH 2/3] Oops :) --- src/datasets/builder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datasets/builder.py b/src/datasets/builder.py index 2e4b3a3a7e8..1bb51bcf5fb 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -2104,11 +2104,11 @@ def _save_info(self): if self.dl_manager else DownloadConfig(token=self.token, storage_options=self._fs.storage_options) ) - with xopen(f"{self._output_dir}/{config.DATASET_ARROW_FILENAME}", "wb", download_config=download_config) as f: + with xopen(f"{self._output_dir}/{config.DATASET_INFO_FILENAME}", "wb", download_config=download_config) as f: self.info._dump_info(f) if self.info.license: with xopen( - f"{self._output_dir}/{config.DATASET_ARROW_FILENAME}", "wb", download_config=download_config + f"{self._output_dir}/{config.LICENSE_FILENAME}", "wb", download_config=download_config ) as f: self.info._dump_license(f) From 46a0506765d0f92916ed5c37eb19e5fa1a77736a Mon Sep 17 00:00:00 2001 From: mariosasko Date: Wed, 27 Sep 2023 17:14:18 +0200 Subject: [PATCH 3/3] Style --- src/datasets/builder.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/datasets/builder.py b/src/datasets/builder.py index 1bb51bcf5fb..c30acc96dc4 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -2107,9 +2107,7 @@ def _save_info(self): with xopen(f"{self._output_dir}/{config.DATASET_INFO_FILENAME}", "wb", download_config=download_config) as f: self.info._dump_info(f) if self.info.license: - with xopen( - f"{self._output_dir}/{config.LICENSE_FILENAME}", "wb", download_config=download_config - ) as f: + with xopen(f"{self._output_dir}/{config.LICENSE_FILENAME}", "wb", download_config=download_config) as f: self.info._dump_license(f) def _prepare_split(