From 3fba2458ad20b2fce02bb9b92d05900dae48a75a Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Wed, 7 Aug 2024 15:47:18 +0100 Subject: [PATCH 01/18] Take mode argument into account when saving dataset Signed-off-by: Merel Theisen --- kedro-datasets/kedro_datasets/pandas/csv_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py index 2c43e13c6..d0009c6f0 100644 --- a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py @@ -69,7 +69,7 @@ class CSVDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): """ DEFAULT_LOAD_ARGS: dict[str, Any] = {} - DEFAULT_SAVE_ARGS: dict[str, Any] = {"index": False} + DEFAULT_SAVE_ARGS: dict[str, Any] = {"index": False, "mode": "wb"} def __init__( # noqa: PLR0913 self, @@ -175,7 +175,7 @@ def _save(self, data: pd.DataFrame) -> None: buf = BytesIO() data.to_csv(path_or_buf=buf, **self._save_args) - with self._fs.open(save_path, mode="wb") as fs_file: + with self._fs.open(save_path, mode=self._save_args.get("mode")) as fs_file: fs_file.write(buf.getvalue()) self._invalidate_cache() From ed881fe7759e76647a573b891fa6e7ba733fd2ed Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Wed, 7 Aug 2024 17:00:53 +0100 Subject: [PATCH 02/18] Fix test Signed-off-by: Merel Theisen --- kedro-datasets/tests/pandas/test_csv_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro-datasets/tests/pandas/test_csv_dataset.py b/kedro-datasets/tests/pandas/test_csv_dataset.py index 0954b4b9b..474b407c3 100644 --- a/kedro-datasets/tests/pandas/test_csv_dataset.py +++ b/kedro-datasets/tests/pandas/test_csv_dataset.py @@ -243,8 +243,8 @@ def test_version_str_repr(self, load_version, save_version): assert "protocol" in str(ds_versioned) assert "protocol" in str(ds) # Default save_args - assert "save_args={'index': False}" in str(ds) - assert "save_args={'index': False}" in str(ds_versioned) + assert "save_args={'index': False, 'mode': wb}" in str(ds) + assert "save_args={'index': False, 'mode': wb}" in str(ds_versioned) def test_save_and_load(self, versioned_csv_dataset, dummy_dataframe): """Test that saved and reloaded data matches the original one for From 28d173fb4d55e669296114b8ff66cc0e4433692b Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Tue, 13 Aug 2024 15:41:05 +0100 Subject: [PATCH 03/18] Move mode to default save args Signed-off-by: Merel Theisen --- kedro-datasets/kedro_datasets/matlab/matlab_dataset.py | 8 ++++++-- kedro-datasets/kedro_datasets/pandas/csv_dataset.py | 8 +++++--- kedro-datasets/kedro_datasets/pandas/excel_dataset.py | 8 +++++--- .../kedro_datasets/pandas/feather_dataset.py | 8 +++++--- kedro-datasets/kedro_datasets/pandas/json_dataset.py | 8 +++++--- .../kedro_datasets/pandas/parquet_dataset.py | 10 +++++++--- kedro-datasets/kedro_datasets/pandas/xml_dataset.py | 8 +++++--- kedro-datasets/kedro_datasets/polars/csv_dataset.py | 8 +++++--- .../kedro_datasets/polars/lazy_polars_dataset.py | 9 ++++++--- 9 files changed, 49 insertions(+), 26 deletions(-) diff --git a/kedro-datasets/kedro_datasets/matlab/matlab_dataset.py b/kedro-datasets/kedro_datasets/matlab/matlab_dataset.py index a74d74209..6e1f1b16d 100644 --- a/kedro-datasets/kedro_datasets/matlab/matlab_dataset.py +++ b/kedro-datasets/kedro_datasets/matlab/matlab_dataset.py @@ -52,7 +52,7 @@ class MatlabDataset(AbstractVersionedDataset[np.ndarray, np.ndarray]): """ - DEFAULT_SAVE_ARGS: dict[str, Any] = {"indent": 2} + DEFAULT_SAVE_ARGS: dict[str, Any] = {"indent": 2, "mode": "wb"} def __init__( # noqa = PLR0913 self, @@ -71,6 +71,8 @@ def __init__( # noqa = PLR0913 The prefix should be any protocol supported by ``fsspec``. Note: `http(s)` doesn't support versioning. save_args: .mat options for saving .mat files. + All defaults are preserved, apart from "indent", which is set to 2 and "mode", which is set to "wb". + Note that the save method requires bytes, so any save mode provided should include "b" for bytes. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` @@ -134,7 +136,9 @@ def _load(self) -> np.ndarray: def _save(self, data: np.ndarray) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) - with self._fs.open(save_path, mode="wb") as f: + save_mode = self._save_args.get("mode") + + with self._fs.open(save_path, mode=save_mode) as f: io.savemat(f, {"data": data}) self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py index d0009c6f0..241f266d8 100644 --- a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py @@ -97,7 +97,8 @@ def __init__( # noqa: PLR0913 save_args: Pandas options for saving CSV files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html - All defaults are preserved, but "index", which is set to False. + Defaults are preserved, apart from "index", which is set to False and "mode" which is set to "wb". + Note that the save method requires bytes, so any save mode provided should include "b" for bytes. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` @@ -106,7 +107,7 @@ def __init__( # noqa: PLR0913 E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). - metadata: Any Any arbitrary metadata. + metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ _fs_args = deepcopy(fs_args) or {} @@ -175,7 +176,8 @@ def _save(self, data: pd.DataFrame) -> None: buf = BytesIO() data.to_csv(path_or_buf=buf, **self._save_args) - with self._fs.open(save_path, mode=self._save_args.get("mode")) as fs_file: + save_mode = self._save_args.get("mode") + with self._fs.open(save_path, mode=save_mode) as fs_file: fs_file.write(buf.getvalue()) self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py index 601ef377e..7be41a8ed 100644 --- a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py @@ -108,7 +108,7 @@ class ExcelDataset( """ DEFAULT_LOAD_ARGS = {"engine": "openpyxl"} - DEFAULT_SAVE_ARGS = {"index": False} + DEFAULT_SAVE_ARGS: dict[str, Any] = {"index": False, "mode": "wb"} def __init__( # noqa: PLR0913 self, @@ -140,7 +140,8 @@ def __init__( # noqa: PLR0913 save_args: Pandas options for saving Excel files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html - All defaults are preserved, but "index", which is set to False. + Defaults are preserved, apart from "index", which is set to False and "mode" which is set to "wb". + Note that the save method requires bytes, so any save mode provided should include "b" for bytes. If you would like to specify options for the `ExcelWriter`, you can include them under the "writer" key. Here you can find all available arguments: @@ -232,6 +233,7 @@ def _load(self) -> pd.DataFrame | dict[str, pd.DataFrame]: def _save(self, data: pd.DataFrame | dict[str, pd.DataFrame]) -> None: output = BytesIO() save_path = get_filepath_str(self._get_save_path(), self._protocol) + save_mode = self._save_args.get("mode") with pd.ExcelWriter(output, **self._writer_args) as writer: if isinstance(data, dict): @@ -242,7 +244,7 @@ def _save(self, data: pd.DataFrame | dict[str, pd.DataFrame]) -> None: else: data.to_excel(writer, **self._save_args) - with self._fs.open(save_path, mode="wb") as fs_file: + with self._fs.open(save_path, mode=save_mode) as fs_file: fs_file.write(output.getvalue()) self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py index eb1f115f0..92040008e 100644 --- a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py @@ -68,7 +68,7 @@ class FeatherDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): """ DEFAULT_LOAD_ARGS: dict[str, Any] = {} - DEFAULT_SAVE_ARGS: dict[str, Any] = {} + DEFAULT_SAVE_ARGS: dict[str, Any] = {"mode": "wb"} def __init__( # noqa: PLR0913 self, @@ -96,7 +96,8 @@ def __init__( # noqa: PLR0913 save_args: Pandas options for saving feather files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_feather.html - All defaults are preserved. + All defaults are preserved, apart from "mode", which is set to "wb". + Note that the save method requires bytes, so any save mode provided should include "b" for bytes. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` @@ -169,11 +170,12 @@ def _load(self) -> pd.DataFrame: def _save(self, data: pd.DataFrame) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) + save_mode = self._save_args.get("mode") buf = BytesIO() data.to_feather(buf, **self._save_args) - with self._fs.open(save_path, mode="wb") as fs_file: + with self._fs.open(save_path, mode=save_mode) as fs_file: fs_file.write(buf.getvalue()) self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets/pandas/json_dataset.py b/kedro-datasets/kedro_datasets/pandas/json_dataset.py index 578c494ce..e107229b1 100644 --- a/kedro-datasets/kedro_datasets/pandas/json_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/json_dataset.py @@ -65,7 +65,7 @@ class JSONDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): """ DEFAULT_LOAD_ARGS: dict[str, Any] = {} - DEFAULT_SAVE_ARGS: dict[str, Any] = {} + DEFAULT_SAVE_ARGS: dict[str, Any] = {"mode": "wb"} def __init__( # noqa: PLR0913 self, @@ -93,7 +93,8 @@ def __init__( # noqa: PLR0913 save_args: Pandas options for saving JSON files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html - All defaults are preserved, but "index", which is set to False. + All defaults are preserved, apart from "mode", which is set to "wb". + Note that the save method requires bytes, so any save mode provided should include "b" for bytes. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` @@ -166,11 +167,12 @@ def _load(self) -> pd.DataFrame: def _save(self, data: pd.DataFrame) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) + save_mode = self._save_args.get("mode") buf = BytesIO() data.to_json(path_or_buf=buf, **self._save_args) - with self._fs.open(save_path, mode="wb") as fs_file: + with self._fs.open(save_path, mode=save_mode) as fs_file: fs_file.write(buf.getvalue()) self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py index 760d5a8f3..32acd6130 100644 --- a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py @@ -76,7 +76,7 @@ class ParquetDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): """ DEFAULT_LOAD_ARGS: dict[str, Any] = {} - DEFAULT_SAVE_ARGS: dict[str, Any] = {} + DEFAULT_SAVE_ARGS: dict[str, Any] = {"mode": "wb"} def __init__( # noqa: PLR0913 self, @@ -108,7 +108,9 @@ def __init__( # noqa: PLR0913 save_args: Additional saving options for saving Parquet file(s). Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_parquet.html - All defaults are preserved. ``partition_cols`` is not supported. + All defaults are preserved, apart from "mode", which is set to "wb". + Note that the save method requires bytes, so any save mode provided should include "b" for bytes. + ``partition_cols`` is not supported. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` attribute is None, save version will be autogenerated. @@ -196,7 +198,9 @@ def _save(self, data: pd.DataFrame) -> None: bytes_buffer = BytesIO() data.to_parquet(bytes_buffer, **self._save_args) - with self._fs.open(save_path, mode="wb") as fs_file: + save_mode = self._save_args.get("mode") + + with self._fs.open(save_path, mode=save_mode) as fs_file: fs_file.write(bytes_buffer.getvalue()) self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py index b1173f43e..e4a225a69 100644 --- a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py @@ -46,7 +46,7 @@ class XMLDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): """ DEFAULT_LOAD_ARGS: dict[str, Any] = {} - DEFAULT_SAVE_ARGS: dict[str, Any] = {"index": False} + DEFAULT_SAVE_ARGS: dict[str, Any] = {"index": False, "mode": "wb"} def __init__( # noqa: PLR0913 self, @@ -74,7 +74,8 @@ def __init__( # noqa: PLR0913 save_args: Pandas options for saving XML files. Here you can find all available arguments: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_xml.html - All defaults are preserved, but "index", which is set to False. + Defaults are preserved, apart from "index", which is set to False and "mode" which is set to "wb". + Note that the save method requires bytes, so any save mode provided should include "b" for bytes. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` @@ -148,11 +149,12 @@ def _load(self) -> pd.DataFrame: def _save(self, data: pd.DataFrame) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) + save_mode = self._save_args.get("mode") buf = BytesIO() data.to_xml(path_or_buffer=buf, **self._save_args) - with self._fs.open(save_path, mode="wb") as fs_file: + with self._fs.open(save_path, mode=save_mode) as fs_file: fs_file.write(buf.getvalue()) self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets/polars/csv_dataset.py b/kedro-datasets/kedro_datasets/polars/csv_dataset.py index 1195ce295..fabc1330f 100644 --- a/kedro-datasets/kedro_datasets/polars/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/polars/csv_dataset.py @@ -66,7 +66,7 @@ class CSVDataset(AbstractVersionedDataset[pl.DataFrame, pl.DataFrame]): """ DEFAULT_LOAD_ARGS: dict[str, Any] = {"rechunk": True} - DEFAULT_SAVE_ARGS: dict[str, Any] = {} + DEFAULT_SAVE_ARGS: dict[str, Any] = {"mode": "wb"} def __init__( # noqa: PLR0913 self, @@ -97,7 +97,8 @@ def __init__( # noqa: PLR0913 save_args: Polars options for saving CSV files. Here you can find all available arguments: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_csv.html - All defaults are preserved. + All defaults are preserved, apart from "mode", which is set to "wb". + Note that the save method requires bytes, so any save mode provided should include "b" for bytes. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` @@ -175,7 +176,8 @@ def _save(self, data: pl.DataFrame) -> None: buf = BytesIO() data.write_csv(file=buf, **self._save_args) - with self._fs.open(save_path, mode="wb") as fs_file: + save_mode = self._save_args.get("mode") + with self._fs.open(save_path, mode=save_mode) as fs_file: fs_file.write(buf.getvalue()) self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets/polars/lazy_polars_dataset.py b/kedro-datasets/kedro_datasets/polars/lazy_polars_dataset.py index 2e650e52e..3f0e100d5 100644 --- a/kedro-datasets/kedro_datasets/polars/lazy_polars_dataset.py +++ b/kedro-datasets/kedro_datasets/polars/lazy_polars_dataset.py @@ -73,7 +73,7 @@ class LazyPolarsDataset(AbstractVersionedDataset[pl.LazyFrame, PolarsFrame]): """ DEFAULT_LOAD_ARGS: ClassVar[dict[str, Any]] = {} - DEFAULT_SAVE_ARGS: ClassVar[dict[str, Any]] = {} + DEFAULT_SAVE_ARGS: ClassVar[dict[str, Any]] = {"mode": "wb"} def __init__( # noqa: PLR0913 self, @@ -113,7 +113,8 @@ def __init__( # noqa: PLR0913 save_args: Polars options for saving files. Here you can find all available arguments: https://pola-rs.github.io/polars/py-polars/html/reference/io.html - All defaults are preserved. + All defaults are preserved, apart from "mode", which is set to "wb". + Note that the save method requires bytes, so any save mode provided should include "b" for bytes. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` @@ -220,7 +221,9 @@ def _save(self, data: pl.DataFrame | pl.LazyFrame) -> None: if save_method: buf = BytesIO() save_method(file=buf, **self._save_args) - with self._fs.open(save_path, mode="wb") as fs_file: + save_mode = self._save_args.get("mode") + + with self._fs.open(save_path, mode=save_mode) as fs_file: fs_file.write(buf.getvalue()) self._invalidate_cache() # How the LazyPolarsDataset logic is currently written with From e71b81017f0e01c10cf0c2105ea553741b0a2791 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Wed, 14 Aug 2024 14:37:53 +0100 Subject: [PATCH 04/18] Revert changes to add mode to save args and add as fs arg default instead Signed-off-by: Merel Theisen --- .../kedro_datasets/matlab/matlab_dataset.py | 8 ++----- .../kedro_datasets/pandas/csv_dataset.py | 23 ++++++++++++------- .../kedro_datasets/pandas/excel_dataset.py | 8 +++---- .../kedro_datasets/pandas/feather_dataset.py | 8 +++---- .../kedro_datasets/pandas/json_dataset.py | 22 ++++++++++++------ .../kedro_datasets/pandas/parquet_dataset.py | 10 +++----- .../kedro_datasets/pandas/xml_dataset.py | 8 +++---- .../kedro_datasets/polars/csv_dataset.py | 8 +++---- .../polars/lazy_polars_dataset.py | 9 +++----- .../tests/pandas/test_csv_dataset.py | 6 ++--- 10 files changed, 53 insertions(+), 57 deletions(-) diff --git a/kedro-datasets/kedro_datasets/matlab/matlab_dataset.py b/kedro-datasets/kedro_datasets/matlab/matlab_dataset.py index 6e1f1b16d..a74d74209 100644 --- a/kedro-datasets/kedro_datasets/matlab/matlab_dataset.py +++ b/kedro-datasets/kedro_datasets/matlab/matlab_dataset.py @@ -52,7 +52,7 @@ class MatlabDataset(AbstractVersionedDataset[np.ndarray, np.ndarray]): """ - DEFAULT_SAVE_ARGS: dict[str, Any] = {"indent": 2, "mode": "wb"} + DEFAULT_SAVE_ARGS: dict[str, Any] = {"indent": 2} def __init__( # noqa = PLR0913 self, @@ -71,8 +71,6 @@ def __init__( # noqa = PLR0913 The prefix should be any protocol supported by ``fsspec``. Note: `http(s)` doesn't support versioning. save_args: .mat options for saving .mat files. - All defaults are preserved, apart from "indent", which is set to 2 and "mode", which is set to "wb". - Note that the save method requires bytes, so any save mode provided should include "b" for bytes. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` @@ -136,9 +134,7 @@ def _load(self) -> np.ndarray: def _save(self, data: np.ndarray) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) - save_mode = self._save_args.get("mode") - - with self._fs.open(save_path, mode=save_mode) as f: + with self._fs.open(save_path, mode="wb") as f: io.savemat(f, {"data": data}) self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py index 241f266d8..cbc23618b 100644 --- a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py @@ -69,7 +69,8 @@ class CSVDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): """ DEFAULT_LOAD_ARGS: dict[str, Any] = {} - DEFAULT_SAVE_ARGS: dict[str, Any] = {"index": False, "mode": "wb"} + DEFAULT_SAVE_ARGS: dict[str, Any] = {"index": False} + DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "wb"}} def __init__( # noqa: PLR0913 self, @@ -97,8 +98,7 @@ def __init__( # noqa: PLR0913 save_args: Pandas options for saving CSV files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html - Defaults are preserved, apart from "index", which is set to False and "mode" which is set to "wb". - Note that the save method requires bytes, so any save mode provided should include "b" for bytes. + Defaults are preserved, apart from "index", which is set to False. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` @@ -107,20 +107,24 @@ def __init__( # noqa: PLR0913 E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + Defaults are preserved, apart from the `open_args_save` `mode` which is set to `wb`. + Note that the save method requires bytes, so any save mode provided should include "b" for bytes. metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ - _fs_args = deepcopy(fs_args) or {} + self._fs_args = deepcopy(self.DEFAULT_FS_ARGS) + if fs_args is not None: + self._fs_args.update(fs_args) + _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) if protocol == "file": - _fs_args.setdefault("auto_mkdir", True) + self._fs_args.setdefault("auto_mkdir", True) self._protocol = protocol - self._storage_options = {**_credentials, **_fs_args} + self._storage_options = {**_credentials, **self._fs_args} self._fs = fsspec.filesystem(self._protocol, **self._storage_options) - self.metadata = metadata super().__init__( @@ -176,7 +180,10 @@ def _save(self, data: pd.DataFrame) -> None: buf = BytesIO() data.to_csv(path_or_buf=buf, **self._save_args) - save_mode = self._save_args.get("mode") + fs_open_args_save = self._fs_args.get("open_args_save", {}) + save_mode = fs_open_args_save.get( + "mode", self.DEFAULT_FS_ARGS["open_args_save"]["mode"] + ) with self._fs.open(save_path, mode=save_mode) as fs_file: fs_file.write(buf.getvalue()) diff --git a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py index 7be41a8ed..601ef377e 100644 --- a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py @@ -108,7 +108,7 @@ class ExcelDataset( """ DEFAULT_LOAD_ARGS = {"engine": "openpyxl"} - DEFAULT_SAVE_ARGS: dict[str, Any] = {"index": False, "mode": "wb"} + DEFAULT_SAVE_ARGS = {"index": False} def __init__( # noqa: PLR0913 self, @@ -140,8 +140,7 @@ def __init__( # noqa: PLR0913 save_args: Pandas options for saving Excel files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html - Defaults are preserved, apart from "index", which is set to False and "mode" which is set to "wb". - Note that the save method requires bytes, so any save mode provided should include "b" for bytes. + All defaults are preserved, but "index", which is set to False. If you would like to specify options for the `ExcelWriter`, you can include them under the "writer" key. Here you can find all available arguments: @@ -233,7 +232,6 @@ def _load(self) -> pd.DataFrame | dict[str, pd.DataFrame]: def _save(self, data: pd.DataFrame | dict[str, pd.DataFrame]) -> None: output = BytesIO() save_path = get_filepath_str(self._get_save_path(), self._protocol) - save_mode = self._save_args.get("mode") with pd.ExcelWriter(output, **self._writer_args) as writer: if isinstance(data, dict): @@ -244,7 +242,7 @@ def _save(self, data: pd.DataFrame | dict[str, pd.DataFrame]) -> None: else: data.to_excel(writer, **self._save_args) - with self._fs.open(save_path, mode=save_mode) as fs_file: + with self._fs.open(save_path, mode="wb") as fs_file: fs_file.write(output.getvalue()) self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py index 92040008e..eb1f115f0 100644 --- a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py @@ -68,7 +68,7 @@ class FeatherDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): """ DEFAULT_LOAD_ARGS: dict[str, Any] = {} - DEFAULT_SAVE_ARGS: dict[str, Any] = {"mode": "wb"} + DEFAULT_SAVE_ARGS: dict[str, Any] = {} def __init__( # noqa: PLR0913 self, @@ -96,8 +96,7 @@ def __init__( # noqa: PLR0913 save_args: Pandas options for saving feather files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_feather.html - All defaults are preserved, apart from "mode", which is set to "wb". - Note that the save method requires bytes, so any save mode provided should include "b" for bytes. + All defaults are preserved. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` @@ -170,12 +169,11 @@ def _load(self) -> pd.DataFrame: def _save(self, data: pd.DataFrame) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) - save_mode = self._save_args.get("mode") buf = BytesIO() data.to_feather(buf, **self._save_args) - with self._fs.open(save_path, mode=save_mode) as fs_file: + with self._fs.open(save_path, mode="wb") as fs_file: fs_file.write(buf.getvalue()) self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets/pandas/json_dataset.py b/kedro-datasets/kedro_datasets/pandas/json_dataset.py index e107229b1..13b0e6908 100644 --- a/kedro-datasets/kedro_datasets/pandas/json_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/json_dataset.py @@ -65,7 +65,8 @@ class JSONDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): """ DEFAULT_LOAD_ARGS: dict[str, Any] = {} - DEFAULT_SAVE_ARGS: dict[str, Any] = {"mode": "wb"} + DEFAULT_SAVE_ARGS: dict[str, Any] = {} + DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "wb"}} def __init__( # noqa: PLR0913 self, @@ -93,8 +94,7 @@ def __init__( # noqa: PLR0913 save_args: Pandas options for saving JSON files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html - All defaults are preserved, apart from "mode", which is set to "wb". - Note that the save method requires bytes, so any save mode provided should include "b" for bytes. + All defaults are preserved. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` @@ -103,17 +103,22 @@ def __init__( # noqa: PLR0913 E.g. for ``GCSFileSystem`` it should look like `{'token': None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + Defaults are preserved, apart from the `open_args_save` `mode` which is set to `wb`. + Note that the save method requires bytes, so any save mode provided should include "b" for bytes. metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ - _fs_args = deepcopy(fs_args) or {} + self._fs_args = deepcopy(self.DEFAULT_FS_ARGS) + if fs_args is not None: + self._fs_args.update(fs_args) + _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) if protocol == "file": - _fs_args.setdefault("auto_mkdir", True) + self._fs_args.setdefault("auto_mkdir", True) self._protocol = protocol - self._storage_options = {**_credentials, **_fs_args} + self._storage_options = {**_credentials, **self._fs_args} self._fs = fsspec.filesystem(self._protocol, **self._storage_options) self.metadata = metadata @@ -167,11 +172,14 @@ def _load(self) -> pd.DataFrame: def _save(self, data: pd.DataFrame) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) - save_mode = self._save_args.get("mode") buf = BytesIO() data.to_json(path_or_buf=buf, **self._save_args) + fs_open_args_save = self._fs_args.get("open_args_save", {}) + save_mode = fs_open_args_save.get( + "mode", self.DEFAULT_FS_ARGS["open_args_save"]["mode"] + ) with self._fs.open(save_path, mode=save_mode) as fs_file: fs_file.write(buf.getvalue()) diff --git a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py index 32acd6130..760d5a8f3 100644 --- a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py @@ -76,7 +76,7 @@ class ParquetDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): """ DEFAULT_LOAD_ARGS: dict[str, Any] = {} - DEFAULT_SAVE_ARGS: dict[str, Any] = {"mode": "wb"} + DEFAULT_SAVE_ARGS: dict[str, Any] = {} def __init__( # noqa: PLR0913 self, @@ -108,9 +108,7 @@ def __init__( # noqa: PLR0913 save_args: Additional saving options for saving Parquet file(s). Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_parquet.html - All defaults are preserved, apart from "mode", which is set to "wb". - Note that the save method requires bytes, so any save mode provided should include "b" for bytes. - ``partition_cols`` is not supported. + All defaults are preserved. ``partition_cols`` is not supported. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` attribute is None, save version will be autogenerated. @@ -198,9 +196,7 @@ def _save(self, data: pd.DataFrame) -> None: bytes_buffer = BytesIO() data.to_parquet(bytes_buffer, **self._save_args) - save_mode = self._save_args.get("mode") - - with self._fs.open(save_path, mode=save_mode) as fs_file: + with self._fs.open(save_path, mode="wb") as fs_file: fs_file.write(bytes_buffer.getvalue()) self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py index e4a225a69..b1173f43e 100644 --- a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py @@ -46,7 +46,7 @@ class XMLDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): """ DEFAULT_LOAD_ARGS: dict[str, Any] = {} - DEFAULT_SAVE_ARGS: dict[str, Any] = {"index": False, "mode": "wb"} + DEFAULT_SAVE_ARGS: dict[str, Any] = {"index": False} def __init__( # noqa: PLR0913 self, @@ -74,8 +74,7 @@ def __init__( # noqa: PLR0913 save_args: Pandas options for saving XML files. Here you can find all available arguments: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_xml.html - Defaults are preserved, apart from "index", which is set to False and "mode" which is set to "wb". - Note that the save method requires bytes, so any save mode provided should include "b" for bytes. + All defaults are preserved, but "index", which is set to False. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` @@ -149,12 +148,11 @@ def _load(self) -> pd.DataFrame: def _save(self, data: pd.DataFrame) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) - save_mode = self._save_args.get("mode") buf = BytesIO() data.to_xml(path_or_buffer=buf, **self._save_args) - with self._fs.open(save_path, mode=save_mode) as fs_file: + with self._fs.open(save_path, mode="wb") as fs_file: fs_file.write(buf.getvalue()) self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets/polars/csv_dataset.py b/kedro-datasets/kedro_datasets/polars/csv_dataset.py index fabc1330f..1195ce295 100644 --- a/kedro-datasets/kedro_datasets/polars/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/polars/csv_dataset.py @@ -66,7 +66,7 @@ class CSVDataset(AbstractVersionedDataset[pl.DataFrame, pl.DataFrame]): """ DEFAULT_LOAD_ARGS: dict[str, Any] = {"rechunk": True} - DEFAULT_SAVE_ARGS: dict[str, Any] = {"mode": "wb"} + DEFAULT_SAVE_ARGS: dict[str, Any] = {} def __init__( # noqa: PLR0913 self, @@ -97,8 +97,7 @@ def __init__( # noqa: PLR0913 save_args: Polars options for saving CSV files. Here you can find all available arguments: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_csv.html - All defaults are preserved, apart from "mode", which is set to "wb". - Note that the save method requires bytes, so any save mode provided should include "b" for bytes. + All defaults are preserved. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` @@ -176,8 +175,7 @@ def _save(self, data: pl.DataFrame) -> None: buf = BytesIO() data.write_csv(file=buf, **self._save_args) - save_mode = self._save_args.get("mode") - with self._fs.open(save_path, mode=save_mode) as fs_file: + with self._fs.open(save_path, mode="wb") as fs_file: fs_file.write(buf.getvalue()) self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets/polars/lazy_polars_dataset.py b/kedro-datasets/kedro_datasets/polars/lazy_polars_dataset.py index 3f0e100d5..2e650e52e 100644 --- a/kedro-datasets/kedro_datasets/polars/lazy_polars_dataset.py +++ b/kedro-datasets/kedro_datasets/polars/lazy_polars_dataset.py @@ -73,7 +73,7 @@ class LazyPolarsDataset(AbstractVersionedDataset[pl.LazyFrame, PolarsFrame]): """ DEFAULT_LOAD_ARGS: ClassVar[dict[str, Any]] = {} - DEFAULT_SAVE_ARGS: ClassVar[dict[str, Any]] = {"mode": "wb"} + DEFAULT_SAVE_ARGS: ClassVar[dict[str, Any]] = {} def __init__( # noqa: PLR0913 self, @@ -113,8 +113,7 @@ def __init__( # noqa: PLR0913 save_args: Polars options for saving files. Here you can find all available arguments: https://pola-rs.github.io/polars/py-polars/html/reference/io.html - All defaults are preserved, apart from "mode", which is set to "wb". - Note that the save method requires bytes, so any save mode provided should include "b" for bytes. + All defaults are preserved. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` @@ -221,9 +220,7 @@ def _save(self, data: pl.DataFrame | pl.LazyFrame) -> None: if save_method: buf = BytesIO() save_method(file=buf, **self._save_args) - save_mode = self._save_args.get("mode") - - with self._fs.open(save_path, mode=save_mode) as fs_file: + with self._fs.open(save_path, mode="wb") as fs_file: fs_file.write(buf.getvalue()) self._invalidate_cache() # How the LazyPolarsDataset logic is currently written with diff --git a/kedro-datasets/tests/pandas/test_csv_dataset.py b/kedro-datasets/tests/pandas/test_csv_dataset.py index 474b407c3..b3aac46cb 100644 --- a/kedro-datasets/tests/pandas/test_csv_dataset.py +++ b/kedro-datasets/tests/pandas/test_csv_dataset.py @@ -243,8 +243,8 @@ def test_version_str_repr(self, load_version, save_version): assert "protocol" in str(ds_versioned) assert "protocol" in str(ds) # Default save_args - assert "save_args={'index': False, 'mode': wb}" in str(ds) - assert "save_args={'index': False, 'mode': wb}" in str(ds_versioned) + assert "save_args={'index': False}" in str(ds) + assert "save_args={'index': False}" in str(ds_versioned) def test_save_and_load(self, versioned_csv_dataset, dummy_dataframe): """Test that saved and reloaded data matches the original one for @@ -408,7 +408,7 @@ def test_load_and_confirm(self, mocker, mocked_csv_in_s3, mocked_dataframe): if sys.version_info[1] >= 10: read_patch = mocker.patch("pandas.read_csv", return_value=mocked_dataframe) df.load() - read_patch.assert_called_once_with(mocked_csv_in_s3, storage_options={}) + read_patch.assert_called_once_with(mocked_csv_in_s3, storage_options={'open_args_save': {'mode': 'wb'}}) else: loaded = df.load() assert_frame_equal(loaded, mocked_dataframe) From 079b0a35cae57a239c66c869ebaf6dd4f7c83bc6 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Wed, 14 Aug 2024 14:49:45 +0100 Subject: [PATCH 05/18] Fix lint Signed-off-by: Merel Theisen --- kedro-datasets/tests/pandas/test_csv_dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kedro-datasets/tests/pandas/test_csv_dataset.py b/kedro-datasets/tests/pandas/test_csv_dataset.py index b3aac46cb..4f435b492 100644 --- a/kedro-datasets/tests/pandas/test_csv_dataset.py +++ b/kedro-datasets/tests/pandas/test_csv_dataset.py @@ -408,7 +408,9 @@ def test_load_and_confirm(self, mocker, mocked_csv_in_s3, mocked_dataframe): if sys.version_info[1] >= 10: read_patch = mocker.patch("pandas.read_csv", return_value=mocked_dataframe) df.load() - read_patch.assert_called_once_with(mocked_csv_in_s3, storage_options={'open_args_save': {'mode': 'wb'}}) + read_patch.assert_called_once_with( + mocked_csv_in_s3, storage_options={"open_args_save": {"mode": "wb"}} + ) else: loaded = df.load() assert_frame_equal(loaded, mocked_dataframe) From 102638035806815e41d406427d822c0e74905380 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Wed, 14 Aug 2024 16:28:40 +0100 Subject: [PATCH 06/18] Separate fs save and load args again Signed-off-by: Merel Theisen --- kedro-datasets/kedro_datasets/pandas/csv_dataset.py | 12 +++++++----- .../kedro_datasets/pandas/json_dataset.py | 13 ++++++++----- kedro-datasets/tests/pandas/test_csv_dataset.py | 4 +--- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py index cbc23618b..80c61b3d9 100644 --- a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py @@ -112,18 +112,20 @@ def __init__( # noqa: PLR0913 metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ - self._fs_args = deepcopy(self.DEFAULT_FS_ARGS) + _fs_args = deepcopy(self.DEFAULT_FS_ARGS) if fs_args is not None: - self._fs_args.update(fs_args) + _fs_args.update(fs_args) + self._fs_open_args_load = _fs_args.pop("open_args_load", {}) + self._fs_open_args_save = _fs_args.pop("open_args_save", {}) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) if protocol == "file": - self._fs_args.setdefault("auto_mkdir", True) + _fs_args.setdefault("auto_mkdir", True) self._protocol = protocol - self._storage_options = {**_credentials, **self._fs_args} + self._storage_options = {**_credentials, **_fs_args} self._fs = fsspec.filesystem(self._protocol, **self._storage_options) self.metadata = metadata @@ -180,7 +182,7 @@ def _save(self, data: pd.DataFrame) -> None: buf = BytesIO() data.to_csv(path_or_buf=buf, **self._save_args) - fs_open_args_save = self._fs_args.get("open_args_save", {}) + fs_open_args_save = self._fs_open_args_save or {} save_mode = fs_open_args_save.get( "mode", self.DEFAULT_FS_ARGS["open_args_save"]["mode"] ) diff --git a/kedro-datasets/kedro_datasets/pandas/json_dataset.py b/kedro-datasets/kedro_datasets/pandas/json_dataset.py index 13b0e6908..56f99bf08 100644 --- a/kedro-datasets/kedro_datasets/pandas/json_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/json_dataset.py @@ -108,17 +108,20 @@ def __init__( # noqa: PLR0913 metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ - self._fs_args = deepcopy(self.DEFAULT_FS_ARGS) + _fs_args = deepcopy(self.DEFAULT_FS_ARGS) if fs_args is not None: - self._fs_args.update(fs_args) + _fs_args.update(fs_args) + + self._fs_open_args_load = _fs_args.pop("open_args_load", {}) + self._fs_open_args_save = _fs_args.pop("open_args_save", {}) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) if protocol == "file": - self._fs_args.setdefault("auto_mkdir", True) + _fs_args.setdefault("auto_mkdir", True) self._protocol = protocol - self._storage_options = {**_credentials, **self._fs_args} + self._storage_options = {**_credentials, **_fs_args} self._fs = fsspec.filesystem(self._protocol, **self._storage_options) self.metadata = metadata @@ -176,7 +179,7 @@ def _save(self, data: pd.DataFrame) -> None: buf = BytesIO() data.to_json(path_or_buf=buf, **self._save_args) - fs_open_args_save = self._fs_args.get("open_args_save", {}) + fs_open_args_save = self._fs_open_args_save or {} save_mode = fs_open_args_save.get( "mode", self.DEFAULT_FS_ARGS["open_args_save"]["mode"] ) diff --git a/kedro-datasets/tests/pandas/test_csv_dataset.py b/kedro-datasets/tests/pandas/test_csv_dataset.py index 4f435b492..0954b4b9b 100644 --- a/kedro-datasets/tests/pandas/test_csv_dataset.py +++ b/kedro-datasets/tests/pandas/test_csv_dataset.py @@ -408,9 +408,7 @@ def test_load_and_confirm(self, mocker, mocked_csv_in_s3, mocked_dataframe): if sys.version_info[1] >= 10: read_patch = mocker.patch("pandas.read_csv", return_value=mocked_dataframe) df.load() - read_patch.assert_called_once_with( - mocked_csv_in_s3, storage_options={"open_args_save": {"mode": "wb"}} - ) + read_patch.assert_called_once_with(mocked_csv_in_s3, storage_options={}) else: loaded = df.load() assert_frame_equal(loaded, mocked_dataframe) From 0725a9646d4145a0b0b6322384a0a0866441ee1c Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Wed, 14 Aug 2024 17:27:22 +0100 Subject: [PATCH 07/18] Add tests for coverage Signed-off-by: Merel Theisen --- kedro-datasets/tests/pandas/test_csv_dataset.py | 10 ++++++++++ kedro-datasets/tests/pandas/test_json_dataset.py | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/kedro-datasets/tests/pandas/test_csv_dataset.py b/kedro-datasets/tests/pandas/test_csv_dataset.py index 0954b4b9b..92538d30e 100644 --- a/kedro-datasets/tests/pandas/test_csv_dataset.py +++ b/kedro-datasets/tests/pandas/test_csv_dataset.py @@ -115,6 +115,16 @@ def test_save_extra_params(self, csv_dataset, save_args): for key, value in save_args.items(): assert csv_dataset._save_args[key] == value + @pytest.mark.parametrize( + "fs_args", + [{"open_args_load": {"k1": "v1"}, "open_args_save": {"index": "value"}}], + indirect=True, + ) + def test_fs_extra_params(self, csv_dataset, fs_args): + """Test overriding the default fs arguments.""" + assert csv_dataset._fs_open_args_load == {"k1": "v1"} + assert csv_dataset._fs_open_args_save == {"index": "value"} + @pytest.mark.parametrize( "load_args,save_args", [ diff --git a/kedro-datasets/tests/pandas/test_json_dataset.py b/kedro-datasets/tests/pandas/test_json_dataset.py index 04536c20d..2e2beacc5 100644 --- a/kedro-datasets/tests/pandas/test_json_dataset.py +++ b/kedro-datasets/tests/pandas/test_json_dataset.py @@ -83,6 +83,16 @@ def test_save_extra_params(self, json_dataset, save_args): for key, value in save_args.items(): assert json_dataset._save_args[key] == value + @pytest.mark.parametrize( + "fs_args", + [{"open_args_load": {"k1": "v1"}, "open_args_save": {"index": "value"}}], + indirect=True, + ) + def test_fs_extra_params(self, json_dataset, fs_args): + """Test overriding the default fs arguments.""" + assert json_dataset._fs_open_args_load == {"k1": "v1"} + assert json_dataset._fs_open_args_save == {"index": "value"} + @pytest.mark.parametrize( "load_args,save_args", [ From c7ba31472a5b8b63d260d40384bf602d18fcd504 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Wed, 14 Aug 2024 18:06:28 +0100 Subject: [PATCH 08/18] Try simplify init Signed-off-by: Merel Theisen --- .../kedro_datasets/pandas/csv_dataset.py | 22 +++++++++---------- .../kedro_datasets/pandas/json_dataset.py | 22 +++++++++---------- 2 files changed, 20 insertions(+), 24 deletions(-) diff --git a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py index 80c61b3d9..a136167d2 100644 --- a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py @@ -112,14 +112,16 @@ def __init__( # noqa: PLR0913 metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ - _fs_args = deepcopy(self.DEFAULT_FS_ARGS) - if fs_args is not None: - _fs_args.update(fs_args) + _fs_args = deepcopy(fs_args) or {} + # Handle default fs load and save arguments + self._fs_open_args_load = _fs_args.pop( + "open_args_load", self.DEFAULT_FS_ARGS.get("open_args_load", {}) + ) + self._fs_open_args_save = _fs_args.pop( + "open_args_save", self.DEFAULT_FS_ARGS.get("open_args_save", {}) + ) - self._fs_open_args_load = _fs_args.pop("open_args_load", {}) - self._fs_open_args_save = _fs_args.pop("open_args_save", {}) _credentials = deepcopy(credentials) or {} - protocol, path = get_protocol_and_path(filepath, version) if protocol == "file": _fs_args.setdefault("auto_mkdir", True) @@ -137,12 +139,8 @@ def __init__( # noqa: PLR0913 ) # Handle default load and save arguments - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} if "storage_options" in self._save_args or "storage_options" in self._load_args: logger.warning( diff --git a/kedro-datasets/kedro_datasets/pandas/json_dataset.py b/kedro-datasets/kedro_datasets/pandas/json_dataset.py index 56f99bf08..0a8f31568 100644 --- a/kedro-datasets/kedro_datasets/pandas/json_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/json_dataset.py @@ -108,12 +108,14 @@ def __init__( # noqa: PLR0913 metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ - _fs_args = deepcopy(self.DEFAULT_FS_ARGS) - if fs_args is not None: - _fs_args.update(fs_args) - - self._fs_open_args_load = _fs_args.pop("open_args_load", {}) - self._fs_open_args_save = _fs_args.pop("open_args_save", {}) + _fs_args = deepcopy(fs_args) or {} + # Handle default fs load and save arguments + self._fs_open_args_load = _fs_args.pop( + "open_args_load", self.DEFAULT_FS_ARGS.get("open_args_load", {}) + ) + self._fs_open_args_save = _fs_args.pop( + "open_args_save", self.DEFAULT_FS_ARGS.get("open_args_save", {}) + ) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) @@ -134,12 +136,8 @@ def __init__( # noqa: PLR0913 ) # Handle default load and save arguments - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} if "storage_options" in self._save_args or "storage_options" in self._load_args: logger.warning( From 9edd0f9570f4b9621139057abfea275201a34bb8 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Tue, 20 Aug 2024 11:55:07 +0100 Subject: [PATCH 09/18] Remove writing to buffer and use save_args both in saving and conversion Signed-off-by: Merel Theisen --- .../kedro_datasets/pandas/csv_dataset.py | 25 ++++--------------- .../kedro_datasets/pandas/json_dataset.py | 23 +++-------------- .../tests/pandas/test_csv_dataset.py | 10 -------- .../tests/pandas/test_json_dataset.py | 10 -------- 4 files changed, 8 insertions(+), 60 deletions(-) diff --git a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py index a136167d2..b2d99ebb8 100644 --- a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py @@ -5,7 +5,6 @@ import logging from copy import deepcopy -from io import BytesIO from pathlib import PurePosixPath from typing import Any @@ -69,8 +68,7 @@ class CSVDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): """ DEFAULT_LOAD_ARGS: dict[str, Any] = {} - DEFAULT_SAVE_ARGS: dict[str, Any] = {"index": False} - DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "wb"}} + DEFAULT_SAVE_ARGS: dict[str, Any] = {"index": False, "mode": "w"} def __init__( # noqa: PLR0913 self, @@ -113,15 +111,8 @@ def __init__( # noqa: PLR0913 This is ignored by Kedro, but may be consumed by users or external plugins. """ _fs_args = deepcopy(fs_args) or {} - # Handle default fs load and save arguments - self._fs_open_args_load = _fs_args.pop( - "open_args_load", self.DEFAULT_FS_ARGS.get("open_args_load", {}) - ) - self._fs_open_args_save = _fs_args.pop( - "open_args_save", self.DEFAULT_FS_ARGS.get("open_args_save", {}) - ) - _credentials = deepcopy(credentials) or {} + protocol, path = get_protocol_and_path(filepath, version) if protocol == "file": _fs_args.setdefault("auto_mkdir", True) @@ -129,6 +120,7 @@ def __init__( # noqa: PLR0913 self._protocol = protocol self._storage_options = {**_credentials, **_fs_args} self._fs = fsspec.filesystem(self._protocol, **self._storage_options) + self.metadata = metadata super().__init__( @@ -177,15 +169,8 @@ def _load(self) -> pd.DataFrame: def _save(self, data: pd.DataFrame) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) - buf = BytesIO() - data.to_csv(path_or_buf=buf, **self._save_args) - - fs_open_args_save = self._fs_open_args_save or {} - save_mode = fs_open_args_save.get( - "mode", self.DEFAULT_FS_ARGS["open_args_save"]["mode"] - ) - with self._fs.open(save_path, mode=save_mode) as fs_file: - fs_file.write(buf.getvalue()) + with self._fs.open(save_path, **self._save_args) as fs_file: + data.to_csv(path_or_buf=fs_file, **self._save_args) self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets/pandas/json_dataset.py b/kedro-datasets/kedro_datasets/pandas/json_dataset.py index 0a8f31568..bdbcfcb8c 100644 --- a/kedro-datasets/kedro_datasets/pandas/json_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/json_dataset.py @@ -5,7 +5,6 @@ import logging from copy import deepcopy -from io import BytesIO from pathlib import PurePosixPath from typing import Any @@ -65,8 +64,7 @@ class JSONDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): """ DEFAULT_LOAD_ARGS: dict[str, Any] = {} - DEFAULT_SAVE_ARGS: dict[str, Any] = {} - DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "wb"}} + DEFAULT_SAVE_ARGS: dict[str, Any] = {"mode": "w"} def __init__( # noqa: PLR0913 self, @@ -109,14 +107,6 @@ def __init__( # noqa: PLR0913 This is ignored by Kedro, but may be consumed by users or external plugins. """ _fs_args = deepcopy(fs_args) or {} - # Handle default fs load and save arguments - self._fs_open_args_load = _fs_args.pop( - "open_args_load", self.DEFAULT_FS_ARGS.get("open_args_load", {}) - ) - self._fs_open_args_save = _fs_args.pop( - "open_args_save", self.DEFAULT_FS_ARGS.get("open_args_save", {}) - ) - _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) if protocol == "file": @@ -174,15 +164,8 @@ def _load(self) -> pd.DataFrame: def _save(self, data: pd.DataFrame) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) - buf = BytesIO() - data.to_json(path_or_buf=buf, **self._save_args) - - fs_open_args_save = self._fs_open_args_save or {} - save_mode = fs_open_args_save.get( - "mode", self.DEFAULT_FS_ARGS["open_args_save"]["mode"] - ) - with self._fs.open(save_path, mode=save_mode) as fs_file: - fs_file.write(buf.getvalue()) + with self._fs.open(save_path, **self._save_args) as fs_file: + data.to_json(path_or_buf=fs_file, **self._save_args) self._invalidate_cache() diff --git a/kedro-datasets/tests/pandas/test_csv_dataset.py b/kedro-datasets/tests/pandas/test_csv_dataset.py index 92538d30e..0954b4b9b 100644 --- a/kedro-datasets/tests/pandas/test_csv_dataset.py +++ b/kedro-datasets/tests/pandas/test_csv_dataset.py @@ -115,16 +115,6 @@ def test_save_extra_params(self, csv_dataset, save_args): for key, value in save_args.items(): assert csv_dataset._save_args[key] == value - @pytest.mark.parametrize( - "fs_args", - [{"open_args_load": {"k1": "v1"}, "open_args_save": {"index": "value"}}], - indirect=True, - ) - def test_fs_extra_params(self, csv_dataset, fs_args): - """Test overriding the default fs arguments.""" - assert csv_dataset._fs_open_args_load == {"k1": "v1"} - assert csv_dataset._fs_open_args_save == {"index": "value"} - @pytest.mark.parametrize( "load_args,save_args", [ diff --git a/kedro-datasets/tests/pandas/test_json_dataset.py b/kedro-datasets/tests/pandas/test_json_dataset.py index 2e2beacc5..04536c20d 100644 --- a/kedro-datasets/tests/pandas/test_json_dataset.py +++ b/kedro-datasets/tests/pandas/test_json_dataset.py @@ -83,16 +83,6 @@ def test_save_extra_params(self, json_dataset, save_args): for key, value in save_args.items(): assert json_dataset._save_args[key] == value - @pytest.mark.parametrize( - "fs_args", - [{"open_args_load": {"k1": "v1"}, "open_args_save": {"index": "value"}}], - indirect=True, - ) - def test_fs_extra_params(self, json_dataset, fs_args): - """Test overriding the default fs arguments.""" - assert json_dataset._fs_open_args_load == {"k1": "v1"} - assert json_dataset._fs_open_args_save == {"index": "value"} - @pytest.mark.parametrize( "load_args,save_args", [ From 5905ff6cce45dfdd0fc9da6f3baabd79d4104be9 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Tue, 20 Aug 2024 12:30:37 +0100 Subject: [PATCH 10/18] Fix tests Signed-off-by: Merel Theisen --- kedro-datasets/tests/pandas/test_csv_dataset.py | 4 ++-- kedro-datasets/tests/pandas/test_json_dataset.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/kedro-datasets/tests/pandas/test_csv_dataset.py b/kedro-datasets/tests/pandas/test_csv_dataset.py index 0954b4b9b..b371c080b 100644 --- a/kedro-datasets/tests/pandas/test_csv_dataset.py +++ b/kedro-datasets/tests/pandas/test_csv_dataset.py @@ -243,8 +243,8 @@ def test_version_str_repr(self, load_version, save_version): assert "protocol" in str(ds_versioned) assert "protocol" in str(ds) # Default save_args - assert "save_args={'index': False}" in str(ds) - assert "save_args={'index': False}" in str(ds_versioned) + assert "save_args={'index': False, 'mode': w}" in str(ds) + assert "save_args={'index': False, 'mode': w}" in str(ds_versioned) def test_save_and_load(self, versioned_csv_dataset, dummy_dataframe): """Test that saved and reloaded data matches the original one for diff --git a/kedro-datasets/tests/pandas/test_json_dataset.py b/kedro-datasets/tests/pandas/test_json_dataset.py index 04536c20d..26ed307d2 100644 --- a/kedro-datasets/tests/pandas/test_json_dataset.py +++ b/kedro-datasets/tests/pandas/test_json_dataset.py @@ -195,6 +195,9 @@ def test_version_str_repr(self, load_version, save_version): assert "JSONDataset" in str(ds) assert "protocol" in str(ds_versioned) assert "protocol" in str(ds) + # Default save_args + assert "save_args={'mode': w}" in str(ds) + assert "save_args={'mode': w}" in str(ds_versioned) def test_save_and_load(self, versioned_json_dataset, dummy_dataframe): """Test that saved and reloaded data matches the original one for From 0de251deef72f760c947c920de7d7f9f1ee432cc Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Tue, 20 Aug 2024 13:54:47 +0100 Subject: [PATCH 11/18] Revert back to using fs_args to pass mode argument Signed-off-by: Merel Theisen --- .../kedro_datasets/pandas/csv_dataset.py | 20 +++++++++----- .../kedro_datasets/pandas/feather_dataset.py | 27 ++++++++++++------- .../kedro_datasets/pandas/json_dataset.py | 20 ++++++++++---- .../tests/pandas/test_csv_dataset.py | 14 ++++++++-- .../tests/pandas/test_feather_dataset.py | 10 +++++++ .../tests/pandas/test_json_dataset.py | 13 ++++++--- 6 files changed, 79 insertions(+), 25 deletions(-) diff --git a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py index b2d99ebb8..7d20528a1 100644 --- a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py @@ -68,7 +68,8 @@ class CSVDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): """ DEFAULT_LOAD_ARGS: dict[str, Any] = {} - DEFAULT_SAVE_ARGS: dict[str, Any] = {"index": False, "mode": "w"} + DEFAULT_SAVE_ARGS: dict[str, Any] = {"index": False} + DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "w"}} def __init__( # noqa: PLR0913 self, @@ -105,12 +106,16 @@ def __init__( # noqa: PLR0913 E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). - Defaults are preserved, apart from the `open_args_save` `mode` which is set to `wb`. - Note that the save method requires bytes, so any save mode provided should include "b" for bytes. + Defaults are preserved, apart from the `open_args_save` `mode` which is set to `w`. metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ - _fs_args = deepcopy(fs_args) or {} + _fs_args = deepcopy(self.DEFAULT_FS_ARGS) + if fs_args is not None: + _fs_args.update(fs_args) + + self._fs_open_args_load = _fs_args.pop("open_args_load", {}) + self._fs_open_args_save = _fs_args.pop("open_args_save", {}) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) @@ -120,7 +125,6 @@ def __init__( # noqa: PLR0913 self._protocol = protocol self._storage_options = {**_credentials, **_fs_args} self._fs = fsspec.filesystem(self._protocol, **self._storage_options) - self.metadata = metadata super().__init__( @@ -169,7 +173,11 @@ def _load(self) -> pd.DataFrame: def _save(self, data: pd.DataFrame) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) - with self._fs.open(save_path, **self._save_args) as fs_file: + fs_open_args_save = self._fs_open_args_save or {} + save_mode = fs_open_args_save.get( + "mode", self.DEFAULT_FS_ARGS["open_args_save"]["mode"] + ) + with self._fs.open(save_path, mode=save_mode) as fs_file: data.to_csv(path_or_buf=fs_file, **self._save_args) self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py index eb1f115f0..9629a0ae7 100644 --- a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py @@ -69,6 +69,7 @@ class FeatherDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): DEFAULT_LOAD_ARGS: dict[str, Any] = {} DEFAULT_SAVE_ARGS: dict[str, Any] = {} + DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "wb"}} def __init__( # noqa: PLR0913 self, @@ -105,10 +106,18 @@ def __init__( # noqa: PLR0913 E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + Defaults are preserved, apart from the `open_args_save` `mode` which is set to `wb`. + Note that the save method requires bytes, so any save mode provided should include "b" for bytes. metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ - _fs_args = deepcopy(fs_args) or {} + _fs_args = deepcopy(self.DEFAULT_FS_ARGS) + if fs_args is not None: + _fs_args.update(fs_args) + + self._fs_open_args_load = _fs_args.pop("open_args_load", {}) + self._fs_open_args_save = _fs_args.pop("open_args_save", {}) + _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) @@ -129,12 +138,8 @@ def __init__( # noqa: PLR0913 ) # Handle default load argument - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} if "storage_options" in self._save_args or "storage_options" in self._load_args: logger.warning( @@ -172,8 +177,12 @@ def _save(self, data: pd.DataFrame) -> None: buf = BytesIO() data.to_feather(buf, **self._save_args) - - with self._fs.open(save_path, mode="wb") as fs_file: + fs_open_args_save = self._fs_open_args_save or {} + save_mode = fs_open_args_save.get( + "mode", self.DEFAULT_FS_ARGS["open_args_save"]["mode"] + ) + with self._fs.open(save_path, mode=save_mode) as fs_file: + # data.to_feather(fs_file, **self._save_args) fs_file.write(buf.getvalue()) self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets/pandas/json_dataset.py b/kedro-datasets/kedro_datasets/pandas/json_dataset.py index bdbcfcb8c..6c74aefc0 100644 --- a/kedro-datasets/kedro_datasets/pandas/json_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/json_dataset.py @@ -64,7 +64,8 @@ class JSONDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): """ DEFAULT_LOAD_ARGS: dict[str, Any] = {} - DEFAULT_SAVE_ARGS: dict[str, Any] = {"mode": "w"} + DEFAULT_SAVE_ARGS: dict[str, Any] = {} + DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "w"}} def __init__( # noqa: PLR0913 self, @@ -101,12 +102,17 @@ def __init__( # noqa: PLR0913 E.g. for ``GCSFileSystem`` it should look like `{'token': None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). - Defaults are preserved, apart from the `open_args_save` `mode` which is set to `wb`. - Note that the save method requires bytes, so any save mode provided should include "b" for bytes. + Defaults are preserved, apart from the `open_args_save` `mode` which is set to `w`. metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ - _fs_args = deepcopy(fs_args) or {} + _fs_args = deepcopy(self.DEFAULT_FS_ARGS) + if fs_args is not None: + _fs_args.update(fs_args) + + self._fs_open_args_load = _fs_args.pop("open_args_load", {}) + self._fs_open_args_save = _fs_args.pop("open_args_save", {}) + _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) if protocol == "file": @@ -164,7 +170,11 @@ def _load(self) -> pd.DataFrame: def _save(self, data: pd.DataFrame) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) - with self._fs.open(save_path, **self._save_args) as fs_file: + fs_open_args_save = self._fs_open_args_save or {} + save_mode = fs_open_args_save.get( + "mode", self.DEFAULT_FS_ARGS["open_args_save"]["mode"] + ) + with self._fs.open(save_path, mode=save_mode) as fs_file: data.to_json(path_or_buf=fs_file, **self._save_args) self._invalidate_cache() diff --git a/kedro-datasets/tests/pandas/test_csv_dataset.py b/kedro-datasets/tests/pandas/test_csv_dataset.py index b371c080b..92538d30e 100644 --- a/kedro-datasets/tests/pandas/test_csv_dataset.py +++ b/kedro-datasets/tests/pandas/test_csv_dataset.py @@ -115,6 +115,16 @@ def test_save_extra_params(self, csv_dataset, save_args): for key, value in save_args.items(): assert csv_dataset._save_args[key] == value + @pytest.mark.parametrize( + "fs_args", + [{"open_args_load": {"k1": "v1"}, "open_args_save": {"index": "value"}}], + indirect=True, + ) + def test_fs_extra_params(self, csv_dataset, fs_args): + """Test overriding the default fs arguments.""" + assert csv_dataset._fs_open_args_load == {"k1": "v1"} + assert csv_dataset._fs_open_args_save == {"index": "value"} + @pytest.mark.parametrize( "load_args,save_args", [ @@ -243,8 +253,8 @@ def test_version_str_repr(self, load_version, save_version): assert "protocol" in str(ds_versioned) assert "protocol" in str(ds) # Default save_args - assert "save_args={'index': False, 'mode': w}" in str(ds) - assert "save_args={'index': False, 'mode': w}" in str(ds_versioned) + assert "save_args={'index': False}" in str(ds) + assert "save_args={'index': False}" in str(ds_versioned) def test_save_and_load(self, versioned_csv_dataset, dummy_dataframe): """Test that saved and reloaded data matches the original one for diff --git a/kedro-datasets/tests/pandas/test_feather_dataset.py b/kedro-datasets/tests/pandas/test_feather_dataset.py index 317921258..0959adb73 100644 --- a/kedro-datasets/tests/pandas/test_feather_dataset.py +++ b/kedro-datasets/tests/pandas/test_feather_dataset.py @@ -58,6 +58,16 @@ def test_load_extra_params(self, feather_dataset, load_args): for key, value in load_args.items(): assert feather_dataset._load_args[key] == value + @pytest.mark.parametrize( + "fs_args", + [{"open_args_load": {"k1": "v1"}, "open_args_save": {"index": "value"}}], + indirect=True, + ) + def test_fs_extra_params(self, feather_dataset, fs_args): + """Test overriding the default fs arguments.""" + assert feather_dataset._fs_open_args_load == {"k1": "v1"} + assert feather_dataset._fs_open_args_save == {"index": "value"} + @pytest.mark.parametrize( "load_args,save_args", [ diff --git a/kedro-datasets/tests/pandas/test_json_dataset.py b/kedro-datasets/tests/pandas/test_json_dataset.py index 26ed307d2..2e2beacc5 100644 --- a/kedro-datasets/tests/pandas/test_json_dataset.py +++ b/kedro-datasets/tests/pandas/test_json_dataset.py @@ -83,6 +83,16 @@ def test_save_extra_params(self, json_dataset, save_args): for key, value in save_args.items(): assert json_dataset._save_args[key] == value + @pytest.mark.parametrize( + "fs_args", + [{"open_args_load": {"k1": "v1"}, "open_args_save": {"index": "value"}}], + indirect=True, + ) + def test_fs_extra_params(self, json_dataset, fs_args): + """Test overriding the default fs arguments.""" + assert json_dataset._fs_open_args_load == {"k1": "v1"} + assert json_dataset._fs_open_args_save == {"index": "value"} + @pytest.mark.parametrize( "load_args,save_args", [ @@ -195,9 +205,6 @@ def test_version_str_repr(self, load_version, save_version): assert "JSONDataset" in str(ds) assert "protocol" in str(ds_versioned) assert "protocol" in str(ds) - # Default save_args - assert "save_args={'mode': w}" in str(ds) - assert "save_args={'mode': w}" in str(ds_versioned) def test_save_and_load(self, versioned_json_dataset, dummy_dataframe): """Test that saved and reloaded data matches the original one for From 323899722b805774a450d0cb80eb8defed657cdf Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Tue, 20 Aug 2024 14:18:14 +0100 Subject: [PATCH 12/18] Use fs_args to pass mode for all pandas based datasets Signed-off-by: Merel Theisen --- .../kedro_datasets/pandas/csv_dataset.py | 6 +-- .../kedro_datasets/pandas/excel_dataset.py | 43 +++++++++---------- .../kedro_datasets/pandas/feather_dataset.py | 15 ++----- .../kedro_datasets/pandas/gbq_dataset.py | 12 ++---- .../kedro_datasets/pandas/hdf_dataset.py | 24 +++++------ .../kedro_datasets/pandas/json_dataset.py | 6 +-- .../kedro_datasets/pandas/parquet_dataset.py | 28 ++++++------ .../kedro_datasets/pandas/sql_dataset.py | 8 +--- .../kedro_datasets/pandas/xml_dataset.py | 28 ++++++------ 9 files changed, 69 insertions(+), 101 deletions(-) diff --git a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py index 7d20528a1..c0e53de93 100644 --- a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py @@ -173,11 +173,7 @@ def _load(self) -> pd.DataFrame: def _save(self, data: pd.DataFrame) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) - fs_open_args_save = self._fs_open_args_save or {} - save_mode = fs_open_args_save.get( - "mode", self.DEFAULT_FS_ARGS["open_args_save"]["mode"] - ) - with self._fs.open(save_path, mode=save_mode) as fs_file: + with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: data.to_csv(path_or_buf=fs_file, **self._save_args) self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py index 601ef377e..dc29fffdb 100644 --- a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py @@ -5,7 +5,6 @@ import logging from copy import deepcopy -from io import BytesIO from pathlib import PurePosixPath from typing import Any, Union @@ -109,6 +108,7 @@ class ExcelDataset( DEFAULT_LOAD_ARGS = {"engine": "openpyxl"} DEFAULT_SAVE_ARGS = {"index": False} + DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "wb"}} def __init__( # noqa: PLR0913 self, @@ -153,13 +153,20 @@ def __init__( # noqa: PLR0913 E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + Defaults are preserved, apart from the `open_args_save` `mode` which is set to `wb`. + Note that the save method requires bytes, so any save mode provided should include "b" for bytes. metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. Raises: DatasetError: If versioning is enabled while in append mode. """ - _fs_args = deepcopy(fs_args) or {} + _fs_args = deepcopy(self.DEFAULT_FS_ARGS) + if fs_args is not None: + _fs_args.update(fs_args) + + self._fs_open_args_load = _fs_args.pop("open_args_load", {}) + self._fs_open_args_save = _fs_args.pop("open_args_save", {}) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) @@ -179,15 +186,10 @@ def __init__( # noqa: PLR0913 glob_function=self._fs.glob, ) - # Handle default load arguments - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) + # Handle default load and save arguments + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} - # Handle default save arguments - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) self._writer_args = self._save_args.pop("writer", {}) # type: ignore self._writer_args.setdefault("engine", engine or "openpyxl") # type: ignore @@ -230,20 +232,17 @@ def _load(self) -> pd.DataFrame | dict[str, pd.DataFrame]: ) def _save(self, data: pd.DataFrame | dict[str, pd.DataFrame]) -> None: - output = BytesIO() save_path = get_filepath_str(self._get_save_path(), self._protocol) - with pd.ExcelWriter(output, **self._writer_args) as writer: - if isinstance(data, dict): - for sheet_name, sheet_data in data.items(): - sheet_data.to_excel( - writer, sheet_name=sheet_name, **self._save_args - ) - else: - data.to_excel(writer, **self._save_args) - - with self._fs.open(save_path, mode="wb") as fs_file: - fs_file.write(output.getvalue()) + with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: + with pd.ExcelWriter(fs_file, **self._writer_args) as writer: + if isinstance(data, dict): + for sheet_name, sheet_data in data.items(): + sheet_data.to_excel( + writer, sheet_name=sheet_name, **self._save_args + ) + else: + data.to_excel(writer, **self._save_args) self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py index 9629a0ae7..eafac95fc 100644 --- a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py @@ -6,7 +6,6 @@ import logging from copy import deepcopy -from io import BytesIO from pathlib import PurePosixPath from typing import Any @@ -117,7 +116,6 @@ def __init__( # noqa: PLR0913 self._fs_open_args_load = _fs_args.pop("open_args_load", {}) self._fs_open_args_save = _fs_args.pop("open_args_save", {}) - _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) @@ -137,7 +135,7 @@ def __init__( # noqa: PLR0913 glob_function=self._fs.glob, ) - # Handle default load argument + # Handle default load and save arguments self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} @@ -175,15 +173,8 @@ def _load(self) -> pd.DataFrame: def _save(self, data: pd.DataFrame) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) - buf = BytesIO() - data.to_feather(buf, **self._save_args) - fs_open_args_save = self._fs_open_args_save or {} - save_mode = fs_open_args_save.get( - "mode", self.DEFAULT_FS_ARGS["open_args_save"]["mode"] - ) - with self._fs.open(save_path, mode=save_mode) as fs_file: - # data.to_feather(fs_file, **self._save_args) - fs_file.write(buf.getvalue()) + with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: + data.to_feather(fs_file, **self._save_args) self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py index 060316204..f16f828f7 100644 --- a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py @@ -107,12 +107,8 @@ def __init__( # noqa: PLR0913 are different. """ # Handle default load and save arguments - self._load_args = copy.deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = copy.deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} self._validate_location() validate_on_forbidden_chars(dataset=dataset, table_name=table_name) @@ -262,9 +258,7 @@ def __init__( # noqa: PLR0913 ) # Handle default load arguments - self._load_args = copy.deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} self._project_id = project diff --git a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py index 227b26133..0d44fd1a5 100644 --- a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py @@ -60,6 +60,7 @@ class HDFDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): _lock = Lock() DEFAULT_LOAD_ARGS: dict[str, Any] = {} DEFAULT_SAVE_ARGS: dict[str, Any] = {} + DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "wb"}} def __init__( # noqa: PLR0913 self, @@ -106,9 +107,12 @@ def __init__( # noqa: PLR0913 metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ - _fs_args = deepcopy(fs_args) or {} - _fs_open_args_load = _fs_args.pop("open_args_load", {}) - _fs_open_args_save = _fs_args.pop("open_args_save", {}) + _fs_args = deepcopy(self.DEFAULT_FS_ARGS) + if fs_args is not None: + _fs_args.update(fs_args) + + self._fs_open_args_load = _fs_args.pop("open_args_load", {}) + self._fs_open_args_save = _fs_args.pop("open_args_save", {}) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) @@ -129,17 +133,9 @@ def __init__( # noqa: PLR0913 self._key = key - # Handle default load and save arguments - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) - - _fs_open_args_save.setdefault("mode", "wb") - self._fs_open_args_load = _fs_open_args_load - self._fs_open_args_save = _fs_open_args_save + # Handle default load argument + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} def _describe(self) -> dict[str, Any]: return { diff --git a/kedro-datasets/kedro_datasets/pandas/json_dataset.py b/kedro-datasets/kedro_datasets/pandas/json_dataset.py index 6c74aefc0..08d275a97 100644 --- a/kedro-datasets/kedro_datasets/pandas/json_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/json_dataset.py @@ -170,11 +170,7 @@ def _load(self) -> pd.DataFrame: def _save(self, data: pd.DataFrame) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) - fs_open_args_save = self._fs_open_args_save or {} - save_mode = fs_open_args_save.get( - "mode", self.DEFAULT_FS_ARGS["open_args_save"]["mode"] - ) - with self._fs.open(save_path, mode=save_mode) as fs_file: + with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: data.to_json(path_or_buf=fs_file, **self._save_args) self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py index 760d5a8f3..b3a9f47f1 100644 --- a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py @@ -5,7 +5,6 @@ import logging from copy import deepcopy -from io import BytesIO from pathlib import Path, PurePosixPath from typing import Any @@ -77,6 +76,7 @@ class ParquetDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): DEFAULT_LOAD_ARGS: dict[str, Any] = {} DEFAULT_SAVE_ARGS: dict[str, Any] = {} + DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "wb"}} def __init__( # noqa: PLR0913 self, @@ -116,10 +116,17 @@ def __init__( # noqa: PLR0913 E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + Defaults are preserved, apart from the `open_args_save` `mode` which is set to `wb`. + Note that the save method requires bytes, so any save mode provided should include "b" for bytes. metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ - _fs_args = deepcopy(fs_args) or {} + _fs_args = deepcopy(self.DEFAULT_FS_ARGS) + if fs_args is not None: + _fs_args.update(fs_args) + + self._fs_open_args_load = _fs_args.pop("open_args_load", {}) + self._fs_open_args_save = _fs_args.pop("open_args_save", {}) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) @@ -139,13 +146,9 @@ def __init__( # noqa: PLR0913 glob_function=self._fs.glob, ) - # Handle default load and save arguments - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) + # Handle default load argument + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} if "storage_options" in self._save_args or "storage_options" in self._load_args: logger.warning( @@ -193,11 +196,8 @@ def _save(self, data: pd.DataFrame) -> None: f"'partition_cols'. Please use 'kedro.io.PartitionedDataset' instead." ) - bytes_buffer = BytesIO() - data.to_parquet(bytes_buffer, **self._save_args) - - with self._fs.open(save_path, mode="wb") as fs_file: - fs_file.write(bytes_buffer.getvalue()) + with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: + data.to_parquet(fs_file, **self._save_args) self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py index f750ab685..3c5769c62 100644 --- a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py @@ -210,12 +210,8 @@ def __init__( # noqa: PLR0913 ) # Handle default load and save arguments - self._load_args = copy.deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = copy.deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} self._load_args["table_name"] = table_name self._save_args["name"] = table_name diff --git a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py index b1173f43e..6d73db722 100644 --- a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py @@ -5,7 +5,6 @@ import logging from copy import deepcopy -from io import BytesIO from pathlib import PurePosixPath from typing import Any @@ -47,6 +46,7 @@ class XMLDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): DEFAULT_LOAD_ARGS: dict[str, Any] = {} DEFAULT_SAVE_ARGS: dict[str, Any] = {"index": False} + DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "wb"}} def __init__( # noqa: PLR0913 self, @@ -83,10 +83,17 @@ def __init__( # noqa: PLR0913 E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + Defaults are preserved, apart from the `open_args_save` `mode` which is set to `wb`. + Note that the save method requires bytes, so any save mode provided should include "b" for bytes. metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ - _fs_args = deepcopy(fs_args) or {} + _fs_args = deepcopy(self.DEFAULT_FS_ARGS) + if fs_args is not None: + _fs_args.update(fs_args) + + self._fs_open_args_load = _fs_args.pop("open_args_load", {}) + self._fs_open_args_save = _fs_args.pop("open_args_save", {}) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) @@ -106,13 +113,9 @@ def __init__( # noqa: PLR0913 glob_function=self._fs.glob, ) - # Handle default load and save arguments - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) + # Handle default load argument + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} if "storage_options" in self._save_args or "storage_options" in self._load_args: logger.warning( @@ -149,11 +152,8 @@ def _load(self) -> pd.DataFrame: def _save(self, data: pd.DataFrame) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) - buf = BytesIO() - data.to_xml(path_or_buffer=buf, **self._save_args) - - with self._fs.open(save_path, mode="wb") as fs_file: - fs_file.write(buf.getvalue()) + with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: + data.to_xml(path_or_buffer=fs_file, **self._save_args) self._invalidate_cache() From ef11faacc8f6da53a7ac388136fcec76aa1fc348 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Tue, 20 Aug 2024 14:41:12 +0100 Subject: [PATCH 13/18] Make other datasets use fs_args for handling mode as well Signed-off-by: Merel Theisen --- .../kedro_datasets/matlab/matlab_dataset.py | 23 +++++++------- .../kedro_datasets/pandas/hdf_dataset.py | 4 +-- .../kedro_datasets/polars/csv_dataset.py | 30 +++++++++---------- .../polars/lazy_polars_dataset.py | 27 ++++++++--------- .../tests/matlab/test_matlab_dataset.py | 4 +-- 5 files changed, 42 insertions(+), 46 deletions(-) diff --git a/kedro-datasets/kedro_datasets/matlab/matlab_dataset.py b/kedro-datasets/kedro_datasets/matlab/matlab_dataset.py index a74d74209..0edee60bd 100644 --- a/kedro-datasets/kedro_datasets/matlab/matlab_dataset.py +++ b/kedro-datasets/kedro_datasets/matlab/matlab_dataset.py @@ -53,6 +53,7 @@ class MatlabDataset(AbstractVersionedDataset[np.ndarray, np.ndarray]): """ DEFAULT_SAVE_ARGS: dict[str, Any] = {"indent": 2} + DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "wb"}} def __init__( # noqa = PLR0913 self, @@ -83,14 +84,16 @@ def __init__( # noqa = PLR0913 `open_args_load` and `open_args_save`. Here you can find all available arguments for `open`: https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open - All defaults are preserved, except `mode`, which is set to `r` when loading - and to `w` when saving. + All defaults are preserved, except `mode`, which is set to `wb` when saving. metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ - _fs_args = deepcopy(fs_args) or {} - _fs_open_args_load = _fs_args.pop("open_args_load", {}) - _fs_open_args_save = _fs_args.pop("open_args_save", {}) + _fs_args = deepcopy(self.DEFAULT_FS_ARGS) + if fs_args is not None: + _fs_args.update(fs_args) + + self._fs_open_args_load = _fs_args.pop("open_args_load", {}) + self._fs_open_args_save = _fs_args.pop("open_args_save", {}) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) @@ -107,13 +110,7 @@ def __init__( # noqa = PLR0913 glob_function=self._fs.glob, ) # Handle default save arguments - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) - - _fs_open_args_save.setdefault("mode", "w") - self._fs_open_args_load = _fs_open_args_load - self._fs_open_args_save = _fs_open_args_save + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} def _describe(self) -> dict[str, Any]: return { @@ -134,7 +131,7 @@ def _load(self) -> np.ndarray: def _save(self, data: np.ndarray) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) - with self._fs.open(save_path, mode="wb") as f: + with self._fs.open(save_path, **self._fs_open_args_save) as f: io.savemat(f, {"data": data}) self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py index 0d44fd1a5..624d429fc 100644 --- a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py @@ -103,7 +103,7 @@ def __init__( # noqa: PLR0913 `open_args_load` and `open_args_save`. Here you can find all available arguments for `open`: https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open - All defaults are preserved, except `mode`, which is set `wb` when saving. + All defaults are preserved, except `open_args_save` `mode`, which is set `wb` when saving. metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ @@ -133,7 +133,7 @@ def __init__( # noqa: PLR0913 self._key = key - # Handle default load argument + # Handle default load and save arguments self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} diff --git a/kedro-datasets/kedro_datasets/polars/csv_dataset.py b/kedro-datasets/kedro_datasets/polars/csv_dataset.py index 1195ce295..f6dc2d3ab 100644 --- a/kedro-datasets/kedro_datasets/polars/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/polars/csv_dataset.py @@ -5,7 +5,6 @@ import logging from copy import deepcopy -from io import BytesIO from pathlib import PurePosixPath from typing import Any @@ -67,6 +66,7 @@ class CSVDataset(AbstractVersionedDataset[pl.DataFrame, pl.DataFrame]): DEFAULT_LOAD_ARGS: dict[str, Any] = {"rechunk": True} DEFAULT_SAVE_ARGS: dict[str, Any] = {} + DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "w"}} def __init__( # noqa: PLR0913 self, @@ -92,8 +92,8 @@ def __init__( # noqa: PLR0913 load_args: Polars options for loading CSV files. Here you can find all available arguments: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_csv.html#polars.read_csv - All defaults are preserved, but we explicity use `rechunk=True` for `seaborn` - compability. + All defaults are preserved, but we explicitly use `rechunk=True` for `seaborn` + compatibility. save_args: Polars options for saving CSV files. Here you can find all available arguments: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_csv.html @@ -106,10 +106,17 @@ def __init__( # noqa: PLR0913 E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + Defaults are preserved, apart from the `open_args_save` `mode` which is set to `w`. + metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ - _fs_args = deepcopy(fs_args) or {} + _fs_args = deepcopy(self.DEFAULT_FS_ARGS) + if fs_args is not None: + _fs_args.update(fs_args) + + self._fs_open_args_load = _fs_args.pop("open_args_load", {}) + self._fs_open_args_save = _fs_args.pop("open_args_save", {}) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) @@ -130,12 +137,8 @@ def __init__( # noqa: PLR0913 ) # Handle default load and save arguments - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} if "storage_options" in self._save_args or "storage_options" in self._load_args: logger.warning( @@ -172,11 +175,8 @@ def _load(self) -> pl.DataFrame: def _save(self, data: pl.DataFrame) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) - buf = BytesIO() - data.write_csv(file=buf, **self._save_args) - - with self._fs.open(save_path, mode="wb") as fs_file: - fs_file.write(buf.getvalue()) + with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: + data.write_csv(file=fs_file, **self._save_args) self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets/polars/lazy_polars_dataset.py b/kedro-datasets/kedro_datasets/polars/lazy_polars_dataset.py index 2e650e52e..652f08947 100644 --- a/kedro-datasets/kedro_datasets/polars/lazy_polars_dataset.py +++ b/kedro-datasets/kedro_datasets/polars/lazy_polars_dataset.py @@ -6,7 +6,6 @@ import logging from copy import deepcopy -from io import BytesIO from pathlib import PurePosixPath from typing import Any, ClassVar, Union @@ -74,6 +73,7 @@ class LazyPolarsDataset(AbstractVersionedDataset[pl.LazyFrame, PolarsFrame]): DEFAULT_LOAD_ARGS: ClassVar[dict[str, Any]] = {} DEFAULT_SAVE_ARGS: ClassVar[dict[str, Any]] = {} + DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "wb"}} def __init__( # noqa: PLR0913 self, @@ -126,8 +126,7 @@ def __init__( # noqa: PLR0913 `open_args_load` and `open_args_save`. Here you can find all available arguments for `open`: https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open - All defaults are preserved, except `mode`, which is set to `r` when loading - and to `w` when saving. + All defaults are preserved, except `mode`, which is set to `wb` when saving. metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. Raises: @@ -144,7 +143,12 @@ def __init__( # noqa: PLR0913 "https://pola-rs.github.io/polars/py-polars/html/reference/io.html" ) - _fs_args = deepcopy(fs_args) or {} + _fs_args = deepcopy(self.DEFAULT_FS_ARGS) + if fs_args is not None: + _fs_args.update(fs_args) + + self._fs_open_args_load = _fs_args.pop("open_args_load", {}) + self._fs_open_args_save = _fs_args.pop("open_args_save", {}) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) @@ -165,12 +169,8 @@ def __init__( # noqa: PLR0913 ) # Handle default load and save arguments - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} if "storage_options" in self._save_args or "storage_options" in self._load_args: logger.warning( @@ -218,10 +218,9 @@ def _save(self, data: pl.DataFrame | pl.LazyFrame) -> None: # https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_parquet.html save_method = getattr(collected_data, f"write_{self._file_format}", None) if save_method: - buf = BytesIO() - save_method(file=buf, **self._save_args) - with self._fs.open(save_path, mode="wb") as fs_file: - fs_file.write(buf.getvalue()) + with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: + save_method(file=fs_file, **self._save_args) + self._invalidate_cache() # How the LazyPolarsDataset logic is currently written with # ACCEPTED_FILE_FORMATS and a check in the `__init__` method, diff --git a/kedro-datasets/tests/matlab/test_matlab_dataset.py b/kedro-datasets/tests/matlab/test_matlab_dataset.py index a7a935962..331702db9 100644 --- a/kedro-datasets/tests/matlab/test_matlab_dataset.py +++ b/kedro-datasets/tests/matlab/test_matlab_dataset.py @@ -41,7 +41,7 @@ def test_save_and_load(self, matlab_dataset, dummy_data): reloaded = matlab_dataset.load() assert (dummy_data == reloaded["data"]).all() assert matlab_dataset._fs_open_args_load == {} - assert matlab_dataset._fs_open_args_save == {"mode": "w"} + assert matlab_dataset._fs_open_args_save == {"mode": "wb"} def test_exists(self, matlab_dataset, dummy_data): """Test `exists` method invocation for both existing and @@ -65,7 +65,7 @@ def test_save_extra_params(self, matlab_dataset, save_args): ) def test_open_extra_args(self, matlab_dataset, fs_args): assert matlab_dataset._fs_open_args_load == fs_args["open_args_load"] - assert matlab_dataset._fs_open_args_save == {"mode": "w"} # default unchanged + assert matlab_dataset._fs_open_args_save == {"mode": "wb"} # default unchanged def test_load_missing_file(self, matlab_dataset): """Check the error when trying to load missing file.""" From 97e49a2ecbe6076a5998576ecd07b6ddb7626009 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Tue, 20 Aug 2024 16:07:20 +0100 Subject: [PATCH 14/18] Refactor and make all datasets consistent Signed-off-by: Merel Theisen --- .../biosequence/biosequence_dataset.py | 27 +++++++++------- .../kedro_datasets/dask/csv_dataset.py | 8 ++--- .../kedro_datasets/dask/parquet_dataset.py | 8 ++--- .../kedro_datasets/email/message_dataset.py | 32 +++++++++---------- .../holoviews/holoviews_writer.py | 4 +-- .../kedro_datasets/json/json_dataset.py | 24 +++++++------- .../kedro_datasets/matlab/matlab_dataset.py | 19 +++++++---- .../matplotlib/matplotlib_writer.py | 4 +-- .../kedro_datasets/networkx/gml_dataset.py | 32 +++++++++++-------- .../networkx/graphml_dataset.py | 32 +++++++++++-------- .../kedro_datasets/networkx/json_dataset.py | 28 ++++++++-------- .../kedro_datasets/pandas/csv_dataset.py | 18 +++++++---- .../pandas/deltatable_dataset.py | 10 ++---- .../kedro_datasets/pandas/excel_dataset.py | 19 +++++++---- .../kedro_datasets/pandas/feather_dataset.py | 19 +++++++---- .../kedro_datasets/pandas/generic_dataset.py | 25 ++++++++------- .../kedro_datasets/pandas/hdf_dataset.py | 19 +++++++---- .../kedro_datasets/pandas/json_dataset.py | 19 +++++++---- .../kedro_datasets/pandas/parquet_dataset.py | 19 +++++++---- .../kedro_datasets/pandas/xml_dataset.py | 19 +++++++---- .../kedro_datasets/pickle/pickle_dataset.py | 21 ++++++------ .../kedro_datasets/pillow/image_dataset.py | 22 +++++++------ .../kedro_datasets/plotly/json_dataset.py | 23 ++++++------- .../kedro_datasets/plotly/plotly_dataset.py | 13 ++++++-- .../snowflake/snowpark_dataset.py | 9 ++---- .../kedro_datasets/spark/spark_dataset.py | 8 ++--- .../spark/spark_jdbc_dataset.py | 9 ++---- .../spark/spark_streaming_dataset.py | 9 ++---- .../svmlight/svmlight_dataset.py | 28 +++++++++------- .../tensorflow/tensorflow_model_dataset.py | 8 ++--- .../kedro_datasets/text/text_dataset.py | 18 ++++++++--- .../kedro_datasets/yaml/yaml_dataset.py | 20 ++++++------ .../netcdf/netcdf_dataset.py | 8 ++--- .../rioxarray/geotiff_dataset.py | 9 ++---- .../tests/pandas/test_csv_dataset.py | 2 +- .../tests/pandas/test_feather_dataset.py | 2 +- .../tests/pandas/test_json_dataset.py | 2 +- 37 files changed, 316 insertions(+), 280 deletions(-) diff --git a/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py b/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py index bba06f185..6c44d3d19 100644 --- a/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py +++ b/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py @@ -44,6 +44,10 @@ class BioSequenceDataset(AbstractDataset[list, list]): DEFAULT_LOAD_ARGS: dict[str, Any] = {} DEFAULT_SAVE_ARGS: dict[str, Any] = {} + DEFAULT_FS_ARGS: dict[str, Any] = { + "open_args_save": {"mode": "w"}, + "open_args_load": {"mode": "r"}, + } def __init__( # noqa: PLR0913 self, @@ -96,18 +100,17 @@ def __init__( # noqa: PLR0913 self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) - # Handle default load and save arguments - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) - - _fs_open_args_load.setdefault("mode", "r") - _fs_open_args_save.setdefault("mode", "w") - self._fs_open_args_load = _fs_open_args_load - self._fs_open_args_save = _fs_open_args_save + # Handle default load and save and fs arguments + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} + self._fs_open_args_load = { + **self.DEFAULT_FS_ARGS.get("open_args_load", {}), + **(_fs_open_args_load or {}), + } + self._fs_open_args_save = { + **self.DEFAULT_FS_ARGS.get("open_args_save", {}), + **(_fs_open_args_save or {}), + } self.metadata = metadata diff --git a/kedro-datasets/kedro_datasets/dask/csv_dataset.py b/kedro-datasets/kedro_datasets/dask/csv_dataset.py index 0e02f6ade..faee81a01 100644 --- a/kedro-datasets/kedro_datasets/dask/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/dask/csv_dataset.py @@ -84,12 +84,8 @@ def __init__( # noqa: PLR0913 self.metadata = metadata # Handle default load and save arguments - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} @property def fs_args(self) -> dict[str, Any]: diff --git a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py index c75d067aa..402e97a27 100644 --- a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py @@ -114,12 +114,8 @@ def __init__( # noqa: PLR0913 self.metadata = metadata # Handle default load and save arguments - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} @property def fs_args(self) -> dict[str, Any]: diff --git a/kedro-datasets/kedro_datasets/email/message_dataset.py b/kedro-datasets/kedro_datasets/email/message_dataset.py index df60a3c2a..15050901f 100644 --- a/kedro-datasets/kedro_datasets/email/message_dataset.py +++ b/kedro-datasets/kedro_datasets/email/message_dataset.py @@ -8,7 +8,6 @@ from email.generator import Generator from email.message import Message from email.parser import Parser -from email.policy import default from pathlib import PurePosixPath from typing import Any @@ -55,6 +54,10 @@ class EmailMessageDataset(AbstractVersionedDataset[Message, Message]): DEFAULT_LOAD_ARGS: dict[str, Any] = {} DEFAULT_SAVE_ARGS: dict[str, Any] = {} + DEFAULT_FS_ARGS: dict[str, Any] = { + "open_args_save": {"mode": "w"}, + "open_args_load": {"mode": "r"}, + } def __init__( # noqa: PLR0913 self, @@ -129,22 +132,17 @@ def __init__( # noqa: PLR0913 glob_function=self._fs.glob, ) - # Handle default load arguments - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._parser_args = self._load_args.pop("parser", {"policy": default}) - - # Handle default save arguments - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) - self._generator_args = self._save_args.pop("generator", {}) - - _fs_open_args_load.setdefault("mode", "r") - _fs_open_args_save.setdefault("mode", "w") - self._fs_open_args_load = _fs_open_args_load - self._fs_open_args_save = _fs_open_args_save + # Handle default load and save and fs arguments + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} + self._fs_open_args_load = { + **self.DEFAULT_FS_ARGS.get("open_args_load", {}), + **(_fs_open_args_load or {}), + } + self._fs_open_args_save = { + **self.DEFAULT_FS_ARGS.get("open_args_save", {}), + **(_fs_open_args_save or {}), + } def _describe(self) -> dict[str, Any]: return { diff --git a/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py b/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py index 97e6446a9..d2c310df4 100644 --- a/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py +++ b/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py @@ -100,9 +100,7 @@ def __init__( # noqa: PLR0913 self._fs_open_args_save = _fs_open_args_save # Handle default save arguments - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} def _describe(self) -> dict[str, Any]: return { diff --git a/kedro-datasets/kedro_datasets/json/json_dataset.py b/kedro-datasets/kedro_datasets/json/json_dataset.py index cc882e75f..76f6e15d0 100644 --- a/kedro-datasets/kedro_datasets/json/json_dataset.py +++ b/kedro-datasets/kedro_datasets/json/json_dataset.py @@ -54,6 +54,7 @@ class JSONDataset(AbstractVersionedDataset[Any, Any]): """ DEFAULT_SAVE_ARGS: dict[str, Any] = {"indent": 2} + DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "w"}} def __init__( # noqa: PLR0913 self, @@ -89,16 +90,15 @@ def __init__( # noqa: PLR0913 `open_args_load` and `open_args_save`. Here you can find all available arguments for `open`: https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open - All defaults are preserved, except `mode`, which is set to `r` when loading - and to `w` when saving. + All defaults are preserved, except `mode`, which is set to `w` when saving. metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ _fs_args = deepcopy(fs_args) or {} _fs_open_args_load = _fs_args.pop("open_args_load", {}) _fs_open_args_save = _fs_args.pop("open_args_save", {}) - _credentials = deepcopy(credentials) or {} + _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) self._protocol = protocol @@ -115,14 +115,16 @@ def __init__( # noqa: PLR0913 glob_function=self._fs.glob, ) - # Handle default save arguments - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) - - _fs_open_args_save.setdefault("mode", "w") - self._fs_open_args_load = _fs_open_args_load - self._fs_open_args_save = _fs_open_args_save + # Handle default save and fs arguments + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} + self._fs_open_args_load = { + **self.DEFAULT_FS_ARGS.get("open_args_load", {}), + **(_fs_open_args_load or {}), + } + self._fs_open_args_save = { + **self.DEFAULT_FS_ARGS.get("open_args_save", {}), + **(_fs_open_args_save or {}), + } def _describe(self) -> dict[str, Any]: return { diff --git a/kedro-datasets/kedro_datasets/matlab/matlab_dataset.py b/kedro-datasets/kedro_datasets/matlab/matlab_dataset.py index 0edee60bd..916f5e940 100644 --- a/kedro-datasets/kedro_datasets/matlab/matlab_dataset.py +++ b/kedro-datasets/kedro_datasets/matlab/matlab_dataset.py @@ -88,12 +88,9 @@ def __init__( # noqa = PLR0913 metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ - _fs_args = deepcopy(self.DEFAULT_FS_ARGS) - if fs_args is not None: - _fs_args.update(fs_args) - - self._fs_open_args_load = _fs_args.pop("open_args_load", {}) - self._fs_open_args_save = _fs_args.pop("open_args_save", {}) + _fs_args = deepcopy(fs_args) or {} + _fs_open_args_load = _fs_args.pop("open_args_load", {}) + _fs_open_args_save = _fs_args.pop("open_args_save", {}) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) @@ -109,8 +106,16 @@ def __init__( # noqa = PLR0913 exists_function=self._fs.exists, glob_function=self._fs.glob, ) - # Handle default save arguments + # Handle default save and fs arguments self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} + self._fs_open_args_load = { + **self.DEFAULT_FS_ARGS.get("open_args_load", {}), + **(_fs_open_args_load or {}), + } + self._fs_open_args_save = { + **self.DEFAULT_FS_ARGS.get("open_args_save", {}), + **(_fs_open_args_save or {}), + } def _describe(self) -> dict[str, Any]: return { diff --git a/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py b/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py index 377c4dfbd..33cc6d12c 100644 --- a/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py +++ b/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py @@ -178,9 +178,7 @@ def __init__( # noqa: PLR0913 self._fs_open_args_save = _fs_open_args_save # Handle default save arguments - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} if overwrite and version is not None: warn( diff --git a/kedro-datasets/kedro_datasets/networkx/gml_dataset.py b/kedro-datasets/kedro_datasets/networkx/gml_dataset.py index dec121bee..ff6916b61 100644 --- a/kedro-datasets/kedro_datasets/networkx/gml_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/gml_dataset.py @@ -40,6 +40,10 @@ class GMLDataset(AbstractVersionedDataset[networkx.Graph, networkx.Graph]): DEFAULT_LOAD_ARGS: dict[str, Any] = {} DEFAULT_SAVE_ARGS: dict[str, Any] = {} + DEFAULT_FS_ARGS: dict[str, Any] = { + "open_args_save": {"mode": "wb"}, + "open_args_load": {"mode": "rb"}, + } def __init__( # noqa: PLR0913 self, @@ -74,9 +78,9 @@ def __init__( # noqa: PLR0913 `open_args_load` and `open_args_save`. Here you can find all available arguments for `open`: https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open - All defaults are preserved, except `mode`, which is set to `r` when loading - and to `w` when saving. - metadata: Any Any arbitrary metadata. + All defaults are preserved, except `mode`, which is set to `rb` when loading + and to `wb` when saving. + metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ _fs_args = deepcopy(fs_args) or {} @@ -100,17 +104,17 @@ def __init__( # noqa: PLR0913 glob_function=self._fs.glob, ) - # Handle default load and save arguments - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) - _fs_open_args_load.setdefault("mode", "rb") - _fs_open_args_save.setdefault("mode", "wb") - self._fs_open_args_load = _fs_open_args_load - self._fs_open_args_save = _fs_open_args_save + # Handle default load and save and fs arguments + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} + self._fs_open_args_load = { + **self.DEFAULT_FS_ARGS.get("open_args_load", {}), + **(_fs_open_args_load or {}), + } + self._fs_open_args_save = { + **self.DEFAULT_FS_ARGS.get("open_args_save", {}), + **(_fs_open_args_save or {}), + } def _load(self) -> networkx.Graph: load_path = get_filepath_str(self._get_load_path(), self._protocol) diff --git a/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py b/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py index f69113533..cccffee20 100644 --- a/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py @@ -39,6 +39,10 @@ class GraphMLDataset(AbstractVersionedDataset[networkx.Graph, networkx.Graph]): DEFAULT_LOAD_ARGS: dict[str, Any] = {} DEFAULT_SAVE_ARGS: dict[str, Any] = {} + DEFAULT_FS_ARGS: dict[str, Any] = { + "open_args_save": {"mode": "wb"}, + "open_args_load": {"mode": "rb"}, + } def __init__( # noqa: PLR0913 self, @@ -73,9 +77,9 @@ def __init__( # noqa: PLR0913 `open_args_load` and `open_args_save`. Here you can find all available arguments for `open`: https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open - All defaults are preserved, except `mode`, which is set to `r` when loading - and to `w` when saving. - metadata: Any arbitrary Any arbitrary metadata. + All defaults are preserved, except `mode`, which is set to `rb` when loading + and to `wb` when saving. + metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ _fs_args = deepcopy(fs_args) or {} @@ -99,17 +103,17 @@ def __init__( # noqa: PLR0913 glob_function=self._fs.glob, ) - # Handle default load and save arguments - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) - _fs_open_args_load.setdefault("mode", "rb") - _fs_open_args_save.setdefault("mode", "wb") - self._fs_open_args_load = _fs_open_args_load - self._fs_open_args_save = _fs_open_args_save + # Handle default load and save and fs arguments + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} + self._fs_open_args_load = { + **self.DEFAULT_FS_ARGS.get("open_args_load", {}), + **(_fs_open_args_load or {}), + } + self._fs_open_args_save = { + **self.DEFAULT_FS_ARGS.get("open_args_save", {}), + **(_fs_open_args_save or {}), + } def _load(self) -> networkx.Graph: load_path = get_filepath_str(self._get_load_path(), self._protocol) diff --git a/kedro-datasets/kedro_datasets/networkx/json_dataset.py b/kedro-datasets/kedro_datasets/networkx/json_dataset.py index 1bef8dc3d..a40d7b3e1 100644 --- a/kedro-datasets/kedro_datasets/networkx/json_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/json_dataset.py @@ -40,6 +40,7 @@ class JSONDataset(AbstractVersionedDataset[networkx.Graph, networkx.Graph]): DEFAULT_LOAD_ARGS: dict[str, Any] = {} DEFAULT_SAVE_ARGS: dict[str, Any] = {} + DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "w"}} def __init__( # noqa: PLR0913 self, @@ -74,9 +75,8 @@ def __init__( # noqa: PLR0913 `open_args_load` and `open_args_save`. Here you can find all available arguments for `open`: https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open - All defaults are preserved, except `mode`, which is set to `r` when loading - and to `w` when saving. - metadata: Any Any arbitrary metadata. + All defaults are preserved, except `mode`, which is set to `w` when saving. + metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ _fs_args = deepcopy(fs_args) or {} @@ -100,17 +100,17 @@ def __init__( # noqa: PLR0913 glob_function=self._fs.glob, ) - # Handle default load and save arguments - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) - - _fs_open_args_save.setdefault("mode", "w") - self._fs_open_args_load = _fs_open_args_load - self._fs_open_args_save = _fs_open_args_save + # Handle default load and save and fs arguments + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} + self._fs_open_args_load = { + **self.DEFAULT_FS_ARGS.get("open_args_load", {}), + **(_fs_open_args_load or {}), + } + self._fs_open_args_save = { + **self.DEFAULT_FS_ARGS.get("open_args_save", {}), + **(_fs_open_args_save or {}), + } def _load(self) -> networkx.Graph: load_path = get_filepath_str(self._get_load_path(), self._protocol) diff --git a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py index c0e53de93..e9487b216 100644 --- a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py @@ -110,12 +110,10 @@ def __init__( # noqa: PLR0913 metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ - _fs_args = deepcopy(self.DEFAULT_FS_ARGS) - if fs_args is not None: - _fs_args.update(fs_args) + _fs_args = deepcopy(fs_args) or {} + _fs_open_args_load = _fs_args.pop("open_args_load", {}) + _fs_open_args_save = _fs_args.pop("open_args_save", {}) - self._fs_open_args_load = _fs_args.pop("open_args_load", {}) - self._fs_open_args_save = _fs_args.pop("open_args_save", {}) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) @@ -134,9 +132,17 @@ def __init__( # noqa: PLR0913 glob_function=self._fs.glob, ) - # Handle default load and save arguments + # Handle default load and save and fs arguments self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} + self._fs_open_args_load = { + **self.DEFAULT_FS_ARGS.get("open_args_load", {}), + **(_fs_open_args_load or {}), + } + self._fs_open_args_save = { + **self.DEFAULT_FS_ARGS.get("open_args_save", {}), + **(_fs_open_args_save or {}), + } if "storage_options" in self._save_args or "storage_options" in self._load_args: logger.warning( diff --git a/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py b/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py index fcc680d16..4e5db2f45 100644 --- a/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py @@ -144,13 +144,9 @@ def __init__( # noqa: PLR0913 self.is_empty_dir: bool = False self._delta_table: DeltaTable | None = None - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args: - self._load_args.update(load_args) - - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args: - self._save_args.update(save_args) + # Handle default load and save arguments + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} write_mode = self._save_args.get("mode", None) if write_mode not in self.ACCEPTED_WRITE_MODES: diff --git a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py index dc29fffdb..d08250bdf 100644 --- a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py @@ -161,12 +161,9 @@ def __init__( # noqa: PLR0913 Raises: DatasetError: If versioning is enabled while in append mode. """ - _fs_args = deepcopy(self.DEFAULT_FS_ARGS) - if fs_args is not None: - _fs_args.update(fs_args) - - self._fs_open_args_load = _fs_args.pop("open_args_load", {}) - self._fs_open_args_save = _fs_args.pop("open_args_save", {}) + _fs_args = deepcopy(fs_args) or {} + _fs_open_args_load = _fs_args.pop("open_args_load", {}) + _fs_open_args_save = _fs_args.pop("open_args_save", {}) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) @@ -186,9 +183,17 @@ def __init__( # noqa: PLR0913 glob_function=self._fs.glob, ) - # Handle default load and save arguments + # Handle default load and save and fs arguments self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} + self._fs_open_args_load = { + **self.DEFAULT_FS_ARGS.get("open_args_load", {}), + **(_fs_open_args_load or {}), + } + self._fs_open_args_save = { + **self.DEFAULT_FS_ARGS.get("open_args_save", {}), + **(_fs_open_args_save or {}), + } self._writer_args = self._save_args.pop("writer", {}) # type: ignore self._writer_args.setdefault("engine", engine or "openpyxl") # type: ignore diff --git a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py index eafac95fc..706eaee1b 100644 --- a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py @@ -110,12 +110,9 @@ def __init__( # noqa: PLR0913 metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ - _fs_args = deepcopy(self.DEFAULT_FS_ARGS) - if fs_args is not None: - _fs_args.update(fs_args) - - self._fs_open_args_load = _fs_args.pop("open_args_load", {}) - self._fs_open_args_save = _fs_args.pop("open_args_save", {}) + _fs_args = deepcopy(fs_args) or {} + _fs_open_args_load = _fs_args.pop("open_args_load", {}) + _fs_open_args_save = _fs_args.pop("open_args_save", {}) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) @@ -135,9 +132,17 @@ def __init__( # noqa: PLR0913 glob_function=self._fs.glob, ) - # Handle default load and save arguments + # Handle default load and save and fs arguments self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} + self._fs_open_args_load = { + **self.DEFAULT_FS_ARGS.get("open_args_load", {}), + **(_fs_open_args_load or {}), + } + self._fs_open_args_save = { + **self.DEFAULT_FS_ARGS.get("open_args_save", {}), + **(_fs_open_args_save or {}), + } if "storage_options" in self._save_args or "storage_options" in self._load_args: logger.warning( diff --git a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py index 4a4ec2726..77da9caf2 100644 --- a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py @@ -87,6 +87,7 @@ class GenericDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): DEFAULT_LOAD_ARGS: dict[str, Any] = {} DEFAULT_SAVE_ARGS: dict[str, Any] = {} + DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "w"}} def __init__( # noqa: PLR0913 self, @@ -137,8 +138,7 @@ def __init__( # noqa: PLR0913 `open_args_load` and `open_args_save`. Here you can find all available arguments for `open`: https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open - All defaults are preserved, except `mode`, which is set to `r` when loading - and to `w` when saving. + All defaults are preserved, except `mode`, which is set to `w` when saving. metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. @@ -170,16 +170,17 @@ def __init__( # noqa: PLR0913 glob_function=self._fs.glob, ) - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) - - _fs_open_args_save.setdefault("mode", "w") - self._fs_open_args_load = _fs_open_args_load - self._fs_open_args_save = _fs_open_args_save + # Handle default load and save and fs arguments + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} + self._fs_open_args_load = { + **self.DEFAULT_FS_ARGS.get("open_args_load", {}), + **(_fs_open_args_load or {}), + } + self._fs_open_args_save = { + **self.DEFAULT_FS_ARGS.get("open_args_save", {}), + **(_fs_open_args_save or {}), + } def _ensure_file_system_target(self) -> None: # Fail fast if provided a known non-filesystem target diff --git a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py index 624d429fc..b3c56e2c7 100644 --- a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py @@ -107,12 +107,9 @@ def __init__( # noqa: PLR0913 metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ - _fs_args = deepcopy(self.DEFAULT_FS_ARGS) - if fs_args is not None: - _fs_args.update(fs_args) - - self._fs_open_args_load = _fs_args.pop("open_args_load", {}) - self._fs_open_args_save = _fs_args.pop("open_args_save", {}) + _fs_args = deepcopy(fs_args) or {} + _fs_open_args_load = _fs_args.pop("open_args_load", {}) + _fs_open_args_save = _fs_args.pop("open_args_save", {}) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) @@ -133,9 +130,17 @@ def __init__( # noqa: PLR0913 self._key = key - # Handle default load and save arguments + # Handle default load and save and fs arguments self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} + self._fs_open_args_load = { + **self.DEFAULT_FS_ARGS.get("open_args_load", {}), + **(_fs_open_args_load or {}), + } + self._fs_open_args_save = { + **self.DEFAULT_FS_ARGS.get("open_args_save", {}), + **(_fs_open_args_save or {}), + } def _describe(self) -> dict[str, Any]: return { diff --git a/kedro-datasets/kedro_datasets/pandas/json_dataset.py b/kedro-datasets/kedro_datasets/pandas/json_dataset.py index 08d275a97..d4d0c87d6 100644 --- a/kedro-datasets/kedro_datasets/pandas/json_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/json_dataset.py @@ -106,12 +106,9 @@ def __init__( # noqa: PLR0913 metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ - _fs_args = deepcopy(self.DEFAULT_FS_ARGS) - if fs_args is not None: - _fs_args.update(fs_args) - - self._fs_open_args_load = _fs_args.pop("open_args_load", {}) - self._fs_open_args_save = _fs_args.pop("open_args_save", {}) + _fs_args = deepcopy(fs_args) or {} + _fs_open_args_load = _fs_args.pop("open_args_load", {}) + _fs_open_args_save = _fs_args.pop("open_args_save", {}) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) @@ -131,9 +128,17 @@ def __init__( # noqa: PLR0913 glob_function=self._fs.glob, ) - # Handle default load and save arguments + # Handle default load and save and fs arguments self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} + self._fs_open_args_load = { + **self.DEFAULT_FS_ARGS.get("open_args_load", {}), + **(_fs_open_args_load or {}), + } + self._fs_open_args_save = { + **self.DEFAULT_FS_ARGS.get("open_args_save", {}), + **(_fs_open_args_save or {}), + } if "storage_options" in self._save_args or "storage_options" in self._load_args: logger.warning( diff --git a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py index b3a9f47f1..7aefe675e 100644 --- a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py @@ -121,12 +121,9 @@ def __init__( # noqa: PLR0913 metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ - _fs_args = deepcopy(self.DEFAULT_FS_ARGS) - if fs_args is not None: - _fs_args.update(fs_args) - - self._fs_open_args_load = _fs_args.pop("open_args_load", {}) - self._fs_open_args_save = _fs_args.pop("open_args_save", {}) + _fs_args = deepcopy(fs_args) or {} + _fs_open_args_load = _fs_args.pop("open_args_load", {}) + _fs_open_args_save = _fs_args.pop("open_args_save", {}) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) @@ -146,9 +143,17 @@ def __init__( # noqa: PLR0913 glob_function=self._fs.glob, ) - # Handle default load argument + # Handle default load and save and fs arguments self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} + self._fs_open_args_load = { + **self.DEFAULT_FS_ARGS.get("open_args_load", {}), + **(_fs_open_args_load or {}), + } + self._fs_open_args_save = { + **self.DEFAULT_FS_ARGS.get("open_args_save", {}), + **(_fs_open_args_save or {}), + } if "storage_options" in self._save_args or "storage_options" in self._load_args: logger.warning( diff --git a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py index 6d73db722..8b86909dc 100644 --- a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py @@ -88,12 +88,9 @@ def __init__( # noqa: PLR0913 metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ - _fs_args = deepcopy(self.DEFAULT_FS_ARGS) - if fs_args is not None: - _fs_args.update(fs_args) - - self._fs_open_args_load = _fs_args.pop("open_args_load", {}) - self._fs_open_args_save = _fs_args.pop("open_args_save", {}) + _fs_args = deepcopy(fs_args) or {} + _fs_open_args_load = _fs_args.pop("open_args_load", {}) + _fs_open_args_save = _fs_args.pop("open_args_save", {}) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) @@ -113,9 +110,17 @@ def __init__( # noqa: PLR0913 glob_function=self._fs.glob, ) - # Handle default load argument + # Handle default load and save and fs arguments self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} + self._fs_open_args_load = { + **self.DEFAULT_FS_ARGS.get("open_args_load", {}), + **(_fs_open_args_load or {}), + } + self._fs_open_args_save = { + **self.DEFAULT_FS_ARGS.get("open_args_save", {}), + **(_fs_open_args_save or {}), + } if "storage_options" in self._save_args or "storage_options" in self._load_args: logger.warning( diff --git a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py index 3ef071e6c..30b4d3630 100644 --- a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py +++ b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py @@ -74,6 +74,7 @@ class PickleDataset(AbstractVersionedDataset[Any, Any]): DEFAULT_LOAD_ARGS: dict[str, Any] = {} DEFAULT_SAVE_ARGS: dict[str, Any] = {} + DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "wb"}} def __init__( # noqa: PLR0913 self, @@ -193,16 +194,16 @@ def __init__( # noqa: PLR0913 self._backend = backend # Handle default load and save arguments - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) - - _fs_open_args_save.setdefault("mode", "wb") - self._fs_open_args_load = _fs_open_args_load - self._fs_open_args_save = _fs_open_args_save + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} + self._fs_open_args_load = { + **self.DEFAULT_FS_ARGS.get("open_args_load", {}), + **(_fs_open_args_load or {}), + } + self._fs_open_args_save = { + **self.DEFAULT_FS_ARGS.get("open_args_save", {}), + **(_fs_open_args_save or {}), + } def _describe(self) -> dict[str, Any]: return { diff --git a/kedro-datasets/kedro_datasets/pillow/image_dataset.py b/kedro-datasets/kedro_datasets/pillow/image_dataset.py index d3e2b838d..2f4ddae13 100644 --- a/kedro-datasets/kedro_datasets/pillow/image_dataset.py +++ b/kedro-datasets/kedro_datasets/pillow/image_dataset.py @@ -45,6 +45,7 @@ class ImageDataset(AbstractVersionedDataset[Image.Image, Image.Image]): """ DEFAULT_SAVE_ARGS: dict[str, Any] = {} + DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "wb"}} def __init__( # noqa: PLR0913 self, @@ -80,8 +81,7 @@ def __init__( # noqa: PLR0913 `open_args_load` and `open_args_save`. Here you can find all available arguments for `open`: https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open - All defaults are preserved, except `mode`, which is set to `r` when loading - and to `w` when saving. + All defaults are preserved, except `mode`, which is set to `wb` when saving. metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ @@ -106,14 +106,16 @@ def __init__( # noqa: PLR0913 glob_function=self._fs.glob, ) - # Handle default save argument - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) - - _fs_open_args_save.setdefault("mode", "wb") - self._fs_open_args_load = _fs_open_args_load - self._fs_open_args_save = _fs_open_args_save + # Handle default save and fs arguments + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} + self._fs_open_args_load = { + **self.DEFAULT_FS_ARGS.get("open_args_load", {}), + **(_fs_open_args_load or {}), + } + self._fs_open_args_save = { + **self.DEFAULT_FS_ARGS.get("open_args_save", {}), + **(_fs_open_args_save or {}), + } def _describe(self) -> dict[str, Any]: return { diff --git a/kedro-datasets/kedro_datasets/plotly/json_dataset.py b/kedro-datasets/kedro_datasets/plotly/json_dataset.py index 4e5182f69..f1a097909 100644 --- a/kedro-datasets/kedro_datasets/plotly/json_dataset.py +++ b/kedro-datasets/kedro_datasets/plotly/json_dataset.py @@ -57,6 +57,7 @@ class JSONDataset( DEFAULT_LOAD_ARGS: dict[str, Any] = {} DEFAULT_SAVE_ARGS: dict[str, Any] = {} + DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "w"}} def __init__( # noqa: PLR0913 self, @@ -123,17 +124,17 @@ def __init__( # noqa: PLR0913 glob_function=self._fs.glob, ) - # Handle default load and save arguments - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) - - _fs_open_args_save.setdefault("mode", "w") - self._fs_open_args_load = _fs_open_args_load - self._fs_open_args_save = _fs_open_args_save + # Handle default load and save and fs arguments + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} + self._fs_open_args_load = { + **self.DEFAULT_FS_ARGS.get("open_args_load", {}), + **(_fs_open_args_load or {}), + } + self._fs_open_args_save = { + **self.DEFAULT_FS_ARGS.get("open_args_save", {}), + **(_fs_open_args_save or {}), + } def _describe(self) -> dict[str, Any]: return { diff --git a/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py b/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py index 68b64fd71..8874849ee 100644 --- a/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py +++ b/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py @@ -70,6 +70,8 @@ class PlotlyDataset(JSONDataset): """ + DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "w"}} + def __init__( # noqa: PLR0913 self, *, @@ -133,8 +135,15 @@ def __init__( # noqa: PLR0913 _fs_open_args_save = _fs_args.pop("open_args_save", {}) _fs_open_args_save.setdefault("mode", "w") - self._fs_open_args_load = _fs_open_args_load - self._fs_open_args_save = _fs_open_args_save + # Handle default fs arguments + self._fs_open_args_load = { + **self.DEFAULT_FS_ARGS.get("open_args_load", {}), + **(_fs_open_args_load or {}), + } + self._fs_open_args_save = { + **self.DEFAULT_FS_ARGS.get("open_args_save", {}), + **(_fs_open_args_save or {}), + } self.metadata = metadata diff --git a/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py b/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py index 249f5d001..c8685aa49 100644 --- a/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py +++ b/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py @@ -3,7 +3,6 @@ from __future__ import annotations import logging -from copy import deepcopy from typing import Any import snowflake.snowpark as sp @@ -156,12 +155,8 @@ def __init__( # noqa: PLR0913 ) schema = credentials["schema"] # Handle default load and save arguments - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} self._table_name = table_name self._database = database diff --git a/kedro-datasets/kedro_datasets/spark/spark_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_dataset.py index 2fc9bd16c..e077d6390 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_dataset.py @@ -366,12 +366,8 @@ def __init__( # noqa: PLR0913 ) # Handle default load and save arguments - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} # Handle schema load argument self._schema = self._load_args.pop("schema", None) diff --git a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py index 60e1443c0..3e4c7b2f4 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py @@ -1,7 +1,6 @@ """SparkJDBCDataset to load and save a PySpark DataFrame via JDBC.""" from __future__ import annotations -from copy import deepcopy from typing import Any from kedro.io.core import AbstractDataset, DatasetError @@ -126,12 +125,8 @@ def __init__( # noqa: PLR0913 self.metadata = metadata # Handle default load and save arguments - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} # Update properties in load_args and save_args with credentials. if credentials is not None: diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index f30770852..c63bd80cb 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -1,7 +1,6 @@ """SparkStreamingDataset to load and save a PySpark Streaming DataFrame.""" from __future__ import annotations -from copy import deepcopy from pathlib import PurePosixPath from typing import Any @@ -87,12 +86,8 @@ def __init__( # noqa: PLR0913 self._filepath = PurePosixPath(filepath) # Handle default load and save arguments - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} # Handle schema load argument self._schema = self._load_args.pop("schema", None) diff --git a/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py b/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py index 9d6818eaf..8e879dff7 100644 --- a/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py +++ b/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py @@ -91,6 +91,10 @@ class SVMLightDataset(AbstractVersionedDataset[_DI, _DO]): DEFAULT_LOAD_ARGS: dict[str, Any] = {} DEFAULT_SAVE_ARGS: dict[str, Any] = {} + DEFAULT_FS_ARGS: dict[str, Any] = { + "open_args_save": {"mode": "wb"}, + "open_args_load": {"mode": "rb"}, + } def __init__( # noqa: PLR0913 self, @@ -123,6 +127,8 @@ def __init__( # noqa: PLR0913 E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + All defaults are preserved, except `mode`, which is set to `rb` when loading + and to `wb` when saving. metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ @@ -147,17 +153,17 @@ def __init__( # noqa: PLR0913 glob_function=self._fs.glob, ) - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) - - _fs_open_args_load.setdefault("mode", "rb") - _fs_open_args_save.setdefault("mode", "wb") - self._fs_open_args_load = _fs_open_args_load - self._fs_open_args_save = _fs_open_args_save + # Handle default load and save and fs arguments + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} + self._fs_open_args_load = { + **self.DEFAULT_FS_ARGS.get("open_args_load", {}), + **(_fs_open_args_load or {}), + } + self._fs_open_args_save = { + **self.DEFAULT_FS_ARGS.get("open_args_save", {}), + **(_fs_open_args_save or {}), + } def _describe(self): return { diff --git a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py index 5a9bbc5a8..5c5dc27a1 100644 --- a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py +++ b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py @@ -129,12 +129,8 @@ def __init__( # noqa: PLR0913 self._tmp_prefix = "kedro_tensorflow_tmp" # temp prefix pattern # Handle default load and save arguments - self._load_args = copy.deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = copy.deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} self._is_h5 = self._save_args.get("save_format") == "h5" diff --git a/kedro-datasets/kedro_datasets/text/text_dataset.py b/kedro-datasets/kedro_datasets/text/text_dataset.py index 0432d066f..2f5c5684a 100644 --- a/kedro-datasets/kedro_datasets/text/text_dataset.py +++ b/kedro-datasets/kedro_datasets/text/text_dataset.py @@ -47,6 +47,11 @@ class TextDataset(AbstractVersionedDataset[str, str]): """ + DEFAULT_FS_ARGS: dict[str, Any] = { + "open_args_save": {"mode": "w"}, + "open_args_load": {"mode": "r"}, + } + def __init__( # noqa: PLR0913 self, *, @@ -102,10 +107,15 @@ def __init__( # noqa: PLR0913 glob_function=self._fs.glob, ) - _fs_open_args_load.setdefault("mode", "r") - _fs_open_args_save.setdefault("mode", "w") - self._fs_open_args_load = _fs_open_args_load - self._fs_open_args_save = _fs_open_args_save + # Handle default fs arguments + self._fs_open_args_load = { + **self.DEFAULT_FS_ARGS.get("open_args_load", {}), + **(_fs_open_args_load or {}), + } + self._fs_open_args_save = { + **self.DEFAULT_FS_ARGS.get("open_args_save", {}), + **(_fs_open_args_save or {}), + } def _describe(self) -> dict[str, Any]: return { diff --git a/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py b/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py index 8e6325b68..b08060a12 100644 --- a/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py +++ b/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py @@ -54,6 +54,7 @@ class YAMLDataset(AbstractVersionedDataset[dict, dict]): """ DEFAULT_SAVE_ARGS: dict[str, Any] = {"default_flow_style": False} + DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "w"}} def __init__( # noqa: PLR0913 self, @@ -89,8 +90,7 @@ def __init__( # noqa: PLR0913 `open_args_load` and `open_args_save`. Here you can find all available arguments for `open`: https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open - All defaults are preserved, except `mode`, which is set to `r` when loading - and to `w` when saving. + All defaults are preserved, except `mode`, which is set to `w` when saving. metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ @@ -116,13 +116,15 @@ def __init__( # noqa: PLR0913 ) # Handle default save arguments - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) - - _fs_open_args_save.setdefault("mode", "w") - self._fs_open_args_load = _fs_open_args_load - self._fs_open_args_save = _fs_open_args_save + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} + self._fs_open_args_load = { + **self.DEFAULT_FS_ARGS.get("open_args_load", {}), + **(_fs_open_args_load or {}), + } + self._fs_open_args_save = { + **self.DEFAULT_FS_ARGS.get("open_args_save", {}), + **(_fs_open_args_save or {}), + } def _describe(self) -> dict[str, Any]: return { diff --git a/kedro-datasets/kedro_datasets_experimental/netcdf/netcdf_dataset.py b/kedro-datasets/kedro_datasets_experimental/netcdf/netcdf_dataset.py index da83c1139..391d5521b 100644 --- a/kedro-datasets/kedro_datasets_experimental/netcdf/netcdf_dataset.py +++ b/kedro-datasets/kedro_datasets_experimental/netcdf/netcdf_dataset.py @@ -129,12 +129,8 @@ def __init__( # noqa self.metadata = metadata # Handle default load and save arguments - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} # Determine if multiple NetCDF files are being loaded in. self._is_multifile = ( diff --git a/kedro-datasets/kedro_datasets_experimental/rioxarray/geotiff_dataset.py b/kedro-datasets/kedro_datasets_experimental/rioxarray/geotiff_dataset.py index 5b290888a..08fa52419 100644 --- a/kedro-datasets/kedro_datasets_experimental/rioxarray/geotiff_dataset.py +++ b/kedro-datasets/kedro_datasets_experimental/rioxarray/geotiff_dataset.py @@ -3,7 +3,6 @@ returns a xarray.DataArray object. """ import logging -from copy import deepcopy from pathlib import PurePosixPath from typing import Any @@ -106,12 +105,8 @@ def __init__( # noqa: PLR0913 ) # Handle default load and save arguments - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) + self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} def _describe(self) -> dict[str, Any]: return { diff --git a/kedro-datasets/tests/pandas/test_csv_dataset.py b/kedro-datasets/tests/pandas/test_csv_dataset.py index 92538d30e..6a5c52464 100644 --- a/kedro-datasets/tests/pandas/test_csv_dataset.py +++ b/kedro-datasets/tests/pandas/test_csv_dataset.py @@ -123,7 +123,7 @@ def test_save_extra_params(self, csv_dataset, save_args): def test_fs_extra_params(self, csv_dataset, fs_args): """Test overriding the default fs arguments.""" assert csv_dataset._fs_open_args_load == {"k1": "v1"} - assert csv_dataset._fs_open_args_save == {"index": "value"} + assert csv_dataset._fs_open_args_save == {"mode": "w", "index": "value"} @pytest.mark.parametrize( "load_args,save_args", diff --git a/kedro-datasets/tests/pandas/test_feather_dataset.py b/kedro-datasets/tests/pandas/test_feather_dataset.py index 0959adb73..38d1f0e31 100644 --- a/kedro-datasets/tests/pandas/test_feather_dataset.py +++ b/kedro-datasets/tests/pandas/test_feather_dataset.py @@ -66,7 +66,7 @@ def test_load_extra_params(self, feather_dataset, load_args): def test_fs_extra_params(self, feather_dataset, fs_args): """Test overriding the default fs arguments.""" assert feather_dataset._fs_open_args_load == {"k1": "v1"} - assert feather_dataset._fs_open_args_save == {"index": "value"} + assert feather_dataset._fs_open_args_save == {"index": "value", "mode": "wb"} @pytest.mark.parametrize( "load_args,save_args", diff --git a/kedro-datasets/tests/pandas/test_json_dataset.py b/kedro-datasets/tests/pandas/test_json_dataset.py index 2e2beacc5..20f0a1e21 100644 --- a/kedro-datasets/tests/pandas/test_json_dataset.py +++ b/kedro-datasets/tests/pandas/test_json_dataset.py @@ -91,7 +91,7 @@ def test_save_extra_params(self, json_dataset, save_args): def test_fs_extra_params(self, json_dataset, fs_args): """Test overriding the default fs arguments.""" assert json_dataset._fs_open_args_load == {"k1": "v1"} - assert json_dataset._fs_open_args_save == {"index": "value"} + assert json_dataset._fs_open_args_save == {"index": "value", "mode": "w"} @pytest.mark.parametrize( "load_args,save_args", From 46aef8d5794003328d70df29e8e80c11530a8101 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Tue, 20 Aug 2024 16:16:43 +0100 Subject: [PATCH 15/18] Fix message dataset Signed-off-by: Merel Theisen --- kedro-datasets/kedro_datasets/email/message_dataset.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kedro-datasets/kedro_datasets/email/message_dataset.py b/kedro-datasets/kedro_datasets/email/message_dataset.py index 15050901f..97c3b203f 100644 --- a/kedro-datasets/kedro_datasets/email/message_dataset.py +++ b/kedro-datasets/kedro_datasets/email/message_dataset.py @@ -8,6 +8,7 @@ from email.generator import Generator from email.message import Message from email.parser import Parser +from email.policy import default from pathlib import PurePosixPath from typing import Any @@ -134,7 +135,11 @@ def __init__( # noqa: PLR0913 # Handle default load and save and fs arguments self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} + self._parser_args = self._load_args.pop("parser", {"policy": default}) + self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} + self._generator_args = self._save_args.pop("generator", {}) + self._fs_open_args_load = { **self.DEFAULT_FS_ARGS.get("open_args_load", {}), **(_fs_open_args_load or {}), From 2aa49a8e42a063cb85933e5b60041c916c138580 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Tue, 20 Aug 2024 16:28:45 +0100 Subject: [PATCH 16/18] Fix tests Signed-off-by: Merel Theisen --- kedro-datasets/tests/spark/test_spark_streaming_dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index 6ac8d8b0b..330c8d10d 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -94,7 +94,7 @@ def test_load(self, tmp_path, sample_spark_streaming_df): schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix() spark_json_ds = SparkDataset( - filepath=filepath, file_format="json", save_args=[{"mode", "overwrite"}] + filepath=filepath, file_format="json", save_args={"mode": "overwrite"} ) spark_json_ds.save(sample_spark_streaming_df) @@ -115,7 +115,7 @@ def test_load_options_schema_path_with_credentials( schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix() spark_json_ds = SparkDataset( - filepath=filepath, file_format="json", save_args=[{"mode", "overwrite"}] + filepath=filepath, file_format="json", save_args={"mode": "overwrite"} ) spark_json_ds.save(sample_spark_streaming_df) @@ -144,7 +144,7 @@ def test_save(self, tmp_path, sample_spark_streaming_df): spark_json_ds = SparkDataset( filepath=filepath_json, file_format="json", - save_args=[{"mode", "overwrite"}], + save_args={"mode": "overwrite"}, ) spark_json_ds.save(sample_spark_streaming_df) From 9e7c363ab45f41d00b8b31d88f914162eae90dfa Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Wed, 21 Aug 2024 13:28:14 +0100 Subject: [PATCH 17/18] Clean up Signed-off-by: Merel Theisen --- .../kedro_datasets/plotly/plotly_dataset.py | 1 - .../kedro_datasets/polars/csv_dataset.py | 19 ++++++++++++------- .../polars/lazy_polars_dataset.py | 19 ++++++++++++------- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py b/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py index 8874849ee..c15df71d7 100644 --- a/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py +++ b/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py @@ -133,7 +133,6 @@ def __init__( # noqa: PLR0913 _fs_args = deepcopy(fs_args) or {} _fs_open_args_load = _fs_args.pop("open_args_load", {}) _fs_open_args_save = _fs_args.pop("open_args_save", {}) - _fs_open_args_save.setdefault("mode", "w") # Handle default fs arguments self._fs_open_args_load = { diff --git a/kedro-datasets/kedro_datasets/polars/csv_dataset.py b/kedro-datasets/kedro_datasets/polars/csv_dataset.py index f6dc2d3ab..44fd427af 100644 --- a/kedro-datasets/kedro_datasets/polars/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/polars/csv_dataset.py @@ -111,12 +111,9 @@ def __init__( # noqa: PLR0913 metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ - _fs_args = deepcopy(self.DEFAULT_FS_ARGS) - if fs_args is not None: - _fs_args.update(fs_args) - - self._fs_open_args_load = _fs_args.pop("open_args_load", {}) - self._fs_open_args_save = _fs_args.pop("open_args_save", {}) + _fs_args = deepcopy(fs_args) or {} + _fs_open_args_load = _fs_args.pop("open_args_load", {}) + _fs_open_args_save = _fs_args.pop("open_args_save", {}) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) @@ -136,9 +133,17 @@ def __init__( # noqa: PLR0913 glob_function=self._fs.glob, ) - # Handle default load and save arguments + # Handle default load and save and fs arguments self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} + self._fs_open_args_load = { + **self.DEFAULT_FS_ARGS.get("open_args_load", {}), + **(_fs_open_args_load or {}), + } + self._fs_open_args_save = { + **self.DEFAULT_FS_ARGS.get("open_args_save", {}), + **(_fs_open_args_save or {}), + } if "storage_options" in self._save_args or "storage_options" in self._load_args: logger.warning( diff --git a/kedro-datasets/kedro_datasets/polars/lazy_polars_dataset.py b/kedro-datasets/kedro_datasets/polars/lazy_polars_dataset.py index 652f08947..fc9733100 100644 --- a/kedro-datasets/kedro_datasets/polars/lazy_polars_dataset.py +++ b/kedro-datasets/kedro_datasets/polars/lazy_polars_dataset.py @@ -143,12 +143,9 @@ def __init__( # noqa: PLR0913 "https://pola-rs.github.io/polars/py-polars/html/reference/io.html" ) - _fs_args = deepcopy(self.DEFAULT_FS_ARGS) - if fs_args is not None: - _fs_args.update(fs_args) - - self._fs_open_args_load = _fs_args.pop("open_args_load", {}) - self._fs_open_args_save = _fs_args.pop("open_args_save", {}) + _fs_args = deepcopy(fs_args) or {} + _fs_open_args_load = _fs_args.pop("open_args_load", {}) + _fs_open_args_save = _fs_args.pop("open_args_save", {}) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) @@ -168,9 +165,17 @@ def __init__( # noqa: PLR0913 glob_function=self._fs.glob, ) - # Handle default load and save arguments + # Handle default load and save and fs arguments self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})} self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})} + self._fs_open_args_load = { + **self.DEFAULT_FS_ARGS.get("open_args_load", {}), + **(_fs_open_args_load or {}), + } + self._fs_open_args_save = { + **self.DEFAULT_FS_ARGS.get("open_args_save", {}), + **(_fs_open_args_save or {}), + } if "storage_options" in self._save_args or "storage_options" in self._load_args: logger.warning( From 0fb3dac2c51e0bbb7e09198accb8cc13b2e5c207 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Thu, 22 Aug 2024 14:03:49 +0100 Subject: [PATCH 18/18] Update release notes Signed-off-by: Merel Theisen --- kedro-datasets/RELEASE.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index a90176166..7d00a5865 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -2,6 +2,8 @@ ## Major features and improvements ## Bug fixes and other changes +* Refactored all datasets to set `fs_args` defaults in the same way as `load_args` and `save_args` and not have hardcoded values in the save methods. + ## Breaking Changes ## Community contributions