kedro-org · astrojuanlu · Mar 16, 2024 · Feb 28, 2024 · Feb 28, 2024 · Feb 29, 2024
diff --git a/kedro-airflow/kedro_airflow/plugin.py b/kedro-airflow/kedro_airflow/plugin.py
@@ -1,4 +1,5 @@
 """ Kedro plugin for running a project with Airflow """
+
 from __future__ import annotations
 
 from collections import defaultdict

diff --git a/kedro-airflow/tests/conftest.py b/kedro-airflow/tests/conftest.py
@@ -4,6 +4,7 @@
 discover them automatically. More info here:
 https://docs.pytest.org/en/latest/fixture.html
 """
+
 from __future__ import annotations
 
 import os

diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md
@@ -9,12 +9,13 @@ pip install kedro-datasets[pandas-parquetdataset]
 
 ## Bug fixes and other changes
 * If using MSSQL, `load_args:params` will be typecasted as tuple.
+* Fixed bug with loading datasets from Hugging Face. Now allows passing parameters to the load_dataset function.
 
 ## Community contributions
 Many thanks to the following Kedroids for contributing PRs to this release:
 * [Riley Brady](https://github.com/riley-brady)
 * [Andrew Cao](https://github.com/andrewcao1)
-
+* [Eduardo Romero Lopez](https://github.com/eromerobilbomatica)
 
 # Release 2.1.0
 ## Major features and improvements
@@ -43,7 +44,7 @@ Many thanks to the following Kedroids for contributing PRs to this release:
 ## Bug fixes and other changes
 * Fixed bug with loading models saved with `TensorFlowModelDataset`.
 * Make dataset parameters keyword-only.
-* Correct pandas-gbq as py311 dependency
+* Correct pandas-gbq as py311 dependency.
 
 ## Community contributions
 Many thanks to the following Kedroids for contributing PRs to this release:

diff --git a/kedro-datasets/kedro_datasets/api/__init__.py b/kedro-datasets/kedro_datasets/api/__init__.py
@@ -2,6 +2,7 @@
 and returns them into either as string or json Dict.
 It uses the python requests library: https://requests.readthedocs.io/en/latest/
 """
+
 from typing import Any
 
 import lazy_loader as lazy

diff --git a/kedro-datasets/kedro_datasets/api/api_dataset.py b/kedro-datasets/kedro_datasets/api/api_dataset.py
@@ -1,6 +1,7 @@
 """``APIDataset`` loads the data from HTTP(S) APIs.
 It uses the python requests library: https://requests.readthedocs.io/en/latest/
 """
+
 import json as json_  # make pylint happy
 from copy import deepcopy
 from typing import Any, Union

diff --git a/kedro-datasets/kedro_datasets/biosequence/__init__.py b/kedro-datasets/kedro_datasets/biosequence/__init__.py
@@ -1,4 +1,5 @@
 """``AbstractDataset`` implementation to read/write from/to a sequence file."""
+
 from typing import Any
 
 import lazy_loader as lazy

diff --git a/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py b/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py
@@ -1,6 +1,7 @@
 """BioSequenceDataset loads and saves data to/from bio-sequence objects to
 file.
 """
+
 from copy import deepcopy
 from pathlib import PurePosixPath
 from typing import Any

diff --git a/kedro-datasets/kedro_datasets/dask/__init__.py b/kedro-datasets/kedro_datasets/dask/__init__.py
@@ -1,4 +1,5 @@
 """Provides I/O modules using dask dataframe."""
+
 from typing import Any
 
 import lazy_loader as lazy

diff --git a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py
@@ -1,5 +1,6 @@
 """``ParquetDataset`` is a data set used to load and save data to parquet files using Dask
 dataframe"""
+
 from copy import deepcopy
 from typing import Any
 

diff --git a/kedro-datasets/kedro_datasets/databricks/__init__.py b/kedro-datasets/kedro_datasets/databricks/__init__.py
@@ -1,4 +1,5 @@
 """Provides interface to Unity Catalog Tables."""
+
 from typing import Any
 
 import lazy_loader as lazy

diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py
@@ -1,6 +1,7 @@
 """``ManagedTableDataset`` implementation to access managed delta tables
 in Databricks.
 """
+
 import logging
 import re
 from dataclasses import dataclass
@@ -188,7 +189,18 @@ class ManagedTableDataset(AbstractVersionedDataset):
         ...     [StructField("name", StringType(), True), StructField("age", IntegerType(), True)]
         ... )
         >>> data = [("Alex", 31), ("Bob", 12), ("Clarke", 65), ("Dave", 29)]
-        >>> spark_df = SparkSession.builder.config("spark.jars.packages", f"io.delta:delta-core_2.12:{DELTA_VERSION}").config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension").config("spark.sql.catalog.spark_catalog","org.apache.spark.sql.delta.catalog.DeltaCatalog",).getOrCreate().createDataFrame(data, schema)
+        >>> spark_df = (
+        ...     SparkSession.builder.config(
+        ...         "spark.jars.packages", f"io.delta:delta-core_2.12:{DELTA_VERSION}"
+        ...     )
+        ...     .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
+        ...     .config(
+        ...         "spark.sql.catalog.spark_catalog",
+        ...         "org.apache.spark.sql.delta.catalog.DeltaCatalog",
+        ...     )
+        ...     .getOrCreate()
+        ...     .createDataFrame(data, schema)
+        ... )
         >>> dataset = ManagedTableDataset(table="names_and_ages", write_mode="overwrite")
         >>> dataset.save(spark_df)
         >>> reloaded = dataset.load()

diff --git a/kedro-datasets/kedro_datasets/email/__init__.py b/kedro-datasets/kedro_datasets/email/__init__.py
@@ -1,4 +1,5 @@
 """``AbstractDataset`` implementations for managing email messages."""
+
 from typing import Any
 
 import lazy_loader as lazy

diff --git a/kedro-datasets/kedro_datasets/email/message_dataset.py b/kedro-datasets/kedro_datasets/email/message_dataset.py
@@ -2,6 +2,7 @@
 using an underlying filesystem (e.g.: local, S3, GCS). It uses the
 ``email`` package in the standard library to manage email messages.
 """
+
 from copy import deepcopy
 from email.generator import Generator
 from email.message import Message

diff --git a/kedro-datasets/kedro_datasets/geopandas/__init__.py b/kedro-datasets/kedro_datasets/geopandas/__init__.py
@@ -1,4 +1,5 @@
 """``GeoJSONDataset`` is an ``AbstractVersionedDataset`` to save and load GeoJSON files."""
+
 from typing import Any
 
 import lazy_loader as lazy

diff --git a/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py b/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py
@@ -2,6 +2,7 @@
 underlying functionality is supported by geopandas, so it supports all
 allowed geopandas (pandas) options for loading and saving geosjon files.
 """
+
 import copy
 from pathlib import PurePosixPath
 from typing import Any, Union

diff --git a/kedro-datasets/kedro_datasets/holoviews/__init__.py b/kedro-datasets/kedro_datasets/holoviews/__init__.py
@@ -1,4 +1,5 @@
 """``AbstractDataset`` implementation to save Holoviews objects as image files."""
+
 from typing import Any
 
 import lazy_loader as lazy

diff --git a/kedro-datasets/kedro_datasets/huggingface/__init__.py b/kedro-datasets/kedro_datasets/huggingface/__init__.py
@@ -1,4 +1,5 @@
 """Provides interface to Hugging Face transformers and datasets."""
+
 from typing import Any
 
 import lazy_loader as lazy

diff --git a/kedro-datasets/kedro_datasets/huggingface/hugging_face_dataset.py b/kedro-datasets/kedro_datasets/huggingface/hugging_face_dataset.py
@@ -36,11 +36,17 @@ class HFDataset(AbstractVersionedDataset):
 
     """
 
-    def __init__(self, *, dataset_name: str):
+    def __init__(
+        self,
+        *,
+        dataset_name: str,
+        dataset_kwargs: dict[Any] | None = None,
+    ):
         self.dataset_name = dataset_name
+        self._dataset_kwargs = dataset_kwargs or {}
 
     def _load(self):
-        return load_dataset(self.dataset_name)
+        return load_dataset(self.dataset_name, **self._dataset_kwargs)
 
     def _save(self):
         raise NotImplementedError("Not yet implemented")

diff --git a/kedro-datasets/kedro_datasets/json/__init__.py b/kedro-datasets/kedro_datasets/json/__init__.py
@@ -1,4 +1,5 @@
 """``AbstractDataset`` implementation to load/save data from/to a JSON file."""
+
 from typing import Any
 
 import lazy_loader as lazy

diff --git a/kedro-datasets/kedro_datasets/json/json_dataset.py b/kedro-datasets/kedro_datasets/json/json_dataset.py
@@ -1,6 +1,7 @@
 """``JSONDataset`` loads/saves data from/to a JSON file using an underlying
 filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file.
 """
+
 import json
 from copy import deepcopy
 from pathlib import PurePosixPath

diff --git a/kedro-datasets/kedro_datasets/matlab/__init__.py b/kedro-datasets/kedro_datasets/matlab/__init__.py
@@ -1,4 +1,5 @@
 """``AbstractDataset`` implementation to load/save data from/to a Matlab file."""
+
 from __future__ import annotations
 
 from typing import Any

diff --git a/kedro-datasets/kedro_datasets/matlab/matlab_dataset.py b/kedro-datasets/kedro_datasets/matlab/matlab_dataset.py
@@ -3,6 +3,7 @@
 the specified backend library passed in (defaults to the ``matlab`` library), so it
 supports all allowed options for loading and saving matlab files.
 """
+
 from copy import deepcopy
 from pathlib import PurePosixPath
 from typing import Any

diff --git a/kedro-datasets/kedro_datasets/matplotlib/__init__.py b/kedro-datasets/kedro_datasets/matplotlib/__init__.py
@@ -1,4 +1,5 @@
 """``AbstractDataset`` implementation to save matplotlib objects as image files."""
+
 from typing import Any
 
 import lazy_loader as lazy

diff --git a/kedro-datasets/kedro_datasets/netcdf/__init__.py b/kedro-datasets/kedro_datasets/netcdf/__init__.py
@@ -1,4 +1,5 @@
 """``NetCDFDataset`` is an ``AbstractDataset`` to save and load NetCDF files."""
+
 from __future__ import annotations
 
 from typing import Any

diff --git a/kedro-datasets/kedro_datasets/netcdf/netcdf_dataset.py b/kedro-datasets/kedro_datasets/netcdf/netcdf_dataset.py
@@ -1,4 +1,5 @@
 """NetCDFDataset loads and saves data to a local netcdf (.nc) file."""
+
 import logging
 from copy import deepcopy
 from glob import glob

diff --git a/kedro-datasets/kedro_datasets/networkx/__init__.py b/kedro-datasets/kedro_datasets/networkx/__init__.py
@@ -1,5 +1,6 @@
 """``AbstractDataset`` implementation to save and load graphs in JSON,
 GraphML and GML formats using NetworkX."""
+
 from typing import Any
 
 import lazy_loader as lazy

diff --git a/kedro-datasets/kedro_datasets/networkx/gml_dataset.py b/kedro-datasets/kedro_datasets/networkx/gml_dataset.py
@@ -2,6 +2,7 @@
 file using an underlying filesystem (e.g.: local, S3, GCS). NetworkX is used to
 create GML data.
 """
+
 from copy import deepcopy
 from pathlib import PurePosixPath
 from typing import Any

diff --git a/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py b/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py
@@ -1,6 +1,7 @@
 """NetworkX ``GraphMLDataset`` loads and saves graphs to a GraphML file using an underlying
 filesystem (e.g.: local, S3, GCS). NetworkX is used to create GraphML data.
 """
+
 from copy import deepcopy
 from pathlib import PurePosixPath
 from typing import Any

diff --git a/kedro-datasets/kedro_datasets/networkx/json_dataset.py b/kedro-datasets/kedro_datasets/networkx/json_dataset.py
@@ -1,6 +1,7 @@
 """``JSONDataset`` loads and saves graphs to a JSON file using an underlying
 filesystem (e.g.: local, S3, GCS). NetworkX is used to create JSON data.
 """
+
 import json
 from copy import deepcopy
 from pathlib import PurePosixPath

diff --git a/kedro-datasets/kedro_datasets/pandas/__init__.py b/kedro-datasets/kedro_datasets/pandas/__init__.py
@@ -1,4 +1,5 @@
 """``AbstractDataset`` implementations that produce pandas DataFrames."""
+
 from typing import Any
 
 import lazy_loader as lazy

diff --git a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py
@@ -1,6 +1,7 @@
 """``CSVDataset`` loads/saves data from/to a CSV file using an underlying
 filesystem (e.g.: local, S3, GCS). It uses pandas to handle the CSV file.
 """
+
 import logging
 from copy import deepcopy
 from io import BytesIO

diff --git a/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py b/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py
@@ -2,6 +2,7 @@
 S3, GCS), Databricks unity catalog and AWS Glue catalog respectively. It handles
 load and save using a pandas dataframe.
 """
+
 from copy import deepcopy
 from typing import Any, Optional
 

diff --git a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py
@@ -1,6 +1,7 @@
 """``ExcelDataset`` loads/saves data from/to a Excel file using an underlying
 filesystem (e.g.: local, S3, GCS). It uses pandas to handle the Excel file.
 """
+
 import logging
 from copy import deepcopy
 from io import BytesIO

diff --git a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py
@@ -2,6 +2,7 @@
 using an underlying filesystem (e.g.: local, S3, GCS). The underlying functionality
 is supported by pandas, so it supports all operations the pandas supports.
 """
+
 import logging
 from copy import deepcopy
 from io import BytesIO

diff --git a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py
@@ -1,6 +1,7 @@
 """``GBQTableDataset`` loads and saves data from/to Google BigQuery. It uses pandas-gbq
 to read and write from/to BigQuery table.
 """
+
 import copy
 from pathlib import PurePosixPath
 from typing import Any, NoReturn, Union
@@ -51,7 +52,9 @@ class GBQTableDataset(AbstractDataset[None, pd.DataFrame]):
         >>>
         >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]})
         >>>
-        >>> dataset = GBQTableDataset(dataset="dataset", table_name="table_name", project="my-project")
+        >>> dataset = GBQTableDataset(
+        ...     dataset="dataset", table_name="table_name", project="my-project"
+        ... )
         >>> dataset.save(data)
         >>> reloaded = dataset.load()
         >>>

diff --git a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py
@@ -2,6 +2,7 @@
 filesystem (e.g.: local, S3, GCS). It uses pandas to handle the
 type of read/write target.
 """
+
 from copy import deepcopy
 from pathlib import PurePosixPath
 from typing import Any

diff --git a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py
@@ -1,6 +1,7 @@
 """``HDFDataset`` loads/saves data from/to a hdf file using an underlying
 filesystem (e.g.: local, S3, GCS). It uses pandas.HDFStore to handle the hdf file.
 """
+
 from copy import deepcopy
 from pathlib import PurePosixPath
 from threading import Lock

diff --git a/kedro-datasets/kedro_datasets/pandas/json_dataset.py b/kedro-datasets/kedro_datasets/pandas/json_dataset.py
@@ -1,6 +1,7 @@
 """``JSONDataset`` loads/saves data from/to a JSON file using an underlying
 filesystem (e.g.: local, S3, GCS). It uses pandas to handle the JSON file.
 """
+
 import logging
 from copy import deepcopy
 from io import BytesIO

diff --git a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py
@@ -1,6 +1,7 @@
 """``ParquetDataset`` loads/saves data from/to a Parquet file using an underlying
 filesystem (e.g.: local, S3, GCS). It uses pandas to handle the Parquet file.
 """
+
 import logging
 from copy import deepcopy
 from io import BytesIO

diff --git a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py
@@ -1,4 +1,5 @@
 """``SQLDataset`` to load and save data to a SQL backend."""
+
 from __future__ import annotations
 
 import copy

diff --git a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py
@@ -1,6 +1,7 @@
 """``XMLDataset`` loads/saves data from/to a XML file using an underlying
 filesystem (e.g.: local, S3, GCS). It uses pandas to handle the XML file.
 """
+
 import logging
 from copy import deepcopy
 from io import BytesIO

diff --git a/kedro-datasets/kedro_datasets/partitions/__init__.py b/kedro-datasets/kedro_datasets/partitions/__init__.py
@@ -1,6 +1,7 @@
 """``AbstractDataset`` implementations to load/save data in partitions
 from/to any underlying dataset format.
 """
+
 from typing import Any
 
 import lazy_loader as lazy

diff --git a/kedro-datasets/kedro_datasets/partitions/incremental_dataset.py b/kedro-datasets/kedro_datasets/partitions/incremental_dataset.py
@@ -5,6 +5,7 @@
 of the data partitions by default, so that subsequent pipeline run loads only
 new partitions past the checkpoint.It also uses `fsspec` for filesystem level operations.
 """
+
 from __future__ import annotations
 
 import operator
@@ -44,7 +45,9 @@ class IncrementalDataset(PartitionedDataset):
 
         >>> from kedro_datasets.partitions import IncrementalDataset
         >>>
-        >>> dataset = IncrementalDataset(path=str(tmp_path/ "test_data"), dataset="pandas.CSVDataset")
+        >>> dataset = IncrementalDataset(
+        ...     path=str(tmp_path / "test_data"), dataset="pandas.CSVDataset"
+        ... )
         >>> loaded = dataset.load()  # loads all available partitions
         >>> # assert isinstance(loaded, dict)
         >>>

diff --git a/kedro-datasets/kedro_datasets/partitions/partitioned_dataset.py b/kedro-datasets/kedro_datasets/partitions/partitioned_dataset.py
@@ -1,6 +1,7 @@
 """``PartitionedDataset`` loads and saves partitioned file-like data using the
 underlying dataset definition. It also uses `fsspec` for filesystem level operations.
 """
+
 from __future__ import annotations
 
 import operator

diff --git a/kedro-datasets/kedro_datasets/pickle/__init__.py b/kedro-datasets/kedro_datasets/pickle/__init__.py
@@ -1,4 +1,5 @@
 """``AbstractDataset`` implementation to load/save data from/to a Pickle file."""
+
 from typing import Any
 
 import lazy_loader as lazy