Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(datasets): Add kwargs for huggingface.HFDataset #580

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions kedro-airflow/kedro_airflow/plugin.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
""" Kedro plugin for running a project with Airflow """

from __future__ import annotations

from collections import defaultdict
Expand Down
1 change: 1 addition & 0 deletions kedro-airflow/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
discover them automatically. More info here:
https://docs.pytest.org/en/latest/fixture.html
"""

from __future__ import annotations

import os
Expand Down
5 changes: 3 additions & 2 deletions kedro-datasets/RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@ pip install kedro-datasets[pandas-parquetdataset]

## Bug fixes and other changes
* If using MSSQL, `load_args:params` will be typecasted as tuple.
* Fixed bug with loading datasets from Hugging Face. Now allows passing parameters to the load_dataset function.

## Community contributions
Many thanks to the following Kedroids for contributing PRs to this release:
* [Riley Brady](https://github.com/riley-brady)
* [Andrew Cao](https://github.com/andrewcao1)

* [Eduardo Romero Lopez](https://github.com/eromerobilbomatica)

# Release 2.1.0
## Major features and improvements
Expand Down Expand Up @@ -43,7 +44,7 @@ Many thanks to the following Kedroids for contributing PRs to this release:
## Bug fixes and other changes
* Fixed bug with loading models saved with `TensorFlowModelDataset`.
* Make dataset parameters keyword-only.
* Correct pandas-gbq as py311 dependency
* Correct pandas-gbq as py311 dependency.

## Community contributions
Many thanks to the following Kedroids for contributing PRs to this release:
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
and returns them into either as string or json Dict.
It uses the python requests library: https://requests.readthedocs.io/en/latest/
"""

from typing import Any

import lazy_loader as lazy
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/api/api_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""``APIDataset`` loads the data from HTTP(S) APIs.
It uses the python requests library: https://requests.readthedocs.io/en/latest/
"""

import json as json_ # make pylint happy
from copy import deepcopy
from typing import Any, Union
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/biosequence/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""``AbstractDataset`` implementation to read/write from/to a sequence file."""

from typing import Any

import lazy_loader as lazy
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""BioSequenceDataset loads and saves data to/from bio-sequence objects to
file.
"""

from copy import deepcopy
from pathlib import PurePosixPath
from typing import Any
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/dask/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Provides I/O modules using dask dataframe."""

from typing import Any

import lazy_loader as lazy
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/dask/parquet_dataset.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""``ParquetDataset`` is a data set used to load and save data to parquet files using Dask
dataframe"""

from copy import deepcopy
from typing import Any

Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/databricks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Provides interface to Unity Catalog Tables."""

from typing import Any

import lazy_loader as lazy
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""``ManagedTableDataset`` implementation to access managed delta tables
in Databricks.
"""

import logging
import re
from dataclasses import dataclass
Expand Down Expand Up @@ -188,7 +189,18 @@ class ManagedTableDataset(AbstractVersionedDataset):
... [StructField("name", StringType(), True), StructField("age", IntegerType(), True)]
... )
>>> data = [("Alex", 31), ("Bob", 12), ("Clarke", 65), ("Dave", 29)]
>>> spark_df = SparkSession.builder.config("spark.jars.packages", f"io.delta:delta-core_2.12:{DELTA_VERSION}").config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension").config("spark.sql.catalog.spark_catalog","org.apache.spark.sql.delta.catalog.DeltaCatalog",).getOrCreate().createDataFrame(data, schema)
>>> spark_df = (
... SparkSession.builder.config(
... "spark.jars.packages", f"io.delta:delta-core_2.12:{DELTA_VERSION}"
... )
... .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
... .config(
... "spark.sql.catalog.spark_catalog",
... "org.apache.spark.sql.delta.catalog.DeltaCatalog",
... )
... .getOrCreate()
... .createDataFrame(data, schema)
... )
>>> dataset = ManagedTableDataset(table="names_and_ages", write_mode="overwrite")
>>> dataset.save(spark_df)
>>> reloaded = dataset.load()
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/email/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""``AbstractDataset`` implementations for managing email messages."""

from typing import Any

import lazy_loader as lazy
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/email/message_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
using an underlying filesystem (e.g.: local, S3, GCS). It uses the
``email`` package in the standard library to manage email messages.
"""

from copy import deepcopy
from email.generator import Generator
from email.message import Message
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/geopandas/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""``GeoJSONDataset`` is an ``AbstractVersionedDataset`` to save and load GeoJSON files."""

from typing import Any

import lazy_loader as lazy
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
underlying functionality is supported by geopandas, so it supports all
allowed geopandas (pandas) options for loading and saving geosjon files.
"""

import copy
from pathlib import PurePosixPath
from typing import Any, Union
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/holoviews/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""``AbstractDataset`` implementation to save Holoviews objects as image files."""

from typing import Any

import lazy_loader as lazy
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/huggingface/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Provides interface to Hugging Face transformers and datasets."""

from typing import Any

import lazy_loader as lazy
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,17 @@ class HFDataset(AbstractVersionedDataset):

"""

def __init__(self, *, dataset_name: str):
def __init__(
self,
*,
dataset_name: str,
dataset_kwargs: dict[Any] | None = None,
):
self.dataset_name = dataset_name
self._dataset_kwargs = dataset_kwargs or {}

def _load(self):
return load_dataset(self.dataset_name)
return load_dataset(self.dataset_name, **self._dataset_kwargs)

def _save(self):
raise NotImplementedError("Not yet implemented")
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/json/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""``AbstractDataset`` implementation to load/save data from/to a JSON file."""

from typing import Any

import lazy_loader as lazy
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/json/json_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""``JSONDataset`` loads/saves data from/to a JSON file using an underlying
filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file.
"""

import json
from copy import deepcopy
from pathlib import PurePosixPath
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/matlab/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""``AbstractDataset`` implementation to load/save data from/to a Matlab file."""

from __future__ import annotations

from typing import Any
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/matlab/matlab_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
the specified backend library passed in (defaults to the ``matlab`` library), so it
supports all allowed options for loading and saving matlab files.
"""

from copy import deepcopy
from pathlib import PurePosixPath
from typing import Any
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/matplotlib/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""``AbstractDataset`` implementation to save matplotlib objects as image files."""

from typing import Any

import lazy_loader as lazy
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/netcdf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""``NetCDFDataset`` is an ``AbstractDataset`` to save and load NetCDF files."""

from __future__ import annotations

from typing import Any
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/netcdf/netcdf_dataset.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""NetCDFDataset loads and saves data to a local netcdf (.nc) file."""

import logging
from copy import deepcopy
from glob import glob
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/networkx/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""``AbstractDataset`` implementation to save and load graphs in JSON,
GraphML and GML formats using NetworkX."""

from typing import Any

import lazy_loader as lazy
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/networkx/gml_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
file using an underlying filesystem (e.g.: local, S3, GCS). NetworkX is used to
create GML data.
"""

from copy import deepcopy
from pathlib import PurePosixPath
from typing import Any
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/networkx/graphml_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""NetworkX ``GraphMLDataset`` loads and saves graphs to a GraphML file using an underlying
filesystem (e.g.: local, S3, GCS). NetworkX is used to create GraphML data.
"""

from copy import deepcopy
from pathlib import PurePosixPath
from typing import Any
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/networkx/json_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""``JSONDataset`` loads and saves graphs to a JSON file using an underlying
filesystem (e.g.: local, S3, GCS). NetworkX is used to create JSON data.
"""

import json
from copy import deepcopy
from pathlib import PurePosixPath
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/pandas/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""``AbstractDataset`` implementations that produce pandas DataFrames."""

from typing import Any

import lazy_loader as lazy
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/pandas/csv_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""``CSVDataset`` loads/saves data from/to a CSV file using an underlying
filesystem (e.g.: local, S3, GCS). It uses pandas to handle the CSV file.
"""

import logging
from copy import deepcopy
from io import BytesIO
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
S3, GCS), Databricks unity catalog and AWS Glue catalog respectively. It handles
load and save using a pandas dataframe.
"""

from copy import deepcopy
from typing import Any, Optional

Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/pandas/excel_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""``ExcelDataset`` loads/saves data from/to a Excel file using an underlying
filesystem (e.g.: local, S3, GCS). It uses pandas to handle the Excel file.
"""

import logging
from copy import deepcopy
from io import BytesIO
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/pandas/feather_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
using an underlying filesystem (e.g.: local, S3, GCS). The underlying functionality
is supported by pandas, so it supports all operations the pandas supports.
"""

import logging
from copy import deepcopy
from io import BytesIO
Expand Down
5 changes: 4 additions & 1 deletion kedro-datasets/kedro_datasets/pandas/gbq_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""``GBQTableDataset`` loads and saves data from/to Google BigQuery. It uses pandas-gbq
to read and write from/to BigQuery table.
"""

import copy
from pathlib import PurePosixPath
from typing import Any, NoReturn, Union
Expand Down Expand Up @@ -51,7 +52,9 @@ class GBQTableDataset(AbstractDataset[None, pd.DataFrame]):
>>>
>>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]})
>>>
>>> dataset = GBQTableDataset(dataset="dataset", table_name="table_name", project="my-project")
>>> dataset = GBQTableDataset(
... dataset="dataset", table_name="table_name", project="my-project"
... )
>>> dataset.save(data)
>>> reloaded = dataset.load()
>>>
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/pandas/generic_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
filesystem (e.g.: local, S3, GCS). It uses pandas to handle the
type of read/write target.
"""

from copy import deepcopy
from pathlib import PurePosixPath
from typing import Any
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/pandas/hdf_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""``HDFDataset`` loads/saves data from/to a hdf file using an underlying
filesystem (e.g.: local, S3, GCS). It uses pandas.HDFStore to handle the hdf file.
"""

from copy import deepcopy
from pathlib import PurePosixPath
from threading import Lock
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/pandas/json_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""``JSONDataset`` loads/saves data from/to a JSON file using an underlying
filesystem (e.g.: local, S3, GCS). It uses pandas to handle the JSON file.
"""

import logging
from copy import deepcopy
from io import BytesIO
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/pandas/parquet_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""``ParquetDataset`` loads/saves data from/to a Parquet file using an underlying
filesystem (e.g.: local, S3, GCS). It uses pandas to handle the Parquet file.
"""

import logging
from copy import deepcopy
from io import BytesIO
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/pandas/sql_dataset.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""``SQLDataset`` to load and save data to a SQL backend."""

from __future__ import annotations

import copy
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/pandas/xml_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""``XMLDataset`` loads/saves data from/to a XML file using an underlying
filesystem (e.g.: local, S3, GCS). It uses pandas to handle the XML file.
"""

import logging
from copy import deepcopy
from io import BytesIO
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/partitions/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""``AbstractDataset`` implementations to load/save data in partitions
from/to any underlying dataset format.
"""

from typing import Any

import lazy_loader as lazy
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
of the data partitions by default, so that subsequent pipeline run loads only
new partitions past the checkpoint.It also uses `fsspec` for filesystem level operations.
"""

from __future__ import annotations

import operator
Expand Down Expand Up @@ -44,7 +45,9 @@ class IncrementalDataset(PartitionedDataset):

>>> from kedro_datasets.partitions import IncrementalDataset
>>>
>>> dataset = IncrementalDataset(path=str(tmp_path/ "test_data"), dataset="pandas.CSVDataset")
>>> dataset = IncrementalDataset(
... path=str(tmp_path / "test_data"), dataset="pandas.CSVDataset"
... )
>>> loaded = dataset.load() # loads all available partitions
>>> # assert isinstance(loaded, dict)
>>>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""``PartitionedDataset`` loads and saves partitioned file-like data using the
underlying dataset definition. It also uses `fsspec` for filesystem level operations.
"""

from __future__ import annotations

import operator
Expand Down
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/pickle/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""``AbstractDataset`` implementation to load/save data from/to a Pickle file."""

from typing import Any

import lazy_loader as lazy
Expand Down
Loading