From ffaa0dc71fb6ed6d01ba7866bd20777646097017 Mon Sep 17 00:00:00 2001 From: deanm0000 <37878412+deanm0000@users.noreply.github.com> Date: Sun, 15 Sep 2024 08:58:40 -0400 Subject: [PATCH] Support fsspec.AbstractFileSystem (#88) * supported_filesystem * fixes * remove unused import --------- Co-authored-by: ZhengYu, Xu --- pyarrow-stubs/_dataset.pyi | 14 +++++++------- pyarrow-stubs/_dataset_parquet.pyi | 6 +++--- pyarrow-stubs/_fs.pyi | 6 +++++- pyarrow-stubs/dataset.pyi | 12 ++++++------ pyarrow-stubs/fs.pyi | 9 +++++---- pyarrow-stubs/orc.pyi | 4 ++-- pyarrow-stubs/parquet/core.pyi | 20 ++++++++++---------- 7 files changed, 38 insertions(+), 33 deletions(-) diff --git a/pyarrow-stubs/_dataset.pyi b/pyarrow-stubs/_dataset.pyi index d91a48d..fef3cbe 100644 --- a/pyarrow-stubs/_dataset.pyi +++ b/pyarrow-stubs/_dataset.pyi @@ -13,7 +13,7 @@ from typing import ( ) from . import _csv, _json, _parquet, lib -from ._fs import FileSelector, FileSystem +from ._fs import FileSelector, FileSystem, SupportedFileSystem from ._stubs_typing import Indices, JoinType, Order from .acero import ExecNodeOptions from .compute import Expression @@ -129,7 +129,7 @@ class FileSystemDataset(Dataset): fragments: list[Fragment], schema: lib.Schema, format: FileFormat, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, root_partition: Expression | None = None, ) -> None: ... @classmethod @@ -138,7 +138,7 @@ class FileSystemDataset(Dataset): paths: list[str], schema: lib.Schema | None = None, format: FileFormat | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, partitions: list[Expression] | None = None, root_partition: Expression | None = None, ) -> FileSystemDataset: ... @@ -157,12 +157,12 @@ class FileWriteOptions(lib._Weakrefable): class FileFormat(lib._Weakrefable): def inspect( - self, file: str | Path | IO, filesystem: FileSystem | None = None + self, file: str | Path | IO, filesystem: SupportedFileSystem | None = None ) -> lib.Schema: ... def make_fragment( self, file: str | Path | IO, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, partition_expression: Expression | None = None, *, file_size: int | None = None, @@ -402,7 +402,7 @@ class FileSystemFactoryOptions(lib._Weakrefable): class FileSystemDatasetFactory(DatasetFactory): def __init__( self, - filesystem: FileSystem, + filesystem: SupportedFileSystem, paths_or_selector: FileSelector, format: FileFormat, options: FileSystemFactoryOptions | None = None, @@ -503,7 +503,7 @@ def _filesystemdataset_write( data: Scanner, base_dir: str | Path, basename_template: str, - filesystem: FileSystem, + filesystem: SupportedFileSystem, partitioning: Partitioning, file_options: FileWriteOptions, max_partitions: int, diff --git a/pyarrow-stubs/_dataset_parquet.pyi b/pyarrow-stubs/_dataset_parquet.pyi index ce1a840..f5b2c93 100644 --- a/pyarrow-stubs/_dataset_parquet.pyi +++ b/pyarrow-stubs/_dataset_parquet.pyi @@ -14,7 +14,7 @@ from ._dataset import ( PartitioningFactory, ) from ._dataset_parquet_encryption import ParquetDecryptionConfig -from ._fs import FileSystem +from ._fs import SupportedFileSystem from ._parquet import FileDecryptionProperties, FileMetaData from .lib import CacheOptions, Schema, _Weakrefable @@ -36,7 +36,7 @@ class ParquetFileFormat(FileFormat): def make_fragment( self, file: IO | Path | str, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, partition_expression: Expression | None = None, row_groups: Iterable[int] | None = None, *, @@ -118,7 +118,7 @@ class ParquetDatasetFactory(DatasetFactory): def __init__( self, metadata_path: str, - filesystem: FileSystem, + filesystem: SupportedFileSystem, format: FileFormat, options: ParquetFactoryOptions | None = None, ) -> None: ... diff --git a/pyarrow-stubs/_fs.pyi b/pyarrow-stubs/_fs.pyi index 3f87e3f..4725b20 100644 --- a/pyarrow-stubs/_fs.pyi +++ b/pyarrow-stubs/_fs.pyi @@ -2,10 +2,14 @@ import datetime as dt import enum from abc import ABC, abstractmethod -from typing import Self, overload +from typing import Self, TypeAlias, Union, overload + +from fsspec import AbstractFileSystem from .lib import NativeFile, _Weakrefable +SupportedFileSystem: TypeAlias = Union[AbstractFileSystem, FileSystem] + class FileType(enum.IntFlag): NotFound = enum.auto() Unknown = enum.auto() diff --git a/pyarrow-stubs/dataset.pyi b/pyarrow-stubs/dataset.pyi index 1fcc436..3473fe4 100644 --- a/pyarrow-stubs/dataset.pyi +++ b/pyarrow-stubs/dataset.pyi @@ -50,7 +50,7 @@ from pyarrow._dataset_parquet_encryption import ( from pyarrow.compute import Expression, field, scalar from pyarrow.lib import Array, RecordBatch, RecordBatchReader, Schema, Table -from ._fs import FileSystem +from ._fs import SupportedFileSystem _orc_available: bool _parquet_available: bool @@ -153,7 +153,7 @@ def partitioning( def parquet_dataset( metadata_path: str | Path, schema: Schema | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, format: ParquetFileFormat | None = None, partitioning: Partitioning | PartitioningFactory | None = None, partition_base_dir: str | None = None, @@ -163,7 +163,7 @@ def dataset( source: str | list[str] | Path | list[Path], schema: Schema | None = None, format: FileFormat | _DatasetFormat | None = None, - filesystem: FileSystem | str | None = None, + filesystem: SupportedFileSystem | str | None = None, partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, partition_base_dir: str | None = None, exclude_invalid_files: bool | None = None, @@ -174,7 +174,7 @@ def dataset( source: list[Dataset], schema: Schema | None = None, format: FileFormat | _DatasetFormat | None = None, - filesystem: FileSystem | str | None = None, + filesystem: SupportedFileSystem | str | None = None, partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, partition_base_dir: str | None = None, exclude_invalid_files: bool | None = None, @@ -185,7 +185,7 @@ def dataset( source: Iterable[RecordBatch] | Iterable[Table] | RecordBatchReader, schema: Schema | None = None, format: FileFormat | _DatasetFormat | None = None, - filesystem: FileSystem | str | None = None, + filesystem: SupportedFileSystem | str | None = None, partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, partition_base_dir: str | None = None, exclude_invalid_files: bool | None = None, @@ -200,7 +200,7 @@ def write_dataset( partitioning: Partitioning | list[str] | None = None, partitioning_flavor: str | None = None, schema: Schema | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, file_options: FileWriteOptions | None = None, use_threads: bool = True, max_partitions: int = 1024, diff --git a/pyarrow-stubs/fs.pyi b/pyarrow-stubs/fs.pyi index a185cb6..6bf7561 100644 --- a/pyarrow-stubs/fs.pyi +++ b/pyarrow-stubs/fs.pyi @@ -8,6 +8,7 @@ from pyarrow._fs import ( # noqa _MockFileSystem, FileSystemHandler, PyFileSystem, + SupportedFileSystem, ) from pyarrow._azurefs import AzureFileSystem from pyarrow._hdfs import HadoopFileSystem @@ -30,16 +31,16 @@ FileStats = FileInfo def copy_files( source: str, destination: str, - source_filesystem: FileSystem | None = None, - destination_filesystem: FileSystem | None = None, + source_filesystem: SupportedFileSystem | None = None, + destination_filesystem: SupportedFileSystem | None = None, *, chunk_size: int = 1024 * 1024, use_threads: bool = True, ) -> None: ... class FSSpecHandler(FileSystemHandler): # type: ignore[misc] - fs: FileSystem - def __init__(self, fs: FileSystem) -> None: ... + fs: SupportedFileSystem + def __init__(self, fs: SupportedFileSystem) -> None: ... __all__ = [ # _fs diff --git a/pyarrow-stubs/orc.pyi b/pyarrow-stubs/orc.pyi index 1b2d277..c0104f1 100644 --- a/pyarrow-stubs/orc.pyi +++ b/pyarrow-stubs/orc.pyi @@ -1,7 +1,7 @@ from typing import IO, Literal, Self from . import _orc -from ._fs import FileSystem +from ._fs import SupportedFileSystem from .lib import KeyValueMetadata, NativeFile, RecordBatch, Schema, Table class ORCFile: @@ -71,7 +71,7 @@ class ORCWriter: def read_table( source: str | NativeFile | IO, columns: list[str] | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, ) -> Table: ... def write_table( table: Table, diff --git a/pyarrow-stubs/parquet/core.pyi b/pyarrow-stubs/parquet/core.pyi index 1611c82..2285bc5 100644 --- a/pyarrow-stubs/parquet/core.pyi +++ b/pyarrow-stubs/parquet/core.pyi @@ -3,7 +3,7 @@ from typing import IO, Callable, Iterator, Literal, Self, Sequence from pyarrow import _parquet from pyarrow._compute import Expression -from pyarrow._fs import FileSystem +from pyarrow._fs import FileSystem, SupportedFileSystem from pyarrow._parquet import ( ColumnChunkMetaData, ColumnSchema, @@ -70,7 +70,7 @@ class ParquetFile: decryption_properties: FileDecryptionProperties | None = None, thrift_string_size_limit: int | None = None, thrift_container_size_limit: int | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, page_checksum_verification: bool = False, ): ... def __enter__(self) -> Self: ... @@ -129,7 +129,7 @@ class ParquetWriter: self, where: str | Path | IO, schema: Schema, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, flavor: str | None = None, version: Literal["1.0", "2.4", "2.6"] = ..., use_dictionary: bool = True, @@ -166,7 +166,7 @@ class ParquetDataset: def __init__( self, path_or_paths: str | list[str], - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, schema: Schema | None = None, *, filters: Expression | FilterTuple | list[FilterTuple] | None = None, @@ -213,7 +213,7 @@ def read_table( memory_map: bool = False, buffer_size: int = 0, partitioning: str | list[str] | Partitioning = "hive", - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, filters: Expression | FilterTuple | list[FilterTuple] | None = None, use_legacy_dataset: bool | None = None, ignore_prefixes: list[str] | None = None, @@ -240,7 +240,7 @@ def write_table( allow_truncated_timestamps: bool = False, data_page_size: int | None = None, flavor: str | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, compression_level: int | dict | None = None, use_byte_stream_split: bool = False, column_encoding: str | dict | None = None, @@ -260,7 +260,7 @@ def write_to_dataset( table: Table, root_path: str | Path, partition_cols: list[str] | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, use_legacy_dataset: bool | None = None, schema: Schema | None = None, partitioning: Partitioning | list[str] | None = None, @@ -275,18 +275,18 @@ def write_metadata( schema: Schema, where: str | NativeFile, metadata_collector: list[FileMetaData] | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, **kwargs, ) -> None: ... def read_metadata( where: str | Path | IO, memory_map: bool = False, decryption_properties: FileDecryptionProperties | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, ) -> FileMetaData: ... def read_schema( where: str | Path | IO, memory_map: bool = False, decryption_properties: FileDecryptionProperties | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, ) -> FileMetaData: ...