From e53370f8ddfee5613540c8fb84221af9584b9bc4 Mon Sep 17 00:00:00 2001 From: Dean MacGregor Date: Thu, 12 Sep 2024 14:37:29 -0400 Subject: [PATCH 1/3] supported_filesystem --- pyarrow-stubs/_dataset.pyi | 14 +++++++------- pyarrow-stubs/_dataset_parquet.pyi | 6 +++--- pyarrow-stubs/_fs.pyi | 6 +++++- pyarrow-stubs/dataset.pyi | 12 ++++++------ pyarrow-stubs/fs.pyi | 9 +++++---- pyarrow-stubs/orc.pyi | 2 +- pyarrow-stubs/parquet/core.pyi | 20 ++++++++++---------- 7 files changed, 37 insertions(+), 32 deletions(-) diff --git a/pyarrow-stubs/_dataset.pyi b/pyarrow-stubs/_dataset.pyi index d91a48d..fef3cbe 100644 --- a/pyarrow-stubs/_dataset.pyi +++ b/pyarrow-stubs/_dataset.pyi @@ -13,7 +13,7 @@ from typing import ( ) from . import _csv, _json, _parquet, lib -from ._fs import FileSelector, FileSystem +from ._fs import FileSelector, FileSystem, SupportedFileSystem from ._stubs_typing import Indices, JoinType, Order from .acero import ExecNodeOptions from .compute import Expression @@ -129,7 +129,7 @@ class FileSystemDataset(Dataset): fragments: list[Fragment], schema: lib.Schema, format: FileFormat, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, root_partition: Expression | None = None, ) -> None: ... @classmethod @@ -138,7 +138,7 @@ class FileSystemDataset(Dataset): paths: list[str], schema: lib.Schema | None = None, format: FileFormat | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, partitions: list[Expression] | None = None, root_partition: Expression | None = None, ) -> FileSystemDataset: ... @@ -157,12 +157,12 @@ class FileWriteOptions(lib._Weakrefable): class FileFormat(lib._Weakrefable): def inspect( - self, file: str | Path | IO, filesystem: FileSystem | None = None + self, file: str | Path | IO, filesystem: SupportedFileSystem | None = None ) -> lib.Schema: ... def make_fragment( self, file: str | Path | IO, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, partition_expression: Expression | None = None, *, file_size: int | None = None, @@ -402,7 +402,7 @@ class FileSystemFactoryOptions(lib._Weakrefable): class FileSystemDatasetFactory(DatasetFactory): def __init__( self, - filesystem: FileSystem, + filesystem: SupportedFileSystem, paths_or_selector: FileSelector, format: FileFormat, options: FileSystemFactoryOptions | None = None, @@ -503,7 +503,7 @@ def _filesystemdataset_write( data: Scanner, base_dir: str | Path, basename_template: str, - filesystem: FileSystem, + filesystem: SupportedFileSystem, partitioning: Partitioning, file_options: FileWriteOptions, max_partitions: int, diff --git a/pyarrow-stubs/_dataset_parquet.pyi b/pyarrow-stubs/_dataset_parquet.pyi index ce1a840..345ef12 100644 --- a/pyarrow-stubs/_dataset_parquet.pyi +++ b/pyarrow-stubs/_dataset_parquet.pyi @@ -14,7 +14,7 @@ from ._dataset import ( PartitioningFactory, ) from ._dataset_parquet_encryption import ParquetDecryptionConfig -from ._fs import FileSystem +from ._fs import FileSystem, SupportedFileSystem from ._parquet import FileDecryptionProperties, FileMetaData from .lib import CacheOptions, Schema, _Weakrefable @@ -36,7 +36,7 @@ class ParquetFileFormat(FileFormat): def make_fragment( self, file: IO | Path | str, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, partition_expression: Expression | None = None, row_groups: Iterable[int] | None = None, *, @@ -118,7 +118,7 @@ class ParquetDatasetFactory(DatasetFactory): def __init__( self, metadata_path: str, - filesystem: FileSystem, + filesystem: SupportedFileSystem, format: FileFormat, options: ParquetFactoryOptions | None = None, ) -> None: ... diff --git a/pyarrow-stubs/_fs.pyi b/pyarrow-stubs/_fs.pyi index 3f87e3f..4725b20 100644 --- a/pyarrow-stubs/_fs.pyi +++ b/pyarrow-stubs/_fs.pyi @@ -2,10 +2,14 @@ import datetime as dt import enum from abc import ABC, abstractmethod -from typing import Self, overload +from typing import Self, TypeAlias, Union, overload + +from fsspec import AbstractFileSystem from .lib import NativeFile, _Weakrefable +SupportedFileSystem: TypeAlias = Union[AbstractFileSystem, FileSystem] + class FileType(enum.IntFlag): NotFound = enum.auto() Unknown = enum.auto() diff --git a/pyarrow-stubs/dataset.pyi b/pyarrow-stubs/dataset.pyi index 1fcc436..3473fe4 100644 --- a/pyarrow-stubs/dataset.pyi +++ b/pyarrow-stubs/dataset.pyi @@ -50,7 +50,7 @@ from pyarrow._dataset_parquet_encryption import ( from pyarrow.compute import Expression, field, scalar from pyarrow.lib import Array, RecordBatch, RecordBatchReader, Schema, Table -from ._fs import FileSystem +from ._fs import SupportedFileSystem _orc_available: bool _parquet_available: bool @@ -153,7 +153,7 @@ def partitioning( def parquet_dataset( metadata_path: str | Path, schema: Schema | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, format: ParquetFileFormat | None = None, partitioning: Partitioning | PartitioningFactory | None = None, partition_base_dir: str | None = None, @@ -163,7 +163,7 @@ def dataset( source: str | list[str] | Path | list[Path], schema: Schema | None = None, format: FileFormat | _DatasetFormat | None = None, - filesystem: FileSystem | str | None = None, + filesystem: SupportedFileSystem | str | None = None, partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, partition_base_dir: str | None = None, exclude_invalid_files: bool | None = None, @@ -174,7 +174,7 @@ def dataset( source: list[Dataset], schema: Schema | None = None, format: FileFormat | _DatasetFormat | None = None, - filesystem: FileSystem | str | None = None, + filesystem: SupportedFileSystem | str | None = None, partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, partition_base_dir: str | None = None, exclude_invalid_files: bool | None = None, @@ -185,7 +185,7 @@ def dataset( source: Iterable[RecordBatch] | Iterable[Table] | RecordBatchReader, schema: Schema | None = None, format: FileFormat | _DatasetFormat | None = None, - filesystem: FileSystem | str | None = None, + filesystem: SupportedFileSystem | str | None = None, partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, partition_base_dir: str | None = None, exclude_invalid_files: bool | None = None, @@ -200,7 +200,7 @@ def write_dataset( partitioning: Partitioning | list[str] | None = None, partitioning_flavor: str | None = None, schema: Schema | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, file_options: FileWriteOptions | None = None, use_threads: bool = True, max_partitions: int = 1024, diff --git a/pyarrow-stubs/fs.pyi b/pyarrow-stubs/fs.pyi index a185cb6..6bf7561 100644 --- a/pyarrow-stubs/fs.pyi +++ b/pyarrow-stubs/fs.pyi @@ -8,6 +8,7 @@ from pyarrow._fs import ( # noqa _MockFileSystem, FileSystemHandler, PyFileSystem, + SupportedFileSystem, ) from pyarrow._azurefs import AzureFileSystem from pyarrow._hdfs import HadoopFileSystem @@ -30,16 +31,16 @@ FileStats = FileInfo def copy_files( source: str, destination: str, - source_filesystem: FileSystem | None = None, - destination_filesystem: FileSystem | None = None, + source_filesystem: SupportedFileSystem | None = None, + destination_filesystem: SupportedFileSystem | None = None, *, chunk_size: int = 1024 * 1024, use_threads: bool = True, ) -> None: ... class FSSpecHandler(FileSystemHandler): # type: ignore[misc] - fs: FileSystem - def __init__(self, fs: FileSystem) -> None: ... + fs: SupportedFileSystem + def __init__(self, fs: SupportedFileSystem) -> None: ... __all__ = [ # _fs diff --git a/pyarrow-stubs/orc.pyi b/pyarrow-stubs/orc.pyi index 1b2d277..fc370ba 100644 --- a/pyarrow-stubs/orc.pyi +++ b/pyarrow-stubs/orc.pyi @@ -71,7 +71,7 @@ class ORCWriter: def read_table( source: str | NativeFile | IO, columns: list[str] | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, ) -> Table: ... def write_table( table: Table, diff --git a/pyarrow-stubs/parquet/core.pyi b/pyarrow-stubs/parquet/core.pyi index 1611c82..ff92cf9 100644 --- a/pyarrow-stubs/parquet/core.pyi +++ b/pyarrow-stubs/parquet/core.pyi @@ -3,7 +3,7 @@ from typing import IO, Callable, Iterator, Literal, Self, Sequence from pyarrow import _parquet from pyarrow._compute import Expression -from pyarrow._fs import FileSystem +from pyarrow._fs import SupportedFileSystem from pyarrow._parquet import ( ColumnChunkMetaData, ColumnSchema, @@ -70,7 +70,7 @@ class ParquetFile: decryption_properties: FileDecryptionProperties | None = None, thrift_string_size_limit: int | None = None, thrift_container_size_limit: int | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, page_checksum_verification: bool = False, ): ... def __enter__(self) -> Self: ... @@ -129,7 +129,7 @@ class ParquetWriter: self, where: str | Path | IO, schema: Schema, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, flavor: str | None = None, version: Literal["1.0", "2.4", "2.6"] = ..., use_dictionary: bool = True, @@ -166,7 +166,7 @@ class ParquetDataset: def __init__( self, path_or_paths: str | list[str], - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, schema: Schema | None = None, *, filters: Expression | FilterTuple | list[FilterTuple] | None = None, @@ -213,7 +213,7 @@ def read_table( memory_map: bool = False, buffer_size: int = 0, partitioning: str | list[str] | Partitioning = "hive", - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, filters: Expression | FilterTuple | list[FilterTuple] | None = None, use_legacy_dataset: bool | None = None, ignore_prefixes: list[str] | None = None, @@ -240,7 +240,7 @@ def write_table( allow_truncated_timestamps: bool = False, data_page_size: int | None = None, flavor: str | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, compression_level: int | dict | None = None, use_byte_stream_split: bool = False, column_encoding: str | dict | None = None, @@ -260,7 +260,7 @@ def write_to_dataset( table: Table, root_path: str | Path, partition_cols: list[str] | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, use_legacy_dataset: bool | None = None, schema: Schema | None = None, partitioning: Partitioning | list[str] | None = None, @@ -275,18 +275,18 @@ def write_metadata( schema: Schema, where: str | NativeFile, metadata_collector: list[FileMetaData] | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, **kwargs, ) -> None: ... def read_metadata( where: str | Path | IO, memory_map: bool = False, decryption_properties: FileDecryptionProperties | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, ) -> FileMetaData: ... def read_schema( where: str | Path | IO, memory_map: bool = False, decryption_properties: FileDecryptionProperties | None = None, - filesystem: FileSystem | None = None, + filesystem: SupportedFileSystem | None = None, ) -> FileMetaData: ... From efe0d142825593f6a1718870ac1f4593c4577405 Mon Sep 17 00:00:00 2001 From: Dean MacGregor Date: Thu, 12 Sep 2024 14:43:21 -0400 Subject: [PATCH 2/3] fixes --- pyarrow-stubs/orc.pyi | 2 +- pyarrow-stubs/parquet/core.pyi | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyarrow-stubs/orc.pyi b/pyarrow-stubs/orc.pyi index fc370ba..c0104f1 100644 --- a/pyarrow-stubs/orc.pyi +++ b/pyarrow-stubs/orc.pyi @@ -1,7 +1,7 @@ from typing import IO, Literal, Self from . import _orc -from ._fs import FileSystem +from ._fs import SupportedFileSystem from .lib import KeyValueMetadata, NativeFile, RecordBatch, Schema, Table class ORCFile: diff --git a/pyarrow-stubs/parquet/core.pyi b/pyarrow-stubs/parquet/core.pyi index ff92cf9..2285bc5 100644 --- a/pyarrow-stubs/parquet/core.pyi +++ b/pyarrow-stubs/parquet/core.pyi @@ -3,7 +3,7 @@ from typing import IO, Callable, Iterator, Literal, Self, Sequence from pyarrow import _parquet from pyarrow._compute import Expression -from pyarrow._fs import SupportedFileSystem +from pyarrow._fs import FileSystem, SupportedFileSystem from pyarrow._parquet import ( ColumnChunkMetaData, ColumnSchema, From 64dec6209fa6a83a557433604f2adbb87714d8a0 Mon Sep 17 00:00:00 2001 From: "ZhengYu, Xu" Date: Sun, 15 Sep 2024 20:56:56 +0800 Subject: [PATCH 3/3] remove unused import --- pyarrow-stubs/_dataset_parquet.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyarrow-stubs/_dataset_parquet.pyi b/pyarrow-stubs/_dataset_parquet.pyi index 345ef12..f5b2c93 100644 --- a/pyarrow-stubs/_dataset_parquet.pyi +++ b/pyarrow-stubs/_dataset_parquet.pyi @@ -14,7 +14,7 @@ from ._dataset import ( PartitioningFactory, ) from ._dataset_parquet_encryption import ParquetDecryptionConfig -from ._fs import FileSystem, SupportedFileSystem +from ._fs import SupportedFileSystem from ._parquet import FileDecryptionProperties, FileMetaData from .lib import CacheOptions, Schema, _Weakrefable