diff --git a/cpp/src/arrow/dataset/file_base.h b/cpp/src/arrow/dataset/file_base.h index 0cf283a06d61a..2a17cbf8f884d 100644 --- a/cpp/src/arrow/dataset/file_base.h +++ b/cpp/src/arrow/dataset/file_base.h @@ -54,11 +54,6 @@ class ARROW_DS_EXPORT FileSource : public util::EqualityComparable { : file_info_(std::move(path)), filesystem_(std::move(filesystem)), compression_(compression) {} - FileSource(std::string path, int64_t size, std::shared_ptr filesystem, - Compression::type compression = Compression::UNCOMPRESSED) - : file_info_(std::move(path), std::move(size)), - filesystem_(std::move(filesystem)), - compression_(compression) {} FileSource(fs::FileInfo info, std::shared_ptr filesystem, Compression::type compression = Compression::UNCOMPRESSED) : file_info_(std::move(info)), diff --git a/cpp/src/arrow/filesystem/filesystem.h b/cpp/src/arrow/filesystem/filesystem.h index 3f233c74d5a1f..baf05cadbb2d3 100644 --- a/cpp/src/arrow/filesystem/filesystem.h +++ b/cpp/src/arrow/filesystem/filesystem.h @@ -60,9 +60,6 @@ struct ARROW_EXPORT FileInfo : public util::EqualityComparable { explicit FileInfo(std::string path, FileType type = FileType::Unknown) : path_(std::move(path)), type_(type) {} - explicit FileInfo(std::string path, int64_t size, FileType type = FileType::Unknown) - : path_(std::move(path)), type_(type), size_(size) {} - /// The file type FileType type() const { return type_; } void set_type(FileType type) { type_ = type; } diff --git a/python/pyarrow/_dataset.pxd b/python/pyarrow/_dataset.pxd index 71b2a58c35abc..b9e3aac68ee1f 100644 --- a/python/pyarrow/_dataset.pxd +++ b/python/pyarrow/_dataset.pxd @@ -22,12 +22,11 @@ from pyarrow.includes.common cimport * from pyarrow.includes.libarrow_dataset cimport * from pyarrow.lib cimport * -from pyarrow._fs cimport FileSystem +from pyarrow._fs cimport FileSystem, FileInfo cdef CFileSource _make_file_source(object file, FileSystem filesystem=*, int64_t file_size=*) - cdef class DatasetFactory(_Weakrefable): cdef: diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 445d86d85387b..b99197cb37332 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -32,7 +32,7 @@ from pyarrow.includes.libarrow_dataset cimport * from pyarrow._acero cimport ExecNodeOptions from pyarrow._compute cimport Expression, _bind from pyarrow._compute import _forbid_instantiation -from pyarrow._fs cimport FileSystem, FileSelector +from pyarrow._fs cimport FileSystem, FileSelector, FileInfo from pyarrow._csv cimport ( ConvertOptions, ParseOptions, ReadOptions, WriteOptions) from pyarrow.util import _is_iterable, _is_path_like, _stringify_path @@ -101,6 +101,7 @@ cdef CFileSource _make_file_source(object file, FileSystem filesystem=None, int6 cdef: CFileSource c_source shared_ptr[CFileSystem] c_filesystem + CFileInfo c_info c_string c_path shared_ptr[CRandomAccessFile] c_file shared_ptr[CBuffer] c_buffer @@ -117,7 +118,9 @@ cdef CFileSource _make_file_source(object file, FileSystem filesystem=None, int6 if file_size >= 0: c_size = file_size - c_source = CFileSource(move(c_path), move(c_size), move(c_filesystem)) + info = FileInfo(c_path, size=c_size) + c_info = info.unwrap() + c_source = CFileSource(move(c_info), move(c_filesystem)) else: c_source = CFileSource(move(c_path), move(c_filesystem)) elif hasattr(file, 'read'): diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 07474dadd7652..55e3d7442714f 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -991,7 +991,6 @@ def test_make_fragment_with_size(s3_example_simple): fragments = [file_format.make_fragment(path, fs) for path in paths] - dataset = ds.FileSystemDataset( fragments, format=file_format, schema=table.schema, filesystem=fs ) @@ -999,7 +998,18 @@ def test_make_fragment_with_size(s3_example_simple): tbl = dataset.to_table() assert tbl.equals(table) - sizes_toosmall = [1] + # true sizes -> works + sizes_true = [dataset.filesystem.get_file_info(x).size for x in dataset.files] + fragments_with_size = [file_format.make_fragment(path, fs, file_size=size) + for path, size in zip(paths, sizes_true)] + dataset_with_size = ds.FileSystemDataset( + fragments_with_size, format=file_format, schema=table.schema, filesystem=fs + ) + tbl = dataset.to_table() + assert tbl.equals(table) + + # too small sizes -> error + sizes_toosmall = [1 for path in paths] fragments_with_size = [file_format.make_fragment(path, fs, file_size=size) for path, size in zip(paths, sizes_toosmall)] @@ -1010,7 +1020,8 @@ def test_make_fragment_with_size(s3_example_simple): with pytest.raises(pyarrow.lib.ArrowInvalid, match='Parquet file size is 1 bytes'): table = dataset_with_size.to_table() - sizes_toolarge = [1000000] + # too large sizes -> error + sizes_toolarge = [1000000 for path in paths] fragments_with_size = [file_format.make_fragment(path, fs, file_size=size) for path, size in zip(paths, sizes_toolarge)]