diff --git a/python/pyarrow/_dataset.pxd b/python/pyarrow/_dataset.pxd index b9e3aac68ee1f..159324758315b 100644 --- a/python/pyarrow/_dataset.pxd +++ b/python/pyarrow/_dataset.pxd @@ -25,7 +25,7 @@ from pyarrow.lib cimport * from pyarrow._fs cimport FileSystem, FileInfo -cdef CFileSource _make_file_source(object file, FileSystem filesystem=*, int64_t file_size=*) +cdef CFileSource _make_file_source(object file, FileSystem filesystem=*, object file_size=*) cdef class DatasetFactory(_Weakrefable): diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index b99197cb37332..f02398be66360 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -96,7 +96,7 @@ def _get_parquet_symbol(name): return _dataset_pq and getattr(_dataset_pq, name) -cdef CFileSource _make_file_source(object file, FileSystem filesystem=None, int64_t file_size=-1): +cdef CFileSource _make_file_source(object file, FileSystem filesystem=None, object file_size=None): cdef: CFileSource c_source @@ -105,6 +105,7 @@ cdef CFileSource _make_file_source(object file, FileSystem filesystem=None, int6 c_string c_path shared_ptr[CRandomAccessFile] c_file shared_ptr[CBuffer] c_buffer + int64_t c_size if isinstance(file, Buffer): c_buffer = pyarrow_unwrap_buffer(file) @@ -116,10 +117,9 @@ cdef CFileSource _make_file_source(object file, FileSystem filesystem=None, int6 c_filesystem = filesystem.unwrap() c_path = tobytes(_stringify_path(file)) - if file_size >= 0: + if file_size is not None: c_size = file_size - info = FileInfo(c_path, size=c_size) - c_info = info.unwrap() + c_info = FileInfo(c_path, size=c_size).unwrap() c_source = CFileSource(move(c_info), move(c_filesystem)) else: c_source = CFileSource(move(c_path), move(c_filesystem)) @@ -1236,7 +1236,7 @@ cdef class FileFormat(_Weakrefable): The schema inferred from the file """ cdef: - CFileSource c_source = _make_file_source(file, filesystem=filesystem) + CFileSource c_source = _make_file_source(file, filesystem, file_size=None) CResult[shared_ptr[CSchema]] c_result with nogil: c_result = self.format.Inspect(c_source) @@ -1268,14 +1268,9 @@ cdef class FileFormat(_Weakrefable): fragment : Fragment The file fragment """ - cdef: - # default value, will not be passed to constructor - int64_t c_size = -1 if partition_expression is None: partition_expression = _true - if file_size is not None: - c_size = file_size - c_source = _make_file_source(file, filesystem=filesystem, file_size=c_size) + c_source = _make_file_source(file, filesystem, file_size) c_fragment = GetResultValue( self.format.MakeFragment(move(c_source), partition_expression.unwrap(), diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index d34dc8f284010..0fdc333eca589 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -262,17 +262,13 @@ cdef class ParquetFileFormat(FileFormat): """ cdef: vector[int] c_row_groups - # default value, will not be passed to constructor - int64_t c_size = -1 if partition_expression is None: partition_expression = _true - if file_size is not None: - c_size = file_size if row_groups is None: return super().make_fragment(file, filesystem, partition_expression, file_size=file_size) - c_source = _make_file_source(file, filesystem, file_size=c_size) + c_source = _make_file_source(file, filesystem, file_size) c_row_groups = [ row_group for row_group in set(row_groups)] c_fragment = GetResultValue(