Skip to content

Commit

Permalink
[Python][Dataset] Change default of pre_buffer to True for reading Pa…
Browse files Browse the repository at this point in the history
…rquet files
  • Loading branch information
jorisvandenbossche committed Sep 25, 2023
1 parent 7b14b2b commit 16e56fa
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 7 deletions.
4 changes: 2 additions & 2 deletions python/pyarrow/_dataset_parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -666,7 +666,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions):
Disabled by default.
buffer_size : int, default 8192
Size of buffered stream, if enabled. Default is 8KB.
pre_buffer : bool, default False
pre_buffer : bool, default True
If enabled, pre-buffer the raw Parquet data instead of issuing one
read per column chunk. This can improve performance on high-latency
filesystems.
Expand All @@ -688,7 +688,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions):

def __init__(self, *, bint use_buffered_stream=False,
buffer_size=8192,
bint pre_buffer=False,
bint pre_buffer=True,
thrift_string_size_limit=None,
thrift_container_size_limit=None):
self.init(shared_ptr[CFragmentScanOptions](
Expand Down
10 changes: 5 additions & 5 deletions python/pyarrow/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -784,28 +784,28 @@ def test_parquet_scan_options():
opts2 = ds.ParquetFragmentScanOptions(buffer_size=4096)
opts3 = ds.ParquetFragmentScanOptions(
buffer_size=2**13, use_buffered_stream=True)
opts4 = ds.ParquetFragmentScanOptions(buffer_size=2**13, pre_buffer=True)
opts4 = ds.ParquetFragmentScanOptions(buffer_size=2**13, pre_buffer=False)
opts5 = ds.ParquetFragmentScanOptions(
thrift_string_size_limit=123456,
thrift_container_size_limit=987654,)

assert opts1.use_buffered_stream is False
assert opts1.buffer_size == 2**13
assert opts1.pre_buffer is False
assert opts1.pre_buffer is True
assert opts1.thrift_string_size_limit == 100_000_000 # default in C++
assert opts1.thrift_container_size_limit == 1_000_000 # default in C++

assert opts2.use_buffered_stream is False
assert opts2.buffer_size == 2**12
assert opts2.pre_buffer is False
assert opts2.pre_buffer is True

assert opts3.use_buffered_stream is True
assert opts3.buffer_size == 2**13
assert opts3.pre_buffer is False
assert opts3.pre_buffer is True

assert opts4.use_buffered_stream is False
assert opts4.buffer_size == 2**13
assert opts4.pre_buffer is True
assert opts4.pre_buffer is False

assert opts5.thrift_string_size_limit == 123456
assert opts5.thrift_container_size_limit == 987654
Expand Down

0 comments on commit 16e56fa

Please sign in to comment.