From 46afeaf163ee11f41c24b06b866647102643ab42 Mon Sep 17 00:00:00 2001 From: Fokko Date: Wed, 7 Aug 2024 16:57:04 +0200 Subject: [PATCH] Allow setting `write.parquet.row-group-limit` And update the docs --- mkdocs/docs/configuration.md | 2 +- pyiceberg/io/pyarrow.py | 4 ++-- pyiceberg/table/__init__.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index ff3741656a..2b86a0ff97 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -32,10 +32,10 @@ Iceberg tables support table properties to configure table behavior. | --------------------------------- | --------------------------------- | ------- | ------------------------------------------------------------------------------------------- | | `write.parquet.compression-codec` | `{uncompressed,zstd,gzip,snappy}` | zstd | Sets the Parquet compression coddec. | | `write.parquet.compression-level` | Integer | null | Parquet compression level for the codec. If not set, it is up to PyIceberg | +| `write.parquet.row-group-limit` | Number of rows | 1048576 | The upper bound of the number of entries within a single row group | | `write.parquet.page-size-bytes` | Size in bytes | 1MB | Set a target threshold for the approximate encoded size of data pages within a column chunk | | `write.parquet.page-row-limit` | Number of rows | 20000 | Set a target threshold for the approximate encoded size of data pages within a column chunk | | `write.parquet.dict-size-bytes` | Size in bytes | 2MB | Set the dictionary page size limit per row group | -| `write.parquet.row-group-limit` | Number of rows | 122880 | The Parquet row group limit | ### Table behavior options diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 4175f5fecf..0bd7c3d133 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -2136,8 +2136,8 @@ def write_file(io: FileIO, table_metadata: TableMetadata, tasks: Iterator[WriteT parquet_writer_kwargs = _get_parquet_writer_kwargs(table_metadata.properties) row_group_size = property_as_int( properties=table_metadata.properties, - property_name=TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, - default=TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT, + property_name=TableProperties.PARQUET_ROW_GROUP_LIMIT, + default=TableProperties.PARQUET_ROW_GROUP_LIMIT_DEFAULT, ) def write_parquet(task: WriteTask) -> DataFile: diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 873f5abfdc..de32786d57 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -175,7 +175,7 @@ class TableProperties: PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024 # 128 MB PARQUET_ROW_GROUP_LIMIT = "write.parquet.row-group-limit" - PARQUET_ROW_GROUP_LIMIT_DEFAULT = 128 * 1024 * 1024 # 128 MB + PARQUET_ROW_GROUP_LIMIT_DEFAULT = 1048576 PARQUET_PAGE_SIZE_BYTES = "write.parquet.page-size-bytes" PARQUET_PAGE_SIZE_BYTES_DEFAULT = 1024 * 1024 # 1 MB