ray-project · raghumdani · Sep 30, 2024 · Sep 24, 2024 · Sep 25, 2024 · Sep 25, 2024
diff --git a/deltacat/compute/compactor/model/compact_partition_params.py b/deltacat/compute/compactor/model/compact_partition_params.py
@@ -23,6 +23,8 @@
     TOTAL_MEMORY_BUFFER_PERCENTAGE,
     DEFAULT_DISABLE_COPY_BY_REFERENCE,
     DEFAULT_NUM_ROUNDS,
+    PARQUET_TO_PYARROW_INFLATION,
+    MAX_PARQUET_METADATA_SIZE,
 )
 from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
 from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
@@ -104,6 +106,23 @@ def of(params: Optional[Dict]) -> CompactPartitionParams:
         result.metrics_config = params.get("metrics_config")
 
         result.num_rounds = params.get("num_rounds", DEFAULT_NUM_ROUNDS)
+        result.parquet_to_pyarrow_inflation = params.get(
+            "parquet_to_pyarrow_inflation", PARQUET_TO_PYARROW_INFLATION
+        )
+        result.force_using_previous_inflation_for_memory_calculation = params.get(
+            "force_using_previous_inflation_for_memory_calculation", False
+        )
+        result.enable_intelligent_size_estimation = params.get(
+            "enable_intelligent_size_estimation", False
+        )
+
+        # disable input split during rebase as the rebase files are already uniform
+        result.enable_input_split = (
+            params.get("rebase_source_partition_locator") is None
+        )
+        result.max_parquet_meta_size_bytes = params.get(
+            "max_parquet_meta_size_bytes", MAX_PARQUET_METADATA_SIZE
+        )
 
         if not importlib.util.find_spec("memray"):
             result.enable_profiler = False
@@ -414,13 +433,69 @@ def num_rounds(self, num_rounds: int) -> None:
         self["num_rounds"] = num_rounds
 
     @property
-    def parquet_to_pyarrow_inflation(self) -> int:
+    def parquet_to_pyarrow_inflation(self) -> float:
+        """
+        The inflation factor for the parquet uncompressed_size_bytes to pyarrow table size.
+        """
         return self["parquet_to_pyarrow_inflation"]
 
     @parquet_to_pyarrow_inflation.setter
-    def parquet_to_pyarrow_inflation(self, value: int) -> None:
+    def parquet_to_pyarrow_inflation(self, value: float) -> None:
         self["parquet_to_pyarrow_inflation"] = value
 
+    @property
+    def enable_input_split(self) -> bool:
+        """
+        When this is True, the input split will be always enabled for parquet files.
+        The input split feature will split the parquet files into individual row groups
+        so that we could process them in different nodes in parallel.
+        By default, input split is enabled for incremental compaction and disabled for rebase.
+        """
+        return self["enable_input_split"]
+
+    @enable_input_split.setter
+    def enable_input_split(self, value: bool) -> None:
+        self["enable_input_split"] = value
+
+    @property
+    def force_using_previous_inflation_for_memory_calculation(self) -> bool:
+        """
+        When this is True, the memory estimation will always use previous inflation
+        and average record size for all data formats even if format specific metadata
+        is available to make better predictions of memory requirements.
+
+        By default, previous inflation is used for non-parquet files to estimate memory while
+        parquet metadata will be used for parquet to estimate memory. We only fallback
+        to previous inflation if parquet metadata isn't available.
+        """
+        return self["force_using_previous_inflation_for_memory_calculation"]
+
+    @force_using_previous_inflation_for_memory_calculation.setter
+    def force_using_previous_inflation_for_memory_calculation(
+        self, value: bool
+    ) -> None:
+        self["force_using_previous_inflation_for_memory_calculation"] = value
+
+    @property
+    def max_parquet_meta_size_bytes(self) -> int:
+        return self["max_parquet_meta_size_bytes"]
+
+    @max_parquet_meta_size_bytes.setter
+    def max_parquet_meta_size_bytes(self, value: int) -> None:
+        self["max_parquet_meta_size_bytes"] = value
+
+    @property
+    def enable_intelligent_size_estimation(self) -> bool:
+        """
+        The arguments enable intelligent memory estimation that considers
+        encoding, min/max and other statistics to estimate memory requirements.
+        """
+        return self["enable_intelligent_size_estimation"]
+
+    @enable_intelligent_size_estimation.setter
+    def enable_intelligent_size_estimation(self, value: bool) -> None:
+        self["enable_intelligent_size_estimation"] = value
+
     @staticmethod
     def json_handler_for_compact_partition_params(obj):
         """

diff --git a/deltacat/compute/compactor/model/compaction_session_audit_info.py b/deltacat/compute/compactor/model/compaction_session_audit_info.py
@@ -436,6 +436,22 @@ def compactor_version(self) -> str:
         """
         return self.get("compactorVersion")
 
+    @property
+    def observed_input_inflation(self) -> float:
+        """
+        The average inflation observed for input files only.
+        This only accounts for files in the source.
+        """
+        return self.get("observedInputInflation")
+
+    @property
+    def observed_input_average_record_size_bytes(self) -> float:
+        """
+        The average record size observed for input files only.
+        This only accounts for files in the source.
+        """
+        return self.get("observedInputAverageRecordSizeBytes")
+
     # Setters follow
 
     def set_audit_url(self, audit_url: str) -> CompactionSessionAuditInfo:
@@ -756,6 +772,16 @@ def set_compactor_version(self, value: str) -> CompactionSessionAuditInfo:
         self["compactorVersion"] = value
         return self
 
+    def set_observed_input_inflation(self, value: float) -> CompactionSessionAuditInfo:
+        self["observedInputInflation"] = value
+        return self
+
+    def set_observed_input_average_record_size_bytes(
+        self, value: float
+    ) -> CompactionSessionAuditInfo:
+        self["observedInputAverageRecordSizeBytes"] = value
+        return self
+
     # High level methods to save stats
     def save_step_stats(
         self,

diff --git a/deltacat/compute/compactor/model/delta_annotated.py b/deltacat/compute/compactor/model/delta_annotated.py
@@ -69,6 +69,7 @@ def rebatch(
         estimation_function: Optional[
             Callable[[ManifestEntry], float]
         ] = lambda entry: entry.meta.content_length,
+        enable_input_split: Optional[bool] = False,
     ) -> List[DeltaAnnotated]:
         """
         Simple greedy algorithm to split/merge 1 or more annotated deltas into
@@ -86,13 +87,19 @@ def rebatch(
         new_da_bytes = 0
         da_group_entry_count = 0
 
-        for delta_annotated in annotated_deltas:
-            split_annotated_deltas.extend(DeltaAnnotated._split_single(delta_annotated))
+        if enable_input_split:
+            for delta_annotated in annotated_deltas:
+                split_annotated_deltas.extend(
+                    DeltaAnnotated._split_single(delta_annotated)
+                )
 
-        logger.info(
-            f"Split the {len(annotated_deltas)} annotated deltas "
-            f"into {len(split_annotated_deltas)} groups."
-        )
+            logger.info(
+                f"Split the {len(annotated_deltas)} annotated deltas "
+                f"into {len(split_annotated_deltas)} groups."
+            )
+        else:
+            logger.info("Skipping input split as it is disabled...")
+            split_annotated_deltas = annotated_deltas
 
         for src_da in split_annotated_deltas:
             src_da_annotations = src_da.annotations

diff --git a/deltacat/compute/compactor_v2/constants.py b/deltacat/compute/compactor_v2/constants.py
@@ -41,6 +41,9 @@
 # size in metadata to pyarrow table size.
 PARQUET_TO_PYARROW_INFLATION = 4
 
+# Maximum size of the parquet metadata
+MAX_PARQUET_METADATA_SIZE = 100_000_000  # 100 MB
+
 # By default, copy by reference is enabled
 DEFAULT_DISABLE_COPY_BY_REFERENCE = False
 

diff --git a/deltacat/compute/compactor_v2/private/compaction_utils.py b/deltacat/compute/compactor_v2/private/compaction_utils.py
@@ -152,9 +152,13 @@ def _build_uniform_deltas(
         previous_inflation=params.previous_inflation,
         min_delta_bytes=params.min_delta_bytes_in_batch,
         min_file_counts=params.min_files_in_batch,
-        # disable input split during rebase as the rebase files are already uniform
-        enable_input_split=params.rebase_source_partition_locator is None,
+        enable_input_split=params.enable_input_split,
         deltacat_storage_kwargs=params.deltacat_storage_kwargs,
+        parquet_to_pyarrow_inflation=params.parquet_to_pyarrow_inflation,
+        force_use_previous_inflation=params.force_using_previous_inflation_for_memory_calculation,
+        enable_intelligent_size_estimation=params.enable_intelligent_size_estimation,
+        task_max_parallelism=params.task_max_parallelism,
+        max_parquet_meta_size_bytes=params.max_parquet_meta_size_bytes,
     )
     delta_discovery_end: float = time.monotonic()
 
@@ -400,6 +404,9 @@ def _merge(
         deltacat_storage_kwargs=params.deltacat_storage_kwargs,
         ray_custom_resources=params.ray_custom_resources,
         memory_logs_enabled=params.memory_logs_enabled,
+        parquet_to_pyarrow_inflation=params.parquet_to_pyarrow_inflation,
+        force_use_previous_inflation=params.force_using_previous_inflation_for_memory_calculation,
+        enable_intelligent_size_estimation=params.enable_intelligent_size_estimation,
     )
 
     def merge_input_provider(index, item) -> dict[str, MergeInput]:
@@ -463,6 +470,9 @@ def _hash_bucket(
         primary_keys=params.primary_keys,
         ray_custom_resources=params.ray_custom_resources,
         memory_logs_enabled=params.memory_logs_enabled,
+        parquet_to_pyarrow_inflation=params.parquet_to_pyarrow_inflation,
+        force_use_previous_inflation=params.force_using_previous_inflation_for_memory_calculation,
+        enable_intelligent_size_estimation=params.enable_intelligent_size_estimation,
     )
 
     def hash_bucket_input_provider(index, item) -> dict[str, HashBucketInput]:
@@ -537,6 +547,9 @@ def _run_local_merge(
         ray_custom_resources=params.ray_custom_resources,
         primary_keys=params.primary_keys,
         memory_logs_enabled=params.memory_logs_enabled,
+        parquet_to_pyarrow_inflation=params.parquet_to_pyarrow_inflation,
+        force_use_previous_inflation=params.force_using_previous_inflation_for_memory_calculation,
+        enable_intelligent_size_estimation=params.enable_intelligent_size_estimation,
     )
     local_merge_result = ray.get(
         mg.merge.options(**local_merge_options).remote(local_merge_input)
@@ -666,6 +679,11 @@ def _write_new_round_completion_file(
         f" and average record size={input_average_record_size_bytes}"
     )
 
+    mutable_compaction_audit.set_observed_input_inflation(input_inflation)
+    mutable_compaction_audit.set_observed_input_average_record_size_bytes(
+        input_average_record_size_bytes
+    )
+
     _update_and_upload_compaction_audit(
         params,
         mutable_compaction_audit,