apache · Fokko · Jul 9, 2024 · Apr 2, 2024 · Apr 2, 2024 · Apr 2, 2024
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -49,14 +49,19 @@
 import pyiceberg.expressions.parser as parser
 from pyiceberg.exceptions import CommitFailedException, ResolveError, ValidationError
 from pyiceberg.expressions import (
+    AlwaysFalse,
     AlwaysTrue,
     And,
     BooleanExpression,
     EqualTo,
+    Or,
     Reference,
 )
 from pyiceberg.expressions.visitors import (
+    ROWS_CANNOT_MATCH,
+    ROWS_MUST_MATCH,
     _InclusiveMetricsEvaluator,
+    _StrictMetricsEvaluator,
     expression_evaluator,
     inclusive_projection,
     manifest_evaluator,
@@ -2726,6 +2731,112 @@ def _commit(self) -> UpdatesAndRequirements:
         )
 
 
+class DeleteFiles(_MergingSnapshotProducer):
+    _predicate: BooleanExpression
+
+    def __init__(
+        self,
+        operation: Operation,
+        transaction: Transaction,
+        io: FileIO,
+        commit_uuid: Optional[uuid.UUID] = None,
+        snapshot_properties: Dict[str, str] = EMPTY_DICT,
+    ):
+        super().__init__(operation, transaction, io, commit_uuid, snapshot_properties)
+        self._predicate = AlwaysFalse()
+
+    def _build_partition_projection(self, spec_id: int) -> BooleanExpression:
+        schema = self._transaction.table_metadata.schema()
+        spec = self._transaction.table_metadata.specs()[spec_id]
+        project = inclusive_projection(schema, spec)
+        return project(self._predicate)
+
+    @cached_property
+    def partition_filters(self) -> KeyDefaultDict[int, BooleanExpression]:
+        return KeyDefaultDict(self._build_partition_projection)
+
+    def _build_manifest_evaluator(self, spec_id: int) -> Callable[[ManifestFile], bool]:
+        schema = self._transaction.table_metadata.schema()
+        spec = self._transaction.table_metadata.specs()[spec_id]
+        return manifest_evaluator(spec, schema, self.partition_filters[spec_id], case_sensitive=True)
+
+    def _build_partition_evaluator(self, spec_id: int) -> Callable[[DataFile], bool]:
+        schema = self._transaction.table_metadata.schema()
+        spec = self._transaction.table_metadata.specs()[spec_id]
+        partition_type = spec.partition_type(schema)
+        partition_schema = Schema(*partition_type.fields)
+        partition_expr = self.partition_filters[spec_id]
+
+        return lambda data_file: expression_evaluator(partition_schema, partition_expr, case_sensitive=True)(data_file.partition)
+
+    def delete(self, predicate: BooleanExpression) -> None:
+        self._predicate = Or(self._predicate, predicate)
+
+    @cached_property
+    def _compute_deletes(self) -> Tuple[List[ManifestFile], List[ManifestEntry]]:
+        schema = self._transaction.table_metadata.schema()
+
+        def _copy_with_new_status(entry: ManifestEntry, status: ManifestEntryStatus) -> ManifestEntry:
+            return ManifestEntry(
+                status=status,
+                snapshot_id=entry.snapshot_id,
+                data_sequence_number=entry.data_sequence_number,
+                file_sequence_number=entry.file_sequence_number,
+                data_file=entry.data_file,
+            )
+
+        manifest_evaluators: Dict[int, Callable[[ManifestFile], bool]] = KeyDefaultDict(self._build_manifest_evaluator)
+        strict_metrics_evaluator = _StrictMetricsEvaluator(schema, self._predicate, case_sensitive=True).eval
+        inclusive_metrics_evaluator = _InclusiveMetricsEvaluator(schema, self._predicate, case_sensitive=True).eval
+
+        existing_manifests = []
+        total_deleted_entries = []
+        if snapshot := self._transaction.table_metadata.current_snapshot():
+            for num, manifest_file in enumerate(snapshot.manifests(io=self._io)):
+                if not manifest_evaluators[manifest_file.partition_spec_id](manifest_file):
+                    # If the manifest isn't relevant, we can just keep it in the manifest-list
+                    existing_manifests.append(manifest_file)
+                else:
+                    # It is relevant, let's check out the content
+                    deleted_entries = []
+                    existing_entries = []
+                    for entry in manifest_file.fetch_manifest_entry(io=self._io):
+                        if strict_metrics_evaluator(entry.data_file) == ROWS_MUST_MATCH:
+                            deleted_entries.append(_copy_with_new_status(entry, ManifestEntryStatus.DELETED))
+                        elif inclusive_metrics_evaluator(entry.data_file) == ROWS_CANNOT_MATCH:
+                            existing_entries.append(_copy_with_new_status(entry, ManifestEntryStatus.EXISTING))
+                        else:
+                            raise ValueError("Deletes do not support rewrites of data files")
+
+                    if len(deleted_entries) > 0:
+                        total_deleted_entries += deleted_entries
+
+                        # Rewrite the manifest
+                        if len(existing_entries) > 0:
+                            output_file_location = _new_manifest_path(
+                                location=self._transaction.table_metadata.location, num=num, commit_uuid=self.commit_uuid
+                            )
+                            with write_manifest(
+                                format_version=self._transaction.table_metadata.format_version,
+                                spec=self._transaction.table_metadata.specs()[manifest_file.partition_spec_id],
+                                schema=self._transaction.table_metadata.schema(),
+                                output_file=self._io.new_output(output_file_location),
+                                snapshot_id=self._snapshot_id,
+                            ) as writer:
+                                for existing_entry in existing_entries:
+                                    writer.add_entry(existing_entry)
+                    else:
+                        existing_manifests.append(manifest_file)
+
+        return existing_manifests, total_deleted_entries
+
+    def _existing_manifests(self) -> List[ManifestFile]:
+        return self._compute_deletes[0]
+
+    def _deleted_entries(self) -> List[ManifestEntry]:
+        return self._compute_deletes[1]
+
+
 class FastAppendFiles(_MergingSnapshotProducer):
     def _existing_manifests(self) -> List[ManifestFile]:
         """To determine if there are any existing manifest files.
@@ -2803,7 +2914,7 @@ class UpdateSnapshot:
     _io: FileIO
     _snapshot_properties: Dict[str, str]
 
-    def __init__(self, transaction: Transaction, io: FileIO, snapshot_properties: Dict[str, str]) -> None:
+    def __init__(self, transaction: Transaction, io: FileIO, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> None:
         self._transaction = transaction
         self._io = io
         self._snapshot_properties = snapshot_properties
@@ -2823,6 +2934,14 @@ def overwrite(self) -> OverwriteFiles:
             snapshot_properties=self._snapshot_properties,
         )
 
+    def delete(self) -> DeleteFiles:
+        return DeleteFiles(
+            operation=Operation.DELETE,
+            transaction=self._transaction,
+            io=self._io,
+            snapshot_properties=self._snapshot_properties,
+        )
+
 
 class UpdateSpec(UpdateTableMetadata["UpdateSpec"]):
     _transaction: Transaction

diff --git a/pyiceberg/table/snapshots.py b/pyiceberg/table/snapshots.py
@@ -345,7 +345,7 @@ def get_prop(prop: str) -> int:
 def update_snapshot_summaries(
     summary: Summary, previous_summary: Optional[Mapping[str, str]] = None, truncate_full_table: bool = False
 ) -> Summary:
-    if summary.operation not in {Operation.APPEND, Operation.OVERWRITE}:
+    if summary.operation not in {Operation.APPEND, Operation.OVERWRITE, Operation.DELETE}:
         raise ValueError(f"Operation not implemented: {summary.operation}")
 
     if truncate_full_table and summary.operation == Operation.OVERWRITE and previous_summary is not None:

diff --git a/tests/integration/test_deletes.py b/tests/integration/test_deletes.py
@@ -0,0 +1,76 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint:disable=redefined-outer-name
+import pytest
+from pyspark.sql import DataFrame, SparkSession
+
+from pyiceberg.catalog.rest import RestCatalog
+from pyiceberg.expressions import EqualTo
+
+
+@pytest.fixture
+def test_deletes_table(spark: SparkSession) -> DataFrame:
+    identifier = 'default.table_partitioned_delete'
+
+    spark.sql(f"DROP TABLE IF EXISTS {identifier}")
+
+    spark.sql(
+        f"""
+        CREATE TABLE {identifier} (
+            number_partitioned  int,
+            number              int
+        )
+        USING iceberg
+        PARTITIONED BY (number_partitioned)
+    """
+    )
+    spark.sql(
+        f"""
+        INSERT INTO {identifier} VALUES (10, 20), (10, 30)
+    """
+    )
+    spark.sql(
+        f"""
+        INSERT INTO {identifier} VALUES (11, 20), (11, 30)
+    """
+    )
+
+    return spark.table(identifier)
+
+
+def test_partition_deletes(test_deletes_table: DataFrame, session_catalog: RestCatalog) -> None:
+    identifier = 'default.table_partitioned_delete'
+
+    tbl = session_catalog.load_table(identifier)
+
+    with tbl.transaction() as txn:
+        with txn.update_snapshot().delete() as delete:
+            delete.delete(EqualTo("number_partitioned", 10))
+
+    assert tbl.scan().to_arrow().to_pydict() == {'number_partitioned': [11, 11], 'number': [20, 30]}
+
+
+def test_deletes(test_deletes_table: DataFrame, session_catalog: RestCatalog) -> None:
+    identifier = 'default.table_partitioned_delete'
+
+    tbl = session_catalog.load_table(identifier)
+
+    with tbl.transaction() as txn:
+        with txn.update_snapshot().delete() as delete:
+            delete.delete(EqualTo("number", 30))
+
+    assert tbl.scan().to_arrow().to_pydict() == {'number_partitioned': [11, 11], 'number': [20, 30]}