apache · rdblue · Jun 12, 2022 · May 6, 2022 · May 10, 2022 · May 13, 2022
diff --git a/python/spellcheck-dictionary.txt b/python/spellcheck-dictionary.txt
@@ -37,6 +37,7 @@ NaN
 nan
 NestedField
 nullability
+PartitionField
 pragma
 PrimitiveType
 pyarrow

diff --git a/python/src/iceberg/table/partitioning.py b/python/src/iceberg/table/partitioning.py
@@ -14,9 +14,16 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+from dataclasses import dataclass, field
+from typing import Dict, List, Tuple
+
+from iceberg.schema import Schema
 from iceberg.transforms import Transform
 
+_PARTITION_DATA_ID_START: int = 1000
+
 
+@dataclass(frozen=True)
 class PartitionField:
     """
     PartitionField is a single element with name and unique id,
@@ -29,38 +36,81 @@ class PartitionField:
         name(str): The name of this partition field
     """
 
-    def __init__(self, source_id: int, field_id: int, transform: Transform, name: str):
-        self._source_id = source_id
-        self._field_id = field_id
-        self._transform = transform
-        self._name = name
+    source_id: int
+    field_id: int
+    transform: Transform
+    name: str
+
+    def __str__(self):
+        return f"{self.field_id}: {self.name}: {self.transform}({self.source_id})"
+
 
-    @property
-    def source_id(self) -> int:
-        return self._source_id
+@dataclass(eq=False, frozen=True)
+class PartitionSpec:
+    """
+    PartitionSpec capture the transformation from table data to partition values
 
-    @property
-    def field_id(self) -> int:
-        return self._field_id
+    Attributes:
+        schema(Schema): the schema of data table
+        spec_id(int): any change to PartitionSpec will produce a new specId
+        fields(List[PartitionField): list of partition fields to produce partition values
+        last_assigned_field_id(int): auto-increment partition field id starting from PARTITION_DATA_ID_START
+    """
 
-    @property
-    def name(self) -> str:
-        return self._name
+    schema: Schema
+    spec_id: int
+    fields: Tuple[PartitionField, ...]
+    last_assigned_field_id: int
+    source_id_to_fields_map: Dict[int, List[PartitionField]] = field(init=False, repr=False)
 
-    @property
-    def transform(self) -> Transform:
-        return self._transform
+    def __post_init__(self):
+        source_id_to_fields_map = dict()
+        for partition_field in self.fields:
+            source_column = self.schema.find_column_name(partition_field.source_id)
+            if not source_column:
+                raise ValueError(f"Cannot find source column: {partition_field.source_id}")
+            existing = source_id_to_fields_map.get(partition_field.source_id, [])
+            existing.append(partition_field)
+            source_id_to_fields_map[partition_field.source_id] = existing
+        object.__setattr__(self, "source_id_to_fields_map", source_id_to_fields_map)
 
     def __eq__(self, other):
-        return (
-            self.field_id == other.field_id
-            and self.source_id == other.source_id
-            and self.name == other.name
-            and self.transform == other.transform
-        )
+        """
+        Produce a boolean to return True if two objects are considered equal
+
+        Note:
+            Equality of PartitionSpec is determined by spec_id and partition fields only
+        """
+        if not isinstance(other, PartitionSpec):
+            return False
+        return self.spec_id == other.spec_id and self.fields == other.fields
 
     def __str__(self):
-        return f"{self.field_id}: {self.name}: {self.transform}({self.source_id})"
+        """
+        Produce a human-readable string representation of PartitionSpec
 
-    def __repr__(self):
-        return f"PartitionField(field_id={self.field_id}, name={self.name}, transform={repr(self.transform)}, source_id={self.source_id})"
+        Note:
+            Only include list of partition fields in the PartitionSpec's string representation
+        """
+        result_str = "["
+        if self.fields:
+            result_str += "\n  " + "\n  ".join([str(field) for field in self.fields]) + "\n"
+        result_str += "]"
+        return result_str
+
+    def is_unpartitioned(self) -> bool:
+        return len(self.fields) < 1
+
+    def fields_by_source_id(self, field_id: int) -> List[PartitionField]:
+        return self.source_id_to_fields_map[field_id]
+
+    def compatible_with(self, other: "PartitionSpec") -> bool:
+        """
+        Produce a boolean to return True if two PartitionSpec are considered compatible
+        """
+        return all(
+            this_field.source_id == that_field.source_id
+            and this_field.transform == that_field.transform
+            and this_field.name == that_field.name
+            for this_field, that_field in zip(self.fields, other.fields)
+        )
diff --git a/python/tests/table/test_partitioning.py b/python/tests/table/test_partitioning.py
@@ -15,7 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from iceberg.table.partitioning import PartitionField
+from iceberg.schema import Schema
+from iceberg.table.partitioning import PartitionField, PartitionSpec
 from iceberg.transforms import bucket
 from iceberg.types import IntegerType
 
@@ -32,5 +33,32 @@ def test_partition_field_init():
     assert str(partition_field) == "1000: id: bucket[100](3)"
     assert (
         repr(partition_field)
-        == "PartitionField(field_id=1000, name=id, transform=transforms.bucket(source_type=IntegerType(), num_buckets=100), source_id=3)"
+        == "PartitionField(source_id=3, field_id=1000, transform=transforms.bucket(source_type=IntegerType(), num_buckets=100), name='id')"
     )
+
+
+def test_partition_spec_init(table_schema_simple: Schema):
+    bucket_transform = bucket(IntegerType(), 4)
+    id_field1 = PartitionField(3, 1001, bucket_transform, "id")
+    partition_spec1 = PartitionSpec(table_schema_simple, 0, (id_field1,), 1001)
+
+    assert partition_spec1.spec_id == 0
+    assert partition_spec1.schema == table_schema_simple
+    assert partition_spec1 == partition_spec1
+    assert partition_spec1 != id_field1
+    assert str(partition_spec1) == f"[\n  {str(id_field1)}\n]"
+    assert not partition_spec1.is_unpartitioned()
+    # only differ by PartitionField field_id
+    id_field2 = PartitionField(3, 1002, bucket_transform, "id")
+    partition_spec2 = PartitionSpec(table_schema_simple, 0, (id_field2,), 1001)
+    assert partition_spec1 != partition_spec2
+    assert partition_spec1.compatible_with(partition_spec2)
+    assert partition_spec1.fields_by_source_id(3) == [id_field1]
+
+
+def test_unpartitioned(table_schema_simple: Schema):
+    unpartitioned = PartitionSpec(table_schema_simple, 1, tuple(), 1000)
+
+    assert not unpartitioned.fields
+    assert unpartitioned.is_unpartitioned()
+    assert str(unpartitioned) == "[]"