apache · slessard · May 8, 2024 · May 8, 2024 · May 8, 2024 · May 28, 2024
diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowBatchReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowBatchReader.java
@@ -50,7 +50,6 @@ public final ColumnarBatch read(ColumnarBatch reuse, int numRowsToRead) {
           "Number of rows in the vector %s didn't match expected %s ",
           numRowsInVector,
           numRowsToRead);
-      // Handle null vector for constant case
       columnVectors[i] = new ColumnVector(vectorHolders[i]);
     }
     return new ColumnarBatch(numRowsToRead, columnVectors);

diff --git a/.../src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java b/.../src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java
@@ -35,6 +35,7 @@
 import org.apache.arrow.vector.Float4Vector;
 import org.apache.arrow.vector.Float8Vector;
 import org.apache.arrow.vector.IntVector;
+import org.apache.arrow.vector.NullVector;
 import org.apache.arrow.vector.TimeMicroVector;
 import org.apache.arrow.vector.TimeStampMicroTZVector;
 import org.apache.arrow.vector.TimeStampMicroVector;
@@ -177,6 +178,7 @@ public ArrowVectorAccessor<DecimalT, Utf8StringT, ArrayT, ChildVectorT> getVecto
   @SuppressWarnings("checkstyle:CyclomaticComplexity")
   private ArrowVectorAccessor<DecimalT, Utf8StringT, ArrayT, ChildVectorT> getPlainVectorAccessor(
       FieldVector vector, PrimitiveType primitive) {
+    Preconditions.checkArgument(null != vector, "Invalid field vector: null");
     if (vector instanceof BitVector) {
       return new BooleanAccessor<>((BitVector) vector);
     } else if (vector instanceof IntVector) {
@@ -220,6 +222,8 @@ private ArrowVectorAccessor<DecimalT, Utf8StringT, ArrayT, ChildVectorT> getPlai
       }
       return new FixedSizeBinaryAccessor<>(
           (FixedSizeBinaryVector) vector, stringFactorySupplier.get());
+    } else if (vector instanceof NullVector) {
+      return new NullAccessor<>((NullVector) vector);
     }
     throw new UnsupportedOperationException("Unsupported vector: " + vector.getClass());
   }
@@ -244,6 +248,15 @@ public final boolean getBoolean(int rowId) {
     }
   }
 
+  private static class NullAccessor<
+          DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable>
+      extends ArrowVectorAccessor<DecimalT, Utf8StringT, ArrayT, ChildVectorT> {
+
+    NullAccessor(NullVector vector) {
+      super(vector);
+    }
+  }
+
   private static class IntAccessor<
           DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable>
       extends ArrowVectorAccessor<DecimalT, Utf8StringT, ArrayT, ChildVectorT> {

diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java
@@ -19,6 +19,7 @@
 package org.apache.iceberg.arrow.vectorized;
 
 import org.apache.arrow.vector.FieldVector;
+import org.apache.arrow.vector.NullVector;
 import org.apache.iceberg.MetadataColumns;
 import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
 import org.apache.iceberg.types.Type;
@@ -128,7 +129,7 @@ public static VectorHolder dummyHolder(int numRows) {
   }
 
   public boolean isDummy() {
-    return vector == null;
+    return vector == null || vector instanceof NullVector;
   }
 
   /**
@@ -140,12 +141,20 @@ public static class ConstantVectorHolder<T> extends VectorHolder {
     private final int numRows;
 
     public ConstantVectorHolder(int numRows) {
+      super(new NullVector("_dummy_", numRows), null, new NullabilityHolder(numRows));
+      nullabilityHolder().setNulls(0, numRows);
       this.numRows = numRows;
       this.constantValue = null;
     }
 
     public ConstantVectorHolder(Types.NestedField icebergField, int numRows, T constantValue) {
-      super(icebergField);
+      super(
 // Handle null vector for constant case 
 // Handle null vector for constant case 
+          (null == constantValue) ? new NullVector(icebergField.name(), numRows) : null,
-          (null == constantValue) ? new NullVector(icebergField.name(), numRows) : null,
+          null == constantValue ? new NullVector(icebergField.name(), numRows) : null,
-          (null == constantValue) ? new NullVector(icebergField.name(), numRows) : null,
+          null == constantValue ? new NullVector(icebergField.name(), numRows) : null,
+          icebergField,
+          new NullabilityHolder(numRows));
+      if (null == constantValue) {
+        nullabilityHolder().setNulls(0, numRows);
+      }
       this.numRows = numRows;
       this.constantValue = constantValue;
     }

diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java
@@ -455,6 +455,10 @@ public static VectorizedArrowReader nulls() {
     return NullVectorReader.INSTANCE;
   }
 
+  public static VectorizedArrowReader nulls(Types.NestedField icebergField) {
+    return new NullVectorReader(icebergField);
+  }
+
   public static VectorizedArrowReader positions() {
     return new PositionVectorReader(false);
   }
@@ -464,11 +468,15 @@ public static VectorizedArrowReader positionsWithSetArrowValidityVector() {
   }
 
   private static final class NullVectorReader extends VectorizedArrowReader {
-    private static final NullVectorReader INSTANCE = new NullVectorReader();
+    private static final NullVectorReader INSTANCE = new NullVectorReader(null);
+
+    private NullVectorReader(Types.NestedField icebergField) {
+      super(icebergField);
+    }
 
     @Override
     public VectorHolder read(VectorHolder reuse, int numValsToRead) {
-      return VectorHolder.dummyHolder(numValsToRead);
+      return new VectorHolder.ConstantVectorHolder<>(icebergField(), numValsToRead, null);
     }
 
     @Override

diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedReaderBuilder.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedReaderBuilder.java
@@ -97,7 +97,7 @@ public VectorizedReader<?> message(
       } else if (reader != null) {
         reorderedFields.add(reader);
       } else {
-        reorderedFields.add(VectorizedArrowReader.nulls());
+        reorderedFields.add(VectorizedArrowReader.nulls(field));
       }
     }
     return vectorizedReader(reorderedFields);

diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java
@@ -49,6 +49,7 @@
 import org.apache.arrow.vector.Float4Vector;
 import org.apache.arrow.vector.Float8Vector;
 import org.apache.arrow.vector.IntVector;
+import org.apache.arrow.vector.NullVector;
 import org.apache.arrow.vector.TimeMicroVector;
 import org.apache.arrow.vector.TimeStampMicroTZVector;
 import org.apache.arrow.vector.TimeStampMicroVector;
@@ -59,6 +60,7 @@
 import org.apache.arrow.vector.types.pojo.ArrowType;
 import org.apache.arrow.vector.types.pojo.Field;
 import org.apache.arrow.vector.types.pojo.FieldType;
+import org.apache.iceberg.AppendFiles;
 import org.apache.iceberg.DataFile;
 import org.apache.iceberg.DataFiles;
 import org.apache.iceberg.FileFormat;
@@ -70,6 +72,7 @@
 import org.apache.iceberg.StructLike;
 import org.apache.iceberg.Table;
 import org.apache.iceberg.TableScan;
+import org.apache.iceberg.UpdateSchema;
 import org.apache.iceberg.data.GenericRecord;
 import org.apache.iceberg.data.Record;
 import org.apache.iceberg.data.parquet.GenericParquetWriter;
@@ -262,6 +265,142 @@ public void testReadColumnFilter2() throws Exception {
         scan, NUM_ROWS_PER_MONTH, 12 * NUM_ROWS_PER_MONTH, ImmutableList.of("timestamp"));
   }
 
+  @Test
+  public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception {
+    rowsWritten = Lists.newArrayList();
+    tables = new HadoopTables();
+
+    List<Field> expectedFields =
+        ImmutableList.of(
+            new Field("a", new FieldType(false, MinorType.INT.getType(), null), null),
+            new Field("b", new FieldType(true, MinorType.INT.getType(), null), null),
+            new Field("z", new FieldType(true, MinorType.NULL.getType(), null), null));
+    org.apache.arrow.vector.types.pojo.Schema expectedSchema =
+        new org.apache.arrow.vector.types.pojo.Schema(expectedFields);
+
+    int batchSize = 1;
+    int expectedNumRowsPerBatch = 1;
+    int expectedTotalRows = 1;
+
+    Schema tableSchemaV1 =
+        new Schema(
+            Types.NestedField.required(1, "a", Types.IntegerType.get()),
+            Types.NestedField.optional(2, "b", Types.IntegerType.get()));
+
+    PartitionSpec spec = PartitionSpec.builderFor(tableSchemaV1).build();
+    Table table = tables.create(tableSchemaV1, spec, tableLocation);
+
+    // Add one record to the table
+    GenericRecord rec = GenericRecord.create(tableSchemaV1);
+    rec.setField("a", 1);
+    List<GenericRecord> genericRecords = Lists.newArrayList();
+    genericRecords.add(rec);
+
+    AppendFiles appendFiles = table.newAppend();
+    appendFiles.appendFile(writeParquetFile(table, genericRecords));
+    appendFiles.commit();
+
+    // Alter the table schema by adding a new, optional column.
+    // Do not add any data for this new column in the one existing row in the table
+    // and do not insert any new rows into the table.
+    UpdateSchema updateSchema = table.updateSchema().addColumn("z", Types.IntegerType.get());
+    updateSchema.apply();
+    updateSchema.commit();
+
+    // Select all columns, all rows from the table
+    TableScan scan = table.newScan().select("*");
+
+    Set<String> columns = ImmutableSet.of("a", "b", "z");
+    // Read the data and verify that the returned ColumnarBatches match expected rows.
+    try (VectorizedTableScanIterable itr =
+        new VectorizedTableScanIterable(scan, batchSize, false)) {
+      int rowIndex = 0;
+      for (ColumnarBatch batch : itr) {
+        List<GenericRecord> expectedRows =
+            rowsWritten.subList(rowIndex, rowIndex + expectedNumRowsPerBatch);
+        rowIndex++;
+
+        assertThat(batch.numRows()).isEqualTo(expectedNumRowsPerBatch);
+        assertThat(batch.numCols()).isEqualTo(columns.size());
+
+        checkColumnarArrayValues(
+            expectedNumRowsPerBatch,
+            expectedRows,
+            batch,
+            0,
+            columns,
+            "a",
+            (records, i) -> records.get(i).getField("a"),
+            ColumnVector::getInt);
+        checkColumnarArrayValues(
+            expectedNumRowsPerBatch,
+            expectedRows,
+            batch,
+            1,
+            columns,
+            "b",
+            (records, i) -> records.get(i).getField("b"),
+            (columnVector, i) -> columnVector.isNullAt(i) ? null : columnVector.getInt(i));
+        checkColumnarArrayValues(
+            expectedNumRowsPerBatch,
+            expectedRows,
+            batch,
+            2,
+            columns,
+            "z",
+            (records, i) -> records.get(i).getField("z"),
+            (columnVector, i) -> columnVector.isNullAt(i) ? null : columnVector.getInt(i));
+      }
+    }
+
+    // Read the data and verify that the returned Arrow VectorSchemaRoots match expected rows.
+    try (VectorizedTableScanIterable itr =
+        new VectorizedTableScanIterable(scan, batchSize, false)) {
+      int totalRows = 0;
+      int rowIndex = 0;
+      for (ColumnarBatch batch : itr) {
+        List<GenericRecord> expectedRows =
+            rowsWritten.subList(rowIndex, rowIndex + expectedNumRowsPerBatch);
+        rowIndex++;
+        VectorSchemaRoot root = batch.createVectorSchemaRootFromVectors();
+        assertThat(root.getSchema()).isEqualTo(expectedSchema);
+
+        // check all vector types
+        assertThat(root.getVector("a").getClass()).isEqualTo(IntVector.class);
-        assertThat(root.getVector("a").getClass()).isEqualTo(IntVector.class);
+        assertThat(root.getVector("a")).isInstanceOf(IntVector.class);
-        assertThat(root.getVector("a").getClass()).isEqualTo(IntVector.class);
+        assertThat(root.getVector("a")).isInstanceOf(IntVector.class);
+        assertThat(root.getVector("b").getClass()).isEqualTo(IntVector.class);
+        assertThat(root.getVector("z").getClass()).isEqualTo(NullVector.class);
+
+        checkVectorValues(
+            expectedNumRowsPerBatch,
+            expectedRows,
+            root,
+            columns,
+            "a",
+            (records, i) -> records.get(i).getField("a"),
+            (vector, i) -> ((IntVector) vector).get(i));
+        checkVectorValues(
+            expectedNumRowsPerBatch,
+            expectedRows,
+            root,
+            columns,
+            "b",
+            (records, i) -> records.get(i).getField("b"),
+            (vector, i) -> vector.isNull(i) ? null : ((IntVector) vector).get(i));
+        checkVectorValues(
+            expectedNumRowsPerBatch,
+            expectedRows,
+            root,
+            columns,
+            "z",
+            (records, i) -> records.get(i).getField("z"),
+            (vector, i) -> vector.getObject(i));
+
+        totalRows += root.getRowCount();
+        assertThat(totalRows).isEqualTo(expectedTotalRows);
+      }
+    }
+  }
+
   /**
    * The test asserts that {@link CloseableIterator#hasNext()} returned by the {@link ArrowReader}
    * is idempotent.