apache · AngersZhuuuu · Dec 29, 2020 · Dec 29, 2020 · Dec 29, 2020 · Dec 29, 2020
diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala
@@ -62,10 +62,10 @@ private[sql] class AvroDeserializer(
   private lazy val decimalConversions = new DecimalConversion()
 
   private val dateRebaseFunc = DataSourceUtils.creteDateRebaseFuncInRead(
-    datetimeRebaseMode, "Avro")
+    datetimeRebaseMode, "Avro")(_, _)
 
   private val timestampRebaseFunc = DataSourceUtils.creteTimestampRebaseFuncInRead(
-    datetimeRebaseMode, "Avro")
+    datetimeRebaseMode, "Avro")(_, _)
 
   private val converter: Any => Option[Any] = try {
     rootCatalystType match {
@@ -126,7 +126,8 @@ private[sql] class AvroDeserializer(
         updater.setInt(ordinal, value.asInstanceOf[Int])
 
       case (INT, DateType) => (updater, ordinal, value) =>
-        updater.setInt(ordinal, dateRebaseFunc(value.asInstanceOf[Int]))
+        updater.setInt(ordinal,
+          dateRebaseFunc(avroType.getName, catalystType)(value.asInstanceOf[Int]))
 
       case (LONG, LongType) => (updater, ordinal, value) =>
         updater.setLong(ordinal, value.asInstanceOf[Long])
@@ -137,10 +138,10 @@ private[sql] class AvroDeserializer(
         case null | _: TimestampMillis => (updater, ordinal, value) =>
           val millis = value.asInstanceOf[Long]
           val micros = DateTimeUtils.millisToMicros(millis)
-          updater.setLong(ordinal, timestampRebaseFunc(micros))
+          updater.setLong(ordinal, timestampRebaseFunc(avroType.getName, catalystType)(micros))
         case _: TimestampMicros => (updater, ordinal, value) =>
           val micros = value.asInstanceOf[Long]
-          updater.setLong(ordinal, timestampRebaseFunc(micros))
+          updater.setLong(ordinal, timestampRebaseFunc(avroType.getName, catalystType)(micros))
         case other => throw new IncompatibleSchemaException(errorPrefix +
           s"Avro logical type $other cannot be converted to SQL type ${TimestampType.sql}.")
       }

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala
@@ -394,11 +394,16 @@ object QueryExecutionErrors {
   }
 
   def sparkUpgradeInReadingDatesError(
-      format: String, config: String, option: String): SparkUpgradeException = {
+      colName: String,
+      dataType: DataType,
+      format: String,
+      config: String,
+      option: String): SparkUpgradeException = {
     new SparkUpgradeException("3.0",
       s"""
          |reading dates before 1582-10-15 or timestamps before 1900-01-01T00:00:00Z from $format
-         |files can be ambiguous, as the files may be written by Spark 2.x or legacy versions of
+         |files can be ambiguous when read column `${colName}` of datatype `${dataType}`,
+         |as the files may be written by Spark 2.x or legacy versions of
          |Hive, which uses a legacy hybrid calendar that is different from Spark 3.0+'s Proleptic
          |Gregorian calendar. See more details in SPARK-31404. You can set the SQL config
          |'$config' or the datasource option '$option' to 'LEGACY' to rebase the datetime values

diff --git a/.../src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java b/.../src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
@@ -167,15 +167,15 @@ public void initBatch(
     for (int i = 0; i < requiredFields.length; i++) {
       DataType dt = requiredFields[i].dataType();
       if (requestedPartitionColIds[i] != -1) {
-        OnHeapColumnVector partitionCol = new OnHeapColumnVector(capacity, dt);
+        OnHeapColumnVector partitionCol = new OnHeapColumnVector(capacity, requiredFields[i].name(),dt);
         ColumnVectorUtils.populate(partitionCol, partitionValues, requestedPartitionColIds[i]);
         partitionCol.setIsConstant();
         orcVectorWrappers[i] = partitionCol;
       } else {
         int colId = requestedDataColIds[i];
         // Initialize the missing columns once.
         if (colId == -1) {
-          OnHeapColumnVector missingCol = new OnHeapColumnVector(capacity, dt);
+          OnHeapColumnVector missingCol = new OnHeapColumnVector(capacity,  requiredFields[i].name(),  dt);
           missingCol.putNulls(0, capacity);
           missingCol.setIsConstant();
           orcVectorWrappers[i] = missingCol;

diff --git a/.../main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/.../main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
@@ -190,10 +190,13 @@ private boolean isLazyDecodingSupported(PrimitiveType.PrimitiveTypeName typeName
     return isSupported;
   }
 
-  static int rebaseDays(int julianDays, final boolean failIfRebase) {
+  static int rebaseDays(
+      int julianDays,
+      final boolean failIfRebase,
+      WritableColumnVector c) {
     if (failIfRebase) {
       if (julianDays < RebaseDateTime.lastSwitchJulianDay()) {
-        throw DataSourceUtils.newRebaseExceptionInRead("Parquet");
+        throw DataSourceUtils.newRebaseExceptionInRead(c.colName, c.dataType(), "Parquet");
       } else {
         return julianDays;
       }
@@ -205,10 +208,11 @@ static int rebaseDays(int julianDays, final boolean failIfRebase) {
   private static long rebaseTimestamp(
       long julianMicros,
       final boolean failIfRebase,
+      WritableColumnVector c,
       final String format) {
     if (failIfRebase) {
       if (julianMicros < RebaseDateTime.lastSwitchJulianTs()) {
-        throw DataSourceUtils.newRebaseExceptionInRead(format);
+        throw DataSourceUtils.newRebaseExceptionInRead(c.colName, c.dataType(), format);
       } else {
         return julianMicros;
       }
@@ -217,12 +221,18 @@ private static long rebaseTimestamp(
     }
   }
 
-  static long rebaseMicros(long julianMicros, final boolean failIfRebase) {
-    return rebaseTimestamp(julianMicros, failIfRebase, "Parquet");
+  static long rebaseMicros(
+      long julianMicros,
+      final boolean failIfRebase,
+      WritableColumnVector c) {
+    return rebaseTimestamp(julianMicros, failIfRebase, c, "Parquet");
   }
 
-  static long rebaseInt96(long julianMicros, final boolean failIfRebase) {
-    return rebaseTimestamp(julianMicros, failIfRebase, "Parquet INT96");
+  static long rebaseInt96(
+      long julianMicros,
+      final boolean failIfRebase,
+      WritableColumnVector c) {
+    return rebaseTimestamp(julianMicros, failIfRebase, c, "Parquet INT96");
   }
 
   /**
@@ -387,7 +397,7 @@ private void decodeDictionaryIds(
           for (int i = rowId; i < rowId + num; ++i) {
             if (!column.isNullAt(i)) {
               int julianDays = dictionary.decodeToInt(dictionaryIds.getDictId(i));
-              column.putInt(i, rebaseDays(julianDays, failIfRebase));
+              column.putInt(i, rebaseDays(julianDays, failIfRebase, column));
             }
           }
         } else {
@@ -432,7 +442,7 @@ private void decodeDictionaryIds(
               if (!column.isNullAt(i)) {
                 long julianMillis = dictionary.decodeToLong(dictionaryIds.getDictId(i));
                 long julianMicros = DateTimeUtils.millisToMicros(julianMillis);
-                column.putLong(i, rebaseMicros(julianMicros, failIfRebase));
+                column.putLong(i, rebaseMicros(julianMicros, failIfRebase, column));
               }
             }
           }
@@ -441,7 +451,7 @@ private void decodeDictionaryIds(
           for (int i = rowId; i < rowId + num; ++i) {
             if (!column.isNullAt(i)) {
               long julianMicros = dictionary.decodeToLong(dictionaryIds.getDictId(i));
-              column.putLong(i, rebaseMicros(julianMicros, failIfRebase));
+              column.putLong(i, rebaseMicros(julianMicros, failIfRebase, column));
             }
           }
         } else {
@@ -480,7 +490,7 @@ private void decodeDictionaryIds(
                 if (!column.isNullAt(i)) {
                   Binary v = dictionary.decodeToBinary(dictionaryIds.getDictId(i));
                   long julianMicros = ParquetRowConverter.binaryToSQLTimestamp(v);
-                  long gregorianMicros = rebaseInt96(julianMicros, failIfRebase);
+                  long gregorianMicros = rebaseInt96(julianMicros, failIfRebase, column);
                   column.putLong(i, gregorianMicros);
                 }
               }
@@ -500,7 +510,7 @@ private void decodeDictionaryIds(
                 if (!column.isNullAt(i)) {
                   Binary v = dictionary.decodeToBinary(dictionaryIds.getDictId(i));
                   long julianMicros = ParquetRowConverter.binaryToSQLTimestamp(v);
-                  long gregorianMicros = rebaseInt96(julianMicros, failIfRebase);
+                  long gregorianMicros = rebaseInt96(julianMicros, failIfRebase, column);
                   long adjTime = DateTimeUtils.convertTz(gregorianMicros, convertTz, UTC);
                   column.putLong(i, adjTime);
                 }
@@ -640,7 +650,7 @@ private void readLongBatch(int rowId, int num, WritableColumnVector column) thro
         for (int i = 0; i < num; i++) {
           if (defColumn.readInteger() == maxDefLevel) {
             long julianMicros = DateTimeUtils.millisToMicros(dataColumn.readLong());
-            column.putLong(rowId + i, rebaseMicros(julianMicros, failIfRebase));
+            column.putLong(rowId + i, rebaseMicros(julianMicros, failIfRebase, column));
           } else {
             column.putNull(rowId + i);
           }
@@ -698,7 +708,7 @@ private void readBinaryBatch(int rowId, int num, WritableColumnVector column) th
             if (defColumn.readInteger() == maxDefLevel) {
               // Read 12 bytes for INT96
               long julianMicros = ParquetRowConverter.binaryToSQLTimestamp(data.readBinary(12));
-              long gregorianMicros = rebaseInt96(julianMicros, failIfRebase);
+              long gregorianMicros = rebaseInt96(julianMicros, failIfRebase, column);
               column.putLong(rowId + i, gregorianMicros);
             } else {
               column.putNull(rowId + i);
@@ -722,7 +732,7 @@ private void readBinaryBatch(int rowId, int num, WritableColumnVector column) th
             if (defColumn.readInteger() == maxDefLevel) {
               // Read 12 bytes for INT96
               long julianMicros = ParquetRowConverter.binaryToSQLTimestamp(data.readBinary(12));
-              long gregorianMicros = rebaseInt96(julianMicros, failIfRebase);
+              long gregorianMicros = rebaseInt96(julianMicros, failIfRebase, column);
               long adjTime = DateTimeUtils.convertTz(gregorianMicros, convertTz, UTC);
               column.putLong(rowId + i, adjTime);
             } else {

diff --git a/.../java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java b/.../java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java
@@ -107,7 +107,7 @@ public final void readIntegersWithRebase(
     }
     if (rebase) {
       if (failIfRebase) {
-        throw DataSourceUtils.newRebaseExceptionInRead("Parquet");
+        throw DataSourceUtils.newRebaseExceptionInRead(c.colName, c.dataType(), "Parquet");
       } else {
         for (int i = 0; i < total; i += 1) {
           c.putInt(rowId + i, RebaseDateTime.rebaseJulianToGregorianDays(buffer.getInt()));
@@ -164,7 +164,7 @@ public final void readLongsWithRebase(
     }
     if (rebase) {
       if (failIfRebase) {
-        throw DataSourceUtils.newRebaseExceptionInRead("Parquet");
+        throw DataSourceUtils.newRebaseExceptionInRead(c.colName, c.dataType(), "Parquet");
       } else {
         for (int i = 0; i < total; i += 1) {
           c.putLong(rowId + i, RebaseDateTime.rebaseJulianToGregorianMicros(buffer.getLong()));

diff --git a/...in/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedRleValuesReader.java b/...in/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedRleValuesReader.java
@@ -264,7 +264,8 @@ public void readIntegersWithRebase(
           for (int i = 0; i < n; ++i) {
             if (currentBuffer[currentBufferIdx++] == level) {
               int julianDays = data.readInteger();
-              c.putInt(rowId + i, VectorizedColumnReader.rebaseDays(julianDays, failIfRebase));
+              c.putInt(rowId + i,
+                VectorizedColumnReader.rebaseDays(julianDays, failIfRebase, c));
             } else {
               c.putNull(rowId + i);
             }
@@ -492,7 +493,7 @@ public void readLongsWithRebase(
           for (int i = 0; i < n; ++i) {
             if (currentBuffer[currentBufferIdx++] == level) {
               long julianMicros = data.readLong();
-              c.putLong(rowId + i, VectorizedColumnReader.rebaseMicros(julianMicros, failIfRebase));
+              c.putLong(rowId + i, VectorizedColumnReader.rebaseMicros(julianMicros, failIfRebase, c));
             } else {
               c.putNull(rowId + i);
             }

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
@@ -50,7 +50,7 @@ public static OffHeapColumnVector[] allocateColumns(int capacity, StructType sch
   public static OffHeapColumnVector[] allocateColumns(int capacity, StructField[] fields) {
     OffHeapColumnVector[] vectors = new OffHeapColumnVector[fields.length];
     for (int i = 0; i < fields.length; i++) {
-      vectors[i] = new OffHeapColumnVector(capacity, fields[i].dataType());
+      vectors[i] = new OffHeapColumnVector(capacity, fields[i].name(), fields[i].dataType());
     }
     return vectors;
   }
@@ -64,8 +64,8 @@ public static OffHeapColumnVector[] allocateColumns(int capacity, StructField[]
   private long lengthData;
   private long offsetData;
 
-  public OffHeapColumnVector(int capacity, DataType type) {
-    super(capacity, type);
+  public OffHeapColumnVector(int capacity, String colName, DataType type) {
+    super(capacity, colName, type);
 
     nulls = 0;
     data = 0;
@@ -566,7 +566,7 @@ protected void reserveInternal(int newCapacity) {
   }
 
   @Override
-  protected OffHeapColumnVector reserveNewColumn(int capacity, DataType type) {
-    return new OffHeapColumnVector(capacity, type);
+  protected OffHeapColumnVector reserveNewColumn(int capacity, String colName, DataType type) {
+    return new OffHeapColumnVector(capacity, colName, type);
   }
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
@@ -50,7 +50,7 @@ public static OnHeapColumnVector[] allocateColumns(int capacity, StructType sche
   public static OnHeapColumnVector[] allocateColumns(int capacity, StructField[] fields) {
     OnHeapColumnVector[] vectors = new OnHeapColumnVector[fields.length];
     for (int i = 0; i < fields.length; i++) {
-      vectors[i] = new OnHeapColumnVector(capacity, fields[i].dataType());
+      vectors[i] = new OnHeapColumnVector(capacity, fields[i].name(), fields[i].dataType());
     }
     return vectors;
   }
@@ -73,8 +73,8 @@ public static OnHeapColumnVector[] allocateColumns(int capacity, StructField[] f
   private int[] arrayLengths;
   private int[] arrayOffsets;
 
-  public OnHeapColumnVector(int capacity, DataType type) {
-    super(capacity, type);
+  public OnHeapColumnVector(int capacity, String colName, DataType type) {
+    super(capacity, colName, type);
 
     reserveInternal(capacity);
     reset();
@@ -580,7 +580,7 @@ protected void reserveInternal(int newCapacity) {
   }
 
   @Override
-  protected OnHeapColumnVector reserveNewColumn(int capacity, DataType type) {
-    return new OnHeapColumnVector(capacity, type);
+  protected OnHeapColumnVector reserveNewColumn(int capacity, String colName, DataType type) {
+    return new OnHeapColumnVector(capacity, colName, type);
   }
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -165,7 +165,7 @@ public void setDictionary(Dictionary dictionary) {
    */
   public WritableColumnVector reserveDictionaryIds(int capacity) {
     if (dictionaryIds == null) {
-      dictionaryIds = reserveNewColumn(capacity, DataTypes.IntegerType);
+      dictionaryIds = reserveNewColumn(capacity, colName, DataTypes.IntegerType);
     } else {
       dictionaryIds.reset();
       dictionaryIds.reserve(capacity);
@@ -677,6 +677,11 @@ public WritableColumnVector arrayData() {
    */
   public final void setIsConstant() { isConstant = true; }
 
+  /**
+   * Column name of this column.
+   */
+  public String colName;
+
   /**
    * Maximum number of rows that can be stored in this column.
    */
@@ -717,7 +722,7 @@ public WritableColumnVector arrayData() {
   /**
    * Reserve a new column.
    */
-  protected abstract WritableColumnVector reserveNewColumn(int capacity, DataType type);
+  protected abstract WritableColumnVector reserveNewColumn(int capacity, String colName, DataType type);
 
   protected boolean isArray() {
     return type instanceof ArrayType || type instanceof BinaryType || type instanceof StringType ||
@@ -728,8 +733,9 @@ protected boolean isArray() {
    * Sets up the common state and also handles creating the child columns if this is a nested
    * type.
    */
-  protected WritableColumnVector(int capacity, DataType type) {
+  protected WritableColumnVector(int capacity, String colName, DataType type) {
     super(type);
+    this.colName = colName;
     this.capacity = capacity;
 
     if (isArray()) {
@@ -742,24 +748,25 @@ protected WritableColumnVector(int capacity, DataType type) {
         childCapacity *= DEFAULT_ARRAY_LENGTH;
       }
       this.childColumns = new WritableColumnVector[1];
-      this.childColumns[0] = reserveNewColumn(childCapacity, childType);
+      this.childColumns[0] = reserveNewColumn(childCapacity, colName + ".elem", childType);
     } else if (type instanceof StructType) {
       StructType st = (StructType)type;
       this.childColumns = new WritableColumnVector[st.fields().length];
       for (int i = 0; i < childColumns.length; ++i) {
-        this.childColumns[i] = reserveNewColumn(capacity, st.fields()[i].dataType());
+        this.childColumns[i] = reserveNewColumn(capacity, colName + "." + st.fields()[i].name(),
+          st.fields()[i].dataType());
       }
     } else if (type instanceof MapType) {
       MapType mapType = (MapType) type;
       this.childColumns = new WritableColumnVector[2];
-      this.childColumns[0] = reserveNewColumn(capacity, mapType.keyType());
-      this.childColumns[1] = reserveNewColumn(capacity, mapType.valueType());
+      this.childColumns[0] = reserveNewColumn(capacity, colName + ".key", mapType.keyType());
+      this.childColumns[1] = reserveNewColumn(capacity, colName + ".value", mapType.valueType());
     } else if (type instanceof CalendarIntervalType) {
       // Three columns. Months as int. Days as Int. Microseconds as Long.
       this.childColumns = new WritableColumnVector[3];
-      this.childColumns[0] = reserveNewColumn(capacity, DataTypes.IntegerType);
-      this.childColumns[1] = reserveNewColumn(capacity, DataTypes.IntegerType);
-      this.childColumns[2] = reserveNewColumn(capacity, DataTypes.LongType);
+      this.childColumns[0] = reserveNewColumn(capacity, colName + ".months", DataTypes.IntegerType);
+      this.childColumns[1] = reserveNewColumn(capacity, colName + ".days", DataTypes.IntegerType);
+      this.childColumns[2] = reserveNewColumn(capacity, colName + ".microseconds", DataTypes.LongType);
     } else {
       this.childColumns = null;
     }