[SPARK-47616][SQL] Add User Document for Mapping Spark SQL Data Types…

… from MySQL ### What changes were proposed in this pull request? This PR added a User Document for Mapping Spark SQL Data Types from MySQL. The write side document is not included yet which might need further verification. ### Why are the changes needed? Now, the conversion of data types from MySQL to Spark SQL is solid. End users can refer to it. It's also time for maintainers to have an overall perspective to review the changes in the coming 4.0.0 ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? add some test for missing MySQL data types ![image](https://github.com/apache/spark/assets/8326978/da8eb241-1d05-4a01-b038-d7d329193674) ### Was this patch authored or co-authored using generative AI tooling? no Closes apache#45736 from yaooqinn/SPARK-47616. Authored-by: Kent Yao <[email protected]> Signed-off-by: Dongjoon Hyun <[email protected]>
MaxGekk · Mar 27, 2024 · 8d1539f · 8d1539f
1 parent d10dbaa
commit 8d1539f
Show file tree

Hide file tree

Showing 2 changed files with 262 additions and 0 deletions.
diff --git a/...er-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MySQLIntegrationSuite.scala b/...er-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MySQLIntegrationSuite.scala
@@ -90,6 +90,10 @@ class MySQLIntegrationSuite extends DockerJDBCIntegrationSuite {
     conn.prepareStatement("CREATE TABLE collections (" +
         "a SET('cap', 'hat', 'helmet'), b ENUM('S', 'M', 'L', 'XL'))").executeUpdate()
     conn.prepareStatement("INSERT INTO collections VALUES ('cap,hat', 'M')").executeUpdate()
+
+    conn.prepareStatement("CREATE TABLE TBL_GEOMETRY (col0 GEOMETRY)").executeUpdate()
+    conn.prepareStatement("INSERT INTO TBL_GEOMETRY VALUES (ST_GeomFromText('POINT(0 0)'))")
+      .executeUpdate()
   }
 
   def testConnection(): Unit = {
@@ -191,6 +195,12 @@ class MySQLIntegrationSuite extends DockerJDBCIntegrationSuite {
       assert(rows(0).getAs[Timestamp](3).equals(Timestamp.valueOf("2009-02-13 23:31:30")))
       assert(rows(0).getAs[Date](4).equals(Date.valueOf("2001-01-01")))
     }
+    val df = spark.read.format("jdbc")
+      .option("url", jdbcUrl)
+      .option("query", "select yr from dates")
+      .option("yearIsDateType", false)
+      .load()
+    checkAnswer(df, Row(2001))
   }
 
   test("SPARK-47406: MySQL datetime types with preferTimestampNTZ") {
@@ -318,6 +328,12 @@ class MySQLIntegrationSuite extends DockerJDBCIntegrationSuite {
         Row("cap,hat", "M") :: Row("cap,hat", "M") :: Nil)
     }
   }
+
+  test("SPARK-47616: Read GEOMETRY from MySQL") {
+    val df = spark.read.jdbc(jdbcUrl, "TBL_GEOMETRY", new Properties)
+    checkAnswer(df,
+      Row(Array[Byte](0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)))
+  }
 }
 
 

diff --git a/docs/sql-data-sources-jdbc.md b/docs/sql-data-sources-jdbc.md
@@ -427,3 +427,249 @@ SELECT * FROM resultTable
 
 </div>
 </div>
+
+## Data Type Mapping
+
+### Mapping Spark SQL Data Types from MySQL
+
+The below table describe the data type conversions from MySQL data types to Spark SQL Data Types,
+when reading data from a MySQL table using the built-in jdbc data source with the MySQL Connector/J
+as the activated JDBC Driver. Note that, different JDBC drivers, such as Maria Connector/J, which
+are also available to connect MySQL, may have different mapping rules.
+
+<table>
+  <thead>
+    <tr>
+      <th><b>MySQL Data Type</b></th>
+      <th><b>Spark SQL Data Type</b></th>
+      <th><b>Remarks</b></th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>BIT(1)</td>
+      <td>BooleanType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>BIT( &gt;1 )</td>
+      <td>LongType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>TINYINT(1)</td>
+      <td>BooleanType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>TINYINT(1)</td>
+      <td>ByteType</td>
+      <td>tinyInt1isBit=false</td>
+    </tr>
+    <tr>
+      <td>BOOLEAN</td>
+      <td>BooleanType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>BOOLEAN</td>
+      <td>ByteType</td>
+      <td>tinyInt1isBit=false</td>
+    </tr>
+    <tr>
+      <td>TINYINT( &gt;1 )</td>
+      <td>ByteType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>TINYINT( any ) UNSIGNED</td>
+      <td>ShortType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>SMALLINT</td>
+      <td>ShortType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>SMALLINT UNSIGNED</td>
+      <td>IntegerType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>MEDIUMINT [UNSIGNED]</td>
+      <td>IntegerType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>INT</td>
+      <td>IntegerType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>INT UNSIGNED</td>
+      <td>LongType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>BIGINT</td>
+      <td>LongType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>BIGINT UNSIGNED</td>
+      <td>DecimalType(20,0)</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>FLOAT</td>
+      <td>FloatType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>FLOAT UNSIGNED</td>
+      <td>DoubleType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>DOUBLE [UNSIGNED]</td>
+      <td>DoubleType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>DECIMAL(p,s) [UNSIGNED]</td>
+      <td>DecimalType(min(38, p),(min(18,s)))</td>
+      <td>The column type is bounded to DecimalType(38, 18), thus if any value of this column have a actual presion greater 38 will fail with DECIMAL_PRECISION_EXCEEDS_MAX_PRECISION </td>
+    </tr>
+    <tr>
+      <td>DATE</td>
+      <td>DateType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>DATETIME</td>
+      <td>TimestampType</td>
+      <td>(Default)preferTimestampNTZ=false or spark.sql.timestampType=TIMESTAMP_LTZ</td>
+    </tr>
+    <tr>
+      <td>DATETIME</td>
+      <td>TimestampNTZType</td>
+      <td>preferTimestampNTZ=true or spark.sql.timestampType=TIMESTAMP_NTZ</td>
+    </tr>
+    <tr>
+      <td>TIMESTAMP</td>
+      <td>TimestampType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>TIME</td>
+      <td>TimestampType</td>
+      <td>(Default)preferTimestampNTZ=false or spark.sql.timestampType=TIMESTAMP_LTZ</td>
+    </tr>
+    <tr>
+      <td>TIME</td>
+      <td>TimestampNTZType</td>
+      <td>preferTimestampNTZ=true or spark.sql.timestampType=TIMESTAMP_NTZ</td>
+    </tr>
+    <tr>
+      <td>YEAR</td>
+      <td>DateType</td>
+      <td>yearIsDateType=true</td>
+    </tr>
+    <tr>
+      <td>YEAR</td>
+      <td>IntegerType</td>
+      <td>yearIsDateType=false</td>
+    </tr>
+    <tr>
+      <td>CHAR(n)</td>
+      <td>CharType(n)</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>VARCHAR(n)</td>
+      <td>VarcharType(n)</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>BINARY(n)</td>
+      <td>BinaryType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>VARBINARY(n)</td>
+      <td>BinaryType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>CHAR(n) BINARY</td>
+      <td>BinaryType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>VARCHAR(n) BINARY</td>
+      <td>BinaryType</td>
+      <td></td>
+    </tr>
+     <tr>
+      <td>BLOB</td>
+      <td>BinaryType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>TINYBLOB</td>
+      <td>BinaryType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>MEDIUMBLOB</td>
+      <td>BinaryType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>LONGBLOB</td>
+      <td>BinaryType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>TEXT</td>
+      <td>StringType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>TINYTEXT</td>
+      <td>StringType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>MEDIUMTEXT</td>
+      <td>StringType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>LONGTEXT</td>
+      <td>StringType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>JSON</td>
+      <td>StringType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>GEOMETRY</td>
+      <td>BinaryType</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>ENUM</td>
+      <td>CharType(n)</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>SET</td>
+      <td>CharType(n)</td>
+      <td></td>
+    </tr>
+  </tbody>
+</table>