Skip to content

Commit

Permalink
[SPARK-49139][SQL] Enable collations by default
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
Set `COLLATION_ENABLED` SQL conf to `true` by default.

### Why are the changes needed?
Collations should now be enabled by default.

### Does this PR introduce _any_ user-facing change?
Yes, collations should now be enabled by default, without the need to set `spark.sql.collation.enabled` manually.

### How was this patch tested?
Existing tests.

### Was this patch authored or co-authored using generative AI tooling?
No.

Closes #47650 from uros-db/enable-collations.

Authored-by: Uros Bojanic <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
  • Loading branch information
uros-db authored and cloud-fan committed Aug 8, 2024
1 parent a713a66 commit 43d881a
Show file tree
Hide file tree
Showing 10 changed files with 1 addition and 98 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -36,24 +36,14 @@ import org.apache.spark.sql.types._
""",
examples = """
Examples:
> SET spark.sql.collation.enabled=true;
spark.sql.collation.enabled true
> SELECT COLLATION('Spark SQL' _FUNC_ UTF8_LCASE);
UTF8_LCASE
> SET spark.sql.collation.enabled=false;
spark.sql.collation.enabled false
""",
since = "4.0.0",
group = "string_funcs")
// scalastyle:on line.contains.tab
object CollateExpressionBuilder extends ExpressionBuilder {
override def build(funcName: String, expressions: Seq[Expression]): Expression = {
// We need to throw collationNotEnabledError before unexpectedNullError
// and nonFoldableArgumentError, as we do not want user to see misleading
// messages that collation is enabled
if (!SQLConf.get.collationEnabled) {
throw QueryCompilationErrors.collationNotEnabledError()
}
expressions match {
case Seq(e: Expression, collationExpr: Expression) =>
(collationExpr.dataType, collationExpr.foldable) match {
Expand Down Expand Up @@ -107,12 +97,8 @@ case class Collate(child: Expression, collationName: String)
""",
examples = """
Examples:
> SET spark.sql.collation.enabled=true;
spark.sql.collation.enabled true
> SELECT _FUNC_('Spark SQL');
UTF8_BINARY
> SET spark.sql.collation.enabled=false;
spark.sql.collation.enabled false
""",
since = "4.0.0",
group = "string_funcs")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2378,9 +2378,6 @@ class AstBuilder extends DataTypeAstBuilder
}

override def visitCollateClause(ctx: CollateClauseContext): String = withOrigin(ctx) {
if (!SQLConf.get.collationEnabled) {
throw QueryCompilationErrors.collationNotEnabledError()
}
ctx.identifier.getText
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -351,12 +351,6 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat
)
}

def collationNotEnabledError(): Throwable = {
new AnalysisException(
errorClass = "UNSUPPORTED_FEATURE.COLLATION",
messageParameters = Map.empty)
}

def unresolvedUsingColForJoinError(
colName: String, suggestion: String, side: String): Throwable = {
new AnalysisException(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -759,14 +759,6 @@ object SQLConf {
.checkValue(_ > 0, "The initial number of partitions must be positive.")
.createOptional

lazy val COLLATION_ENABLED =
buildConf("spark.sql.collation.enabled")
.doc("Collations feature is under development and its use should be done under this" +
"feature flag.")
.version("4.0.0")
.booleanConf
.createWithDefault(Utils.isTesting)

val DEFAULT_COLLATION =
buildConf(SqlApiConfHelper.DEFAULT_COLLATION)
.doc("Sets default collation to use for string literals, parameter markers or the string" +
Expand Down Expand Up @@ -5413,8 +5405,6 @@ class SQLConf extends Serializable with Logging with SqlApiConf {
}
}

def collationEnabled: Boolean = getConf(COLLATION_ENABLED)

override def defaultStringType: StringType = {
if (getConf(DEFAULT_COLLATION).toUpperCase(Locale.ROOT) == "UTF8_BINARY") {
StringType
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,6 @@ class SparkSqlAstBuilder extends AstBuilder {
* }}}
*/
override def visitSetCollation(ctx: SetCollationContext): LogicalPlan = withOrigin(ctx) {
if (!SQLConf.get.collationEnabled) {
throw QueryCompilationErrors.collationNotEnabledError()
}
val key = SQLConf.DEFAULT_COLLATION.key
SetCommand(Some(key -> Some(ctx.identifier.getText.toUpperCase(Locale.ROOT))))
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import org.apache.spark.SparkIllegalArgumentException
import org.apache.spark.sql.{AnalysisException, SaveMode, SparkSession}
import org.apache.spark.sql.catalyst.analysis._
import org.apache.spark.sql.catalyst.catalog._
import org.apache.spark.sql.catalyst.expressions.{Collate, Collation, Expression, InputFileBlockLength, InputFileBlockStart, InputFileName, RowOrdering}
import org.apache.spark.sql.catalyst.expressions.{Expression, InputFileBlockLength, InputFileBlockStart, InputFileName, RowOrdering}
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.catalyst.types.DataTypeUtils.toAttributes
Expand All @@ -37,7 +37,6 @@ import org.apache.spark.sql.execution.command.DDLUtils
import org.apache.spark.sql.execution.command.ViewHelper.generateViewProperties
import org.apache.spark.sql.execution.datasources.{CreateTable => CreateTableV1}
import org.apache.spark.sql.execution.datasources.v2.FileDataSourceV2
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.sources.InsertableRelation
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.sql.util.PartitioningUtils.normalizePartitionSpec
Expand Down Expand Up @@ -641,25 +640,6 @@ case class QualifyLocationWithWarehouse(catalog: SessionCatalog) extends Rule[Lo
}
}

object CollationCheck extends (LogicalPlan => Unit) {
def apply(plan: LogicalPlan): Unit = {
plan.foreach {
case operator: LogicalPlan =>
operator.expressions.foreach(_.foreach(
e =>
if (isCollationExpression(e) && !SQLConf.get.collationEnabled) {
throw QueryCompilationErrors.collationNotEnabledError()
}
)
)
}
}

private def isCollationExpression(expression: Expression): Boolean =
expression.isInstanceOf[Collation] || expression.isInstanceOf[Collate]
}


/**
* This rule checks for references to views WITH SCHEMA [TYPE] EVOLUTION and synchronizes the
* catalog if evolution was detected.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,6 @@ abstract class BaseSessionStateBuilder(
HiveOnlyCheck +:
TableCapabilityCheck +:
CommandCheck +:
CollationCheck +:
ViewSyncSchemaToMetaStore +:
customCheckRules
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -878,37 +878,6 @@ class QueryCompilationErrorsSuite
"className" -> "org.apache.spark.sql.catalyst.expressions.UnsafeRow"))
}

test("SPARK-47102: the collation feature is off without collate builder call") {
withSQLConf(SQLConf.COLLATION_ENABLED.key -> "false") {
Seq(
"CREATE TABLE t(col STRING COLLATE UNICODE_CI) USING parquet",
"CREATE TABLE t(col STRING COLLATE UNKNOWN_COLLATION_STRING) USING parquet",
"SELECT 'aaa' COLLATE UNICODE_CI",
"select collation('aaa')"
).foreach { sqlText =>
checkError(
exception = intercept[AnalysisException](sql(sqlText)),
errorClass = "UNSUPPORTED_FEATURE.COLLATION")
}
}
}

test("SPARK-47102: the collation feature is off with collate builder call") {
withSQLConf(SQLConf.COLLATION_ENABLED.key -> "false") {
Seq(
"SELECT collate('aaa', 'UNICODE_CI')",
"SELECT collate('aaa', 'UNKNOWN_COLLATION_STRING')"
).foreach { sqlText =>
checkError(
exception = intercept[AnalysisException](sql(sqlText)),
errorClass = "UNSUPPORTED_FEATURE.COLLATION",
parameters = Map.empty,
context = ExpectedContext(
fragment = sqlText.substring(7), start = 7, stop = sqlText.length - 1))
}
}
}

test("INTERNAL_ERROR: Convert unsupported data type from Spark to Parquet") {
val converter = new SparkToParquetSchemaConverter
val dummyDataType = new DataType {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -517,14 +517,6 @@ class SQLConfSuite extends QueryTest with SharedSparkSession {
"confName" -> "spark.sql.session.collation.default",
"proposals" -> "UNICODE"
))

withSQLConf(SQLConf.COLLATION_ENABLED.key -> "false") {
checkError(
exception = intercept[AnalysisException](sql(s"SET COLLATION UNICODE_CI")),
errorClass = "UNSUPPORTED_FEATURE.COLLATION",
parameters = Map.empty
)
}
}

test("SPARK-43028: config not found error") {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ class HiveSessionStateBuilder(
PreReadCheck +:
TableCapabilityCheck +:
CommandCheck +:
CollationCheck +:
ViewSyncSchemaToMetaStore +:
customCheckRules
}
Expand Down

0 comments on commit 43d881a

Please sign in to comment.