diff --git a/build.sbt b/build.sbt index ee3bb6b3f..ac0fafde7 100644 --- a/build.sbt +++ b/build.sbt @@ -127,6 +127,9 @@ ThisBuild / Test / fork := true ThisBuild / Test / javaOptions += "-Xmx1024m" +// Needed to test both non-codegen and codegen parts of expressions +ThisBuild / Test / envVars += "SPARK_TESTING" -> "1" + ThisBuild / coverageExcludedPackages := "com\\.fasterxml.*;com\\.microsoft\\.hyperspace\\.shim" /** diff --git a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/util/ExprMatcher.scala b/src/main/scala-spark2/com/microsoft/hyperspace/shim/FirstNullSafe.scala similarity index 67% rename from src/main/scala/com/microsoft/hyperspace/index/dataskipping/util/ExprMatcher.scala rename to src/main/scala-spark2/com/microsoft/hyperspace/shim/FirstNullSafe.scala index 48b804ead..5d2bece53 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/util/ExprMatcher.scala +++ b/src/main/scala-spark2/com/microsoft/hyperspace/shim/FirstNullSafe.scala @@ -14,15 +14,11 @@ * limitations under the License. */ -package com.microsoft.hyperspace.index.dataskipping.util +package com.microsoft.hyperspace.shim -import org.apache.spark.sql.catalyst.expressions.{Expression, ExprId} +import org.apache.spark.sql.catalyst.expressions.{Expression, Literal} +import org.apache.spark.sql.catalyst.expressions.aggregate.First -trait ExprMatcher { - - /** - * Returns true if the given expression matches the expression this matcher - * is associated with. - */ - def apply(e: Expression): Boolean +object FirstNullSafe { + def apply(child: Expression): First = First(child, Literal(false)) } diff --git a/src/main/scala-spark3/com/microsoft/hyperspace/shim/FirstNullSafe.scala b/src/main/scala-spark3/com/microsoft/hyperspace/shim/FirstNullSafe.scala new file mode 100644 index 000000000..26c76a3fa --- /dev/null +++ b/src/main/scala-spark3/com/microsoft/hyperspace/shim/FirstNullSafe.scala @@ -0,0 +1,24 @@ +/* + * Copyright (2021) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.shim + +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.catalyst.expressions.aggregate.First + +object FirstNullSafe { + def apply(child: Expression): First = First(child, false) +} diff --git a/src/main/scala/com/microsoft/hyperspace/Hyperspace.scala b/src/main/scala/com/microsoft/hyperspace/Hyperspace.scala index a6c883f88..c55191744 100644 --- a/src/main/scala/com/microsoft/hyperspace/Hyperspace.scala +++ b/src/main/scala/com/microsoft/hyperspace/Hyperspace.scala @@ -21,7 +21,7 @@ import org.apache.spark.sql.{DataFrame, SparkSession} import com.microsoft.hyperspace.index._ import com.microsoft.hyperspace.index.IndexConstants.{OPTIMIZE_MODE_QUICK, REFRESH_MODE_FULL} import com.microsoft.hyperspace.index.plananalysis.{CandidateIndexAnalyzer, PlanAnalyzer} -import com.microsoft.hyperspace.index.rules.ApplyHyperspace +import com.microsoft.hyperspace.index.rules.ApplyHyperspace.withHyperspaceRuleDisabled import com.microsoft.hyperspace.index.sources.FileBasedSourceProviderManager class Hyperspace(spark: SparkSession) { @@ -189,15 +189,6 @@ class Hyperspace(spark: SparkSession) { } } } - - private def withHyperspaceRuleDisabled(f: => Unit): Unit = { - try { - ApplyHyperspace.disableForIndexMaintenance.set(true) - f - } finally { - ApplyHyperspace.disableForIndexMaintenance.set(false) - } - } } object Hyperspace extends ActiveSparkSession { diff --git a/src/main/scala/com/microsoft/hyperspace/index/IndexConstants.scala b/src/main/scala/com/microsoft/hyperspace/index/IndexConstants.scala index ff3dcea3d..b415fb548 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/IndexConstants.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/IndexConstants.scala @@ -128,4 +128,22 @@ object IndexConstants { val DATASKIPPING_TARGET_INDEX_DATA_FILE_SIZE = "spark.hyperspace.index.dataskipping.targetIndexDataFileSize" val DATASKIPPING_TARGET_INDEX_DATA_FILE_SIZE_DEFAULT = "268435456" // 256 MiB + + /** + * Maximum number of index data files. + * + * The number of index data files determined by targetIndexFileSize is + * capped by this value. + */ + val DATASKIPPING_MAX_INDEX_DATA_FILE_COUNT = + "spark.hyperspace.index.dataskipping.maxIndexDataFileCount" + val DATASKIPPING_MAX_INDEX_DATA_FILE_COUNT_DEFAULT = "10000" + + /** + * If set to true, partition sketches for partition columns are included when + * creating data skipping indexes. This does not affect existing indexes. + */ + val DATASKIPPING_AUTO_PARTITION_SKETCH = + "spark.hyperspace.index.dataskipping.autoPartitionSketch" + val DATASKIPPING_AUTO_PARTITION_SKETCH_DEFAULT = "true" } diff --git a/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntryTags.scala b/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntryTags.scala index ec9e071bb..e73648257 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntryTags.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntryTags.scala @@ -16,7 +16,8 @@ package com.microsoft.hyperspace.index -import org.apache.spark.sql.execution.datasources.InMemoryFileIndex +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.execution.datasources.{FileIndex, InMemoryFileIndex} import com.microsoft.hyperspace.index.plananalysis.FilterReason @@ -68,4 +69,17 @@ object IndexLogEntryTags { // If it's enabled, FILTER_REASONS and APPLIED_INDEX_RULES info will be tagged. val INDEX_PLAN_ANALYSIS_ENABLED: IndexLogEntryTag[Boolean] = IndexLogEntryTag[Boolean]("indexPlanAnalysisEnabled") + + // DATASKIPPING_INDEX_DATA_PREDICATE stores the index predicate translated + // from the plan's filter or join condition. + val DATASKIPPING_INDEX_PREDICATE: IndexLogEntryTag[Option[Expression]] = + IndexLogEntryTag[Option[Expression]]("dataskippingIndexPredicate") + + // DATASKIPPING_INDEX_FILEINDEX stores InMemoryFileIndex for the index data. + val DATASKIPPING_INDEX_FILEINDEX: IndexLogEntryTag[InMemoryFileIndex] = + IndexLogEntryTag[InMemoryFileIndex]("dataskippingIndexRelation") + + // DATASKIPPING_INDEX_FILEINDEX stores InMemoryFileIndex for the source data. + val DATASKIPPING_SOURCE_FILEINDEX: IndexLogEntryTag[FileIndex] = + IndexLogEntryTag[FileIndex]("dataskippingSourceRelation") } diff --git a/src/main/scala/com/microsoft/hyperspace/index/IndexUtils.scala b/src/main/scala/com/microsoft/hyperspace/index/IndexUtils.scala index 24cdcbf27..17c68c27f 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/IndexUtils.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/IndexUtils.scala @@ -18,6 +18,7 @@ package com.microsoft.hyperspace.index import java.net.URLDecoder +import org.apache.hadoop.fs.Path import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.udf @@ -64,4 +65,12 @@ object IndexUtils { */ lazy val decodeInputFileName = udf( (p: String) => URLDecoder.decode(p.replace("+", "%2B"), "UTF-8")) + + /** + * Returns the path part of the URI-like string. + * + * This can be used to compare the results of input_file_name() and the paths + * stored in FileIdTracker. + */ + lazy val getPath = udf((p: String) => new Path(p).toUri.getPath) } diff --git a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndex.scala b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndex.scala index f9db362b1..990ba73b5 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndex.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndex.scala @@ -16,13 +16,20 @@ package com.microsoft.hyperspace.index.dataskipping -import org.apache.spark.sql.{Column, DataFrame, SaveMode} -import org.apache.spark.sql.functions.{input_file_name, min, spark_partition_id} +import scala.collection.mutable + +import org.apache.spark.sql.{Column, DataFrame, SaveMode, SparkSession} +import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.functions.input_file_name +import org.apache.spark.sql.hyperspace.utils.StructTypeUtils +import org.apache.spark.sql.types.StructType -import com.microsoft.hyperspace.HyperspaceException import com.microsoft.hyperspace.index._ -import com.microsoft.hyperspace.index.dataskipping.sketch.Sketch -import com.microsoft.hyperspace.index.dataskipping.util.{DataFrameUtils, ExpressionUtils} +import com.microsoft.hyperspace.index.dataskipping.expressions._ +import com.microsoft.hyperspace.index.dataskipping.sketches.{PartitionSketch, Sketch} +import com.microsoft.hyperspace.index.dataskipping.util.DataFrameUtils import com.microsoft.hyperspace.util.HyperspaceConf /** @@ -30,10 +37,12 @@ import com.microsoft.hyperspace.util.HyperspaceConf * files in relations using sketches. * * @param sketches List of sketches for this index + * @param schema Index data schema * @param properties Properties for this index; see [[Index.properties]] for details. */ case class DataSkippingIndex( sketches: Seq[Sketch], + schema: StructType, override val properties: Map[String, String] = Map.empty) extends Index { assert(sketches.nonEmpty, "At least one sketch is required.") @@ -61,7 +70,8 @@ case class DataSkippingIndex( } override def optimize(ctx: IndexerContext, indexDataFilesToOptimize: Seq[FileInfo]): Unit = { - val indexData = ctx.spark.read.parquet(indexDataFilesToOptimize.map(_.name): _*) + val indexData = + ctx.spark.read.schema(schema).parquet(indexDataFilesToOptimize.map(_.name): _*) writeImpl(ctx, indexData, SaveMode.Overwrite) } @@ -71,7 +81,10 @@ case class DataSkippingIndex( deletedSourceDataFiles: Seq[FileInfo], indexContent: Content): (Index, Index.UpdateMode) = { if (appendedSourceData.nonEmpty) { - writeImpl(ctx, index(ctx, appendedSourceData.get), SaveMode.Overwrite) + writeImpl( + ctx, + DataSkippingIndex.createIndexData(ctx, sketches, appendedSourceData.get), + SaveMode.Overwrite) } if (deletedSourceDataFiles.nonEmpty) { val spark = ctx.spark @@ -98,23 +111,188 @@ case class DataSkippingIndex( override def refreshFull( ctx: IndexerContext, sourceData: DataFrame): (DataSkippingIndex, DataFrame) = { - val updatedIndex = copy(sketches = ExpressionUtils.resolve(ctx.spark, sketches, sourceData)) - (updatedIndex, updatedIndex.index(ctx, sourceData)) + val resolvedSketches = ExpressionUtils.resolve(ctx.spark, sketches, sourceData) + val indexData = DataSkippingIndex.createIndexData(ctx, resolvedSketches, sourceData) + val updatedIndex = copy(sketches = resolvedSketches, schema = indexData.schema) + (updatedIndex, indexData) } override def equals(that: Any): Boolean = that match { - case DataSkippingIndex(thatSketches, _) => sketches.toSet == thatSketches.toSet + case DataSkippingIndex(thatSketches, thatSchema, _) => + sketches.toSet == thatSketches.toSet && schema == thatSchema case _ => false } override def hashCode: Int = sketches.map(_.hashCode).sum + /** + * Translate the given filter/join condition for the source data to a + * predicate that can be used to filter out unnecessary source data files + * when applied to index data. + * + * For example, a filter condition "A = 1" can be translated into an index + * predicate "Min_A <= 1 && Max_A >= 1" to filter out files which cannot + * satisfy the condition for any rows in the file. + * + * It is assumed that the condition is in negation normal form. If it is not, + * then it may fail to translate the condition which would have been possible + * otherwise. This is a valid assumption for Spark 2.4 and later. + */ + def translateFilterCondition( + spark: SparkSession, + condition: Expression, + source: LogicalPlan): Option[Expression] = { + val resolvedExprs = + ExpressionUtils.getResolvedExprs(spark, sketches, source).getOrElse { return None } + val predMap = buildPredicateMap(condition, source, resolvedExprs) + + // Create a single index predicate for a single source predicate node, + // by combining individual index predicates with And. + // True is returned if there are no index predicates for the source predicate node. + def toIndexPred(sourcePred: Expression): Expression = { + predMap.get(sourcePred).map(_.reduceLeft(And)).getOrElse(Literal.TrueLiteral) + } + + // Compose an index predicate visiting the source predicate tree recursively. + def composeIndexPred(sourcePred: Expression): Expression = + sourcePred match { + case and: And => And(toIndexPred(and), and.mapChildren(composeIndexPred)) + case or: Or => And(toIndexPred(or), or.mapChildren(composeIndexPred)) + case leaf => toIndexPred(leaf) + } + + val indexPredicate = composeIndexPred(condition) + + // Apply constant folding to get the final predicate. + // This is a trimmed down version of the BooleanSimplification rule. + // It's just enough to determine whether the index is applicable or not. + val optimizePredicate: PartialFunction[Expression, Expression] = { + case And(Literal.TrueLiteral, right) => right + case And(left, Literal.TrueLiteral) => left + case Or(Literal.TrueLiteral, _) => Literal.TrueLiteral + case Or(_, Literal.TrueLiteral) => Literal.TrueLiteral + } + val optimizedIndexPredicate = indexPredicate.transformUp(optimizePredicate) + + // Return None if the index predicate is True - meaning no conversion can be done. + if (optimizedIndexPredicate == Literal.TrueLiteral) { + None + } else { + Some(optimizedIndexPredicate) + } + } + + private def writeImpl(ctx: IndexerContext, indexData: DataFrame, writeMode: SaveMode): Unit = { + // require instead of assert, as the condition can potentially be broken by + // code which is external to dataskipping. + require( + indexData.schema.sameType(schema), + "Schema of the index data doesn't match the index schema: " + + s"index data schema = ${indexData.schema.toDDL}, index schema = ${schema.toDDL}") + indexData.cache() + indexData.count() // force cache + val indexDataSize = DataFrameUtils.getSizeInBytes(indexData) + val targetIndexDataFileSize = HyperspaceConf.DataSkipping.targetIndexDataFileSize(ctx.spark) + val maxIndexDataFileCount = HyperspaceConf.DataSkipping.maxIndexDataFileCount(ctx.spark) + val numFiles = { + val n = indexDataSize / targetIndexDataFileSize + math.min(math.max(1, n), maxIndexDataFileCount).toInt + } + val repartitionedIndexData = indexData.repartition(numFiles) + repartitionedIndexData.write.mode(writeMode).parquet(ctx.indexDataPath.toString) + indexData.unpersist() + } + + /** + * Collects index predicates for each node in the source predicate. + */ + private def buildPredicateMap( + predicate: Expression, + source: LogicalPlan, + resolvedExprs: Map[Sketch, Seq[Expression]]) + : scala.collection.Map[Expression, Seq[Expression]] = { + val predMap = mutable.Map[Expression, mutable.Buffer[Expression]]() + val sketchesWithIndex = sketches.zipWithIndex + val nameMap = source.output.map(attr => attr.exprId -> attr.name).toMap + val attrMap = buildAttrMap(predicate, resolvedExprs, nameMap) + val valueExtractor = AttrValueExtractor(attrMap) + def updatePredMap(sourcePred: Expression): Unit = { + val indexPreds = sketchesWithIndex.flatMap { + case (sketch, idx) => + sketch.convertPredicate( + sourcePred, + resolvedExprs(sketch), + aggrNames(idx).map(UnresolvedAttribute.quoted), + nameMap, + valueExtractor) + } + if (indexPreds.nonEmpty) { + predMap.getOrElseUpdate(sourcePred, mutable.Buffer.empty) ++= indexPreds + } + } + def forEachTerm(p: Expression, f: Expression => Unit): Unit = { + f(p) + p match { + case And(_, _) | Or(_, _) => p.children.foreach(forEachTerm(_, f)) + case _ => + } + } + forEachTerm(predicate, updatePredMap) + predMap + } + + private def buildAttrMap( + predicate: Expression, + resolvedExprs: Map[Sketch, Seq[Expression]], + nameMap: Map[ExprId, String]): Map[Attribute, Expression] = { + val partitionSketchIdx = sketches.indexWhere(_.isInstanceOf[PartitionSketch]) + if (partitionSketchIdx != -1) { + val partitionSketch = sketches(partitionSketchIdx) + val sketchValues = aggrNames(partitionSketchIdx).map(UnresolvedAttribute.quoted) + val exprExtractors = resolvedExprs(partitionSketch).map(NormalizedExprExtractor(_, nameMap)) + val exprsAndValues = exprExtractors.zip(sketchValues) + predicate.references + .flatMap(a => exprsAndValues.find(_._1.unapply(a).isDefined).map(a -> _._2)) + .toMap + } else { + Map.empty + } + } + + private def aggrNames(i: Int): Seq[String] = { + aggregateFunctions + .slice(sketchOffsets(i), sketchOffsets(i + 1)) + .map(_.expr.asInstanceOf[NamedExpression].name) + } + + /** + * Sketch offsets are used to map each sketch to its corresponding columns + * in the dataframe. + */ + @transient + private lazy val sketchOffsets: Seq[Int] = + sketches.map(_.aggregateFunctions.length).scanLeft(0)(_ + _) + + @transient + private lazy val aggregateFunctions = DataSkippingIndex.getNamedAggregateFunctions(sketches) +} + +object DataSkippingIndex { + // $COVERAGE-OFF$ https://github.com/scoverage/scalac-scoverage-plugin/issues/125 + final val kind = "DataSkippingIndex" + final val kindAbbr = "DS" + // $COVERAGE-ON$ + /** * Creates index data for the given source data. */ - def index(ctx: IndexerContext, sourceData: DataFrame): DataFrame = { + def createIndexData( + ctx: IndexerContext, + sketches: Seq[Sketch], + sourceData: DataFrame): DataFrame = { val fileNameCol = "input_file_name" + val aggregateFunctions = getNamedAggregateFunctions(sketches) val indexDataWithFileName = sourceData .groupBy(input_file_name().as(fileNameCol)) .agg(aggregateFunctions.head, aggregateFunctions.tail: _*) @@ -124,33 +302,28 @@ case class DataSkippingIndex( val relation = RelationUtils.getRelation(spark, sourceData.queryExecution.optimizedPlan) import spark.implicits._ val fileIdDf = ctx.fileIdTracker - .getIdToFileMapping(relation.pathNormalizer) + .getIdToFileMapping() .toDF(IndexConstants.DATA_FILE_NAME_ID, fileNameCol) indexDataWithFileName .join( fileIdDf.hint("broadcast"), - IndexUtils.decodeInputFileName(indexDataWithFileName(fileNameCol)) === - fileIdDf(fileNameCol)) + IndexUtils.getPath(IndexUtils.decodeInputFileName(indexDataWithFileName(fileNameCol))) === + IndexUtils.getPath(fileIdDf(fileNameCol))) .select( IndexConstants.DATA_FILE_NAME_ID, indexDataWithFileName.columns.filterNot(_ == fileNameCol).map(c => s"`$c`"): _*) } - private def writeImpl(ctx: IndexerContext, indexData: DataFrame, writeMode: SaveMode): Unit = { - indexData.cache() - indexData.count() // force cache - val indexDataSize = DataFrameUtils.getSizeInBytes(indexData) - val targetIndexDataFileSize = HyperspaceConf.DataSkipping.targetIndexDataFileSize(ctx.spark) - val numFiles = indexDataSize / targetIndexDataFileSize - if (!numFiles.isValidInt) { - throw HyperspaceException( - "Could not create index data files due to too many files: " + - s"indexDataSize=$indexDataSize, targetIndexDataFileSize=$targetIndexDataFileSize") + def getNamedAggregateFunctions(sketches: Seq[Sketch]): Seq[Column] = { + sketches.flatMap { s => + val aggrs = s.aggregateFunctions + assert(aggrs.nonEmpty) + aggrs.zipWithIndex.map { + case (aggr, idx) => + new Column(aggr).as(getNormalizeColumnName(s"${s}_$idx")) + } } - val repartitionedIndexData = indexData.repartition(math.max(1, numFiles.toInt)) - repartitionedIndexData.write.mode(writeMode).parquet(ctx.indexDataPath.toString) - indexData.unpersist() } /** @@ -159,21 +332,4 @@ case class DataSkippingIndex( private def getNormalizeColumnName(name: String): String = { name.replaceAll("[ ,;{}()\n\t=]", "_") } - - @transient - private lazy val aggregateFunctions = sketches.flatMap { s => - val aggrs = s.aggregateFunctions - assert(aggrs.nonEmpty) - aggrs.zipWithIndex.map { - case (aggr, idx) => - new Column(aggr).as(getNormalizeColumnName(s"${s}_$idx")) - } - } -} - -object DataSkippingIndex { - // $COVERAGE-OFF$ https://github.com/scoverage/scalac-scoverage-plugin/issues/125 - final val kind = "DataSkippingIndex" - final val kindAbbr = "DS" - // $COVERAGE-ON$ } diff --git a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndexConfig.scala b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndexConfig.scala index d10372461..558cafecc 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndexConfig.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndexConfig.scala @@ -16,14 +16,13 @@ package com.microsoft.hyperspace.index.dataskipping -import scala.collection.mutable +import org.apache.spark.sql.{DataFrame, SparkSession} -import org.apache.spark.sql.DataFrame - -import com.microsoft.hyperspace.HyperspaceException +import com.microsoft.hyperspace.{Hyperspace, HyperspaceException} import com.microsoft.hyperspace.index.{IndexConfigTrait, IndexerContext} -import com.microsoft.hyperspace.index.dataskipping.sketch.Sketch -import com.microsoft.hyperspace.index.dataskipping.util.ExpressionUtils +import com.microsoft.hyperspace.index.dataskipping.expressions.ExpressionUtils +import com.microsoft.hyperspace.index.dataskipping.sketches.{PartitionSketch, Sketch} +import com.microsoft.hyperspace.util.HyperspaceConf /** * DataSkippingIndexConfig is used to create a [[DataSkippingIndex]] via @@ -59,9 +58,29 @@ case class DataSkippingIndexConfig( sourceData: DataFrame, properties: Map[String, String]): (DataSkippingIndex, DataFrame) = { val resolvedSketches = ExpressionUtils.resolve(ctx.spark, sketches, sourceData) - checkDuplicateSketches(resolvedSketches) - val index = DataSkippingIndex(resolvedSketches, properties) - (index, index.index(ctx, sourceData)) + val autoPartitionSketch = HyperspaceConf.DataSkipping.autoPartitionSketch(ctx.spark) + val partitionSketchOpt = + if (autoPartitionSketch) getPartitionSketch(ctx.spark, sourceData) + else None + val finalSketches = partitionSketchOpt.toSeq ++ resolvedSketches + checkDuplicateSketches(finalSketches) + val indexData = DataSkippingIndex.createIndexData(ctx, finalSketches, sourceData) + val index = DataSkippingIndex(finalSketches, indexData.schema, properties) + (index, indexData) + } + + private def getPartitionSketch( + spark: SparkSession, + sourceData: DataFrame): Option[PartitionSketch] = { + val relation = Hyperspace + .getContext(spark) + .sourceProviderManager + .getRelation(sourceData.queryExecution.optimizedPlan) + if (relation.partitionSchema.nonEmpty) { + Some(PartitionSketch(relation.partitionSchema.map(f => (f.name, Some(f.dataType))))) + } else { + None + } } private def checkDuplicateSketches(sketches: Seq[Sketch]): Unit = { diff --git a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/execution/DataSkippingFileIndex.scala b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/execution/DataSkippingFileIndex.scala new file mode 100644 index 000000000..d70bd1b5a --- /dev/null +++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/execution/DataSkippingFileIndex.scala @@ -0,0 +1,74 @@ +/* + * Copyright (2021) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.index.dataskipping.execution + +import org.apache.hadoop.fs.Path +import org.apache.spark.sql.{Column, DataFrame, SparkSession} +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.execution.datasources._ +import org.apache.spark.sql.functions.isnull +import org.apache.spark.sql.types.StructType + +import com.microsoft.hyperspace.index.{FileIdTracker, IndexConstants} + +/** + * FileIndex implementation that filters out files using the index data and + * index predicates generated by applying a data skipping index to the source. + */ +class DataSkippingFileIndex( + sparkSession: SparkSession, + fileIdTracker: FileIdTracker, + indexData: DataFrame, + private[dataskipping] val indexDataPred: Expression, // exposed for test + private[dataskipping] val baseFileIndex: FileIndex) + extends FileIndex { + + override def listFiles( + partitionFilters: Seq[Expression], + dataFilters: Seq[Expression]): Seq[PartitionDirectory] = { + val partitions = baseFileIndex.listFiles(partitionFilters, dataFilters) + import sparkSession.implicits._ + val pathCol = "__path" + val filesWithId = partitions + .flatMap(_.files.map(f => (f.getPath.toString, fileIdTracker.addFile(f)))) + .toDF(pathCol, IndexConstants.DATA_FILE_NAME_ID) + val selectedFiles = filesWithId + .hint("broadcast") + .join(indexData, Seq(IndexConstants.DATA_FILE_NAME_ID), "left") + .filter(isnull(indexData(IndexConstants.DATA_FILE_NAME_ID)) || new Column(indexDataPred)) + .select(pathCol) + .collect + .map(_.getString(0)) + .toSet + val selectedPartitions = partitions + .map(p => p.copy(files = p.files.filter(f => selectedFiles.contains(f.getPath.toString)))) + .filter(_.files.nonEmpty) + selectedPartitions + } + + override def rootPaths: Seq[Path] = baseFileIndex.rootPaths + + override def inputFiles: Array[String] = baseFileIndex.inputFiles + + override def refresh(): Unit = baseFileIndex.refresh() + + override def sizeInBytes: Long = baseFileIndex.sizeInBytes + + override def partitionSchema: StructType = baseFileIndex.partitionSchema + + override def metadataOpsTimeNs: Option[Long] = baseFileIndex.metadataOpsTimeNs +} diff --git a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/AttrValueExtractor.scala b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/AttrValueExtractor.scala new file mode 100644 index 000000000..622ad0e2c --- /dev/null +++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/AttrValueExtractor.scala @@ -0,0 +1,35 @@ +/* + * Copyright (2021) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.index.dataskipping.expressions + +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, SubqueryExpression} + +case class AttrValueExtractor(attrMap: Map[Attribute, Expression]) extends ExpressionExtractor { + override def unapply(e: Expression): Option[Expression] = { + if (canTransform(e)) Some(transform(e)) else None + } + + private def canTransform(e: Expression): Boolean = { + e.deterministic && + e.references.forall(attrMap.contains) && + !SubqueryExpression.hasSubquery(e) + } + + private def transform(e: Expression): Expression = { + e.transform { case a: Attribute => attrMap(a) } + } +} diff --git a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/ExpressionExtractor.scala b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/ExpressionExtractor.scala new file mode 100644 index 000000000..2a0d8ba07 --- /dev/null +++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/ExpressionExtractor.scala @@ -0,0 +1,23 @@ +/* + * Copyright (2021) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.index.dataskipping.expressions + +import org.apache.spark.sql.catalyst.expressions.Expression + +trait ExpressionExtractor { + def unapply(e: Expression): Option[Expression] +} diff --git a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/util/ExpressionUtils.scala b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/ExpressionUtils.scala similarity index 57% rename from src/main/scala/com/microsoft/hyperspace/index/dataskipping/util/ExpressionUtils.scala rename to src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/ExpressionUtils.scala index 82504d74d..470677332 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/util/ExpressionUtils.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/ExpressionUtils.scala @@ -14,26 +14,23 @@ * limitations under the License. */ -package com.microsoft.hyperspace.index.dataskipping.util +package com.microsoft.hyperspace.index.dataskipping.expressions import java.util.UUID import org.apache.spark.sql.{DataFrame, SparkSession} -import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Filter, LogicalPlan, Project, Window} -import org.apache.spark.sql.types.DataType +import org.apache.spark.sql.types.{BooleanType, DataType} import com.microsoft.hyperspace.HyperspaceException import com.microsoft.hyperspace.index.IndexUtils -import com.microsoft.hyperspace.index.dataskipping.sketch.Sketch +import com.microsoft.hyperspace.index.dataskipping.sketches.Sketch +import com.microsoft.hyperspace.index.rules.ApplyHyperspace.withHyperspaceRuleDisabled object ExpressionUtils { - val nullExprId = ExprId(0, new UUID(0, 0)) - /** * Returns copies of the given sketches with the indexed columns replaced by * resolved column names and data types. @@ -60,9 +57,8 @@ object ExpressionUtils { private def checkExprs( exprWithExpectedDataTypes: Seq[(String, Option[DataType])], sourceData: DataFrame): Seq[DataType] = { - val (exprs, expectedDataTypes) = - (exprWithExpectedDataTypes.map(_._1), exprWithExpectedDataTypes.map(_._2)) - def throwNotSupportedIf(cond: Boolean, msg: => String) = { + val exprs = exprWithExpectedDataTypes.map(_._1) + def throwNotSupportedIf(cond: Boolean, msg: => String): Unit = { if (cond) { throw HyperspaceException(s"DataSkippingIndex does not support indexing $msg") } @@ -87,7 +83,7 @@ object ExpressionUtils { e.find(_.isInstanceOf[SubqueryExpression]).nonEmpty, s"an expression which has a subquery: $expr") throwNotSupportedIf( - e.find(_.isInstanceOf[AttributeReference]).isEmpty, + e.find(_.isInstanceOf[Attribute]).isEmpty, s"an expression which does not reference source columns: $expr") if (expectedDataType.nonEmpty && expectedDataType.get != analyzedExpr.dataType) { throw HyperspaceException( @@ -101,14 +97,11 @@ object ExpressionUtils { /** * Used to workaround the issue where UnresolvedAttribute.sql() doesn't work as expected. */ - private case class QuotedAttribute(name: String) extends LeafExpression { + private case class QuotedAttribute(name: String) extends LeafExpression with Unevaluable { override def sql: String = name // $COVERAGE-OFF$ code never used override def nullable: Boolean = throw new NotImplementedError - override def eval(input: InternalRow): Any = throw new NotImplementedError - override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = - throw new NotImplementedError override def dataType: DataType = throw new NotImplementedError // $COVERAGE-ON$ } @@ -121,12 +114,33 @@ object ExpressionUtils { */ def normalize(expr: Expression): Expression = { expr.transformUp { - case a: AttributeReference => a.withExprId(nullExprId).withQualifier(Nil) + case a: Attribute => a.withExprId(nullExprId).withQualifier(Nil) case g @ GetStructField(child, ordinal, _) => g.copy(child, ordinal, None) + // Undo HandleNullInputsForUDF and ReplaceNullWithFalseInPredicate so + // that we can match scala UDF invocations. HandleNullInputsForUDF rule + // transforms ScalaUDF(args...) into If(Or(IsNull(args)...), null, + // ScalaUDF(KnownNotNull(args)...)), and ReplaceNullWithFalseInPredicate + // rule transforms null into false. ReplaceNullWithFalseInPredicate is + // sensitive to the tree shape. + // This is a safe operation because we ignore null values when computing + // sketch values. For example, MinMaxSketch("f(a)") will only collect + // min/max values of non-null results of "f(a)". Then we can use those + // sketch values to translate filter conditions like "f(a) = 1". Here, + // we are only interested in whether those "f(a)" refers to the same + // thing, not how they should be evaluated. Normalized expressions are + // only meant to be compared, not evaluated. + case If( + ExtractIsNullDisjunction(args1), + Literal(null | false, dataType1), + udf @ ExtractScalaUDF(dataType2, ExtractKnownNotNullArgs(args2))) + if args1 == args2 && dataType1 == dataType2 => + udf.copy(children = args2) } } - // Needed because ScalaUDF has a different number of arguments depending on Spark versions. + // Exposed for test + private[dataskipping] val nullExprId: ExprId = ExprId(0, new UUID(0, 0)) + private[dataskipping] object ExtractScalaUDF { def unapply(e: ScalaUDF): Option[(DataType, Seq[Expression])] = { Some((e.dataType, e.children)) @@ -151,4 +165,53 @@ object ExpressionUtils { } } } + + /** + * Returns sketch expressions that can be used to match indexed expressions + * and expressions in the filter condition. For example, when a user creates + * an index with MinMaxSketch("A"), we create an expression corresponding to + * "A" here, and later we try to match expression nodes in a filter condition, + * say, EqualTo(AttributeReference("A"), Literal(1)), to the expression for + * "A". + * + * We need this step as the filter/join conditions are given to us as a tree + * of expressions in the Spark's optimizer, whereas the indexed expressions + * are provided and stored as strings. + */ + def getResolvedExprs( + spark: SparkSession, + sketches: Seq[Sketch], + source: LogicalPlan): Option[Map[Sketch, Seq[Expression]]] = { + val resolvedExprs = sketches.map { s => + s -> s.expressions.map { + case (expr, dataTypeOpt) => + val parsedExpr = spark.sessionState.sqlParser.parseExpression(expr) + val dataType = dataTypeOpt.get + val filter = dataType match { + case BooleanType => Filter(parsedExpr, source) + case _ => Filter(PredicateWrapper(parsedExpr), source) + } + val optimizedFilter = withHyperspaceRuleDisabled { + spark.sessionState.optimizer + .execute(spark.sessionState.analyzer.execute(filter)) + .asInstanceOf[Filter] + } + val resolvedExpr = dataType match { + case BooleanType => optimizedFilter.condition + case _ => optimizedFilter.condition.asInstanceOf[PredicateWrapper].child + } + if (resolvedExpr.dataType != dataType) { + return None + } + normalize(resolvedExpr) + } + }.toMap + Some(resolvedExprs) + } + + // Used to preserve sketch expressions during optimization + private case class PredicateWrapper(override val child: Expression) + extends UnaryExpression + with Unevaluable + with Predicate } diff --git a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/util/NormalizedExprMatcher.scala b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/NormalizedExprExtractor.scala similarity index 60% rename from src/main/scala/com/microsoft/hyperspace/index/dataskipping/util/NormalizedExprMatcher.scala rename to src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/NormalizedExprExtractor.scala index 1f0169f57..8bfb3d8d3 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/util/NormalizedExprMatcher.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/NormalizedExprExtractor.scala @@ -14,17 +14,15 @@ * limitations under the License. */ -package com.microsoft.hyperspace.index.dataskipping.util +package com.microsoft.hyperspace.index.dataskipping.expressions -import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, ExprId} +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, ExprId} -case class NormalizedExprMatcher(expr: Expression, nameMap: Map[ExprId, String]) - extends ExprMatcher { - def apply(e: Expression): Boolean = { - val renamed = e.transformUp { - case a: AttributeReference => a.withName(nameMap(a.exprId)) - } +case class NormalizedExprExtractor(expr: Expression, nameMap: Map[ExprId, String]) + extends ExpressionExtractor { + def unapply(e: Expression): Option[Expression] = { + val renamed = e.transformUp { case a: Attribute => a.withName(nameMap(a.exprId)) } val normalized = ExpressionUtils.normalize(renamed) - expr == normalized + if (expr == normalized) Some(expr) else None } } diff --git a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/util/SortedArrayLowerBound.scala b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/SortedArrayLowerBound.scala similarity index 73% rename from src/main/scala/com/microsoft/hyperspace/index/dataskipping/util/SortedArrayLowerBound.scala rename to src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/SortedArrayLowerBound.scala index 09e6dabcd..48ed15ff9 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/util/SortedArrayLowerBound.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/SortedArrayLowerBound.scala @@ -14,11 +14,11 @@ * limitations under the License. */ -package com.microsoft.hyperspace.index.dataskipping.util +package com.microsoft.hyperspace.index.dataskipping.expressions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, Expression} -import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode, FalseLiteral} +import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode} import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.catalyst.util.{ArrayData, TypeUtils} import org.apache.spark.sql.types.IntegerType @@ -28,12 +28,13 @@ import org.apache.spark.sql.types.IntegerType * than (greater than or equal to) the value (right), or null if there is no such * element. * + * If the value (right) is null, null is returned. + * * Preconditions (unchecked): * - The array must not be null. * - Elements in the array must be in ascending order. * - The array must not contain null elements. * - The array must not contain duplicate elements. - * - The value must not be null. */ private[dataskipping] case class SortedArrayLowerBound(left: Expression, right: Expression) extends BinaryExpression { @@ -47,15 +48,17 @@ private[dataskipping] case class SortedArrayLowerBound(left: Expression, right: override def eval(input: InternalRow): Any = { val arr = left.eval(input).asInstanceOf[ArrayData] val value = right.eval(input) - val dt = right.dataType - val n = arr.numElements() - if (n > 0) { - if (ordering.lteq(value, arr.get(0, dt))) { - return 1 - } - if (ordering.lteq(value, arr.get(n - 1, dt))) { - val (_, index) = SortedArrayUtils.binarySearch(arr, dt, ordering, 0, n, value) - return index + 1 + if (value != null) { + val dt = right.dataType + val n = arr.numElements() + if (n > 0) { + if (ordering.lteq(value, arr.get(0, dt))) { + return 1 + } + if (ordering.lteq(value, arr.get(n - 1, dt))) { + val (_, index) = SortedArrayUtils.binarySearch(arr, dt, ordering, 0, n, value) + return index + 1 + } } } null @@ -73,14 +76,16 @@ private[dataskipping] case class SortedArrayLowerBound(left: Expression, right: val binarySearch = SortedArrayUtils.binarySearchCodeGen(ctx, dt) val resultCode = s""" - |int $n = $arr.numElements(); - |if ($n > 0) { - | if (!(${ctx.genGreater(dt, value, firstValueInArr)})) { - | ${ev.isNull} = false; - | ${ev.value} = 1; - | } else if (!(${ctx.genGreater(dt, value, lastValueInArr)})) { - | ${ev.isNull} = false; - | ${ev.value} = $binarySearch($arr, 0, $n, $value).index() + 1; + |if (!(${rightGen.isNull})) { + | int $n = $arr.numElements(); + | if ($n > 0) { + | if (!(${ctx.genGreater(dt, value, firstValueInArr)})) { + | ${ev.isNull} = false; + | ${ev.value} = 1; + | } else if (!(${ctx.genGreater(dt, value, lastValueInArr)})) { + | ${ev.isNull} = false; + | ${ev.value} = $binarySearch($arr, 0, $n, $value).index() + 1; + | } | } |} """.stripMargin diff --git a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/util/SortedArrayUtils.scala b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/SortedArrayUtils.scala similarity index 97% rename from src/main/scala/com/microsoft/hyperspace/index/dataskipping/util/SortedArrayUtils.scala rename to src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/SortedArrayUtils.scala index 35e318059..ff31ac5d5 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/util/SortedArrayUtils.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/SortedArrayUtils.scala @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.microsoft.hyperspace.index.dataskipping.util +package com.microsoft.hyperspace.index.dataskipping.expressions import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator} import org.apache.spark.sql.catalyst.util.ArrayData diff --git a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/extractors.scala b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/extractors.scala new file mode 100644 index 000000000..f097fc97d --- /dev/null +++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/extractors.scala @@ -0,0 +1,123 @@ +/* + * Copyright (2021) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.index.dataskipping.expressions + +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.types.BooleanType + +case class EqualToExtractor(left: ExpressionExtractor, right: ExpressionExtractor) { + def unapply(p: Expression): Option[(Expression, Expression)] = + p match { + case EqualTo(left(l), right(r)) => Some((l, r)) + case EqualTo(right(r), left(l)) => Some((l, r)) + case _ => None + } +} + +case class EqualNullSafeExtractor(left: ExpressionExtractor, right: ExpressionExtractor) { + def unapply(p: Expression): Option[(Expression, Expression)] = + p match { + case EqualNullSafe(left(l), right(r)) => Some((l, r)) + case EqualNullSafe(right(r), left(l)) => Some((l, r)) + case _ => None + } +} + +case class LessThanExtractor(left: ExpressionExtractor, right: ExpressionExtractor) { + def unapply(p: Expression): Option[(Expression, Expression)] = + p match { + case LessThan(left(l), right(r)) => Some((l, r)) + case GreaterThan(right(r), left(l)) => Some((l, r)) + case _ => None + } +} + +case class LessThanOrEqualExtractor(left: ExpressionExtractor, right: ExpressionExtractor) { + def unapply(p: Expression): Option[(Expression, Expression)] = + p match { + case LessThanOrEqual(left(l), right(r)) => Some((l, r)) + case GreaterThanOrEqual(right(r), left(l)) => Some((l, r)) + case _ => None + } +} + +case class IsNullExtractor(expr: ExpressionExtractor) { + def unapply(p: Expression): Option[Expression] = + p match { + case IsNull(expr(e)) => Some(e) + case EqualNullSafe(expr(e), v: Literal) if v.value == null => Some(e) + case EqualNullSafe(v: Literal, expr(e)) if v.value == null => Some(e) + case _ => None + } +} + +case class IsNotNullExtractor(expr: ExpressionExtractor) { + def unapply(p: Expression): Option[Expression] = + p match { + case IsNotNull(expr(e)) => Some(e) + // Spark 2.4 lacks a rule transforming Not(IsNull) to IsNotNull + case Not(IsNull(expr(e))) => Some(e) + case Not(EqualNullSafe(expr(e), v: Literal)) if v.value == null => Some(e) + case Not(EqualNullSafe(v: Literal, expr(e))) if v.value == null => Some(e) + case _ => None + } +} + +case class IsTrueExtractor(expr: ExpressionExtractor) { + def unapply(p: Expression): Option[Expression] = + p.dataType match { + case BooleanType => + p match { + case expr(e) => Some(e) + case EqualTo(expr(e), Literal(true, BooleanType)) => Some(e) + case EqualTo(Literal(true, BooleanType), expr(e)) => Some(e) + case _ => None + } + case _ => None + } +} + +case class IsFalseExtractor(expr: ExpressionExtractor) { + def unapply(p: Expression): Option[Expression] = + p.dataType match { + case BooleanType => + p match { + case Not(expr(e)) => Some(e) + case EqualTo(expr(e), Literal(false, BooleanType)) => Some(e) + case EqualTo(Literal(false, BooleanType), expr(e)) => Some(e) + case _ => None + } + case _ => None + } +} + +case class InExtractor(arg: ExpressionExtractor, element: ExpressionExtractor) { + def unapply(p: Expression): Option[(Expression, Seq[Expression])] = + p match { + case In(arg(a), vs) => + Some((a, vs.map(element.unapply(_).getOrElse { return None }))) + case _ => None + } +} + +case class InSetExtractor(arg: ExpressionExtractor) { + def unapply(p: Expression): Option[(Expression, Set[Any])] = + p match { + case InSet(arg(a), vs) => Some((a, vs)) + case _ => None + } +} diff --git a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/rules/ApplyDataSkippingIndex.scala b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/rules/ApplyDataSkippingIndex.scala new file mode 100644 index 000000000..3c219d7d2 --- /dev/null +++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/rules/ApplyDataSkippingIndex.scala @@ -0,0 +1,105 @@ +/* + * Copyright (2021) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.index.dataskipping.rules + +import org.apache.spark.sql.catalyst.expressions.AttributeReference +import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan} +import org.apache.spark.sql.execution.datasources._ +import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat +import org.apache.spark.sql.hyperspace.utils.logicalPlanToDataFrame +import org.apache.spark.sql.types.StructType + +import com.microsoft.hyperspace.index.{IndexLogEntry, IndexLogEntryTags} +import com.microsoft.hyperspace.index.dataskipping.DataSkippingIndex +import com.microsoft.hyperspace.index.dataskipping.execution.DataSkippingFileIndex +import com.microsoft.hyperspace.index.plans.logical.IndexHadoopFsRelation +import com.microsoft.hyperspace.index.rules._ +import com.microsoft.hyperspace.index.rules.ApplyHyperspace.PlanToSelectedIndexMap + +object ApplyDataSkippingIndex extends HyperspaceRule { + protected override val filtersOnQueryPlan: Seq[QueryPlanIndexFilter] = + IndexTypeFilter[DataSkippingIndex]() :: FilterPlanNodeFilter :: FilterConditionFilter :: Nil + + protected override val indexRanker: IndexRankFilter = DataSkippingIndexRanker + + override def applyIndex(plan: LogicalPlan, indexes: PlanToSelectedIndexMap): LogicalPlan = { + if (indexes.isEmpty) { + return plan + } + plan match { + case filter @ Filter(_, ExtractRelation(relation)) => + val indexLogEntry = indexes(relation.plan) + val indexDataPred = indexLogEntry + .getTagValue(plan, IndexLogEntryTags.DATASKIPPING_INDEX_PREDICATE) + .get + .getOrElse { return plan } + val indexData = logicalPlanToDataFrame(spark, getIndexDataRelation(indexLogEntry)) + val originalFileIndex = indexLogEntry.withCachedTag( + relation.plan, + IndexLogEntryTags.DATASKIPPING_SOURCE_FILEINDEX) { + relation.getOrCreateFileIndex(spark) + } + val dataSkippingFileIndex = new DataSkippingFileIndex( + spark, + indexLogEntry.fileIdTracker, + indexData, + indexDataPred, + originalFileIndex) + val newRelation = relation.createLogicalRelation( + IndexHadoopFsRelation( + relation.createHadoopFsRelation( + dataSkippingFileIndex, + relation.schema, + relation.options), + spark, + indexLogEntry), + relation.output.map(_.asInstanceOf[AttributeReference])) + filter.copy(child = newRelation) + case _ => plan + } + } + + override def score(plan: LogicalPlan, indexes: PlanToSelectedIndexMap): Int = { + if (indexes.isEmpty) { + return 0 + } + // Return the lowest score so that covering indexes take precedence over + // data skipping indexes. + 1 + } + + private def getIndexDataRelation(indexLogEntry: IndexLogEntry): LogicalRelation = { + val indexDataSchema = indexLogEntry.derivedDataset.asInstanceOf[DataSkippingIndex].schema + val indexDataLoc = + indexLogEntry.withCachedTag(IndexLogEntryTags.DATASKIPPING_INDEX_FILEINDEX) { + new InMemoryFileIndex( + spark, + indexLogEntry.content.files, + Map.empty, + Some(indexDataSchema), + FileStatusCache.getOrCreate(spark)) + } + LogicalRelation( + HadoopFsRelation( + indexDataLoc, + StructType(Nil), + indexDataSchema, + None, + new ParquetFileFormat, + Map.empty)(spark)) + } +} diff --git a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/rules/DataSkippingIndexRanker.scala b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/rules/DataSkippingIndexRanker.scala new file mode 100644 index 000000000..40642cc2f --- /dev/null +++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/rules/DataSkippingIndexRanker.scala @@ -0,0 +1,38 @@ +/* + * Copyright (2021) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.index.dataskipping.rules + +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan + +import com.microsoft.hyperspace.index.dataskipping.DataSkippingIndex +import com.microsoft.hyperspace.index.rules.ApplyHyperspace.{PlanToIndexesMap, PlanToSelectedIndexMap} +import com.microsoft.hyperspace.index.rules.IndexRankFilter + +object DataSkippingIndexRanker extends IndexRankFilter { + override def apply( + plan: LogicalPlan, + applicableIndexes: PlanToIndexesMap): PlanToSelectedIndexMap = { + // TODO: Multiple data skipping index can be applied to the same plan node, + // although the effectiveness decreases as more indexes are applied. + // The framework should be updated to allow multiple indexes. + // For now, simply choose the index with most sketches. + applicableIndexes.collect { + case (plan, indexes) if indexes.nonEmpty => + plan -> indexes.maxBy(_.derivedDataset.asInstanceOf[DataSkippingIndex].sketches.length) + } + } +} diff --git a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/rules/FilterConditionFilter.scala b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/rules/FilterConditionFilter.scala new file mode 100644 index 000000000..a28272363 --- /dev/null +++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/rules/FilterConditionFilter.scala @@ -0,0 +1,63 @@ +/* + * Copyright (2021) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.index.dataskipping.rules + +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan} + +import com.microsoft.hyperspace.index.IndexLogEntryTags +import com.microsoft.hyperspace.index.dataskipping.DataSkippingIndex +import com.microsoft.hyperspace.index.plananalysis.FilterReasons +import com.microsoft.hyperspace.index.rules.{ExtractRelation, QueryPlanIndexFilter} +import com.microsoft.hyperspace.index.rules.ApplyHyperspace.PlanToIndexesMap + +/** + * FilterConditionFilter filters indexes out if + * 1) an index cannot be applied to the filter condition. + */ +object FilterConditionFilter extends QueryPlanIndexFilter { + override def apply(plan: LogicalPlan, candidateIndexes: PlanToIndexesMap): PlanToIndexesMap = { + if (candidateIndexes.isEmpty) { + return Map.empty + } + plan match { + case Filter(condition: Expression, ExtractRelation(relation)) => + val applicableIndexes = candidateIndexes(relation.plan).flatMap { indexLogEntry => + val indexDataPredOpt = + indexLogEntry.withCachedTag(plan, IndexLogEntryTags.DATASKIPPING_INDEX_PREDICATE) { + val index = indexLogEntry.derivedDataset.asInstanceOf[DataSkippingIndex] + index.translateFilterCondition(spark, condition, relation.plan) + } + if (withFilterReasonTag( + plan, + indexLogEntry, + FilterReasons.IneligibleFilterCondition(condition.sql))( + indexDataPredOpt.nonEmpty)) { + Some(indexLogEntry) + } else { + None + } + } + if (applicableIndexes.nonEmpty) { + Map(relation.plan -> applicableIndexes) + } else { + Map.empty + } + case _ => Map.empty + } + } +} diff --git a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/rules/FilterPlanNodeFilter.scala b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/rules/FilterPlanNodeFilter.scala new file mode 100644 index 000000000..e252e8d0d --- /dev/null +++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/rules/FilterPlanNodeFilter.scala @@ -0,0 +1,40 @@ +/* + * Copyright (2021) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.index.dataskipping.rules + +import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan} + +import com.microsoft.hyperspace.index.rules.{ExtractRelation, QueryPlanIndexFilter, RuleUtils} +import com.microsoft.hyperspace.index.rules.ApplyHyperspace.PlanToIndexesMap + +/** + * FilterPlanNodeFilter filters indexes out if + * 1) the given plan is not eligible filter plan node. + * 2) the source plan of index is not part of the filter plan. + */ +object FilterPlanNodeFilter extends QueryPlanIndexFilter { + override def apply(plan: LogicalPlan, candidateIndexes: PlanToIndexesMap): PlanToIndexesMap = { + if (candidateIndexes.isEmpty) { + return Map.empty + } + plan match { + case Filter(_, ExtractRelation(relation)) if !RuleUtils.isIndexApplied(relation) => + candidateIndexes.filterKeys(relation.plan.equals) + case _ => Map.empty + } + } +} diff --git a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketch/MinMaxSketch.scala b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketch/MinMaxSketch.scala deleted file mode 100644 index 474069dc5..000000000 --- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketch/MinMaxSketch.scala +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (2021) The Hyperspace Project Authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.microsoft.hyperspace.index.dataskipping.sketch - -import org.apache.spark.sql.catalyst.expressions.Expression -import org.apache.spark.sql.catalyst.expressions.aggregate.{Max, Min} -import org.apache.spark.sql.types.DataType - -/** - * Sketch based on minimum and maximum values for a given expression. - * - * @param expr Expression from which min/max values are calculated - * @param dataType Optional data type to specify the expected data type of the - * expression. If not specified, it is deduced automatically. - * If the actual data type of the expression is different from this, - * an error is thrown. Users are recommended to leave this parameter to - * None. - */ -case class MinMaxSketch(override val expr: String, override val dataType: Option[DataType] = None) - extends SingleExprSketch[MinMaxSketch](expr, dataType) { - override def name: String = "MinMax" - - override def withNewExpression(newExpr: (String, Option[DataType])): MinMaxSketch = { - copy(expr = newExpr._1, dataType = newExpr._2) - } - - override def aggregateFunctions: Seq[Expression] = - Min(parsedExpr).toAggregateExpression() :: Max(parsedExpr).toAggregateExpression() :: Nil -} diff --git a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketches/MinMaxSketch.scala b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketches/MinMaxSketch.scala new file mode 100644 index 000000000..6d8a143dc --- /dev/null +++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketches/MinMaxSketch.scala @@ -0,0 +1,100 @@ +/* + * Copyright (2021) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.index.dataskipping.sketches + +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.aggregate.{Max, Min} +import org.apache.spark.sql.catalyst.util.{ArrayData, TypeUtils} +import org.apache.spark.sql.types.{ArrayType, DataType} + +import com.microsoft.hyperspace.index.dataskipping.expressions._ +import com.microsoft.hyperspace.index.dataskipping.util.ArrayUtils + +/** + * Sketch based on minimum and maximum values for a given expression. + * + * @param expr Expression from which min/max values are calculated + * @param dataType Optional data type to specify the expected data type of the + * expression. If not specified, it is deduced automatically. + * If the actual data type of the expression is different from this, + * an error is thrown. Users are recommended to leave this parameter to + * None. + */ +case class MinMaxSketch(override val expr: String, override val dataType: Option[DataType] = None) + extends SingleExprSketch[MinMaxSketch](expr, dataType) { + override def name: String = "MinMax" + + override def withNewExpression(newExpr: (String, Option[DataType])): MinMaxSketch = { + copy(expr = newExpr._1, dataType = newExpr._2) + } + + override def aggregateFunctions: Seq[Expression] = + Min(parsedExpr).toAggregateExpression() :: Max(parsedExpr).toAggregateExpression() :: Nil + + override def convertPredicate( + predicate: Expression, + resolvedExprs: Seq[Expression], + sketchValues: Seq[Expression], + nameMap: Map[ExprId, String], + valueExtractor: ExpressionExtractor): Option[Expression] = { + val min = sketchValues(0) + val max = sketchValues(1) + // TODO: Add third sketch value "hasNull" of type bool + // true if the expr can be null in the file, false if otherwise + // to optimize IsNull (can skip files with hasNull = false) + // This can be also done as a separate sketch, e.g. HasNullSketch + // Should evaluate which way is better + val resolvedExpr = resolvedExprs.head + val dataType = resolvedExpr.dataType + val exprExtractor = NormalizedExprExtractor(resolvedExpr, nameMap) + val ExprIsTrue = IsTrueExtractor(exprExtractor) + val ExprIsFalse = IsFalseExtractor(exprExtractor) + val ExprIsNotNull = IsNotNullExtractor(exprExtractor) + val ExprEqualTo = EqualToExtractor(exprExtractor, valueExtractor) + val ExprEqualNullSafe = EqualNullSafeExtractor(exprExtractor, valueExtractor) + val ExprLessThan = LessThanExtractor(exprExtractor, valueExtractor) + val ExprLessThanOrEqualTo = LessThanOrEqualExtractor(exprExtractor, valueExtractor) + val ExprGreaterThan = LessThanExtractor(valueExtractor, exprExtractor) + val ExprGreaterThanOrEqualTo = LessThanOrEqualExtractor(valueExtractor, exprExtractor) + val ExprIn = InExtractor(exprExtractor, valueExtractor) + val ExprInSet = InSetExtractor(exprExtractor) + Option(predicate) + .collect { + case ExprIsTrue(_) => max + case ExprIsFalse(_) => Not(min) + case ExprIsNotNull(_) => IsNotNull(min) + case ExprEqualTo(_, v) => And(LessThanOrEqual(min, v), GreaterThanOrEqual(max, v)) + case ExprEqualNullSafe(_, v) => + Or(IsNull(v), And(LessThanOrEqual(min, v), GreaterThanOrEqual(max, v))) + case ExprLessThan(_, v) => LessThan(min, v) + case ExprLessThanOrEqualTo(_, v) => LessThanOrEqual(min, v) + case ExprGreaterThan(v, _) => GreaterThan(max, v) + case ExprGreaterThanOrEqualTo(v, _) => GreaterThanOrEqual(max, v) + case ExprIn(_, vs) => + vs.map(v => And(LessThanOrEqual(min, v), GreaterThanOrEqual(max, v))).reduceLeft(Or) + case ExprInSet(_, vs) => + val sortedValues = Literal( + ArrayData.toArrayData( + ArrayUtils.toArray( + vs.filter(_ != null).toArray.sorted(TypeUtils.getInterpretedOrdering(dataType)), + dataType)), + ArrayType(dataType, containsNull = false)) + LessThanOrEqual(ElementAt(sortedValues, SortedArrayLowerBound(sortedValues, min)), max) + // TODO: StartsWith, Like with constant prefix + } + } +} diff --git a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketches/PartitionSketch.scala b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketches/PartitionSketch.scala new file mode 100644 index 000000000..71db379ac --- /dev/null +++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketches/PartitionSketch.scala @@ -0,0 +1,73 @@ +/* + * Copyright (2021) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.index.dataskipping.sketches + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.types.DataType + +import com.microsoft.hyperspace.index.dataskipping.expressions._ +import com.microsoft.hyperspace.shim.FirstNullSafe + +/** + * Internal implementation for partitioning column. + * + * This sketch is automatically created for each partitioning column if the + * source data has partitioning columns. This enables data skipping indexes to + * be usable with filter conditions having a disjunction involving partitioning + * columns and indexed columns. For example, a filter condition like "A = 1 or + * B = 1" will be translated into an index predicate "(Min_A <= 1 and Max_A >= + * 1) or B = 1" where A is an indexed column of MinMaxSketch and B is a + * partitioning column. + */ +private[dataskipping] case class PartitionSketch( + override val expressions: Seq[(String, Option[DataType])]) + extends Sketch { + + override def indexedColumns: Seq[String] = exprStrings + + override def referencedColumns: Seq[String] = exprStrings + + override def withNewExpressions( + newExpressions: Seq[(String, Option[DataType])]): PartitionSketch = { + copy(expressions = newExpressions) + } + + override def aggregateFunctions: Seq[Expression] = { + val parser = SparkSession.getActiveSession.get.sessionState.sqlParser + exprStrings.map { e => + FirstNullSafe(parser.parseExpression(e)).toAggregateExpression() + } + } + + override def toString: String = s"Partition(${exprStrings.mkString(", ")})" + + override def convertPredicate( + predicate: Expression, + resolvedExprs: Seq[Expression], + sketchValues: Seq[Expression], + nameMap: Map[ExprId, String], + valueExtractor: ExpressionExtractor): Option[Expression] = { + predicate match { + case And(_, _) | Or(_, _) => None + case valueExtractor(v) if (predicate.references.nonEmpty) => Some(v) + case _ => None + } + } + + private def exprStrings: Seq[String] = expressions.map(_._1) +} diff --git a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketch/SingleExprSketch.scala b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketches/SingleExprSketch.scala similarity index 97% rename from src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketch/SingleExprSketch.scala rename to src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketches/SingleExprSketch.scala index 75d4a523a..e08b0d6cb 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketch/SingleExprSketch.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketches/SingleExprSketch.scala @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.microsoft.hyperspace.index.dataskipping.sketch +package com.microsoft.hyperspace.index.dataskipping.sketches import scala.reflect.ClassTag diff --git a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketch/Sketch.scala b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketches/Sketch.scala similarity index 51% rename from src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketch/Sketch.scala rename to src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketches/Sketch.scala index 505dc4b27..2bab62dee 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketch/Sketch.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketches/Sketch.scala @@ -14,12 +14,14 @@ * limitations under the License. */ -package com.microsoft.hyperspace.index.dataskipping.sketch +package com.microsoft.hyperspace.index.dataskipping.sketches import com.fasterxml.jackson.annotation.JsonTypeInfo -import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, ExprId} import org.apache.spark.sql.types.DataType +import com.microsoft.hyperspace.index.dataskipping.expressions.ExpressionExtractor + /** * Represents a sketch specification for data skipping indexes. * @@ -75,4 +77,42 @@ trait Sketch { * Returns the hash code for this sketch. */ def hashCode: Int + + /** + * Converts the given predicate node for source data to an index predicate + * that can be used to filter out unnecessary source files when applied to + * index data. + * + * The returned predicate should evaluate to true for an index data row + * if the corresponding source data file cannot be excluded. The returned + * predicate should be used only to find files that cannot be skipped. In + * other words, it must not be negated to find files that can be skipped, + * because it can evaluate to null. + * + * The implementation should consider the given predicate as a single node, + * not a tree that must be traversed recursively, because that part is + * handled by the framework. + * + * @param predicate Source predicate node + * @param resolvedExprs Sketch expressions that can be used to match + * expressions in the source predicate; for example, + * MinMaxSketch("A") will be given an expression corresponding to + * "A". + * @param sketchValues Sketch value references in index data; for example, + * MinMaxSketch("A") will be given two expressions corresponding to + * Min(A) and Max(A) in the index data. If the predicate is + * convertible, the implementation should return a predicate + * composed of these sketch values. + * @param nameMap Map used to normalize attributes in the source predicate by + * looking up the attribute name with ExprId; this is needed because + * the attribute name in the predicate may have different cases + * (lower/upper cases). + * @return Converted predicate for index data + */ + def convertPredicate( + predicate: Expression, + resolvedExprs: Seq[Expression], + sketchValues: Seq[Expression], + nameMap: Map[ExprId, String], + valueExtractor: ExpressionExtractor): Option[Expression] } diff --git a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/util/extractors.scala b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/util/extractors.scala deleted file mode 100644 index 5acdd1456..000000000 --- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/util/extractors.scala +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Copyright (2021) The Hyperspace Project Authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.microsoft.hyperspace.index.dataskipping.util - -import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.types.{BooleanType, DataType} - -// Value extractors returning Option[Literal] check if the value is not null -// because we're only interested in non-null values. -// -// Also note that we don't go overboard to match every pattern because -// we assume that Catalyst optimizer will give us an optimized predicate in NNF. -// It means in general we don't have to deal with Not, or worry about -// foldable expressions because they will be optimized to literals. -// -// There are some differences between Spark versions and that's why we include -// some patterns that are no longer needed in newer Spark versions. - -/** - * Extracts the non-null literal value in the predicate if it's equivalent to - * = . - * - * For the purpose of data skipping, we don't extract the value if it's null. - * If the literal is not null, then the only way to make the predicate - * = or <=> is when the expression is not null - * and its value is equal to the literal value. - */ -case class EqualToExtractor(matcher: ExprMatcher) { - def unapply(p: Expression): Option[Literal] = - p match { - case EqualTo(e, v: Literal) if v.value != null && matcher(e) => Some(v) - case EqualTo(v: Literal, e) if v.value != null && matcher(e) => Some(v) - case EqualNullSafe(e, v: Literal) if v.value != null && matcher(e) => Some(v) - case EqualNullSafe(v: Literal, e) if v.value != null && matcher(e) => Some(v) - case _ => None - } -} - -/** - * Extracts the non-null literal value in the predicate if it's equivalent to - * < . - */ -case class LessThanExtractor(matcher: ExprMatcher) { - def unapply(p: Expression): Option[Literal] = - p match { - case LessThan(e, v: Literal) if v.value != null && matcher(e) => Some(v) - case GreaterThan(v: Literal, e) if v.value != null && matcher(e) => Some(v) - case _ => None - } -} - -/** - * Extracts the non-null literal value in the predicate if it's equivalent to - * <= . - */ -case class LessThanOrEqualToExtractor(matcher: ExprMatcher) { - def unapply(p: Expression): Option[Literal] = - p match { - case LessThanOrEqual(e, v: Literal) if v.value != null && matcher(e) => Some(v) - case GreaterThanOrEqual(v: Literal, e) if v.value != null && matcher(e) => Some(v) - case _ => None - } -} - -/** - * Extracts the non-null literal value in the predicate if it's equivalent to - * > . - */ -case class GreaterThanExtractor(matcher: ExprMatcher) { - def unapply(p: Expression): Option[Literal] = - p match { - case GreaterThan(e, v: Literal) if v.value != null && matcher(e) => Some(v) - case LessThan(v: Literal, e) if v.value != null && matcher(e) => Some(v) - case _ => None - } -} - -/** - * Extracts the non-null literal value in the predicate if it's equivalent to - * >= . - */ -case class GreaterThanOrEqualToExtractor(matcher: ExprMatcher) { - def unapply(p: Expression): Option[Literal] = - p match { - case GreaterThanOrEqual(e, v: Literal) if v.value != null && matcher(e) => Some(v) - case LessThanOrEqual(v: Literal, e) if v.value != null && matcher(e) => Some(v) - case _ => None - } -} - -/** - * Matches the predicate if it's equivalent to IS NULL. - */ -case class IsNullExtractor(matcher: ExprMatcher) { - def unapply(p: Expression): Boolean = - p match { - case IsNull(e) if matcher(e) => true - case EqualNullSafe(e, v: Literal) if v.value == null && matcher(e) => true - case EqualNullSafe(v: Literal, e) if v.value == null && matcher(e) => true - case _ => false - } -} - -/** - * Matches the predicate if it's equivalent to IS NOT NULL. - * - * Not(IsNull()) is also matched because it can be in the predicate in - * Spark 2.4. Since Spark 3.0, this is optimized to IsNotNull(). - */ -case class IsNotNullExtractor(matcher: ExprMatcher) { - def unapply(p: Expression): Boolean = - p match { - case IsNotNull(e) if matcher(e) => true - case Not(IsNull(e)) if matcher(e) => true // for Spark 2.4 - case Not(EqualNullSafe(e, v: Literal)) if v.value == null && matcher(e) => true - case Not(EqualNullSafe(v: Literal, e)) if v.value == null && matcher(e) => true - case _ => false - } -} - -/** - * Matches the predicate if it's equivalent to = true. - * - * Note that boolean expressions can be a predicate on their own, not needing - * EqualTo with true. To avoid false matches, we check that the type of the - * expression is BooleanType, although it's not strictly necessary because our - * predicate conversion does not go down the predicate tree unless it's And/Or - * and Spark has already checked the expression is Boolean if it's a direct - * child of And/Or. - */ -case class IsTrueExtractor(matcher: ExprMatcher) { - def unapply(p: Expression): Boolean = { - val EqualTo = EqualToExtractor(matcher) - p.dataType == BooleanType && (p match { - case EqualTo(Literal(true, BooleanType)) => true - case e if matcher(e) => true - case _ => false - }) - } -} - -/** - * Matches the predicate if it's equivalent to = false. - */ -case class IsFalseExtractor(matcher: ExprMatcher) { - def unapply(p: Expression): Boolean = { - val EqualTo = EqualToExtractor(matcher) - p.dataType == BooleanType && (p match { - case EqualTo(Literal(false, BooleanType)) => true - case Not(e) if matcher(e) => true - case _ => false - }) - } -} - -/** - * Extracts non-null values in the predicate if it's equivalent to - * IN (*). - * - * In Spark, In is created if the number of values in the list does not exceed - * spark.sql.optimizer.inSetConversionThreshold. - */ -case class InExtractor(matcher: ExprMatcher) { - def unapply(p: Expression): Option[(Seq[Literal])] = - p match { - case In(e, vs) if vs.nonEmpty && vs.forall(v => v.isInstanceOf[Literal]) && matcher(e) => - Some(vs.map(_.asInstanceOf[Literal]).filter(_.value != null)) - case _ => None - } -} - -/** - * Extracts non-null values in the predicate if it's equivalent to - * IN (*). - * - * InSet is created instead of In if the list size is larger than - * spark.sql.optimizer.inSetConversionThreshold. - */ -case class InSetExtractor(matcher: ExprMatcher) { - def unapply(p: Expression): Option[(Set[Any])] = - p match { - case InSet(e, vs) if matcher(e) => Some(vs) - case _ => None - } -} diff --git a/src/main/scala/com/microsoft/hyperspace/index/plananalysis/FilterReason.scala b/src/main/scala/com/microsoft/hyperspace/index/plananalysis/FilterReason.scala index 88dbfd1df..519400207 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/plananalysis/FilterReason.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/plananalysis/FilterReason.scala @@ -148,4 +148,11 @@ object FilterReasons { override def verboseStr: String = s"Another candidate index is applied: $appliedIndex" } + + case class IneligibleFilterCondition(condition: String) extends FilterReason { + override final def codeStr: String = "INELIGIBLE_FILTER_CONDITION" + override val args = Seq("condition" -> condition) + override def verboseStr: String = + s"Ineligible filter condition: $condition" + } } diff --git a/src/main/scala/com/microsoft/hyperspace/index/plananalysis/PlanAnalyzer.scala b/src/main/scala/com/microsoft/hyperspace/index/plananalysis/PlanAnalyzer.scala index a9336bf96..81cbfa035 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/plananalysis/PlanAnalyzer.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/plananalysis/PlanAnalyzer.scala @@ -28,6 +28,7 @@ import org.apache.spark.sql.hyperspace.utils.logicalPlanToDataFrame import com.microsoft.hyperspace.{HyperspaceException, Implicits} import com.microsoft.hyperspace.index.IndexConstants +import com.microsoft.hyperspace.index.plans.logical.IndexHadoopFsRelation import com.microsoft.hyperspace.shim.ExtractFileSourceScanExecRelation /** @@ -212,7 +213,13 @@ object PlanAnalyzer { plan: SparkPlan, indexes: DataFrame, bufferStream: BufferStream): Unit = { - val usedIndexes = indexes.filter(indexes("indexLocation").isin(getPaths(plan): _*)) + val usedIndexNames = plan.collect { + case ExtractFileSourceScanExecRelation(rel: IndexHadoopFsRelation) => + rel.indexName + } + val usedIndexes = indexes.filter( + indexes("indexLocation").isin(getPaths(plan): _*) || + indexes("name").isin(usedIndexNames: _*)) usedIndexes.collect().foreach { row => bufferStream .write(row.getAs("name").toString) diff --git a/src/main/scala/com/microsoft/hyperspace/index/plans/logical/IndexHadoopFsRelation.scala b/src/main/scala/com/microsoft/hyperspace/index/plans/logical/IndexHadoopFsRelation.scala index d46b25fb6..0936695b1 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/plans/logical/IndexHadoopFsRelation.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/plans/logical/IndexHadoopFsRelation.scala @@ -45,6 +45,24 @@ class IndexHadoopFsRelation( s"Hyperspace(Type: ${index.derivedDataset.kindAbbr}, " + s"Name: ${index.name}, LogVersion: ${index.id})" } + + def indexName: String = index.name + override def toString(): String = indexPlanStr } + +object IndexHadoopFsRelation { + def apply( + rel: HadoopFsRelation, + spark: SparkSession, + index: IndexLogEntry): IndexHadoopFsRelation = { + new IndexHadoopFsRelation( + rel.location, + rel.partitionSchema, + rel.dataSchema, + rel.bucketSpec, + rel.fileFormat, + rel.options)(spark, index) + } +} diff --git a/src/main/scala/com/microsoft/hyperspace/index/rules/ApplyHyperspace.scala b/src/main/scala/com/microsoft/hyperspace/index/rules/ApplyHyperspace.scala index df87202e0..a9ae97acd 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/rules/ApplyHyperspace.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/rules/ApplyHyperspace.scala @@ -63,4 +63,13 @@ object ApplyHyperspace } } } + + def withHyperspaceRuleDisabled[T](f: => T): T = { + try { + disableForIndexMaintenance.set(true) + f + } finally { + disableForIndexMaintenance.set(false) + } + } } diff --git a/src/main/scala/com/microsoft/hyperspace/index/rules/ScoreBasedIndexPlanOptimizer.scala b/src/main/scala/com/microsoft/hyperspace/index/rules/ScoreBasedIndexPlanOptimizer.scala index ed083e1cb..b2faa7427 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/rules/ScoreBasedIndexPlanOptimizer.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/rules/ScoreBasedIndexPlanOptimizer.scala @@ -21,13 +21,15 @@ import scala.collection.mutable import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import com.microsoft.hyperspace.index.covering.{FilterIndexRule, JoinIndexRule} +import com.microsoft.hyperspace.index.dataskipping.rules.ApplyDataSkippingIndex import com.microsoft.hyperspace.index.rules.ApplyHyperspace.PlanToIndexesMap /** * Apply Hyperspace indexes based on the score of each index application. */ class ScoreBasedIndexPlanOptimizer { - private val rules: Seq[HyperspaceRule] = FilterIndexRule :: JoinIndexRule :: NoOpRule :: Nil + private val rules: Seq[HyperspaceRule] = + Seq(FilterIndexRule, JoinIndexRule, ApplyDataSkippingIndex, NoOpRule) // Map for memoization. The key is the logical plan before applying [[HyperspaceRule]]s // and its value is a pair of best transformed plan and its score. diff --git a/src/main/scala/com/microsoft/hyperspace/index/sources/default/DefaultFileBasedRelation.scala b/src/main/scala/com/microsoft/hyperspace/index/sources/default/DefaultFileBasedRelation.scala index fe9b18b7b..7bc0a7a94 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/sources/default/DefaultFileBasedRelation.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/sources/default/DefaultFileBasedRelation.scala @@ -88,6 +88,13 @@ class DefaultFileBasedRelation(spark: SparkSession, override val plan: LogicalRe case _ => None } + override def getOrCreateFileIndex(spark: SparkSession): FileIndex = { + plan.relation match { + case HadoopFsRelation(location: FileIndex, _, _, _, _, _) => + location + } + } + /** * Creates [[HadoopFsRelation]] based on the current relation. * diff --git a/src/main/scala/com/microsoft/hyperspace/index/sources/iceberg/IcebergRelation.scala b/src/main/scala/com/microsoft/hyperspace/index/sources/iceberg/IcebergRelation.scala index 0c9d0e48d..ee69140e8 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/sources/iceberg/IcebergRelation.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/sources/iceberg/IcebergRelation.scala @@ -18,12 +18,12 @@ package com.microsoft.hyperspace.index.sources.iceberg import collection.JavaConverters._ import org.apache.hadoop.fs.{FileStatus, Path} -import org.apache.iceberg.{FileScanTask, Schema, Table} +import org.apache.iceberg.{FileScanTask, Schema, Table, TableScan} import org.apache.iceberg.spark.SparkSchemaUtil import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import org.apache.spark.sql.execution.datasources.{FileIndex, HadoopFsRelation, LogicalRelation} +import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.types.StructType @@ -70,7 +70,7 @@ class IcebergRelation( * All the files that the current Iceberg table uses for read. */ override lazy val allFiles: Seq[FileStatus] = { - table.newScan().planFiles().iterator().asScala.toSeq.map(toFileStatus) + fileScanTasks.map(toFileStatus) } /** @@ -160,6 +160,33 @@ class IcebergRelation( SparkSchemaUtil.convert(schema) } + override def getOrCreateFileIndex(spark: SparkSession): InMemoryFileIndex = { + val rootPaths = if (snapshotId.isEmpty) { + if (table.spec().isUnpartitioned) { + Seq(PathUtils.makeAbsolute(table.location(), spark.sessionState.newHadoopConf())) + } else { + // We'd like to use a single root path here, but + // then the partition discovery fails + // because there is a directory called "metadata" + // under the root path. + fileScanTasks.map(t => toPath(t).getParent).distinct + } + } else { + // Listing all files with InMemoryFileIndex should be avoided, + // but there is no other way as there is no FileIndex implementation + // for Iceberg that supports snapshots. + fileScanTasks.map(toPath) + } + new InMemoryFileIndex( + spark, + rootPaths, + partitionBasePath + .map(PartitioningAwareFileIndex.BASE_PATH_PARAM -> _) + .toMap, + Some(schema), + FileStatusCache.getOrCreate(spark)) + } + /** * Creates [[HadoopFsRelation]] based on the current relation. * @@ -191,10 +218,26 @@ class IcebergRelation( new LogicalRelation(hadoopFsRelation, updatedOutput, None, false) } - private def toFileStatus(fileScanTask: FileScanTask): FileStatus = { - val path = PathUtils.makeAbsolute( + private def newScan(): TableScan = { + if (snapshotId.isDefined) { + table.newScan().useSnapshot(snapshotId.get) + } else { + table.newScan() + } + } + + private def fileScanTasks: Seq[FileScanTask] = { + newScan().planFiles().iterator().asScala.toSeq + } + + private def toPath(fileScanTask: FileScanTask): Path = { + PathUtils.makeAbsolute( new Path(fileScanTask.file().path().toString), spark.sessionState.newHadoopConf()) + } + + private def toFileStatus(fileScanTask: FileScanTask): FileStatus = { + val path = toPath(fileScanTask) val fs = path.getFileSystem(spark.sessionState.newHadoopConf()) val fullPath = if (!path.isAbsolute) { new Path(s"${fs.getWorkingDirectory.toString}/${path.toString}") diff --git a/src/main/scala/com/microsoft/hyperspace/index/sources/interfaces.scala b/src/main/scala/com/microsoft/hyperspace/index/sources/interfaces.scala index 8b01adbf0..eaba1f5c3 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/sources/interfaces.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/sources/interfaces.scala @@ -105,6 +105,11 @@ trait FileBasedRelation extends SourceRelation { */ def partitionBasePath: Option[String] + /** + * Returns [[FileIndex]] for the current relation. + */ + def getOrCreateFileIndex(spark: SparkSession): FileIndex + /** * Creates [[HadoopFsRelation]] based on the current relation. * diff --git a/src/main/scala/com/microsoft/hyperspace/util/HyperspaceConf.scala b/src/main/scala/com/microsoft/hyperspace/util/HyperspaceConf.scala index 6be9dd731..2fcc56519 100644 --- a/src/main/scala/com/microsoft/hyperspace/util/HyperspaceConf.scala +++ b/src/main/scala/com/microsoft/hyperspace/util/HyperspaceConf.scala @@ -130,6 +130,49 @@ object HyperspaceConf { } longValue } + + def maxIndexDataFileCount(spark: SparkSession): Int = { + // TODO: Consider using a systematic way to validate the config value + // like Spark's ConfigBuilder + val value = spark.conf + .get( + IndexConstants.DATASKIPPING_MAX_INDEX_DATA_FILE_COUNT, + IndexConstants.DATASKIPPING_MAX_INDEX_DATA_FILE_COUNT_DEFAULT) + val intValue = + try { + value.toInt + } catch { + case e: NumberFormatException => + throw HyperspaceException( + s"${IndexConstants.DATASKIPPING_MAX_INDEX_DATA_FILE_COUNT} " + + s"should be int, but was $value") + } + if (intValue <= 0) { + throw HyperspaceException( + s"${IndexConstants.DATASKIPPING_MAX_INDEX_DATA_FILE_COUNT} " + + s"should be a positive number.") + } + intValue + } + + def autoPartitionSketch(spark: SparkSession): Boolean = { + // TODO: Consider using a systematic way to validate the config value + // like Spark's ConfigBuilder + val value = spark.conf + .get( + IndexConstants.DATASKIPPING_AUTO_PARTITION_SKETCH, + IndexConstants.DATASKIPPING_AUTO_PARTITION_SKETCH_DEFAULT) + val booleanValue = + try { + value.toBoolean + } catch { + case e: IllegalArgumentException => + throw HyperspaceException( + s"${IndexConstants.DATASKIPPING_AUTO_PARTITION_SKETCH} " + + s"should be boolean, but was $value") + } + booleanValue + } } /** diff --git a/src/main/scala/org/apache/spark/sql/hyperspace/utils/package.scala b/src/main/scala/org/apache/spark/sql/hyperspace/utils/package.scala index c01c6350e..836844fe2 100644 --- a/src/main/scala/org/apache/spark/sql/hyperspace/utils/package.scala +++ b/src/main/scala/org/apache/spark/sql/hyperspace/utils/package.scala @@ -34,5 +34,6 @@ package object utils { implicit class StructTypeUtils(st: StructType) { // Expose package-private method def merge(that: StructType): StructType = st.merge(that) + def sameType(that: StructType): Boolean = st.sameType(that) } } diff --git a/src/test/scala/com/microsoft/hyperspace/IcebergTestUtils.scala b/src/test/scala/com/microsoft/hyperspace/IcebergTestUtils.scala new file mode 100644 index 000000000..bdd509924 --- /dev/null +++ b/src/test/scala/com/microsoft/hyperspace/IcebergTestUtils.scala @@ -0,0 +1,43 @@ +/* + * Copyright (2021) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace + +import scala.collection.JavaConverters._ + +import org.apache.iceberg.{PartitionSpec => IcebergPartitionSpec, Table, TableProperties} +import org.apache.iceberg.hadoop.HadoopTables +import org.apache.iceberg.spark.SparkSchemaUtil +import org.apache.spark.sql.DataFrame + +trait IcebergTestUtils { + def createIcebergTable(dataPath: String, sourceDf: DataFrame): Table = { + val props = Map(TableProperties.WRITE_NEW_DATA_LOCATION -> dataPath).asJava + val schema = SparkSchemaUtil.convert(sourceDf.schema) + val part = IcebergPartitionSpec.builderFor(schema).build() + new HadoopTables().create(schema, part, props, dataPath) + } + + def createIcebergTableWithPartitions( + dataPath: String, + sourceDf: DataFrame, + partCol: String): Table = { + val props = Map(TableProperties.WRITE_NEW_DATA_LOCATION -> dataPath).asJava + val schema = SparkSchemaUtil.convert(sourceDf.schema) + val part = IcebergPartitionSpec.builderFor(schema).identity(partCol).build() + new HadoopTables().create(schema, part, props, dataPath) + } +} diff --git a/src/test/scala/com/microsoft/hyperspace/index/E2EHyperspaceRulesTest.scala b/src/test/scala/com/microsoft/hyperspace/index/E2EHyperspaceRulesTest.scala index db3203aae..ec1c2841f 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/E2EHyperspaceRulesTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/E2EHyperspaceRulesTest.scala @@ -30,7 +30,7 @@ import com.microsoft.hyperspace.index.IndexConstants.{GLOBBING_PATTERN_KEY, REFR import com.microsoft.hyperspace.index.IndexLogEntryTags._ import com.microsoft.hyperspace.index.covering.JoinIndexRule import com.microsoft.hyperspace.index.dataskipping.DataSkippingIndexConfig -import com.microsoft.hyperspace.index.dataskipping.sketch.MinMaxSketch +import com.microsoft.hyperspace.index.dataskipping.sketches.MinMaxSketch import com.microsoft.hyperspace.index.execution.BucketUnionStrategy import com.microsoft.hyperspace.index.rules.{ApplyHyperspace, CandidateIndexCollector} import com.microsoft.hyperspace.util.PathUtils diff --git a/src/test/scala/com/microsoft/hyperspace/index/IcebergIntegrationTest.scala b/src/test/scala/com/microsoft/hyperspace/index/IcebergIntegrationTest.scala index ff1d4ea0f..bd431dff2 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/IcebergIntegrationTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/IcebergIntegrationTest.scala @@ -19,22 +19,19 @@ package com.microsoft.hyperspace.index import org.apache.commons.lang.StringUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path -import org.apache.iceberg.{PartitionSpec => IcebergPartitionSpec, Table, TableProperties} -import org.apache.iceberg.hadoop.HadoopTables -import org.apache.iceberg.spark.SparkSchemaUtil import org.apache.spark.sql.{DataFrame, QueryTest} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.datasources._ import scala.collection.JavaConverters._ -import com.microsoft.hyperspace.{Hyperspace, Implicits, SampleData, TestConfig} +import com.microsoft.hyperspace.{Hyperspace, IcebergTestUtils, Implicits, SampleData, TestConfig} import com.microsoft.hyperspace.TestUtils.latestIndexLogEntry import com.microsoft.hyperspace.index.IndexConstants.REFRESH_MODE_QUICK import com.microsoft.hyperspace.index.plananalysis.{PlainTextMode, PlanAnalyzer} import com.microsoft.hyperspace.util.PathUtils import com.microsoft.hyperspace.util.PathUtils.DataPathFilter -class IcebergIntegrationTest extends QueryTest with HyperspaceSuite { +class IcebergIntegrationTest extends QueryTest with HyperspaceSuite with IcebergTestUtils { override val indexLocationDirName = "icebergIntegrationTest" private val sampleData = SampleData.testData @@ -131,7 +128,7 @@ class IcebergIntegrationTest extends QueryTest with HyperspaceSuite { val dfFromSample = testData .toDF("Date", "RGUID", "Query", "imprs", "clicks") - createIcebergTableWithDayPartition(testPath, dfFromSample) + createIcebergTableWithPartitions(testPath, dfFromSample, "Date") dfFromSample.write .format("iceberg") @@ -387,20 +384,6 @@ class IcebergIntegrationTest extends QueryTest with HyperspaceSuite { rootPaths.exists(_.toString.contains(p))) } - def createIcebergTable(dataPath: String, sourceDf: DataFrame): Table = { - val props = Map(TableProperties.WRITE_NEW_DATA_LOCATION -> dataPath).asJava - val schema = SparkSchemaUtil.convert(sourceDf.schema) - val part = IcebergPartitionSpec.builderFor(schema).build() - new HadoopTables().create(schema, part, props, dataPath) - } - - def createIcebergTableWithDayPartition(dataPath: String, sourceDf: DataFrame): Table = { - val props = Map(TableProperties.WRITE_NEW_DATA_LOCATION -> dataPath).asJava - val schema = SparkSchemaUtil.convert(sourceDf.schema) - val part = IcebergPartitionSpec.builderFor(schema).identity("Date").build() - new HadoopTables().create(schema, part, props, dataPath) - } - private def truncate(s: String): String = { StringUtils.abbreviate(s, 100) } diff --git a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndexConfigTest.scala b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndexConfigTest.scala index 09699522d..9596310fc 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndexConfigTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndexConfigTest.scala @@ -18,11 +18,11 @@ package com.microsoft.hyperspace.index.dataskipping import org.apache.hadoop.fs.Path import org.apache.spark.sql.functions.{input_file_name, max, min} -import org.apache.spark.sql.types.{LongType, StringType} +import org.apache.spark.sql.types.{IntegerType, LongType, StringType} import com.microsoft.hyperspace.HyperspaceException import com.microsoft.hyperspace.index.IndexConstants -import com.microsoft.hyperspace.index.dataskipping.sketch._ +import com.microsoft.hyperspace.index.dataskipping.sketches._ class DataSkippingIndexConfigTest extends DataSkippingSuite { test("indexName returns the index name.") { @@ -92,6 +92,46 @@ class DataSkippingIndexConfigTest extends DataSkippingSuite { assert(index.sketches === Seq(MinMaxSketch("Foo", Some(LongType)))) } + test("createIndex creates partition sketches for partitioned source data.") { + val sourceData = + createPartitionedSourceData(spark.range(10).selectExpr("id as A", "id * 2 as B"), Seq("A")) + val indexConfig = DataSkippingIndexConfig("MyIndex", MinMaxSketch("B")) + val (index, indexData) = indexConfig.createIndex(ctx, sourceData, Map()) + assert( + index.sketches === Seq( + PartitionSketch(Seq(("A", Some(IntegerType)))), + MinMaxSketch("B", Some(LongType)))) + } + + test( + "createIndex creates partition sketches for partitioned source data " + + "with multiple partition columns.") { + val sourceData = + createPartitionedSourceData( + spark.range(10).selectExpr("id as A", "id as B", "id * 2 as C"), + Seq("A", "B")) + val indexConfig = DataSkippingIndexConfig("MyIndex", MinMaxSketch("C")) + val (index, indexData) = indexConfig.createIndex(ctx, sourceData, Map()) + assert( + index.sketches === Seq( + PartitionSketch(Seq(("A", Some(IntegerType)), ("B", Some(IntegerType)))), + MinMaxSketch("C", Some(LongType)))) + } + + test( + "createIndex does not create partition sketches for partitioned source data " + + "if the config is turned off.") { + withSQLConf(IndexConstants.DATASKIPPING_AUTO_PARTITION_SKETCH -> "false") { + val sourceData = + createPartitionedSourceData( + spark.range(10).selectExpr("id as A", "id * 2 as B"), + Seq("A")) + val indexConfig = DataSkippingIndexConfig("MyIndex", MinMaxSketch("B")) + val (index, indexData) = indexConfig.createIndex(ctx, sourceData, Map()) + assert(index.sketches === Seq(MinMaxSketch("B", Some(LongType)))) + } + } + test("createIndex throws an error if the data type is wrong.") { val sourceData = createSourceData(spark.range(10).toDF("Foo")) val indexConfig = DataSkippingIndexConfig("MyIndex", MinMaxSketch("foO", Some(StringType))) diff --git a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndexIntegrationTest.scala b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndexIntegrationTest.scala index adbb456f2..11133a7cf 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndexIntegrationTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndexIntegrationTest.scala @@ -16,14 +16,45 @@ package com.microsoft.hyperspace.index.dataskipping -import com.microsoft.hyperspace.HyperspaceException -import com.microsoft.hyperspace.index.dataskipping.sketch.MinMaxSketch +import org.apache.hadoop.fs.Path +import org.apache.spark.SparkException +import org.apache.spark.sql.{DataFrame, SaveMode} +import org.apache.spark.sql.execution.datasources._ +import org.apache.spark.sql.types._ -class DataSkippingIndexIntegrationTest extends DataSkippingSuite { +import com.microsoft.hyperspace._ +import com.microsoft.hyperspace.index.IndexConstants +import com.microsoft.hyperspace.index.covering.CoveringIndexConfig +import com.microsoft.hyperspace.index.dataskipping.sketches._ +import com.microsoft.hyperspace.index.plans.logical.IndexHadoopFsRelation +import com.microsoft.hyperspace.shim.ExtractFileSourceScanExecRelation + +class DataSkippingIndexIntegrationTest extends DataSkippingSuite with IcebergTestUtils { import spark.implicits._ override val numParallelism: Int = 10 + test("MinMax index is applied for a filter query (EqualTo).") { + val df = createSourceData(spark.range(100).toDF("A")) + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + def query: DataFrame = df.filter("A = 1") + checkIndexApplied(query, 1) + } + + test("Empty relation is returned if no files match the index predicate.") { + val df = createSourceData(spark.range(100).toDF("A")) + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + def query: DataFrame = df.filter("A = -1") + checkIndexApplied(query, 0) + } + + test("MinMax index is applied for a filter query (EqualTo) with expression.") { + val df = createSourceData(spark.range(100).selectExpr("id as A", "id * 2 as B")) + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A + B"))) + def query: DataFrame = df.filter("A+B < 40") + checkIndexApplied(query, 2) + } + test("Non-deterministic expression is blocked.") { val df = createSourceData(spark.range(100).toDF("A")) val ex = intercept[HyperspaceException]( @@ -88,4 +119,554 @@ class DataSkippingIndexIntegrationTest extends DataSkippingSuite { "DataSkippingIndex does not support indexing an expression which does not " + "reference source columns: myfunc()")) } + + test("MinMax index is applied for a filter query (EqualTo) with UDF.") { + val df = createSourceData(spark.range(100).toDF("A")) + spark.udf.register("myfunc", (a: Int) => a * 2) + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("myfunc(A)"))) + def query: DataFrame = df.filter("myfunc(A) = 10") + checkIndexApplied(query, 1) + } + + test("UDF matching is based on the name, not the actual lambda object.") { + val df = createSourceData(spark.range(100).toDF("A")) + spark.udf.register("myfunc", (a: Int) => a * 2) + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("myfunc(A)"))) + // Register a new function with the same semantics. + spark.udf.register("myfunc", (a: Int) => 2 * a) + def query: DataFrame = df.filter("myfunc(A) = 10") + checkIndexApplied(query, 1) + } + + test("MinMax index is not applied for a filter query if it is not applicable.") { + val df = createSourceData(spark.range(100).selectExpr("id as A", "id * 2 as B")) + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("B"))) + def query: DataFrame = df.filter("A = 1") + checkIndexApplied(query, numParallelism) + } + + test("MinMax index is not applied for a filter query if the filter condition is unsuitable.") { + val df = createSourceData(spark.range(100).selectExpr("id as A", "id * 2 as B")) + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + def query: DataFrame = df.filter("A = 1 or B = 2") + checkIndexApplied(query, numParallelism) + } + + test("MinMax index is not applied for a filter query if the filter condition is IsNull.") { + val df = createSourceData(spark.range(100).toDF("A")) + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + def query: DataFrame = df.filter("A is null") + checkIndexApplied(query, numParallelism) + } + + test("Multiple indexes are applied to multiple filters.") { + val df = createSourceData(spark.range(100).toDF("A"), path = "TA") + val df2 = createSourceData(spark.range(100, 200).toDF("B"), path = "TB") + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + hs.createIndex(df2, DataSkippingIndexConfig("myind2", MinMaxSketch("B"))) + def query: DataFrame = df.filter("A = 10").union(df2.filter("B = 110")) + checkIndexApplied(query, 2) + } + + test("Single index is applied to multiple filters.") { + val df = createSourceData(spark.range(100).toDF("A")) + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + def query: DataFrame = df.filter("A = 10").union(df.filter("A = 20")) + checkIndexApplied(query, 2) + } + + test("Single index is applied to a single filter.") { + val df = createSourceData(spark.range(100).selectExpr("id as A", "id * 2 as B")) + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + def query: DataFrame = df.filter("A = 10").union(df.filter("B = 120")) + checkIndexApplied(query, numParallelism + 1) + } + + test( + "DataSkippingIndex works correctly for CSV where the same source data files can be " + + "interpreted differently.") { + // String order: 1 < 10 < 2 + // Int order: 1 < 2 < 10 + createFile(dataPath("1.csv"), Seq("a", "1", "2", "10").mkString("\n").getBytes()) + createFile(dataPath("2.csv"), Seq("a", "3", "4", "5").mkString("\n").getBytes()) + val paths = Seq(dataPath("1.csv").toString, dataPath("2.csv").toString) + val dfString = spark.read.option("header", "true").csv(paths: _*) + assert(dfString.schema.head.dataType === StringType) + val dfInt = spark.read.option("header", "true").option("inferSchema", "true").csv(paths: _*) + assert(dfInt.schema.head.dataType === IntegerType) + + withIndex("myind") { + hs.createIndex(dfString, DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + checkIndexApplied(dfString.filter("A = 3"), 2) + checkIndexApplied(dfString.filter("A = 10"), 2) + checkIndexApplied(dfString.filter("A = '3'"), 1) + checkIndexApplied(dfString.filter("A = '10'"), 1) + checkIndexApplied(dfInt.filter("A = 3"), 2) + checkIndexApplied(dfInt.filter("A = 10"), 2) + checkIndexApplied(dfInt.filter("A = '3'"), 2) + checkIndexApplied(dfInt.filter("A = '10'"), 2) + } + withIndex("myind") { + hs.createIndex(dfInt, DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + checkIndexApplied(dfString.filter("A = 3"), 2) + checkIndexApplied(dfString.filter("A = 10"), 2) + checkIndexApplied(dfString.filter("A = '3'"), 2) + checkIndexApplied(dfString.filter("A = '10'"), 2) + checkIndexApplied(dfInt.filter("A = 3"), 2) + checkIndexApplied(dfInt.filter("A = 10"), 1) + checkIndexApplied(dfInt.filter("A = '3'"), 2) + checkIndexApplied(dfInt.filter("A = '10'"), 1) + } + } + + test("MinMax index is applied for a filter query (EqualTo) with selection.") { + val df = createSourceData(spark.range(100).selectExpr("id as A", "id * 2 as B")) + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + def query: DataFrame = df.filter("A = 1").select("B") + checkIndexApplied(query, 1) + } + + test("MinMax index can be refreshed (mode = incremental).") { + val df = createSourceData(spark.range(100).toDF("A")) + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + createSourceData(spark.range(100, 200).toDF("A"), saveMode = SaveMode.Append) + hs.refreshIndex("myind", "incremental") + def query: DataFrame = spark.read.parquet(dataPath().toString).filter("A = 1 OR A = 123") + checkIndexApplied(query, 2) + assert(numIndexDataFiles("myind") === 2) + } + + test("MinMax index can be refreshed (mode = full).") { + val df = createSourceData(spark.range(100).toDF("A")) + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + createSourceData(spark.range(100, 200).toDF("A"), saveMode = SaveMode.Append) + hs.refreshIndex("myind", "full") + def query: DataFrame = spark.read.parquet(dataPath().toString).filter("A = 1 OR A = 123") + checkIndexApplied(query, 2) + assert(numIndexDataFiles("myind") === 1) + } + + test("MinMax index can be refreshed (mode = full) for partitioned data.") { + val df = createPartitionedSourceData( + spark.range(100).selectExpr("id as A", "cast(id / 10 as int) as B"), + Seq("B")) + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + createPartitionedSourceData( + spark.range(100, 200).selectExpr("id as A", "cast(id / 15 as int) as B"), + Seq("B"), + saveMode = SaveMode.Append) + hs.refreshIndex("myind", "full") + def query: DataFrame = spark.read.parquet(dataPath().toString).filter("A = 1 OR A = 123") + checkIndexApplied(query, 2) + assert(numIndexDataFiles("myind") === 1) + } + + test( + "MinMax index can be applied without refresh when source files are added " + + "if hybrid scan is enabled.") { + withSQLConf( + IndexConstants.INDEX_HYBRID_SCAN_ENABLED -> "true", + IndexConstants.INDEX_HYBRID_SCAN_APPENDED_RATIO_THRESHOLD -> "1") { + val df = createSourceData(spark.range(100).toDF("A")) + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + createSourceData(spark.range(100, 200).toDF("A"), saveMode = SaveMode.Append) + def query: DataFrame = spark.read.parquet(dataPath().toString).filter("A = 1 OR A = 123") + checkIndexApplied(query, 11) + } + } + + test("Empty source data does not cause an error.") { + val df = createSourceData(spark.range(0).toDF("A")) + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + def query: DataFrame = df.filter("A = 1") + checkIndexApplied(query, 1) + } + + test("Empty source data followed by refresh incremental works as expected.") { + val df = createSourceData(spark.range(0).toDF("A")) + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + createSourceData(spark.range(100).toDF("A"), saveMode = SaveMode.Append) + hs.refreshIndex("myind", "incremental") + def query: DataFrame = spark.read.parquet(dataPath().toString).filter("A = 1") + checkIndexApplied(query, 2) + } + + test("MinMax index can be optimized.") { + val df = createSourceData(spark.range(100).toDF("A")) + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + createSourceData(spark.range(100, 200).toDF("A"), saveMode = SaveMode.Append) + hs.refreshIndex("myind", "incremental") + assert(numIndexDataFiles("myind") === 2) + hs.optimizeIndex("myind") + assert(numIndexDataFiles("myind") === 1) + def query: DataFrame = spark.read.parquet(dataPath().toString).filter("A = 1 OR A = 123") + checkIndexApplied(query, 2) + } + + test("CoveringIndex is applied if both CoveringIndex and DataSkippingIndex are applicable.") { + val df = createSourceData(spark.range(100).toDF("A")) + hs.createIndex(df, DataSkippingIndexConfig("ds", MinMaxSketch("A"))) + hs.createIndex(df, CoveringIndexConfig("ci", Seq("A"), Nil)) + spark.enableHyperspace + def query: DataFrame = df.filter("A = 1 or A = 50") + val rel = query.queryExecution.optimizedPlan.collect { + case LogicalRelation(rel: IndexHadoopFsRelation, _, _, _) => rel + } + assert(rel.map(_.indexName) === Seq("ci")) + checkAnswer(query, Seq(1, 50).toDF("A")) + } + + test("DataSkippingIndex is applied if CoveringIndex is not applicable.") { + val df = createSourceData(spark.range(100).selectExpr("id as A", "id * 2 as B")) + hs.createIndex(df, DataSkippingIndexConfig("ds", MinMaxSketch("A"))) + hs.createIndex(df, CoveringIndexConfig("ci", Seq("A"), Nil)) + spark.enableHyperspace + def query: DataFrame = df.filter("A = 1 or A = 50") + val rel = query.queryExecution.optimizedPlan.collect { + case LogicalRelation(rel: IndexHadoopFsRelation, _, _, _) => rel + } + assert(rel.map(_.indexName) === Seq("ds")) + checkAnswer(query, Seq((1, 2), (50, 100)).toDF("A", "B")) + } + + test("Both CoveringIndex and DataSkippnigIndex can be applied.") { + val df = createSourceData(spark.range(100).selectExpr("id as A", "id * 2 as B")) + hs.createIndex(df, CoveringIndexConfig("ci", Seq("A"), Nil)) + hs.createIndex(df, DataSkippingIndexConfig("ds", MinMaxSketch("B"))) + spark.enableHyperspace + def query: DataFrame = df.filter("A = 1").select("A").union(df.filter("B = 100").select("A")) + val rel = query.queryExecution.optimizedPlan.collect { + case LogicalRelation(rel: IndexHadoopFsRelation, _, _, _) => rel + } + assert(rel.map(_.indexName).sorted === Seq("ci", "ds")) + checkAnswer(query, Seq(1, 50).toDF("A")) + } + + test("DataSkippingIndex works correctly with files having special characters in their name.") { + assume(!Path.WINDOWS) + val df = createSourceData(spark.range(100).toDF("A"), "table ,.;'`~!@#$%^&()_+|\"<>") + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + def query: DataFrame = df.filter("A = 1") + checkIndexApplied(query, 1) + } + + test("DataSkippingIndex works correctly with catalog tables") { + withTable("T") { + spark.range(100).toDF("A").write.saveAsTable("T") + val df = spark.read.table("T") + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + def query: DataFrame = df.filter("A = 1") + checkIndexApplied(query, 1) + } + } + + test("DataSkippingIndex works correctly with partitioned data.") { + val df = createPartitionedSourceData( + spark.range(1000).selectExpr("cast(id/10 as int) as A", "id as B"), + Seq("A")) + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("B"))) + def query: DataFrame = df.filter("A = 1 or B = 100") + checkIndexApplied(query, 2) + } + + test( + "DataSkippingIndex works correctly with partitioned data " + + "with multiple partition columns.") { + val df = createPartitionedSourceData( + spark + .range(1000) + .selectExpr("cast(id/100 as int) as A", "cast(id/10 as int) as B", "id as C"), + Seq("A", "B")) + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("C"))) + def query: DataFrame = df.filter("A = 1 or B = 1 or C = 1") + checkIndexApplied(query, 12) + } + + test( + "DataSkippingIndex works correctly with partitioned data " + + "with a different filter condition.") { + val df = createPartitionedSourceData( + spark.range(1000).selectExpr("cast(id/200 as int)*200 as A", "id as B"), + Seq("A")) + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("B"))) + def query: DataFrame = df.filter("A = B") + checkIndexApplied(query, 5) + } + + test("DataSkippingIndex works correctly with Delta Lake tables.") { + withSQLConf( + "spark.hyperspace.index.sources.fileBasedBuilders" -> + "com.microsoft.hyperspace.index.sources.delta.DeltaLakeFileBasedSourceBuilder") { + val df = createSourceData(spark.range(100).toDF("A"), format = "delta") + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + def query: DataFrame = df.filter("A = 1") + checkIndexApplied(query, 1) + } + } + + test("DataSkippingIndex works correctly with partitioned Delta Lake tables.") { + withSQLConf( + "spark.hyperspace.index.sources.fileBasedBuilders" -> + "com.microsoft.hyperspace.index.sources.delta.DeltaLakeFileBasedSourceBuilder") { + val df = createPartitionedSourceData( + spark.range(100).selectExpr("id as A", "cast(id / 10 as int) as B"), + Seq("B"), + format = "delta") + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + def query: DataFrame = df.filter("A = 1 or B = 5") + checkIndexApplied(query, 2) + } + } + + test("DataSkippingIndex works correctly with Delta time travel.") { + withTable("T") { + withSQLConf( + "spark.hyperspace.index.sources.fileBasedBuilders" -> + "com.microsoft.hyperspace.index.sources.delta.DeltaLakeFileBasedSourceBuilder", + IndexConstants.INDEX_HYBRID_SCAN_ENABLED -> "true", + IndexConstants.INDEX_HYBRID_SCAN_APPENDED_RATIO_THRESHOLD -> "10", + IndexConstants.INDEX_HYBRID_SCAN_DELETED_RATIO_THRESHOLD -> "10") { + + // version 0 + spark.range(100).toDF("A").write.format("delta").save(dataPath("T").toString) + + // version 1 + spark + .range(100, 200) + .toDF("A") + .write + .format("delta") + .mode("append") + .save(dataPath("T").toString) + + // version 2 + spark + .range(200, 300) + .toDF("A") + .write + .format("delta") + .mode("append") + .save(dataPath("T").toString) + + val df = (v: Int) => + spark.read.format("delta").option("versionAsOf", v).load(dataPath("T").toString) + + // Create an index with version 1 data + hs.createIndex(df(1), DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + + def query0: DataFrame = df(0).filter("A = 1 or A = 101 or A = 201") + checkIndexApplied(query0, 1) + + def query1: DataFrame = df(1).filter("A = 1 or A = 101 or A = 201") + checkIndexApplied(query1, 2) + + def query2: DataFrame = df(2).filter("A = 1 or A = 101 or A = 201") + checkIndexApplied(query2, 12) + } + } + } + + test("DataSkippingIndex works correctly with Delta time travel with partitions.") { + withTable("T") { + withSQLConf( + "spark.hyperspace.index.sources.fileBasedBuilders" -> + "com.microsoft.hyperspace.index.sources.delta.DeltaLakeFileBasedSourceBuilder", + IndexConstants.INDEX_HYBRID_SCAN_ENABLED -> "true", + IndexConstants.INDEX_HYBRID_SCAN_APPENDED_RATIO_THRESHOLD -> "10", + IndexConstants.INDEX_HYBRID_SCAN_DELETED_RATIO_THRESHOLD -> "10") { + + // version 0 + spark + .range(100) + .selectExpr("id as A", "cast(id / 10 as int) as B") + .write + .format("delta") + .save(dataPath("T").toString) + + // version 1 + spark + .range(100, 200) + .selectExpr("id as A", "cast(id / 15 as int) as B") + .write + .format("delta") + .mode("append") + .save(dataPath("T").toString) + + // version 2 + spark + .range(200, 300) + .selectExpr("id as A", "cast(id / 20 as int) as B") + .write + .format("delta") + .mode("append") + .save(dataPath("T").toString) + + val df = (v: Int) => + spark.read.format("delta").option("versionAsOf", v).load(dataPath("T").toString) + + // Create an index with version 1 data + hs.createIndex(df(1), DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + + def query0: DataFrame = df(0).filter("A = 1 or A = 101 or A = 201") + checkIndexApplied(query0, 1) + + def query1: DataFrame = df(1).filter("A = 1 or A = 101 or A = 201") + checkIndexApplied(query1, 2) + + def query2: DataFrame = df(2).filter("A = 1 or A = 101 or A = 201") + checkIndexApplied(query2, 12) + } + } + } + + test("DataSkippingIndex works correctly with Iceberg tables.") { + withSQLConf( + "spark.hyperspace.index.sources.fileBasedBuilders" -> + "com.microsoft.hyperspace.index.sources.iceberg.IcebergFileBasedSourceBuilder") { + val data = spark.range(100).toDF("A") + createIcebergTable(dataPath("T").toString, data) + val df = createSourceData(data, format = "iceberg") + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + def query: DataFrame = df.filter("A = 1") + checkIndexApplied(query, 1) + } + } + + test("DataSkippingIndex works correctly with partitioned Iceberg tables.") { + withSQLConf( + "spark.hyperspace.index.sources.fileBasedBuilders" -> + "com.microsoft.hyperspace.index.sources.iceberg.IcebergFileBasedSourceBuilder") { + val data = spark.range(100).selectExpr("id as A", "cast(id / 10 as int) as B") + createIcebergTableWithPartitions(dataPath("T").toString, data, "B") + val df = createPartitionedSourceData(data, Seq("B"), format = "iceberg") + hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + def query: DataFrame = df.filter("A = 1") + checkIndexApplied(query, 1) + } + } + + test("DataSkippingIndex works correctly with Iceberg time travel.") { + withSQLConf( + "spark.hyperspace.index.sources.fileBasedBuilders" -> + "com.microsoft.hyperspace.index.sources.iceberg.IcebergFileBasedSourceBuilder", + IndexConstants.INDEX_HYBRID_SCAN_ENABLED -> "true", + IndexConstants.INDEX_HYBRID_SCAN_APPENDED_RATIO_THRESHOLD -> "10", + IndexConstants.INDEX_HYBRID_SCAN_DELETED_RATIO_THRESHOLD -> "10") { + + // version 0 + val data = spark.range(100).toDF("A") + val table = createIcebergTable(dataPath("T").toString, data) + data.write.format("iceberg").mode("overwrite").save(dataPath("T").toString) + val v0 = table.currentSnapshot.snapshotId + + // version 1 + spark + .range(100, 200) + .toDF("A") + .write + .format("iceberg") + .mode("append") + .save(dataPath("T").toString) + table.newTransaction().commitTransaction() + val v1 = table.currentSnapshot.snapshotId + + // version 2 + spark + .range(200, 300) + .toDF("A") + .write + .format("iceberg") + .mode("append") + .save(dataPath("T").toString) + table.newTransaction().commitTransaction() + val v2 = table.currentSnapshot.snapshotId + + val df = (v: Long) => + spark.read.format("iceberg").option("snapshot-id", v).load(dataPath("T").toString) + + // Create an index with version 1 data + hs.createIndex(df(v1), DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + + def query0: DataFrame = df(v0).filter("A = 1 or A = 101 or A = 201") + checkIndexApplied(query0, 1) + + def query1: DataFrame = df(v1).filter("A = 1 or A = 101 or A = 201") + checkIndexApplied(query1, 2) + + def query2: DataFrame = df(v2).filter("A = 1 or A = 101 or A = 201") + checkIndexApplied(query2, 12) + } + } + + test("DataSkippingIndex works correctly with Iceberg time travel with partitions.") { + withSQLConf( + "spark.hyperspace.index.sources.fileBasedBuilders" -> + "com.microsoft.hyperspace.index.sources.iceberg.IcebergFileBasedSourceBuilder", + IndexConstants.INDEX_HYBRID_SCAN_ENABLED -> "true", + IndexConstants.INDEX_HYBRID_SCAN_APPENDED_RATIO_THRESHOLD -> "10", + IndexConstants.INDEX_HYBRID_SCAN_DELETED_RATIO_THRESHOLD -> "10") { + + // version 0 + val data = spark.range(100).selectExpr("id as A", "cast(id / 10 as int) as B") + val table = createIcebergTable(dataPath("T").toString, data) + data.write.format("iceberg").mode("overwrite").save(dataPath("T").toString) + val v0 = table.currentSnapshot.snapshotId + + // version 1 + spark + .range(100, 200) + .selectExpr("id as A", "cast(id / 15 as int) as B") + .write + .format("iceberg") + .mode("append") + .save(dataPath("T").toString) + table.newTransaction().commitTransaction() + val v1 = table.currentSnapshot.snapshotId + + // version 2 + spark + .range(200, 300) + .selectExpr("id as A", "cast(id / 20 as int) as B") + .write + .format("iceberg") + .mode("append") + .save(dataPath("T").toString) + table.newTransaction().commitTransaction() + val v2 = table.currentSnapshot.snapshotId + + val df = (v: Long) => + spark.read.format("iceberg").option("snapshot-id", v).load(dataPath("T").toString) + + // Create an index with version 1 data + hs.createIndex(df(v1), DataSkippingIndexConfig("myind", MinMaxSketch("A"))) + + def query0: DataFrame = df(v0).filter("A = 1 or A = 101 or A = 201") + checkIndexApplied(query0, 1) + + def query1: DataFrame = df(v1).filter("A = 1 or A = 101 or A = 201") + checkIndexApplied(query1, 2) + + def query2: DataFrame = df(v2).filter("A = 1 or A = 101 or A = 201") + checkIndexApplied(query2, 12) + } + } + + def checkIndexApplied(query: => DataFrame, numExpectedFiles: Int): Unit = { + withClue(s"query = ${query.queryExecution.logical}numExpectedFiles = $numExpectedFiles\n") { + spark.disableHyperspace + val queryWithoutIndex = query + queryWithoutIndex.collect() + spark.enableHyperspace + val queryWithIndex = query + queryWithIndex.collect() + checkAnswer(queryWithIndex, queryWithoutIndex) + assert(numAccessedFiles(queryWithIndex) === numExpectedFiles) + } + } + + def numIndexDataFiles(name: String): Int = { + val manager = Hyperspace.getContext(spark).indexCollectionManager + val latestVersion = manager.getIndexVersions(name, Seq("ACTIVE")).max + manager.getIndex(name, latestVersion).get.content.files.length + } } diff --git a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndexTest.scala b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndexTest.scala index 880757481..414e3c786 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndexTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndexTest.scala @@ -16,56 +16,60 @@ package com.microsoft.hyperspace.index.dataskipping +import scala.util.parsing.json.JSON + import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.sql.SaveMode import org.apache.spark.sql.functions.{input_file_name, max, min} -import org.apache.spark.sql.types.IntegerType +import org.apache.spark.sql.types.{IntegerType, StructType} import com.microsoft.hyperspace.HyperspaceException import com.microsoft.hyperspace.index.{Content, FileInfo, Index, IndexConstants} -import com.microsoft.hyperspace.index.dataskipping.sketch.MinMaxSketch +import com.microsoft.hyperspace.index.dataskipping.sketches.MinMaxSketch import com.microsoft.hyperspace.util.JsonUtils class DataSkippingIndexTest extends DataSkippingSuite { - override val numParallelism: Int = 3 + override val numParallelism: Int = 10 + + val emptyStructType = new StructType() test("""kind returns "DataSkippingIndex".""") { - val index = DataSkippingIndex(Seq(MinMaxSketch("A"))) + val index = DataSkippingIndex(Seq(MinMaxSketch("A")), emptyStructType) assert(index.kind === "DataSkippingIndex") } test("""kindAbbr returns "DS".""") { - val index = DataSkippingIndex(Seq(MinMaxSketch("A"))) + val index = DataSkippingIndex(Seq(MinMaxSketch("A")), emptyStructType) assert(index.kindAbbr === "DS") } test("indexedColumns returns indexed columns of sketches.") { - val index = DataSkippingIndex(Seq(MinMaxSketch("A"), MinMaxSketch("B"))) + val index = DataSkippingIndex(Seq(MinMaxSketch("A"), MinMaxSketch("B")), emptyStructType) assert(index.indexedColumns === Seq("A", "B")) } test("referencedColumns returns indexed columns of sketches.") { - val index = DataSkippingIndex(Seq(MinMaxSketch("A"), MinMaxSketch("B"))) + val index = DataSkippingIndex(Seq(MinMaxSketch("A"), MinMaxSketch("B")), emptyStructType) assert(index.referencedColumns === Seq("A", "B")) } test( "withNewProperties returns a new index which copies the original index except the " + "properties.") { - val index = DataSkippingIndex(Seq(MinMaxSketch("A"))) + val index = DataSkippingIndex(Seq(MinMaxSketch("A")), emptyStructType) val newIndex = index.withNewProperties(Map("foo" -> "bar")) assert(newIndex.properties === Map("foo" -> "bar")) assert(newIndex.sketches === index.sketches) } test("statistics returns a string-formatted list of sketches.") { - val index = DataSkippingIndex(Seq(MinMaxSketch("A"), MinMaxSketch("B"))) + val index = DataSkippingIndex(Seq(MinMaxSketch("A"), MinMaxSketch("B")), emptyStructType) assert(index.statistics() === Map("sketches" -> "MinMax(A), MinMax(B)")) } test("canHandleDeletedFiles returns true.") { - val index = DataSkippingIndex(Seq(MinMaxSketch("A"))) + val index = DataSkippingIndex(Seq(MinMaxSketch("A")), emptyStructType) assert(index.canHandleDeletedFiles === true) } @@ -78,6 +82,14 @@ class DataSkippingIndexTest extends DataSkippingSuite { checkAnswer(writtenIndexData, indexData) } + test("write throws an exception if the index data schema doesn't match.") { + val sourceData = createSourceData(spark.range(100).toDF("A")) + val indexConfig = DataSkippingIndexConfig("myIndex", MinMaxSketch("A")) + val (index, _) = indexConfig.createIndex(ctx, sourceData, Map()) + val ex = intercept[IllegalArgumentException](index.write(ctx, spark.range(10).toDF("B"))) + assert(ex.getMessage.contains("Schema of the index data doesn't match the index schema")) + } + test("optimize reduces the number of index data files.") { val targetIndexDataFileSize = 100000 val expectedNumIndexDataFiles = 1 @@ -112,16 +124,15 @@ class DataSkippingIndexTest extends DataSkippingSuite { } } - test("write throws an exception if target index data file size is too small.") { - withSQLConf(IndexConstants.DATASKIPPING_TARGET_INDEX_DATA_FILE_SIZE -> "1") { + test("write does not create more files than maxIndexDataFileCount.") { + withSQLConf( + IndexConstants.DATASKIPPING_TARGET_INDEX_DATA_FILE_SIZE -> "1", + IndexConstants.DATASKIPPING_MAX_INDEX_DATA_FILE_COUNT -> "3") { val indexConfig = DataSkippingIndexConfig("myIndex", MinMaxSketch("A")) - val sourceData = createSourceData(spark.range(100).toDF("A")) + val sourceData = createSourceData(spark.range(10000).toDF("A")) val (index, indexData) = indexConfig.createIndex(ctx, sourceData, Map()) - val mockIndexData = RDDTestUtils.getMockDataFrameWithFakeSize(spark, 4000000000L) - val ex = intercept[HyperspaceException](index.write(ctx, mockIndexData)) - assert( - ex.getMessage.contains("Could not create index data files due to too many files: " + - "indexDataSize=4000000000, targetIndexDataFileSize=1")) + index.write(ctx, indexData) + assert(listFiles(indexDataPath).filter(isParquet).length === 3) } } @@ -231,38 +242,45 @@ class DataSkippingIndexTest extends DataSkippingSuite { } test("At least one sketch must be specified.") { - val ex = intercept[AssertionError](DataSkippingIndex(Nil)) + val ex = intercept[AssertionError](DataSkippingIndex(Nil, emptyStructType)) assert(ex.getMessage().contains("At least one sketch is required")) } test("Indexes are equal if they have the same sketches and data types.") { - val ds1 = DataSkippingIndex(Seq(MinMaxSketch("A"), MinMaxSketch("B"))) - val ds2 = DataSkippingIndex(Seq(MinMaxSketch("B"), MinMaxSketch("A"))) + val ds1 = DataSkippingIndex(Seq(MinMaxSketch("A"), MinMaxSketch("B")), emptyStructType) + val ds2 = DataSkippingIndex(Seq(MinMaxSketch("B"), MinMaxSketch("A")), emptyStructType) assert(ds1 === ds2) assert(ds1.hashCode === ds2.hashCode) } test("Indexes are not equal to objects which are not indexes.") { - val ds = DataSkippingIndex(Seq(MinMaxSketch("A"))) + val ds = DataSkippingIndex(Seq(MinMaxSketch("A")), emptyStructType) assert(ds !== "ds") } test("Index can be serialized.") { - val ds = DataSkippingIndex(Seq(MinMaxSketch("A", Some(IntegerType))), Map("a" -> "b")) + val ds = DataSkippingIndex( + Seq(MinMaxSketch("A", Some(IntegerType))), + emptyStructType, + Map("a" -> "b")) val json = JsonUtils.toJson(ds) - assert( - json === - """|{ - | "type" : "com.microsoft.hyperspace.index.dataskipping.DataSkippingIndex", - | "sketches" : [ { - | "type" : "com.microsoft.hyperspace.index.dataskipping.sketch.MinMaxSketch", - | "expr" : "A", - | "dataType" : "integer" - | } ], - | "properties" : { - | "a" : "b" - | } - |}""".stripMargin) + val expectedJson = + """|{ + | "type" : "com.microsoft.hyperspace.index.dataskipping.DataSkippingIndex", + | "sketches" : [ { + | "type" : "com.microsoft.hyperspace.index.dataskipping.sketches.MinMaxSketch", + | "expr" : "A", + | "dataType" : "integer" + | } ], + | "schema" : { + | "type" : "struct", + | "fields" : [ ] + | }, + | "properties" : { + | "a" : "b" + | } + |}""".stripMargin + assert(JSON.parseFull(json) === JSON.parseFull(expectedJson)) } test("Index can be deserialized.") { @@ -270,16 +288,20 @@ class DataSkippingIndexTest extends DataSkippingSuite { """|{ | "type" : "com.microsoft.hyperspace.index.dataskipping.DataSkippingIndex", | "sketches" : [ { - | "type" : "com.microsoft.hyperspace.index.dataskipping.sketch.MinMaxSketch", + | "type" : "com.microsoft.hyperspace.index.dataskipping.sketches.MinMaxSketch", | "expr" : "A", | "dataType" : "integer" | } ], + | "schema" : { + | "type" : "struct", + | "fields" : [ ] + | }, | "properties" : { | "a" : "b" | } |}""".stripMargin val ds = JsonUtils.fromJson[DataSkippingIndex](json) - assert(ds === DataSkippingIndex(Seq(MinMaxSketch("A", Some(IntegerType))))) + assert(ds === DataSkippingIndex(Seq(MinMaxSketch("A", Some(IntegerType))), emptyStructType)) assert(ds.properties === Map("a" -> "b")) } } diff --git a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingSuite.scala b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingSuite.scala index 61750670c..f3b18a4e4 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingSuite.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingSuite.scala @@ -21,6 +21,9 @@ import scala.collection.AbstractIterator import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path, RemoteIterator} import org.apache.spark.sql.{DataFrame, QueryTest, SaveMode, SparkSession} +import org.apache.spark.sql.execution.DataSourceScanExec +import org.apache.spark.sql.execution.datasources.FilePartition +import org.apache.spark.sql.internal.SQLConf import com.microsoft.hyperspace.Hyperspace import com.microsoft.hyperspace.index._ @@ -52,6 +55,7 @@ trait DataSkippingSuite extends QueryTest with HyperspaceSuite { after { FileUtils.delete(tempDir) + spark.catalog.clearCache() } def dataPath(path: String = "T"): Path = new Path(dataPathRoot, path) @@ -60,16 +64,43 @@ trait DataSkippingSuite extends QueryTest with HyperspaceSuite { originalData: DataFrame, path: String = "T", saveMode: SaveMode = SaveMode.Overwrite, - appendedDataOnly: Boolean = false): DataFrame = { + appendedDataOnly: Boolean = false, + format: String = "parquet"): DataFrame = { val p = dataPath(path) val oldFiles = listFiles(p).toSet - originalData.write.mode(saveMode).parquet(p.toString) + originalData.write.mode(saveMode).format(format).save(p.toString) updateFileIdTracker(p) if (appendedDataOnly) { val newFiles = listFiles(p).filterNot(oldFiles.contains) - spark.read.parquet(newFiles.map(_.getPath.toString): _*) + spark.read.format(format).load(newFiles.map(_.getPath.toString): _*) } else { - spark.read.parquet(p.toString) + spark.read.format(format).load(p.toString) + } + } + + def createPartitionedSourceData( + originalData: DataFrame, + partitioningColumns: Seq[String], + path: String = "T", + saveMode: SaveMode = SaveMode.Overwrite, + appendedDataOnly: Boolean = false, + format: String = "parquet"): DataFrame = { + val p = dataPath(path) + val oldFiles = listFiles(p).toSet + originalData.write + .partitionBy(partitioningColumns: _*) + .mode(saveMode) + .format(format) + .save(p.toString) + updateFileIdTracker(p) + if (appendedDataOnly) { + val newFiles = listFiles(p).filterNot(oldFiles.contains) + spark.read + .option("basePath", p.toString) + .format(format) + .load(newFiles.map(_.getPath.toString): _*) + } else { + spark.read.format(format).load(p.toString) } } @@ -110,5 +141,55 @@ trait DataSkippingSuite extends QueryTest with HyperspaceSuite { fs.delete(path, true) } + def createFile(path: Path, data: Array[Byte]): Unit = { + val fs = path.getFileSystem(new Configuration) + val out = fs.create(path) + out.write(data) + out.close() + } + def isParquet: FileStatus => Boolean = _.getPath.getName.endsWith(".parquet") + + def withAndWithoutCodegen(testFun: => Unit): Unit = { + import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode._ + Seq(false, true).foreach { codegenEnabled => + withClue(s"codegenEnabled = $codegenEnabled") { + val mode = if (codegenEnabled) CODEGEN_ONLY else NO_CODEGEN + withSQLConf( + SQLConf.CODEGEN_FACTORY_MODE.key -> mode.toString, + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> codegenEnabled.toString) { + testFun + } + } + } + } + + def createIndexLogEntry(indexConfig: IndexConfigTrait, sourceData: DataFrame): IndexLogEntry = { + val (index, indexData) = indexConfig.createIndex(ctx, sourceData, Map()) + index.write(ctx, indexData) + IndexLogEntry( + indexConfig.indexName, + index, + Content.fromDirectory(indexDataPath, fileIdTracker, new Configuration), + Source( + SparkPlan(SparkPlan.Properties( + Seq(RelationUtils + .getRelation(spark, sourceData.queryExecution.optimizedPlan) + .createRelationMetadata(fileIdTracker)), + null, + null, + LogicalPlanFingerprint( + LogicalPlanFingerprint.Properties(Seq(Signature("sp", "sig"))))))), + Map.empty) + } + + def numAccessedFiles(df: DataFrame): Int = { + df.queryExecution.executedPlan.collect { + case scan: DataSourceScanExec => + val files = scan.inputRDDs + .flatMap( + _.partitions.flatMap(_.asInstanceOf[FilePartition].files.map(_.filePath).toSet)) + files.length + }.sum + } } diff --git a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/RDDTestUtils.scala b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/RDDTestUtils.scala deleted file mode 100644 index 1a7c548cc..000000000 --- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/RDDTestUtils.scala +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (2021) The Hyperspace Project Authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.microsoft.hyperspace.index.dataskipping - -import org.apache.spark.SparkContext -import org.apache.spark.sql.{DataFrame, SparkSession} -import org.apache.spark.storage.RDDInfo -import org.mockito.Mockito.{mock, spy, when} - -object RDDTestUtils { - def getMockDataFrameWithFakeSize(spark: SparkSession, size: Long): DataFrame = { - val df = spy(spark.emptyDataFrame) - val rdd = spy(df.rdd) - val mockSparkContext = mock(classOf[SparkContext]) - val mockRddStorageInfo = mock(classOf[RDDInfo]) - when(df.rdd).thenReturn(rdd) - when(rdd.id).thenReturn(42) - when(rdd.context).thenReturn(mockSparkContext) - when(mockSparkContext.getRDDStorageInfo).thenReturn(Array[RDDInfo](mockRddStorageInfo)) - when(mockRddStorageInfo.id).thenReturn(42) - when(mockRddStorageInfo.memSize).thenReturn(size) - df - } -} diff --git a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/execution/DataSkippingFileIndexTest.scala b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/execution/DataSkippingFileIndexTest.scala new file mode 100644 index 000000000..3e2d32915 --- /dev/null +++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/execution/DataSkippingFileIndexTest.scala @@ -0,0 +1,91 @@ +/* + * Copyright (2021) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.index.dataskipping.execution + +import org.apache.hadoop.fs.Path +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.execution.datasources._ +import org.apache.spark.sql.types._ +import org.mockito.Mockito.{mock, verify, when} + +import com.microsoft.hyperspace.index.dataskipping._ +import com.microsoft.hyperspace.index.dataskipping.sketches._ + +class DataSkippingFileIndexTest extends DataSkippingSuite { + test("DataSkippingFileIndex delegates methods to the FileIndex it is based on.") { + val baseFileIndex = mock(classOf[FileIndex]) + val dsFileIndex = new DataSkippingFileIndex( + spark, + fileIdTracker, + spark.emptyDataFrame, + Literal.TrueLiteral, + baseFileIndex) + when(baseFileIndex.rootPaths).thenReturn(Seq(new Path("x"))) + assert(dsFileIndex.rootPaths === Seq(new Path("x"))) + when(baseFileIndex.inputFiles).thenReturn(Array("x/a", "x/b")) + assert(dsFileIndex.inputFiles === Array("x/a", "x/b")) + dsFileIndex.refresh() + verify(baseFileIndex).refresh() + when(baseFileIndex.sizeInBytes).thenReturn(12345) + assert(dsFileIndex.sizeInBytes === 12345) + val structType = StructType(StructField("A", IntegerType) :: Nil) + when(baseFileIndex.partitionSchema).thenReturn(structType) + assert(dsFileIndex.partitionSchema === structType) + when(baseFileIndex.metadataOpsTimeNs).thenReturn(Some(100L)) + assert(dsFileIndex.metadataOpsTimeNs === Some(100L)) + } + + test("listFiles returns partition directories with filtered files.") { + val dsFileIndex = createDataSkippingFileIndex( + spark.range(100).toDF("A"), + MinMaxSketch("A"), + LessThanOrEqual(UnresolvedAttribute("MinMax_A__0"), Literal(1))) + val selectedPartitions = dsFileIndex.listFiles(Nil, Nil) + val allPartitions = dsFileIndex.baseFileIndex.listFiles(Nil, Nil) + assert(partitionsSize(selectedPartitions) < partitionsSize(allPartitions)) + assert(partitionsContain(allPartitions, selectedPartitions)) + } + + def createDataSkippingFileIndex( + df: DataFrame, + sketch: Sketch, + indexDataPred: Expression): DataSkippingFileIndex = { + val sourceData = createSourceData(df) + val baseFileIndex = sourceData.queryExecution.optimizedPlan.collectFirst { + case LogicalRelation(HadoopFsRelation(location, _, _, _, _, _), _, _, _) => location + }.get + val (index, indexData) = + DataSkippingIndexConfig("myind", sketch).createIndex(ctx, sourceData, Map.empty) + new DataSkippingFileIndex(spark, fileIdTracker, indexData, indexDataPred, baseFileIndex) + } + + def partitionsSize(partitions: Seq[PartitionDirectory]): Long = { + partitions.flatMap(_.files.map(_.getLen)).sum + } + + def partitionsContain( + partitions: Seq[PartitionDirectory], + others: Seq[PartitionDirectory]): Boolean = { + val partitionsMap = partitions.map(pd => (pd.values, pd.files)).toMap + others.forall { pd => + val files = partitionsMap.getOrElse(pd.values, Nil).toSet + pd.files.forall(f => files.contains(f)) + } + } +} diff --git a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/util/ExpressionUtilsTest.scala b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/expressions/ExpressionUtilsTest.scala similarity index 83% rename from src/test/scala/com/microsoft/hyperspace/index/dataskipping/util/ExpressionUtilsTest.scala rename to src/test/scala/com/microsoft/hyperspace/index/dataskipping/expressions/ExpressionUtilsTest.scala index 7346da331..1de963331 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/util/ExpressionUtilsTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/expressions/ExpressionUtilsTest.scala @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.microsoft.hyperspace.index.dataskipping.util +package com.microsoft.hyperspace.index.dataskipping.expressions import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types._ @@ -37,6 +37,22 @@ class ExpressionUtilsTest extends HyperspaceSuite { assert(ExpressionUtils.normalize(expr) === expected) } + test("normalize removes expressions inserted for UDF.") { + val arg = AttributeReference("A", IntegerType)(ExprId(42), Seq("t")) + val func = (x: Int) => x + 1 + val expr = If( + IsNull(arg), + Literal(null, IntegerType), + ScalaUDF(func, IntegerType, Seq(KnownNotNull(arg)), Nil)) + val expected = + ScalaUDF( + func, + IntegerType, + Seq(arg.withExprId(ExpressionUtils.nullExprId).withQualifier(Nil)), + Nil) + assert(ExpressionUtils.normalize(expr) === expected) + } + test("ExtractIsNullDisjunction matches IsNull.") { val expr = IsNull(Literal(null)) val args = expr match { diff --git a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/expressions/ExtractorsTest.scala b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/expressions/ExtractorsTest.scala new file mode 100644 index 000000000..3973e7fda --- /dev/null +++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/expressions/ExtractorsTest.scala @@ -0,0 +1,205 @@ +/* + * Copyright (2021) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.index.dataskipping.expressions + +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.types.{BooleanType, IntegerType} +import org.mockito.ArgumentMatchers.any +import org.mockito.Mockito.{mock, when} + +import com.microsoft.hyperspace.index.HyperspaceSuite + +class ExtractorsTest extends HyperspaceSuite { + val a = AttributeReference("a", IntegerType)() + val b = AttributeReference("b", IntegerType)() + val c = AttributeReference("c", IntegerType)() + val d = AttributeReference("d", BooleanType)() + val e = AttributeReference("e", BooleanType)() + + val aa = Literal(0) + val bb = Literal(1) + val dd = Literal(true) + + def getExtractor(x: Expression, y: Expression): ExpressionExtractor = { + val extractor = mock(classOf[ExpressionExtractor]) + when(extractor.unapply(any())).thenReturn(None) + when(extractor.unapply(x)).thenReturn(Some(y)) + extractor + } + + val aExtractor = getExtractor(a, aa) + val bExtractor = getExtractor(b, bb) + val dExtractor = getExtractor(d, dd) + + test("EqualToExtractor matches EqualTo(a, b).") { + assert(EqualToExtractor(aExtractor, bExtractor).unapply(EqualTo(a, b)) === Some((aa, bb))) + } + + test("EqualToExtractor matches EqualTo(b, a).") { + assert(EqualToExtractor(aExtractor, bExtractor).unapply(EqualTo(b, a)) === Some((aa, bb))) + } + + test("EqualToExtractor does not match EqualTo(a, c).") { + assert(EqualToExtractor(aExtractor, bExtractor).unapply(EqualTo(a, c)) === None) + } + + test("LessThanExtractor matches LessThan(a, b).") { + assert(LessThanExtractor(aExtractor, bExtractor).unapply(LessThan(a, b)) === Some((aa, bb))) + } + + test("LessThanExtractor matches GreaterThan(b, a).") { + assert( + LessThanExtractor(aExtractor, bExtractor).unapply(GreaterThan(b, a)) === + Some((aa, bb))) + } + + test("LessThanExtractor does not match LessThan(b, a).") { + assert(LessThanExtractor(aExtractor, bExtractor).unapply(LessThan(b, a)) === None) + } + + test("LessThanOrEqualExtractor matches LessThanOrEqual(a, b).") { + assert( + LessThanOrEqualExtractor(aExtractor, bExtractor).unapply(LessThanOrEqual(a, b)) === + Some((aa, bb))) + } + + test("LessThanOrEqualExtractor matches GreaterThanOrEqual(b, a).") { + assert( + LessThanOrEqualExtractor(aExtractor, bExtractor).unapply(GreaterThanOrEqual(b, a)) === + Some((aa, bb))) + } + + test("LessThanOrEqualExtractor does not match LessThanOrEqual(b, a).") { + assert( + LessThanOrEqualExtractor(aExtractor, bExtractor).unapply(LessThanOrEqual(b, a)) === + None) + } + + test("IsNullExtractor matches IsNull(a).") { + assert(IsNullExtractor(aExtractor).unapply(IsNull(a)) === Some(aa)) + } + + test("IsNullExtractor matches EqualNullSafe(a, null).") { + assert(IsNullExtractor(aExtractor).unapply(EqualNullSafe(a, Literal(null))) === Some(aa)) + } + + test("IsNullExtractor matches EqualNullSafe(null, a).") { + assert(IsNullExtractor(aExtractor).unapply(EqualNullSafe(Literal(null), a)) === Some(aa)) + } + + test("IsNullExtractor does not match IsNull(c).") { + assert(IsNullExtractor(aExtractor).unapply(IsNull(c)) === None) + } + + test("IsNotNullExtractor matches IsNotNull(a).") { + assert(IsNotNullExtractor(aExtractor).unapply(IsNotNull(a)) === Some(aa)) + } + + test("IsNotNullExtractor matches Not(IsNull(a)).") { + assert(IsNotNullExtractor(aExtractor).unapply(Not(IsNull(a))) === Some(aa)) + } + + test("IsNotNullExtractor matches Not(EqualNullSafe(a, null)).") { + assert( + IsNotNullExtractor(aExtractor).unapply(Not(EqualNullSafe(a, Literal(null)))) === + Some(aa)) + } + + test("IsNotNullExtractor matches Not(EqualNullSafe(null, a)).") { + assert( + IsNotNullExtractor(aExtractor).unapply(Not(EqualNullSafe(Literal(null), a))) === + Some(aa)) + } + + test("IsNotNullExtractor does not match IsNotNull(c).") { + assert(IsNotNullExtractor(aExtractor).unapply(IsNotNull(c)) === None) + } + + test("IsTrueExtractor matches d.") { + assert(IsTrueExtractor(dExtractor).unapply(d) === Some(dd)) + } + + test("IsTrueExtractor matches EqualTo(d, true).") { + assert(IsTrueExtractor(dExtractor).unapply(EqualTo(d, Literal(true))) === Some(dd)) + } + + test("IsTrueExtractor matches EqualTo(true, d).") { + assert(IsTrueExtractor(dExtractor).unapply(EqualTo(Literal(true), d)) === Some(dd)) + } + + test("IsTrueExtractor does not match e.") { + assert(IsTrueExtractor(dExtractor).unapply(e) === None) + } + + test("IsTrueExtractor does not match a.") { + assert(IsTrueExtractor(dExtractor).unapply(a) === None) + } + + test("IsFalseExtractor matches Not(d).") { + assert(IsFalseExtractor(dExtractor).unapply(Not(d)) === Some(dd)) + } + + test("IsFalseExtractor matches EqualTo(d, false).") { + assert(IsFalseExtractor(dExtractor).unapply(EqualTo(d, Literal(false))) === Some(dd)) + } + + test("IsFalseExtractor matches EqualTo(false, d).") { + assert(IsFalseExtractor(dExtractor).unapply(EqualTo(Literal(false), d)) === Some(dd)) + } + + test("IsFalseExtractor does not match Not(e).") { + assert(IsFalseExtractor(dExtractor).unapply(Not(e)) === None) + } + + test("IsFalseExtractor does not match a.") { + assert(IsFalseExtractor(dExtractor).unapply(a) === None) + } + + test("InExtractor matches In(a, Seq()).") { + assert(InExtractor(aExtractor, bExtractor).unapply(In(a, Seq())) === Some((aa, Seq()))) + } + + test("InExtractor matches In(a, Seq(b)).") { + assert(InExtractor(aExtractor, bExtractor).unapply(In(a, Seq(b))) === Some((aa, Seq(bb)))) + } + + test("InExtractor matches In(a, Seq(b, b)).") { + assert( + InExtractor(aExtractor, bExtractor).unapply(In(a, Seq(b, b))) === + Some((aa, Seq(bb, bb)))) + } + + test("InExtractor does not match In(a, Seq(b, c)).") { + assert(InExtractor(aExtractor, bExtractor).unapply(In(a, Seq(b, c))) === None) + } + + test("InExtractor does not match In(c, Seq(b, b)).") { + assert(InExtractor(aExtractor, bExtractor).unapply(In(c, Seq(b, b))) === None) + } + + test("InSetExtractor matches InSet(a, Set()).") { + assert(InSetExtractor(aExtractor).unapply(InSet(a, Set[Any]())) === Some((aa, Set[Any]()))) + } + + test("InSetExtractor matches InSet(a, Set(1)).") { + assert(InSetExtractor(aExtractor).unapply(InSet(a, Set[Any](1))) === Some((aa, Set[Any](1)))) + } + + test("InSetExtractor does not match InSet(c, Set(1)).") { + assert(InSetExtractor(aExtractor).unapply(InSet(c, Set[Any](1))) === None) + } +} diff --git a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/util/NormalizedExprMatcherTest.scala b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/expressions/NormalizedExprExtractorTest.scala similarity index 74% rename from src/test/scala/com/microsoft/hyperspace/index/dataskipping/util/NormalizedExprMatcherTest.scala rename to src/test/scala/com/microsoft/hyperspace/index/dataskipping/expressions/NormalizedExprExtractorTest.scala index 4a251c078..b5109e592 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/util/NormalizedExprMatcherTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/expressions/NormalizedExprExtractorTest.scala @@ -14,23 +14,25 @@ * limitations under the License. */ -package com.microsoft.hyperspace.index.dataskipping.util +package com.microsoft.hyperspace.index.dataskipping.expressions import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types._ import com.microsoft.hyperspace.index.HyperspaceSuite -class NormalizedExprMatcherTest extends HyperspaceSuite { - val matcher = NormalizedExprMatcher( +class NormalizedExprExtractorTest extends HyperspaceSuite { + val extractor = NormalizedExprExtractor( AttributeReference("A", IntegerType)(ExpressionUtils.nullExprId, Nil), Map(ExprId(42) -> "A")) test("apply returns true if the expression matches.") { - assert(matcher(AttributeReference("a", IntegerType)(ExprId(42), Nil)) === true) + assert( + extractor.unapply(AttributeReference("a", IntegerType)(ExprId(42), Nil)) === + Some(extractor.expr)) } test("apply returns false if the expression does not match") { - assert(matcher(Literal(42)) === false) + assert(extractor.unapply(Literal(42)) === None) } } diff --git a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/util/SortedArrayLowerBoundTest.scala b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/expressions/SortedArrayLowerBoundTest.scala similarity index 91% rename from src/test/scala/com/microsoft/hyperspace/index/dataskipping/util/SortedArrayLowerBoundTest.scala rename to src/test/scala/com/microsoft/hyperspace/index/dataskipping/expressions/SortedArrayLowerBoundTest.scala index 36aa5e528..baeb68da6 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/util/SortedArrayLowerBoundTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/expressions/SortedArrayLowerBoundTest.scala @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.microsoft.hyperspace.index.dataskipping.util +package com.microsoft.hyperspace.index.dataskipping.expressions import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen._ @@ -84,4 +84,10 @@ class SortedArrayLowerBoundTest checkEvaluation(expr(IntegerType, Seq(1, 3), 4), null) checkEvaluation(expr(IntegerType, Seq(1, 3, 5), 6), null) } + + test("SortedArrayLowerBound returns null if the value is null.") { + checkEvaluation(expr(IntegerType, Seq(1), null), null) + checkEvaluation(expr(IntegerType, Seq(1, 3), null), null) + checkEvaluation(expr(IntegerType, Seq(1, 3, 5), null), null) + } } diff --git a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/rules/ApplyDataSkippingIndexTest.scala b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/rules/ApplyDataSkippingIndexTest.scala new file mode 100644 index 000000000..98dca4f18 --- /dev/null +++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/rules/ApplyDataSkippingIndexTest.scala @@ -0,0 +1,372 @@ +/* + * Copyright (2021) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.index.dataskipping.rules + +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.execution.datasources._ +import org.apache.spark.sql.hyperspace.utils.logicalPlanToDataFrame + +import com.microsoft.hyperspace.index._ +import com.microsoft.hyperspace.index.dataskipping._ +import com.microsoft.hyperspace.index.dataskipping.execution.DataSkippingFileIndex +import com.microsoft.hyperspace.index.dataskipping.sketches._ + +class ApplyDataSkippingIndexTest extends DataSkippingSuite { + import spark.implicits._ + + override val numParallelism: Int = 10 + + test("applyIndex returns the unmodified plan if no index is given.") { + val sourceData = createSourceData(spark.range(100).toDF("A")) + val query = sourceData.filter("A = 1") + val plan = query.queryExecution.optimizedPlan + assert(ApplyDataSkippingIndex.applyIndex(plan, Map.empty) === plan) + } + + test("score returns 0 if no index is given.") { + val sourceData = createSourceData(spark.range(100).toDF("A")) + val query = sourceData.filter("A = 1") + val plan = query.queryExecution.optimizedPlan + assert(ApplyDataSkippingIndex.score(plan, Map.empty) === 0) + } + + case class SourceData(df: () => DataFrame, description: String) + + case class Param( + sourceData: SourceData, + filter: String, + sketches: Seq[Sketch], + numExpectedFiles: Int, + setup: Option[() => _]) + + object Param { + def apply( + sourceData: SourceData, + filter: String, + sketch: Sketch, + numExpectedFiles: Int): Param = { + Param(sourceData, filter, Seq(sketch), numExpectedFiles, None) + } + + def apply( + sourceData: SourceData, + filter: String, + sketches: Seq[Sketch], + numExpectedFiles: Int): Param = { + Param(sourceData, filter, sketches, numExpectedFiles, None) + } + + def apply( + sourceData: SourceData, + filter: String, + sketch: Sketch, + numExpectedFiles: Int, + setup: () => _): Param = { + Param(sourceData, filter, Seq(sketch), numExpectedFiles, Some(setup)) + } + + def apply( + sourceData: SourceData, + filter: String, + sketches: Seq[Sketch], + numExpectedFiles: Int, + setup: () => _): Param = { + Param(sourceData, filter, sketches, numExpectedFiles, Some(setup)) + } + } + + def dataI: SourceData = + SourceData(() => createSourceData(spark.range(100).toDF("A")), "source [A:Int]") + + def dataII: SourceData = + SourceData( + () => createSourceData(spark.range(100).selectExpr("id as A", "id * 2 as B")), + "source [A:Int, B:Int]") + + def dataIN: SourceData = + SourceData( + () => + createSourceData( + Seq[Integer](1, 2, null, null, null, null, 7, 8, 9, null, 11, 12, null, 14, null, null, + 17, null, 19, 20).toDF("A")), + "source [A:Int] with nulls") + + def dataIIP: SourceData = + SourceData( + () => + createPartitionedSourceData( + spark.range(100).selectExpr("cast(id / 10 as int) as A", "id as B"), + Seq("A")), + "source [A:Int, B:Int] partitioned") + + def dataD: SourceData = + SourceData( + () => createSourceData(spark.range(100).map(_.toDouble).toDF("A")), + "source [A:Double]") + + def dataDS: SourceData = + SourceData( + () => + createSourceData( + Seq( + 0.0, + 1.0, + 1.5, + Double.NegativeInfinity, + Double.PositiveInfinity, + Double.NaN, + 3.14, + 2.718, + -1.1, + -0.0).toDF("A")), + "source [A:Double] small") + + def dataN2: SourceData = + SourceData( + () => + createSourceData( + spark.read.json(Seq( + """{"a": 1, "b": {"a": 0, "c": 2, "d": "x"}}""", + """{"a": 2, "b": {"a": 0, "c": 3, "d": "y"}}""", + """{"a": 3, "b": {"a": 1, "c": 4, "d": "x"}}""", + """{"a": 4, "b": {"a": 2, "c": null, "d": "x"}}""", + """{"a": 2, "b": {"a": 2, "c": 6, "d": "x"}}""", + """{"a": 2, "b": {"a": 1, "c": 7, "d": "x"}}""", + """{"b": {"c": 8, "d": "x"}}""", + """{"b": {"d": "y"}}""", + """{"a": 3}""", + """{"b": {"c": 11}}""").toDS)), + "source [A:Int, B:[A:Int, C:Int, D: String]]") + + def dataN3: SourceData = + SourceData( + () => + createSourceData( + spark.read.json(Seq( + """{"a": {"b": {"c": 1}}}""", + """{"a": {"b": {"c": 2}}}""", + """{"a": {"b": {"c": 3}}}""", + """{"a": {"b": {"c": null}}}""", + """{"a": {"b": {"c": 5}}}""", + """{"a": {"b": {"c": 6}}}""", + """{"a": {"b": {"c": 7}}}""", + """{"a": {"b": {"c": 8}}}""", + """{"a": null}""", + """{"a": {"b": {"c": 0}}}""").toDS)), + "source [A:[B:[C:Int]]]") + + def dataB: SourceData = + SourceData( + () => + createSourceData( + Seq( + Array[Byte](0, 0, 0, 0), + Array[Byte](0, 1, 0, 1), + Array[Byte](1, 2, 3, 4), + Array[Byte](5, 6, 7, 8), + Array[Byte](32, 32, 32, 32), + Array[Byte](64, 64, 64, 64), + Array[Byte](1, 1, 1, 1), + Array[Byte](-128, -128, -128, -128), + Array[Byte](127, 127, 127, 127), + Array[Byte](-1, 1, 0, 0)).toDF("A")), + "source [A:Binary]") + + def dataS: SourceData = + SourceData( + () => + createSourceData( + Seq( + "foo1", + "foo2000", + "foo3", + "foo4", + "foo5", + null, + "foo7", + "foo8", + "foo9", + "baar", + null) + .toDF("A")), + "source [A:String]") + + Seq( + Param(dataI, "A = 10", MinMaxSketch("A"), 1), + Param(dataI, "50 = a", MinMaxSketch("A"), 1), + Param(dataI, "A = -10", MinMaxSketch("a"), 0), + Param(dataI, "A = 5 + 5", MinMaxSketch("A"), 1), + Param(dataI, "A = 10 or A = 30", MinMaxSketch("A"), 2), + Param(dataI, "A is null", MinMaxSketch("A"), 10), + Param(dataI, "!(A is null)", MinMaxSketch("A"), 10), + Param(dataI, "A is not null", MinMaxSketch("A"), 10), + Param(dataI, "!(A is not null)", MinMaxSketch("A"), 10), + Param(dataI, "A <=> 10", MinMaxSketch("A"), 1), + Param(dataI, "10 <=> A", MinMaxSketch("A"), 1), + Param(dataI, "A <=> null", MinMaxSketch("A"), 10), + Param(dataI, "A <25", MinMaxSketch("A"), 3), + Param(dataI, "30>A", MinMaxSketch("A"), 3), + Param(dataI, "31 > A", MinMaxSketch("a"), 4), + Param(dataI, "A > 25", MinMaxSketch("a"), 8), + Param(dataI, "28 < A", MinMaxSketch("a"), 8), + Param(dataI, "29< A", MinMaxSketch("A"), 7), + Param(dataI, "A <= 25", MinMaxSketch("A"), 3), + Param(dataI, "29 >= A", MinMaxSketch("A"), 3), + Param(dataI, "30>=A", MinMaxSketch("A"), 4), + Param(dataI, "A >= 25", MinMaxSketch("A"), 8), + Param(dataI, "29 <= A", MinMaxSketch("A"), 8), + Param(dataI, "30 <= A", MinMaxSketch("A"), 7), + Param(dataI, "A != 1", MinMaxSketch("A"), 10), + Param(dataI, "not (A != 1 and A != 10)", MinMaxSketch("A"), 2), + Param(dataI, "!(!(A = 1))", MinMaxSketch("A"), 1), + Param(dataI, "!(A < 20)", MinMaxSketch("A"), 8), + Param(dataI, "not (A not in (1, 2, 3))", MinMaxSketch("A"), 1), + Param(dataS, "A < 'foo'", MinMaxSketch("A"), 1), + Param(dataI, "a = 10", MinMaxSketch("A"), 1), + Param(dataI, "A = 10", MinMaxSketch("a"), 1), + Param(dataI, "A in (1, 2, 3, null, 10)", MinMaxSketch("A"), 2), + Param(dataI, "A in (10,9,8,7,6,5,4,3,2,1,50,49,48,47,46,45)", MinMaxSketch("A"), 4), + Param(dataS, "A in ('foo1', 'foo5', 'foo9')", MinMaxSketch("A"), 3), + Param( + dataS, + "A in ('foo1','a','b','c','d','e','f','g','h','i','j','k')", + MinMaxSketch("A"), + 1), + Param(dataD, "A in (1,2,3,15,16,17)", MinMaxSketch("A"), 2), + Param(dataD, "A in (1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16)", MinMaxSketch("A"), 2), + Param(dataB, "A in (x'00000000', x'0001', x'0002', x'05060708')", MinMaxSketch("A"), 2), + Param( + dataB, + "A in (x'00',x'01',x'02',x'03',x'04',x'05',x'06',x'07',x'08',x'09',x'0a',x'20202020')", + MinMaxSketch("A"), + 1), + Param(dataI, "A BETWEEN 27 AND 51", MinMaxSketch("A"), 4), + Param(dataI, "IF(A=1,2,3)=2", MinMaxSketch("A"), 10), + Param(dataII, "A = 10 OR B = 50", Seq(MinMaxSketch("A"), MinMaxSketch("B")), 2), + Param(dataII, "A = 10 or B = 50", Seq(MinMaxSketch("A")), 10), + Param(dataII, "B = 50 or A = 10", Seq(MinMaxSketch("A")), 10), + Param(dataII, "A = 10 and B = 20", MinMaxSketch("A"), 1), + Param(dataII, "a = 10 AND b = 20", Seq(MinMaxSketch("A"), MinMaxSketch("B")), 1), + Param(dataII, "A < 30 and B > 20", MinMaxSketch("A"), 3), + Param(dataII, "A < 30 and b > 40", Seq(MinMaxSketch("a"), MinMaxSketch("B")), 1), + Param(dataII, "A = 10 and B = 90", Seq(MinMaxSketch("A"), MinMaxSketch("B")), 0), + Param(dataIN, "A is not null", MinMaxSketch("A"), 7), + Param(dataIN, "!(A <=> null)", MinMaxSketch("A"), 7), + Param(dataIN, "A = 2", MinMaxSketch("A"), 1), + Param(dataIN, "A is null", MinMaxSketch("A"), 10), + Param(dataIIP, "B = 10", MinMaxSketch("B"), 1), + Param(dataIIP, "A = 5 and B = 20", MinMaxSketch("B"), 0), + Param(dataIIP, "A < 5 and B = 20", MinMaxSketch("B"), 1), + Param(dataN2, "B.C = 2", MinMaxSketch("B.C"), 1), + Param(dataN2, "B.c = 2", MinMaxSketch("b.C"), 1), + Param(dataN2, "b.c < 5", MinMaxSketch("b.c"), 3), + Param(dataN3, "A.B.C = 2", MinMaxSketch("a.B.C"), 1), + Param(dataDS, "A = 1.0", MinMaxSketch("A"), 1), + Param(dataDS, "A <= 1.5", MinMaxSketch("A"), 6), + Param(dataDS, "A >= 1.5", MinMaxSketch("A"), 5), + Param(dataD, "A in (1, 2, 3, 10)", MinMaxSketch("A"), 2), + Param(dataII, "A + B < 100", MinMaxSketch("a+b"), 4), + Param( + dataI, + "F(A) = 10", + MinMaxSketch("F(A)"), + 1, + () => spark.udf.register("F", (a: Int) => a * 2)), + Param( + dataI, + "is_less_than_23(A)", + MinMaxSketch("is_less_than_23(A)"), + 3, + () => spark.udf.register("is_less_than_23", (a: Int) => a < 23)), + Param( + dataI, + "!is_less_than_23(A)", + MinMaxSketch("is_less_than_23(A)"), + 8, + () => spark.udf.register("is_less_than_23", (a: Int) => a < 23)), + Param( + dataII, + "A < 50 and F(A,B) < 20", + Seq(MinMaxSketch("A"), MinMaxSketch("F(A,B)")), + 2, + () => spark.udf.register("F", (a: Int, b: Int) => b - a)), + Param( + dataI, + "f(a) < 30", + MinMaxSketch("F(a)"), + 2, + () => spark.udf.register("F", (a: Int) => a * 2)), + Param( + dataI, + "IF(A IS NULL,NULL,F(A))=2", + MinMaxSketch("A"), + 10, + () => spark.udf.register("F", (a: Int) => a * 2))).foreach { + case Param(sourceData, filter, sketches, numExpectedFiles, setup) => + test( + s"applyIndex works as expected for ${sourceData.description}: " + + s"filter=[$filter], sketches=[${sketches.mkString(", ")}], " + + s"numExpectedFiles=[$numExpectedFiles]") { + val indexConfig = DataSkippingIndexConfig("ind1", sketches.head, sketches.tail: _*) + if (setup.nonEmpty) { + setup.get.apply() + } + testApplyIndex(sourceData.df(), filter, indexConfig, numExpectedFiles) + } + } + + def testApplyIndex( + sourceData: DataFrame, + filter: String, + indexConfig: DataSkippingIndexConfig, + numExpectedFiles: Int): Unit = { + val originalNumFiles = listFiles(dataPath()).filter(isParquet).length + val query = sourceData.filter(filter) + val plan = query.queryExecution.optimizedPlan + val indexLogEntry = createIndexLogEntry(indexConfig, sourceData) + val indexDataPred = indexLogEntry.derivedDataset + .asInstanceOf[DataSkippingIndex] + .translateFilterCondition( + spark, + plan.asInstanceOf[Filter].condition, + sourceData.queryExecution.optimizedPlan) + indexLogEntry.setTagValue(plan, IndexLogEntryTags.DATASKIPPING_INDEX_PREDICATE, indexDataPred) + val optimizedPlan = ApplyDataSkippingIndex.applyIndex( + plan, + Map(sourceData.queryExecution.optimizedPlan -> indexLogEntry)) + if (indexDataPred.isEmpty) { + assert(optimizedPlan === plan) + } else { + assert(optimizedPlan !== plan) + optimizedPlan match { + case Filter( + _, + LogicalRelation( + HadoopFsRelation(location: DataSkippingFileIndex, _, _, _, _, _), + _, + _, + _)) => + assert(location.indexDataPred === indexDataPred.get) + case _ => fail(s"unexpected optimizedPlan: $optimizedPlan") + } + } + val optimizedDf = logicalPlanToDataFrame(spark, optimizedPlan) + checkAnswer(optimizedDf, query) + assert(numAccessedFiles(optimizedDf) === numExpectedFiles) + } +} diff --git a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/rules/FilterConditionFilterTest.scala b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/rules/FilterConditionFilterTest.scala new file mode 100644 index 000000000..19c219e41 --- /dev/null +++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/rules/FilterConditionFilterTest.scala @@ -0,0 +1,83 @@ +/* + * Copyright (2021) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.index.dataskipping.rules + +import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute +import org.apache.spark.sql.catalyst.expressions._ + +import com.microsoft.hyperspace.index.IndexLogEntryTags +import com.microsoft.hyperspace.index.dataskipping._ +import com.microsoft.hyperspace.index.dataskipping.sketches._ +import com.microsoft.hyperspace.index.plananalysis.FilterReasons.IneligibleFilterCondition + +class FilterConditionFilterTest extends DataSkippingSuite { + test("apply returns an empty map if there are no candidate indexes.") { + val df = spark.range(10).toDF("A") + assert(FilterConditionFilter(df.queryExecution.optimizedPlan, Map.empty) === Map.empty) + } + + test("apply returns an empty map if the plan is not a filter.") { + val df = createSourceData(spark.range(10).toDF("A")) + val indexConfig = DataSkippingIndexConfig("myind", MinMaxSketch("A")) + val indexLogEntry = createIndexLogEntry(indexConfig, df) + val candidateIndexes = Map(df.queryExecution.optimizedPlan -> Seq(indexLogEntry)) + assert(FilterConditionFilter(df.queryExecution.optimizedPlan, candidateIndexes) === Map.empty) + } + + test("apply creates an index data predicate if the index can be applied to the plan.") { + val df = createSourceData(spark.range(10).toDF("A")) + val indexConfig = DataSkippingIndexConfig("myind", MinMaxSketch("A")) + val indexLogEntry = createIndexLogEntry(indexConfig, df) + val candidateIndexes = Map(df.queryExecution.optimizedPlan -> Seq(indexLogEntry)) + val plan = df.filter("A = 1").queryExecution.optimizedPlan + assert(FilterConditionFilter(plan, candidateIndexes) === candidateIndexes) + val indexDataPredOpt = + indexLogEntry.getTagValue(plan, IndexLogEntryTags.DATASKIPPING_INDEX_PREDICATE) + assert( + indexDataPredOpt === Some(Some(And( + IsNotNull(UnresolvedAttribute("MinMax_A__0")), + And( + LessThanOrEqual(UnresolvedAttribute("MinMax_A__0"), Literal(1L)), + GreaterThanOrEqual(UnresolvedAttribute("MinMax_A__1"), Literal(1L))))))) + } + + test("apply returns an empty map if the filter condition is not suitable.") { + val df = createSourceData(spark.range(10).selectExpr("id as A", "id * 2 as B")) + val indexConfig = DataSkippingIndexConfig("myind", MinMaxSketch("A")) + val indexLogEntry = createIndexLogEntry(indexConfig, df) + indexLogEntry.setTagValue(IndexLogEntryTags.INDEX_PLAN_ANALYSIS_ENABLED, true) + val candidateIndexes = Map(df.queryExecution.optimizedPlan -> Seq(indexLogEntry)) + val plan = df.filter("B = 1").queryExecution.optimizedPlan + assert(FilterConditionFilter(plan, candidateIndexes) === Map.empty) + val reason = indexLogEntry.getTagValue(plan, IndexLogEntryTags.FILTER_REASONS) + assert(reason === Some(List(IneligibleFilterCondition("((`B` IS NOT NULL) AND (`B` = 1L))")))) + } + + test("apply returns only the applicable indexes when there are multiple candidate indexes.") { + val df = createSourceData(spark.range(10).selectExpr("id as A", "id * 2 as B")) + val indexConfig1 = DataSkippingIndexConfig("myind", MinMaxSketch("A")) + val indexConfig2 = DataSkippingIndexConfig("myind", MinMaxSketch("B")) + val indexLogEntry1 = createIndexLogEntry(indexConfig1, df) + val indexLogEntry2 = createIndexLogEntry(indexConfig2, df) + val candidateIndexes = + Map(df.queryExecution.optimizedPlan -> Seq(indexLogEntry1, indexLogEntry2)) + val plan = df.filter("A = 1").queryExecution.optimizedPlan + assert( + FilterConditionFilter(plan, candidateIndexes) === Map( + df.queryExecution.optimizedPlan -> Seq(indexLogEntry1))) + } +} diff --git a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/rules/FilterPlanNodeFilterTest.scala b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/rules/FilterPlanNodeFilterTest.scala new file mode 100644 index 000000000..2017f4e55 --- /dev/null +++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/rules/FilterPlanNodeFilterTest.scala @@ -0,0 +1,51 @@ +/* + * Copyright (2021) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.index.dataskipping.rules + +import com.microsoft.hyperspace.index.dataskipping._ +import com.microsoft.hyperspace.index.dataskipping.sketches._ + +class FilterPlanNodeFilterTest extends DataSkippingSuite { + test("apply returns an empty map if there are no candidate indexes.") { + val df = spark.range(10).toDF("A") + assert(FilterPlanNodeFilter(df.queryExecution.optimizedPlan, Map.empty) === Map.empty) + } + + test("apply returns an empty map if the plan is not a filter.") { + val df = createSourceData(spark.range(10).toDF("A")) + val indexConfig = DataSkippingIndexConfig("myind", MinMaxSketch("A")) + val indexLogEntry = createIndexLogEntry(indexConfig, df) + val candidateIndexes = Map(df.queryExecution.optimizedPlan -> Seq(indexLogEntry)) + val plan = df.groupBy("A").count().queryExecution.optimizedPlan + assert(FilterPlanNodeFilter(plan, candidateIndexes) === Map.empty) + } + + test("apply returns applicable indexes only.") { + val df1 = createSourceData(spark.range(10).toDF("A"), "T1") + val df2 = createSourceData(spark.range(10).toDF("A"), "T2") + val indexConfig = DataSkippingIndexConfig("myind", MinMaxSketch("A")) + val indexLogEntry1 = createIndexLogEntry(indexConfig, df1) + val indexLogEntry2 = createIndexLogEntry(indexConfig, df2) + val candidateIndexes = Map( + df1.queryExecution.optimizedPlan -> Seq(indexLogEntry1), + df2.queryExecution.optimizedPlan -> Seq(indexLogEntry2)) + val plan = df1.filter("A = 1").queryExecution.optimizedPlan + assert( + FilterPlanNodeFilter(plan, candidateIndexes) === Map( + df1.queryExecution.optimizedPlan -> Seq(indexLogEntry1))) + } +} diff --git a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/sketch/MinMaxSketchTest.scala b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/sketch/MinMaxSketchTest.scala deleted file mode 100644 index f3db70545..000000000 --- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/sketch/MinMaxSketchTest.scala +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (2021) The Hyperspace Project Authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.microsoft.hyperspace.index.dataskipping.sketch - -import org.apache.spark.sql.{Column, QueryTest} -import org.mockito.Mockito.mock - -import com.microsoft.hyperspace.index.HyperspaceSuite - -class MinMaxSketchTest extends QueryTest with HyperspaceSuite { - import spark.implicits._ - - test("indexedColumns returns the indexed column.") { - val sketch = MinMaxSketch("A") - assert(sketch.indexedColumns === Seq("A")) - } - - test("referencedColumns returns the indexed column.") { - val sketch = MinMaxSketch("A") - assert(sketch.referencedColumns === Seq("A")) - } - - test("aggregateFunctions returns min and max aggregation functions.") { - val sketch = MinMaxSketch("A") - val aggrs = sketch.aggregateFunctions.map(new Column(_)) - val data = Seq(1, -1, 10, 2, 4).toDF("A") - checkAnswer(data.select(aggrs: _*), Seq((-1, 10)).toDF) - } - - test("toString returns a reasonable string.") { - val sketch = MinMaxSketch("A") - assert(sketch.toString === "MinMax(A)") - } - - test("Two sketches are equal if their columns are equal.") { - assert(MinMaxSketch("A") === MinMaxSketch("A")) - assert(MinMaxSketch("A") !== MinMaxSketch("a")) - assert(MinMaxSketch("b") !== MinMaxSketch("B")) - assert(MinMaxSketch("B") === MinMaxSketch("B")) - } - - test("MinMaxSketch is different from other sketches.") { - val s1 = MinMaxSketch("A") - val s2 = mock(classOf[Sketch]) - assert(s1 !== s2) - } - - test("hashCode is reasonably implemented.") { - assert(MinMaxSketch("A").hashCode === MinMaxSketch("A").hashCode) - assert(MinMaxSketch("A").hashCode !== MinMaxSketch("a").hashCode) - } -} diff --git a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/sketches/MinMaxSketchTest.scala b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/sketches/MinMaxSketchTest.scala new file mode 100644 index 000000000..7962a9990 --- /dev/null +++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/sketches/MinMaxSketchTest.scala @@ -0,0 +1,338 @@ +/* + * Copyright (2021) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.index.dataskipping.sketches + +import org.apache.spark.sql.{Column, QueryTest} +import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.types._ +import org.mockito.Mockito.mock + +import com.microsoft.hyperspace.index.HyperspaceSuite +import com.microsoft.hyperspace.index.dataskipping.expressions._ + +class MinMaxSketchTest extends QueryTest with HyperspaceSuite { + import spark.implicits._ + + val valueExtractor = AttrValueExtractor(Map.empty) + + test("indexedColumns returns the indexed column.") { + val sketch = MinMaxSketch("A") + assert(sketch.indexedColumns === Seq("A")) + } + + test("referencedColumns returns the indexed column.") { + val sketch = MinMaxSketch("A") + assert(sketch.referencedColumns === Seq("A")) + } + + test("aggregateFunctions returns min and max aggregation functions.") { + val sketch = MinMaxSketch("A") + val aggrs = sketch.aggregateFunctions.map(new Column(_)) + val data = Seq(1, -1, 10, 2, 4).toDF("A") + checkAnswer(data.select(aggrs: _*), Seq((-1, 10)).toDF) + } + + test("toString returns a reasonable string.") { + val sketch = MinMaxSketch("A") + assert(sketch.toString === "MinMax(A)") + } + + test("Two sketches are equal if their columns are equal.") { + assert(MinMaxSketch("A") === MinMaxSketch("A")) + assert(MinMaxSketch("A") !== MinMaxSketch("a")) + assert(MinMaxSketch("b") !== MinMaxSketch("B")) + assert(MinMaxSketch("B") === MinMaxSketch("B")) + } + + test("MinMaxSketch is different from other sketches.") { + val s1 = MinMaxSketch("A") + val s2 = mock(classOf[Sketch]) + assert(s1 !== s2) + } + + test("hashCode is reasonably implemented.") { + assert(MinMaxSketch("A").hashCode === MinMaxSketch("A").hashCode) + assert(MinMaxSketch("A").hashCode !== MinMaxSketch("a").hashCode) + } + + test("convertPredicate converts EqualTo(, ).") { + val sketch = MinMaxSketch("A") + val predicate = EqualTo(AttributeReference("A", IntegerType)(ExprId(0)), Literal(42)) + val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max")) + val nameMap = Map(ExprId(0) -> "A") + val result = sketch.convertPredicate( + predicate, + Seq(AttributeReference("A", IntegerType)(ExpressionUtils.nullExprId)), + sketchValues, + nameMap, + valueExtractor) + val expected = Some( + And( + LessThanOrEqual(sketchValues(0), Literal(42)), + GreaterThanOrEqual(sketchValues(1), Literal(42)))) + assert(result === expected) + } + + test("convertPredicate converts EqualTo(, ).") { + val sketch = MinMaxSketch("A") + val predicate = EqualTo(Literal(42), AttributeReference("A", IntegerType)(ExprId(0))) + val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max")) + val nameMap = Map(ExprId(0) -> "A") + val result = sketch.convertPredicate( + predicate, + Seq(AttributeReference("A", IntegerType)(ExpressionUtils.nullExprId)), + sketchValues, + nameMap, + valueExtractor) + val expected = Some( + And( + LessThanOrEqual(sketchValues(0), Literal(42)), + GreaterThanOrEqual(sketchValues(1), Literal(42)))) + assert(result === expected) + } + + test("convertPredicate converts EqualTo(, ).") { + val sketch = MinMaxSketch("A.C") + val structAccess = GetStructField( + AttributeReference("A", StructType(Seq(StructField("C", IntegerType))))(ExprId(0)), + 0) + val predicate = EqualTo(structAccess, Literal(42)) + val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max")) + val nameMap = Map(ExprId(0) -> "A") + val result = sketch.convertPredicate( + predicate, + Seq(structAccess.transformUp { + case attr: AttributeReference => attr.withExprId(ExpressionUtils.nullExprId) + }), + sketchValues, + nameMap, + valueExtractor) + val expected = Some( + And( + LessThanOrEqual(sketchValues(0), Literal(42)), + GreaterThanOrEqual(sketchValues(1), Literal(42)))) + assert(result === expected) + } + + test("convertPredicate converts EqualTo(, ).") { + val sketch = MinMaxSketch("A.B.C") + val structAccess = GetStructField( + GetStructField( + AttributeReference( + "A", + StructType(Seq(StructField("B", StructType(Seq(StructField("C", IntegerType)))))))( + ExprId(0)), + 0), + 0) + val predicate = EqualTo(structAccess, Literal(42)) + val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max")) + val nameMap = Map(ExprId(0) -> "A") + val result = sketch.convertPredicate( + predicate, + Seq(structAccess.transformUp { + case attr: AttributeReference => attr.withExprId(ExpressionUtils.nullExprId) + }), + sketchValues, + nameMap, + valueExtractor) + val expected = Some( + And( + LessThanOrEqual(sketchValues(0), Literal(42)), + GreaterThanOrEqual(sketchValues(1), Literal(42)))) + assert(result === expected) + } + + test("convertPredicate converts EqualTo(, ) - string type.") { + val sketch = MinMaxSketch("A") + val predicate = + EqualTo(AttributeReference("A", StringType)(ExprId(0)), Literal.create("hello", StringType)) + val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max")) + val nameMap = Map(ExprId(0) -> "A") + val result = sketch.convertPredicate( + predicate, + Seq(AttributeReference("A", StringType)(ExpressionUtils.nullExprId)), + sketchValues, + nameMap, + valueExtractor) + val expected = Some( + And( + LessThanOrEqual(sketchValues(0), Literal.create("hello", StringType)), + GreaterThanOrEqual(sketchValues(1), Literal.create("hello", StringType)))) + assert(result === expected) + } + + test("convertPredicate converts EqualTo(, ) - double type.") { + val sketch = MinMaxSketch("A") + val predicate = + EqualTo(AttributeReference("A", StringType)(ExprId(0)), Literal(3.14, DoubleType)) + val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max")) + val nameMap = Map(ExprId(0) -> "A") + val result = sketch.convertPredicate( + predicate, + Seq(AttributeReference("A", StringType)(ExpressionUtils.nullExprId)), + sketchValues, + nameMap, + valueExtractor) + val expected = Some( + And( + LessThanOrEqual(sketchValues(0), Literal(3.14, DoubleType)), + GreaterThanOrEqual(sketchValues(1), Literal(3.14, DoubleType)))) + assert(result === expected) + } + + test("convertPredicate converts LessThan.") { + val sketch = MinMaxSketch("A") + val predicate = LessThan(AttributeReference("A", IntegerType)(ExprId(0)), Literal(42)) + val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max")) + val nameMap = Map(ExprId(0) -> "A") + val result = sketch.convertPredicate( + predicate, + Seq(AttributeReference("A", IntegerType)(ExpressionUtils.nullExprId)), + sketchValues, + nameMap, + valueExtractor) + val expected = Some(LessThan(sketchValues(0), Literal(42))) + assert(result === expected) + } + + test("convertPredicate converts LessThan - string type.") { + val sketch = MinMaxSketch("A") + val predicate = LessThan( + AttributeReference("A", StringType)(ExprId(0)), + Literal.create("hello", StringType)) + val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max")) + val nameMap = Map(ExprId(0) -> "A") + val result = sketch.convertPredicate( + predicate, + Seq(AttributeReference("A", StringType)(ExpressionUtils.nullExprId)), + sketchValues, + nameMap, + valueExtractor) + val expected = Some(LessThan(sketchValues(0), Literal.create("hello", StringType))) + assert(result === expected) + } + + test("convertPredicate converts LessThanOrEqual.") { + val sketch = MinMaxSketch("A") + val predicate = LessThanOrEqual(AttributeReference("A", IntegerType)(ExprId(0)), Literal(42)) + val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max")) + val nameMap = Map(ExprId(0) -> "A") + val result = sketch.convertPredicate( + predicate, + Seq(AttributeReference("A", IntegerType)(ExpressionUtils.nullExprId)), + sketchValues, + nameMap, + valueExtractor) + val expected = Some(LessThanOrEqual(sketchValues(0), Literal(42))) + assert(result === expected) + } + + test("convertPredicate converts GreaterThan.") { + val sketch = MinMaxSketch("A") + val predicate = GreaterThan(AttributeReference("A", IntegerType)(ExprId(0)), Literal(42)) + val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max")) + val nameMap = Map(ExprId(0) -> "A") + val result = sketch.convertPredicate( + predicate, + Seq(AttributeReference("A", IntegerType)(ExpressionUtils.nullExprId)), + sketchValues, + nameMap, + valueExtractor) + val expected = Some(GreaterThan(sketchValues(1), Literal(42))) + assert(result === expected) + } + + test("convertPredicate converts GreaterThanOrEqual.") { + val sketch = MinMaxSketch("A") + val predicate = + GreaterThanOrEqual(AttributeReference("A", IntegerType)(ExprId(0)), Literal(42)) + val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max")) + val nameMap = Map(ExprId(0) -> "A") + val result = sketch.convertPredicate( + predicate, + Seq(AttributeReference("A", IntegerType)(ExpressionUtils.nullExprId)), + sketchValues, + nameMap, + valueExtractor) + val expected = Some(GreaterThanOrEqual(sketchValues(1), Literal(42))) + assert(result === expected) + } + + test("convertPredicate converts In.") { + val sketch = MinMaxSketch("A") + val predicate = + In(AttributeReference("A", IntegerType)(ExprId(0)), Seq(Literal(42), Literal(23))) + val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max")) + val nameMap = Map(ExprId(0) -> "A") + val result = sketch.convertPredicate( + predicate, + Seq(AttributeReference("A", IntegerType)(ExpressionUtils.nullExprId)), + sketchValues, + nameMap, + valueExtractor) + val expected = Some( + Or( + And( + LessThanOrEqual(sketchValues(0), Literal(42)), + GreaterThanOrEqual(sketchValues(1), Literal(42))), + And( + LessThanOrEqual(sketchValues(0), Literal(23)), + GreaterThanOrEqual(sketchValues(1), Literal(23))))) + assert(result === expected) + } + + test("convertPredicate converts In - string type.") { + val sketch = MinMaxSketch("A") + val predicate = + In( + AttributeReference("A", StringType)(ExprId(0)), + Seq(Literal.create("hello", StringType), Literal.create("world", StringType))) + val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max")) + val nameMap = Map(ExprId(0) -> "A") + val result = sketch.convertPredicate( + predicate, + Seq(AttributeReference("A", StringType)(ExpressionUtils.nullExprId)), + sketchValues, + nameMap, + valueExtractor) + val expected = Some( + Or( + And( + LessThanOrEqual(sketchValues(0), Literal.create("hello", StringType)), + GreaterThanOrEqual(sketchValues(1), Literal.create("hello", StringType))), + And( + LessThanOrEqual(sketchValues(0), Literal.create("world", StringType)), + GreaterThanOrEqual(sketchValues(1), Literal.create("world", StringType))))) + assert(result === expected) + } + + test("convertPredicate does not convert Not(EqualTo(, )).") { + val sketch = MinMaxSketch("A") + val predicate = Not(EqualTo(AttributeReference("A", IntegerType)(ExprId(0)), Literal(42))) + val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max")) + val nameMap = Map(ExprId(0) -> "A") + val result = sketch.convertPredicate( + predicate, + Seq(AttributeReference("A", IntegerType)(ExpressionUtils.nullExprId)), + sketchValues, + nameMap, + valueExtractor) + val expected = None + assert(result === expected) + } +} diff --git a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/sketches/PartitionSketchTest.scala b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/sketches/PartitionSketchTest.scala new file mode 100644 index 000000000..ecc3e9b7d --- /dev/null +++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/sketches/PartitionSketchTest.scala @@ -0,0 +1,166 @@ +/* + * Copyright (2021) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.index.dataskipping.sketches + +import org.apache.spark.sql.{Column, QueryTest} +import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.plans.logical.LocalRelation +import org.apache.spark.sql.types._ + +import com.microsoft.hyperspace.index.HyperspaceSuite +import com.microsoft.hyperspace.index.dataskipping.expressions._ + +class PartitionSketchTest extends QueryTest with HyperspaceSuite { + import spark.implicits._ + + test("indexedColumns returns the indexed column.") { + val sketch = PartitionSketch(Seq(("A", None))) + assert(sketch.indexedColumns === Seq("A")) + } + + test("referencedColumns returns the indexed column.") { + val sketch = PartitionSketch(Seq(("a", None), ("b", None))) + assert(sketch.referencedColumns === Seq("a", "b")) + } + + test("aggregateFunctions returns first aggregation function.") { + val sketch = PartitionSketch(Seq(("A", None))) + val aggrs = sketch.aggregateFunctions.map(new Column(_)) + val data = Seq(1, 1, 1, 1, 1).toDF("A") + checkAnswer(data.select(aggrs: _*), Seq(1).toDF) + } + + test("toString returns a human-readable string representation.") { + val sketch = PartitionSketch(Seq(("A", None), ("B", None))) + assert(sketch.toString === "Partition(A, B)") + } + + test("Two sketches are equal if their columns are equal.") { + assert(PartitionSketch(Seq(("A", None))) === PartitionSketch(Seq(("A", None)))) + assert( + PartitionSketch(Seq(("A", Some(LongType)))) !== + PartitionSketch(Seq(("A", Some(IntegerType))))) + } + + test("hashCode is reasonably implemented.") { + assert( + PartitionSketch(Seq(("A", None))).hashCode === PartitionSketch(Seq(("A", None))).hashCode) + assert( + PartitionSketch(Seq(("A", Some(LongType)))).hashCode !== + PartitionSketch(Seq(("A", Some(IntegerType)))).hashCode) + } + + val a = AttributeReference("A", IntegerType)(ExprId(0)) + val b = AttributeReference("B", IntegerType)(ExprId(1)) + val sketchA = UnresolvedAttribute("Partition_A") + val sketchB = UnresolvedAttribute("Partition_B") + + { + val sketch = PartitionSketch(Seq(("A", Some(IntegerType)))) + val resolvedExprs = Seq(ExpressionUtils.normalize(a)) + val sketchValues = Seq(sketchA) + val valueExtractor = AttrValueExtractor(Map(a -> sketchA)) + + test("convertPredicate converts EqualTo(, ).") { + assert( + sketch.convertPredicate( + EqualTo(a, Literal(42)), + resolvedExprs, + sketchValues, + Map.empty, + valueExtractor) === Some(EqualTo(sketchA, Literal(42)))) + } + + test("convertPredicate does not convert non-deterministic expression.") { + assert( + sketch.convertPredicate( + EqualTo(a, Rand(42)), + resolvedExprs, + sketchValues, + Map.empty, + valueExtractor) === None) + } + + test("convertPredicate does not convert literal.") { + assert( + sketch.convertPredicate( + Literal(42), + resolvedExprs, + sketchValues, + Map.empty, + valueExtractor) === None) + } + + test("convertPredicate does not convert conjunction.") { + assert( + sketch.convertPredicate( + And(LessThan(a, Literal(42)), GreaterThan(a, Literal(23))), + resolvedExprs, + sketchValues, + Map.empty, + valueExtractor) === None) + } + + test("convertPredicate does not convert disjunction.") { + assert( + sketch.convertPredicate( + Or(LessThan(a, Literal(42)), GreaterThan(a, Literal(23))), + resolvedExprs, + sketchValues, + Map.empty, + valueExtractor) === None) + } + + test("convertPredicate does not convert predicate having subquery.") { + assert( + sketch.convertPredicate( + InSubquery(Seq(a), ListQuery(LocalRelation(a))), + resolvedExprs, + sketchValues, + Map.empty, + valueExtractor) === None) + } + + test("convertPredicate does not convert predicate having unknown attr.") { + assert( + sketch.convertPredicate( + EqualTo(a, b), + resolvedExprs, + sketchValues, + Map.empty, + valueExtractor) === None) + } + } + + { + val sketch = PartitionSketch(Seq(("A", Some(IntegerType)), ("B", Some(IntegerType)))) + val resolvedExprs = Seq(a, b).map(ExpressionUtils.normalize) + val sketchValues = Seq(sketchA, sketchB) + val valueExtractor = AttrValueExtractor(Map(a -> sketchA, b -> sketchB)) + + test("convertPredicate converts EqualTo(, ).") { + assert( + sketch.convertPredicate( + EqualTo(a, b), + resolvedExprs, + sketchValues, + Map.empty, + valueExtractor) === Some(EqualTo(sketchA, sketchB))) + } + } +} diff --git a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/util/ExtractorsTest.scala b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/util/ExtractorsTest.scala deleted file mode 100644 index 81eea8338..000000000 --- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/util/ExtractorsTest.scala +++ /dev/null @@ -1,946 +0,0 @@ -/* - * Copyright (2021) The Hyperspace Project Authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.microsoft.hyperspace.index.dataskipping.util - -import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute -import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.types.{BooleanType, IntegerType} -import org.mockito.Mockito.{mock, when} - -import com.microsoft.hyperspace.index.HyperspaceSuite - -class ExtractorsTest extends HyperspaceSuite { - val a = AttributeReference("A", IntegerType)() - val b = AttributeReference("B", BooleanType)() - val zero = Literal(0, IntegerType) - val one = Literal(1, IntegerType) - val two = Literal(2, IntegerType) - val trueLit = Literal(true, BooleanType) - val falseLit = Literal(false, BooleanType) - val nullInt = Literal(null, IntegerType) - val nullBool = Literal(null, BooleanType) - - val aMatcher = { - val matcher = mock(classOf[ExprMatcher]) - when(matcher.apply(a)).thenReturn(true) - matcher - } - val bMatcher = { - val matcher = mock(classOf[ExprMatcher]) - when(matcher.apply(b)).thenReturn(true) - matcher - } - val nonMatcher = mock(classOf[ExprMatcher]) - - // EqualToExtractor - { - val AEqualTo = EqualToExtractor(aMatcher) - val NoneEqualTo = EqualToExtractor(nonMatcher) - - test("EqualToExtractor matches EqualTo(, ) if matcher().") { - val value = EqualTo(a, zero) match { - case AEqualTo(value) => Some(value) - case _ => None - } - assert(value === Some(zero)) - } - - test("EqualToExtractor does not match EqualTo(, null).") { - val value = EqualTo(a, nullInt) match { - case AEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("EqualToExtractor does not match EqualTo(, ).") { - val value = EqualTo(a, a) match { - case AEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("EqualToExtractor does not match EqualTo(, ) if !matcher().") { - val value = EqualTo(a, zero) match { - case NoneEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("EqualToExtractor matches EqualTo(, ) if matcher().") { - val value = EqualTo(zero, a) match { - case AEqualTo(value) => Some(value) - case _ => None - } - assert(value === Some(zero)) - } - - test("EqualToExtractor does not match EqualTo(null, ).") { - val value = EqualTo(nullInt, a) match { - case AEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("EqualToExtractor does not match EqualTo(, ) if !matcher().") { - val value = EqualTo(zero, a) match { - case NoneEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("EqualToExtractor matches EqualNullSafe(, ) if matcher().") { - val value = EqualNullSafe(a, zero) match { - case AEqualTo(value) => Some(value) - case _ => None - } - assert(value === Some(zero)) - } - - test("EqualToExtractor does not match EqualNullSafe(, null).") { - val value = EqualNullSafe(a, nullInt) match { - case AEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("EqualToExtractor does not match EqualNullSafe(, ) if !matcher().") { - val value = EqualNullSafe(a, zero) match { - case NoneEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("EqualToExtractor matches EqualNullSafe(, ) if matcher().") { - val value = EqualNullSafe(zero, a) match { - case AEqualTo(value) => Some(value) - case _ => None - } - assert(value === Some(zero)) - } - - test("EqualToExtractor does not match EqualNullSafe(null, ).") { - val value = EqualNullSafe(nullInt, a) match { - case AEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("EqualToExtractor does not match EqualNullSafe(, ) if !matcher().") { - val value = EqualNullSafe(zero, a) match { - case NoneEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("EqualToExtractor does not match expressions other than EqualTo/EqualNullSafe.") { - val value = zero match { - case AEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - } - - // LessThanExtractor - { - val ALessThan = LessThanExtractor(aMatcher) - val NoneLessThan = LessThanExtractor(nonMatcher) - - test("LessThanExtractor matches LessThan(, ) if matcher().") { - val value = LessThan(a, zero) match { - case ALessThan(value) => Some(value) - case _ => None - } - assert(value === Some(zero)) - } - - test("LessThanExtractor does not match LessThan(, null).") { - val value = LessThan(a, nullInt) match { - case ALessThan(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("LessThanExtractor does not match LessThan(, ).") { - val value = LessThan(a, a) match { - case ALessThan(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("LessThanExtractor does not match LessThan(, ) if !matcher().") { - val value = LessThan(a, zero) match { - case NoneLessThan(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("LessThanExtractor does not match LessThan(, ).") { - val value = LessThan(zero, a) match { - case ALessThan(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("LessThanExtractor matches GreaterThan(, ) if matcher().") { - val value = GreaterThan(zero, a) match { - case ALessThan(value) => Some(value) - case _ => None - } - assert(value === Some(zero)) - } - - test("LessThanExtractor does not match GreaterThan(null, ).") { - val value = GreaterThan(nullInt, a) match { - case ALessThan(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("LessThanExtractor does not match GreaterThan(, ) if !matcher().") { - val value = GreaterThan(zero, a) match { - case NoneLessThan(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("LessThanExtractor does not match GreaterThan(, ).") { - val value = GreaterThan(a, zero) match { - case ALessThan(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("LessThanExtractor does not match expressions other than LessThan.") { - val value = zero match { - case ALessThan(value) => Some(value) - case _ => None - } - assert(value === None) - } - } - - // LessThanOrEqualToExtractor - { - val ALessThanOrEqualTo = LessThanOrEqualToExtractor(aMatcher) - val NoneLessThanOrEqualTo = LessThanOrEqualToExtractor(nonMatcher) - - test( - "LessThanOrEqualToExtractor matches LessThanOrEqual(, ) if matcher().") { - val value = LessThanOrEqual(a, zero) match { - case ALessThanOrEqualTo(value) => Some(value) - case _ => None - } - assert(value === Some(zero)) - } - - test("LessThanOrEqualToExtractor does not match LessThanOrEqual(, null).") { - val value = LessThanOrEqual(a, nullInt) match { - case ALessThanOrEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("LessThanOrEqualToExtractor does not match LessThanOrEqual(, ).") { - val value = LessThanOrEqual(a, a) match { - case ALessThanOrEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test( - "LessThanOrEqualToExtractor does not match LessThanOrEqual(, ) " + - "if !matcher().") { - val value = LessThanOrEqual(a, zero) match { - case NoneLessThanOrEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("LessThanOrEqualToExtractor does not match LessThanOrEqual(, ).") { - val value = LessThanOrEqual(zero, a) match { - case ALessThanOrEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test( - "LessThanOrEqualToExtractor matches GreaterThanOrEqual(, ) if matcher().") { - val value = GreaterThanOrEqual(zero, a) match { - case ALessThanOrEqualTo(value) => Some(value) - case _ => None - } - assert(value === Some(zero)) - } - - test("LessThanOrEqualToExtractor does not match GreaterThanOrEqual(null, ).") { - val value = GreaterThanOrEqual(nullInt, a) match { - case ALessThanOrEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test( - "LessThanOrEqualToExtractor does not match GreaterThanOrEqual(, ) " + - "if !matcher().") { - val value = GreaterThanOrEqual(zero, a) match { - case NoneLessThanOrEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("LessThanOrEqualToExtractor does not match GreaterThanOrEqual(, ).") { - val value = GreaterThanOrEqual(a, zero) match { - case ALessThanOrEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("LessThanOrEqualToExtractor does not match expressions other than LessThanOrEqual.") { - val value = zero match { - case ALessThanOrEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - } - - // GreaterThanExtractor - { - val AGreaterThan = GreaterThanExtractor(aMatcher) - val NoneGreaterThan = GreaterThanExtractor(nonMatcher) - - test("GreaterThanExtractor matches GreaterThan(, ) if matcher().") { - val value = GreaterThan(a, zero) match { - case AGreaterThan(value) => Some(value) - case _ => None - } - assert(value === Some(zero)) - } - - test("GreaterThanExtractor does not match GreaterThan(, null).") { - val value = GreaterThan(a, nullInt) match { - case AGreaterThan(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("GreaterThanExtractor does not match GreaterThan(, ).") { - val value = GreaterThan(a, a) match { - case AGreaterThan(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("GreaterThanExtractor does not match GreaterThan(, ) if !matcher().") { - val value = GreaterThan(a, zero) match { - case NoneGreaterThan(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("GreaterThanExtractor does not match GreaterThan(, ).") { - val value = GreaterThan(zero, a) match { - case AGreaterThan(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("GreaterThanExtractor matches LessThan(, ) if matcher().") { - val value = LessThan(zero, a) match { - case AGreaterThan(value) => Some(value) - case _ => None - } - assert(value === Some(zero)) - } - - test("GreaterThanExtractor does not match LessThan(null, ).") { - val value = LessThan(nullInt, a) match { - case AGreaterThan(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("GreaterThanExtractor does not match LessThan(, ) if !matcher().") { - val value = LessThan(zero, a) match { - case NoneGreaterThan(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("GreaterThanExtractor does not match LessThan(, ).") { - val value = LessThan(a, zero) match { - case AGreaterThan(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("GreaterThanExtractor does not match expressions other than GreaterThan.") { - val value = zero match { - case AGreaterThan(value) => Some(value) - case _ => None - } - assert(value === None) - } - } - - // GreaterThanOrEqualToExtractor - { - val AGreaterThanOrEqualTo = GreaterThanOrEqualToExtractor(aMatcher) - val NoneGreaterThanOrEqualTo = GreaterThanOrEqualToExtractor(nonMatcher) - - test( - "GreaterThanOrEqualToExtractor matches GreaterThanOrEqual(, ) " + - "if matcher().") { - val value = GreaterThanOrEqual(a, zero) match { - case AGreaterThanOrEqualTo(value) => Some(value) - case _ => None - } - assert(value === Some(zero)) - } - - test("GreaterThanOrEqualToExtractor does not match GreaterThanOrEqual(, null).") { - val value = GreaterThanOrEqual(a, nullInt) match { - case AGreaterThanOrEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("GreaterThanOrEqualToExtractor does not match GreaterThanOrEqual(, ).") { - val value = GreaterThanOrEqual(a, a) match { - case AGreaterThanOrEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test( - "GreaterThanOrEqualToExtractor does not match GreaterThanOrEqual(, ) " + - "if !matcher().") { - val value = GreaterThanOrEqual(a, zero) match { - case NoneGreaterThanOrEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("GreaterThanOrEqualToExtractor does not match GreaterThanOrEqual(, ).") { - val value = GreaterThanOrEqual(zero, a) match { - case AGreaterThanOrEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test( - "GreaterThanOrEqualToExtractor matches LessThanOrEqual(, ) if matcher().") { - val value = LessThanOrEqual(zero, a) match { - case AGreaterThanOrEqualTo(value) => Some(value) - case _ => None - } - assert(value === Some(zero)) - } - - test("GreaterThanOrEqualToExtractor does not match LessThanOrEqual(null, ).") { - val value = LessThanOrEqual(nullInt, a) match { - case AGreaterThanOrEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test( - "GreaterThanOrEqualToExtractor does not match LessThanOrEqual(, ) " + - "if !matcher().") { - val value = LessThanOrEqual(zero, a) match { - case NoneGreaterThanOrEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test("GreaterThanOrEqualToExtractor does not match LessThanOrEqual(, ).") { - val value = LessThanOrEqual(a, zero) match { - case AGreaterThanOrEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - - test( - "GreaterThanOrEqualToExtractor does not match expressions other than GreaterThanOrEqual.") { - val value = zero match { - case AGreaterThanOrEqualTo(value) => Some(value) - case _ => None - } - assert(value === None) - } - } - - // IsNullExtractor - { - val AIsNull = IsNullExtractor(aMatcher) - val NoneIsNull = IsNullExtractor(nonMatcher) - - test("IsNullExtractor matches IsNull() if matcher().") { - val value = IsNull(a) match { - case AIsNull() => true - case _ => false - } - assert(value) - } - - test("IsNullExtractor does not match IsNull() if !matcher().") { - val value = IsNull(a) match { - case NoneIsNull() => true - case _ => false - } - assert(!value) - } - - test("IsNullExtractor matches EqualNullSafe(, null).") { - val value = EqualNullSafe(a, nullInt) match { - case AIsNull() => true - case _ => false - } - assert(value) - } - - test("IsNullExtractor does not match EqualNullSafe(, ).") { - val value = EqualNullSafe(a, zero) match { - case AIsNull() => true - case _ => false - } - assert(!value) - } - - test("IsNullExtractor matches EqualNullSafe(null, ).") { - val value = EqualNullSafe(nullInt, a) match { - case AIsNull() => true - case _ => false - } - assert(value) - } - - test("IsNullExtractor does not match EqualNullSafe(, ).") { - val value = EqualNullSafe(zero, a) match { - case AIsNull() => true - case _ => false - } - assert(!value) - } - - test("IsNullExtractor does not match expressions other than IsNull/EqualNullSafe.") { - val value = zero match { - case AIsNull() => true - case _ => false - } - assert(!value) - } - } - - // IsNotNullExtractor - { - val AIsNotNull = IsNotNullExtractor(aMatcher) - val NoneIsNotNull = IsNotNullExtractor(nonMatcher) - - test("IsNotNullExtractor matches IsNotNull() if matcher().") { - val value = IsNotNull(a) match { - case AIsNotNull() => true - case _ => false - } - assert(value) - } - - test("IsNotNullExtractor does not match IsNotNull() if !matcher().") { - val value = IsNotNull(a) match { - case NoneIsNotNull() => true - case _ => false - } - assert(!value) - } - - test("IsNotNullExtractor matches Not(IsNull()) if matcher().") { - val value = Not(IsNull(a)) match { - case AIsNotNull() => true - case _ => false - } - assert(value) - } - - test("IsNotNullExtractor matches Not(IsNull()) if !matcher().") { - val value = Not(IsNull(a)) match { - case NoneIsNotNull() => true - case _ => false - } - assert(!value) - } - - test("IsNotNullExtractor matches Not(EqualNullSafe(, null)).") { - val value = Not(EqualNullSafe(a, nullInt)) match { - case AIsNotNull() => true - case _ => false - } - assert(value) - } - - test("IsNotNullExtractor does not match Not(EqualNullSafe(, )).") { - val value = Not(EqualNullSafe(a, zero)) match { - case AIsNotNull() => true - case _ => false - } - assert(!value) - } - - test("IsNotNullExtractor matches Not(EqualNullSafe(null, )).") { - val value = Not(EqualNullSafe(nullInt, a)) match { - case AIsNotNull() => true - case _ => false - } - assert(value) - } - - test("IsNotNullExtractor does not match Not(EqualNullSafe(, )).") { - val value = Not(EqualNullSafe(zero, a)) match { - case AIsNotNull() => true - case _ => false - } - assert(!value) - } - - test( - "IsNotNullExtractor does not match expressions other than IsNotNull/Not(EqualNullSafe).") { - val value = zero match { - case AIsNotNull() => true - case _ => false - } - assert(!value) - } - } - - // IsTrueExtractor - { - val AIsTrue = IsTrueExtractor(aMatcher) - val BIsTrue = IsTrueExtractor(bMatcher) - val NoneIsTrue = IsTrueExtractor(nonMatcher) - - test("IsTrueExtractor matches if matcher().") { - val value = b match { - case BIsTrue() => true - case _ => false - } - assert(value) - } - - test("IsTrueExtractor does not match if !matcher().") { - val value = b match { - case NoneIsTrue() => true - case _ => false - } - assert(!value) - } - - test("IsTrueExtractor does not match if type is not boolean.") { - val value = a match { - case AIsTrue() => true - case _ => false - } - assert(!value) - } - - test("IsTrueExtractor matches EqualTo(, true) if matcher().") { - val value = EqualTo(b, trueLit) match { - case BIsTrue() => true - case _ => false - } - assert(value) - } - - test("IsTrueExtractor does not match EqualTo(, false).") { - val value = EqualTo(b, falseLit) match { - case BIsTrue() => true - case _ => false - } - assert(!value) - } - - test("IsTrueExtractor matches EqualTo(true, ) if matcher().") { - val value = EqualTo(trueLit, b) match { - case BIsTrue() => true - case _ => false - } - assert(value) - } - - test("IsTrueExtractor does not match EqualTo(false, ).") { - val value = EqualTo(falseLit, b) match { - case BIsTrue() => true - case _ => false - } - assert(!value) - } - - test("IsTrueExtractor matches EqualNullSafe(, true) if matcher().") { - val value = EqualNullSafe(b, trueLit) match { - case BIsTrue() => true - case _ => false - } - assert(value) - } - - test("IsTrueExtractor does not match EqualNullSafe(, false).") { - val value = EqualNullSafe(b, falseLit) match { - case BIsTrue() => true - case _ => false - } - assert(!value) - } - - test("IsTrueExtractor matches EqualNullSafe(true, ) if matcher().") { - val value = EqualNullSafe(trueLit, b) match { - case BIsTrue() => true - case _ => false - } - assert(value) - } - - test("IsTrueExtractor does not match EqualNullSafe(false, ).") { - val value = EqualNullSafe(falseLit, b) match { - case BIsTrue() => true - case _ => false - } - assert(!value) - } - - test("IsTrueExtractor does not match .") { - val value = zero match { - case BIsTrue() => true - case _ => false - } - assert(!value) - } - } - - // IsFalseExtractor - { - val BIsFalse = IsFalseExtractor(bMatcher) - val NoneIsFalse = IsFalseExtractor(nonMatcher) - - test("IsFalseExtractor matches Not() if matcher().") { - val value = Not(b) match { - case BIsFalse() => true - case _ => false - } - assert(value) - } - - test("IsFalseExtractor does not match Not() if !matcher().") { - val value = Not(b) match { - case NoneIsFalse() => true - case _ => false - } - assert(!value) - } - - test("IsFalseExtractor matches EqualTo(, false) if matcher().") { - val value = EqualTo(b, falseLit) match { - case BIsFalse() => true - case _ => false - } - assert(value) - } - - test("IsFalseExtractor does not match EqualTo(, true).") { - val value = EqualTo(b, trueLit) match { - case BIsFalse() => true - case _ => false - } - assert(!value) - } - - test("IsFalseExtractor matches EqualTo(false, ) if matcher().") { - val value = EqualTo(falseLit, b) match { - case BIsFalse() => true - case _ => false - } - assert(value) - } - - test("IsFalseExtractor does not match EqualTo(true, ).") { - val value = EqualTo(trueLit, b) match { - case BIsFalse() => true - case _ => false - } - assert(!value) - } - - test("IsFalseExtractor matches EqualNullSafe(, false) if matcher().") { - val value = EqualNullSafe(b, falseLit) match { - case BIsFalse() => true - case _ => false - } - assert(value) - } - - test("IsFalseExtractor does not match EqualNullSafe(, true).") { - val value = EqualNullSafe(b, trueLit) match { - case BIsFalse() => true - case _ => false - } - assert(!value) - } - - test("IsFalseExtractor matches EqualNullSafe(false, ) if matcher().") { - val value = EqualNullSafe(falseLit, b) match { - case BIsFalse() => true - case _ => false - } - assert(value) - } - - test("IsFalseExtractor does not match EqualNullSafe(true, ).") { - val value = EqualNullSafe(trueLit, b) match { - case BIsFalse() => true - case _ => false - } - assert(!value) - } - - test("IsFalseExtractor does not match .") { - val value = zero match { - case BIsFalse() => true - case _ => false - } - assert(!value) - } - } - - // InExtractor - { - val AIn = InExtractor(aMatcher) - val NoneIn = InExtractor(nonMatcher) - - test("InExtractor matches In(, *) if matcher().") { - val value = In(a, Seq(zero, one, two)) match { - case AIn(values) => Some(values) - case _ => None - } - assert(value === Some(Seq(zero, one, two))) - } - - test("InExtractor matches In(, *) if !matcher().") { - val value = In(a, Seq(zero, one, two)) match { - case NoneIn(values) => Some(values) - case _ => None - } - assert(value === None) - } - - test("InExtractor does not match In(, *) if * is empty.") { - val value = In(a, Nil) match { - case AIn(values) => Some(values) - case _ => None - } - assert(value === None) - } - - test("InExtractor matches In(, *) if some s are null.") { - val value = In(a, Seq(zero, nullInt, nullInt, two)) match { - case AIn(values) => Some(values) - case _ => None - } - assert(value === Some(Seq(zero, two))) - } - - test("InExtractor does not match In(, *).") { - val value = In(a, Seq(zero, two, a)) match { - case AIn(values) => Some(values) - case _ => None - } - assert(value === None) - } - - test("InExtractor does not match other than In.") { - val value = a match { - case AIn(values) => Some(values) - case _ => None - } - assert(value === None) - } - } - - // InSetExtractor - { - val AInSet = InSetExtractor(aMatcher) - val NoneInSet = InSetExtractor(nonMatcher) - - test("InSetExtractor matches InSet(, ) if matcher().") { - val value = InSet(a, Set(0, 1, 2)) match { - case AInSet(values) => Some(values) - case _ => None - } - assert(value === Some(Set(0, 1, 2))) - } - - test("InSetExtractor matches InSet(, *) if !matcher().") { - val value = InSet(a, Set(0, 1, 2)) match { - case NoneInSet(values) => Some(values) - case _ => None - } - assert(value === None) - } - - test("InSetExtractor does not match other than InSet.") { - val value = a match { - case AInSet(values) => Some(values) - case _ => None - } - assert(value === None) - } - } -}