support col match and change to DatasetMatch (#529)

* update col match and other improvements * review comments * review comments
awslabs · Feb 21, 2024 · 63e80dc · 63e80dc
1 parent 4cf8ca6
commit 63e80dc
Show file tree

Hide file tree

Showing 10 changed files with 253 additions and 185 deletions.
diff --git a/src/main/scala/com/amazon/deequ/analyzers/DataSynchronizationState.scala b/src/main/scala/com/amazon/deequ/analyzers/DataSynchronizationState.scala
diff --git a/...alyzers/DataSynchronizationAnalyzer.scala → ...eequ/analyzers/DatasetMatchAnalyzer.scala b/...alyzers/DataSynchronizationAnalyzer.scala → ...eequ/analyzers/DatasetMatchAnalyzer.scala
@@ -18,8 +18,8 @@ package com.amazon.deequ.analyzers
 
 import com.amazon.deequ.analyzers.Analyzers.metricFromFailure
 import com.amazon.deequ.comparison.DataSynchronization
-import com.amazon.deequ.comparison.DataSynchronizationFailed
-import com.amazon.deequ.comparison.DataSynchronizationSucceeded
+import com.amazon.deequ.comparison.DatasetMatchFailed
+import com.amazon.deequ.comparison.DatasetMatchSucceeded
 import com.amazon.deequ.metrics.DoubleMetric
 import com.amazon.deequ.metrics.Entity
 import org.apache.spark.sql.DataFrame
@@ -29,59 +29,67 @@ import scala.util.Try
 
 
 /**
- * An Analyzer for Deequ that performs a data synchronization check between two DataFrames.
- * It evaluates the degree of synchronization based on specified column mappings and an assertion function.
+ * An Analyzer for Deequ that performs a dataset match check between two DataFrames.
+ * It evaluates the degree of match based on specified column mappings and an assertion function.
  *
- * The analyzer computes a ratio of synchronized data points to the total data points, represented as a DoubleMetric.
- * Refer to [[com.amazon.deequ.comparison.DataSynchronization.columnMatch]] for DataSynchronization implementation
+ * The analyzer computes a ratio of matched data points to the total data points, represented as a DoubleMetric.
+ * Refer to [[com.amazon.deequ.comparison.DataSynchronization.columnMatch]] for dataset match implementation
  *
  * @param dfToCompare The DataFrame to compare with the primary DataFrame that is setup
  *                    during [[com.amazon.deequ.VerificationSuite.onData]] setup.
  * @param columnMappings A map where each key-value pair represents a column in the primary DataFrame
  *                       and its corresponding column in dfToCompare.
+ * @param matchColumnMappings A map defining the column correlations between the current DataFrame and otherDf.
+ *                            These are the columns which we will check for equality, post joining.
+ *                            It's an optional value with defaults to None.
  * @param assertion A function that takes a Double (the match ratio) and returns a Boolean.
  *                  It defines the condition for successful synchronization.
  *
  * Usage:
- * This analyzer is used in Deequ's VerificationSuite based if `isDataSynchronized` check is defined or could be used
+ * This analyzer is used in Deequ's VerificationSuite based if `doesDatasetMatch` check is defined or could be used
  * manually as well.
  *
  * Example:
- * val analyzer = DataSynchronizationAnalyzer(dfToCompare, Map("col1" -> "col2"), _ > 0.8)
+ * val analyzer = DatasetMatchAnalyzer(dfToCompare, Map("col1" -> "col2"), _ > 0.8)
  * val verificationResult = VerificationSuite().onData(df).addAnalyzer(analyzer).run()
  *
  * // or could do something like below
- * val verificationResult = VerificationSuite().onData(df).isDataSynchronized(dfToCompare, Map("col1" -> "col2"),
+ * val verificationResult = VerificationSuite().onData(df).doesDatasetMatch(dfToCompare, Map("col1" -> "col2"),
  *                                                                              _ > 0.8).run()
  *
  *
- * The computeStateFrom method calculates the synchronization state by comparing the specified columns of the two
+ * The computeStateFrom method calculates the datasetmatch state by comparing the specified columns of the two
  * DataFrames.
- * The computeMetricFrom method then converts this state into a DoubleMetric representing the synchronization ratio.
+ * The computeMetricFrom method then converts this state into a DoubleMetric representing the match ratio.
  *
  */
-case class DataSynchronizationAnalyzer(dfToCompare: DataFrame,
-                                       columnMappings: Map[String, String],
-                                       assertion: Double => Boolean)
-  extends Analyzer[DataSynchronizationState, DoubleMetric] {
+case class DatasetMatchAnalyzer(dfToCompare: DataFrame,
+                                columnMappings: Map[String, String],
+                                assertion: Double => Boolean,
+                                matchColumnMappings: Option[Map[String, String]] = None)
+  extends Analyzer[DatasetMatchState, DoubleMetric] {
 
-  override def computeStateFrom(data: DataFrame): Option[DataSynchronizationState] = {
+  override def computeStateFrom(data: DataFrame): Option[DatasetMatchState] = {
 
-    val result = DataSynchronization.columnMatch(data, dfToCompare, columnMappings, assertion)
+    val result = if (matchColumnMappings.isDefined) {
+      DataSynchronization.columnMatch(data, dfToCompare, columnMappings, matchColumnMappings.get, assertion)
+    } else {
+      DataSynchronization.columnMatch(data, dfToCompare, columnMappings, assertion)
+    }
 
     result match {
-      case succeeded: DataSynchronizationSucceeded =>
-        Some(DataSynchronizationState(succeeded.passedCount, succeeded.totalCount))
-      case failed: DataSynchronizationFailed =>
-        Some(DataSynchronizationState(failed.passedCount.getOrElse(0), failed.totalCount.getOrElse(0)))
+      case succeeded: DatasetMatchSucceeded =>
+        Some(DatasetMatchState(succeeded.passedCount, succeeded.totalCount))
+      case failed: DatasetMatchFailed =>
+        Some(DatasetMatchState(failed.passedCount.getOrElse(0), failed.totalCount.getOrElse(0)))
       case _ => None
     }
   }
 
-  override def computeMetricFrom(state: Option[DataSynchronizationState]): DoubleMetric = {
+  override def computeMetricFrom(state: Option[DatasetMatchState]): DoubleMetric = {
 
     val metric = state match {
-      case Some(s) => Try(s.synchronizedDataCount.toDouble / s.totalDataCount.toDouble)
+      case Some(s) => Try(s.matchedDataCount.toDouble / s.totalDataCount.toDouble)
       case _ => Failure(new IllegalStateException("No state available for DataSynchronizationAnalyzer"))
     }
 

diff --git a/src/main/scala/com/amazon/deequ/analyzers/DatasetMatchState.scala b/src/main/scala/com/amazon/deequ/analyzers/DatasetMatchState.scala
@@ -0,0 +1,46 @@
+/**
+ * Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not
+ * use this file except in compliance with the License. A copy of the License
+ * is located at
+ *
+ * http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+ * express or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+package com.amazon.deequ.analyzers
+
+/**
+ * Represents the state of datasetMatch between two DataFrames in Deequ.
+ * This state keeps track of the count of matched record count and the total record count.
+ * It measures how well the data in the two DataFrames matches.
+ *
+ * @param matchedDataCount The count of records that are considered match between the two DataFrames.
+ * @param totalDataCount The total count of records for check.
+ *
+ * The `sum` method allows for aggregation of this state with another, combining the counts from both states.
+ * This is useful in distributed computations where states from different partitions need to be aggregated.
+ *
+ * The `metricValue` method computes the synchronization ratio. It is the ratio of `matchedDataCount` to `dataCount`.
+ * If `dataCount` is zero, which means no data points were examined, the method returns `Double.NaN` to indicate
+ * the undefined state.
+ *
+ */
+case class DatasetMatchState(matchedDataCount: Long, totalDataCount: Long)
+  extends DoubleValuedState[DatasetMatchState] {
+  override def sum(other: DatasetMatchState): DatasetMatchState = {
+    DatasetMatchState(matchedDataCount + other.matchedDataCount, totalDataCount + other.totalDataCount)
+  }
+
+  override def metricValue(): Double = {
+    if (totalDataCount == 0L) Double.NaN else matchedDataCount.toDouble / totalDataCount.toDouble
+  }
+}
+
+object DatasetMatchState
diff --git a/src/main/scala/com/amazon/deequ/checks/Check.scala b/src/main/scala/com/amazon/deequ/checks/Check.scala
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
  * use this file except in compliance with the License. A copy of the License
@@ -19,8 +19,8 @@ package com.amazon.deequ.checks
 import com.amazon.deequ.analyzers.runners.AnalyzerContext
 import com.amazon.deequ.analyzers.Analyzer
 import com.amazon.deequ.analyzers.AnalyzerOptions
-import com.amazon.deequ.analyzers.DataSynchronizationAnalyzer
-import com.amazon.deequ.analyzers.DataSynchronizationState
+import com.amazon.deequ.analyzers.DatasetMatchAnalyzer
+import com.amazon.deequ.analyzers.DatasetMatchState
 import com.amazon.deequ.analyzers.Histogram
 import com.amazon.deequ.analyzers.KLLParameters
 import com.amazon.deequ.analyzers.Patterns
@@ -351,13 +351,13 @@ case class Check(
   }
 
   /**
-   * Performs a data synchronization check between the base DataFrame supplied to
+   * Performs a dataset check between the base DataFrame supplied to
    * [[com.amazon.deequ.VerificationSuite.onData]] and other DataFrame supplied to this check using Deequ's
    * [[com.amazon.deequ.comparison.DataSynchronization.columnMatch]] framework.
-   * This method compares specified columns of both DataFrames and assesses synchronization based on a custom assertion.
+   * This method compares specified columns of both DataFrames and assesses match based on a custom assertion.
    *
-   * Utilizes [[com.amazon.deequ.analyzers.DataSynchronizationAnalyzer]] for comparing the data
-   * and Constraint [[com.amazon.deequ.constraints.DataSynchronizationConstraint]].
+   * Utilizes [[com.amazon.deequ.analyzers.DatasetMatchAnalyzer]] for comparing the data
+   * and Constraint [[com.amazon.deequ.constraints.DatasetMatchConstraint]].
    *
    * Usage:
    * To use this method, create a VerificationSuite and invoke this method as part of adding checks:
@@ -368,37 +368,41 @@ case class Check(
    *   val assertionFunction: Double => Boolean = _ > 0.7
    *
    *   val check = new Check(CheckLevel.Error, "Data Synchronization Check")
-   *     .isDataSynchronized(otherDataFrame, columnMappings, assertionFunction)
+   *     .doesDatasetMatch(otherDataFrame, columnMappings, assertionFunction)
    *
    *   val verificationResult = VerificationSuite()
    *     .onData(baseDataFrame)
    *     .addCheck(check)
    *     .run()
    * }}}
    *
-   * This will add a data synchronization check to the VerificationSuite, comparing the specified columns of
+   * This will add a dataset match check to the VerificationSuite, comparing the specified columns of
    * baseDataFrame and otherDataFrame based on the provided assertion function.
    *
-   *
-   * @param otherDf         The DataFrame to be compared with the current one. Analyzed in conjunction with the
-   *                        current DataFrame to assess data synchronization.
-   * @param columnMappings  A map defining the column correlations between the current DataFrame and otherDf.
-   *                        Keys represent column names in the current DataFrame,
-   *                        and values are corresponding column names in otherDf.
-   * @param assertion       A function that takes a Double (result of the comparison) and returns a Boolean.
-   *                        Defines the condition under which the data in both DataFrames is considered synchronized.
-   *                        For example (_ > 0.7) denoting metric value > 0.7 or 70% of records.
-   * @param hint            Optional. Additional context or information about the synchronization check.
-   *                        Helpful for understanding the intent or specifics of the check. Default is None.
-   * @return                A [[com.amazon.deequ.checks.Check]] object representing the outcome
-   *                        of the synchronization check. This object can be used in Deequ's verification suite to
-   *                        assert data quality constraints.
+   * @param otherDataset The DataFrame to be compared with the current one. Analyzed in conjunction with the
+   *                     current DataFrame to assess data synchronization.
+   * @param keyColumnMappings  A map defining the column correlations between the current DataFrame and otherDf.
+   *                           Keys represent column names in the current DataFrame, and values are corresponding
+   *                           column names in otherDf.
+   * @param assertion A function that takes a Double (result of the comparison) and returns a Boolean. Defines the
+   *                  condition under which the data in both DataFrames is considered synchronized. For example
+   *                  (_ > 0.7) denoting metric value > 0.7 or 70% of records.
+   * @param matchColumnMappings A map defining the column correlations between the current DataFrame and otherDf.
+   *                            These are the columns which we will check for equality, post joining. It's an optional
+   *                            value with defaults to None, which will be derived from `keyColumnMappings` if None.
+   * @param hint Optional. Additional context or information about the synchronization check.
+   *             Helpful for understanding the intent or specifics of the check. Default is None.
+   * @return A [[com.amazon.deequ.checks.Check]] object representing the outcome of the dataset match check.
+   *         This object can be used in Deequ's verification suite to assert data quality constraints.
    *
    */
-  def isDataSynchronized(otherDf: DataFrame, columnMappings: Map[String, String], assertion: Double => Boolean,
-                         hint: Option[String] = None): Check = {
-    val dataSyncAnalyzer = DataSynchronizationAnalyzer(otherDf, columnMappings, assertion)
-    val constraint = AnalysisBasedConstraint[DataSynchronizationState, Double, Double](dataSyncAnalyzer, assertion,
+  def doesDatasetMatch(otherDataset: DataFrame,
+                       keyColumnMappings: Map[String, String],
+                       assertion: Double => Boolean,
+                       matchColumnMappings: Option[Map[String, String]] = None,
+                       hint: Option[String] = None): Check = {
+    val dataMatchAnalyzer = DatasetMatchAnalyzer(otherDataset, keyColumnMappings, assertion, matchColumnMappings)
+    val constraint = AnalysisBasedConstraint[DatasetMatchState, Double, Double](dataMatchAnalyzer, assertion,
       hint = hint)
     addConstraint(constraint)
   }

diff --git a/src/main/scala/com/amazon/deequ/comparison/ComparisonResult.scala b/src/main/scala/com/amazon/deequ/comparison/ComparisonResult.scala
@@ -21,6 +21,6 @@ sealed trait ComparisonResult
 case class ComparisonFailed(errorMessage: String, ratio: Double = 0) extends ComparisonResult
 case class ComparisonSucceeded(ratio: Double = 0) extends ComparisonResult
 
-case class DataSynchronizationFailed(errorMessage: String, passedCount: Option[Long] = None,
-                                     totalCount: Option[Long] = None) extends ComparisonResult
-case class DataSynchronizationSucceeded(passedCount: Long, totalCount: Long) extends ComparisonResult
+case class DatasetMatchFailed(errorMessage: String, passedCount: Option[Long] = None,
+                              totalCount: Option[Long] = None) extends ComparisonResult
+case class DatasetMatchSucceeded(passedCount: Long, totalCount: Long) extends ComparisonResult