Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Addition of HasMax/HasMin/HasStandardDeviation/HasMean constraint suggestions #489

Merged
merged 16 commits into from
Jun 27, 2023
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ target/
.vscode/
.bloop/
.DS_Store

.scalafmt.conf
*.log
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
build:
mvn clean install
compile:
mvn clean compile
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ private[examples] object ConstraintSuggestionExample extends App {
// to suggest constraints
val suggestionResult = ConstraintSuggestionRunner()
.onData(data)
.addConstraintRules(Rules.DEFAULT)
.addConstraintRules(Rules.EXTENDED)
.run()

// We can now investigate the constraints that deequ suggested. We get a textual description
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ object Rules {
Seq(CompleteIfCompleteRule(), RetainCompletenessRule(), RetainTypeRule(),
CategoricalRangeRule(), FractionalCategoricalRangeRule(),
NonNegativeNumbersRule())

val NUMERICAL: Seq[ConstraintRule[ColumnProfile]] =
Seq(HasMax(), HasMin(), HasMean(), HasStandardDeviation())

val EXTENDED: Seq[ConstraintRule[ColumnProfile]] = DEFAULT ++ NUMERICAL
}

private[suggestions] case class ConstraintSuggestionMetricsRepositoryOptions(
Expand Down
56 changes: 56 additions & 0 deletions src/main/scala/com/amazon/deequ/suggestions/rules/HasMax.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/**
* Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"). You may not
* use this file except in compliance with the License. A copy of the License
* is located at
*
* http://aws.amazon.com/apache2.0/
*
* or in the "license" file accompanying this file. This file is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
*/

package com.amazon.deequ.suggestions.rules

import com.amazon.deequ.checks.Check
import com.amazon.deequ.constraints.Constraint.maxConstraint
import com.amazon.deequ.profiles.ColumnProfile
import com.amazon.deequ.profiles.NumericColumnProfile
import com.amazon.deequ.suggestions.ConstraintSuggestion
import com.amazon.deequ.checks

/** If we see only non-negative numbers in a column, we suggest a corresponding
* constraint
*/
case class HasMax() extends ConstraintRule[ColumnProfile] {

override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = {
profile match {
case np: NumericColumnProfile => np.maximum.isDefined
case _ => false
}
}

override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = {
val maximum: Double = profile match { case np: NumericColumnProfile => np.maximum.get }

val description = s"'${profile.column}' <= $maximum"
val constraint = maxConstraint(profile.column, _ == maximum)

ConstraintSuggestion(
constraint,
profile.column,
s"Maximum: $maximum",
description,
this,
s""".hasMax("${profile.column}", _ == $maximum)"""
)
}

override val ruleDescription: String = "If we see a numeric column, " +
"we suggest a corresponding Maximum value constraint"
}
56 changes: 56 additions & 0 deletions src/main/scala/com/amazon/deequ/suggestions/rules/HasMean.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/**
* Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"). You may not
* use this file except in compliance with the License. A copy of the License
* is located at
*
* http://aws.amazon.com/apache2.0/
*
* or in the "license" file accompanying this file. This file is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
*/

package com.amazon.deequ.suggestions.rules

import com.amazon.deequ.checks.Check
import com.amazon.deequ.constraints.Constraint.meanConstraint
import com.amazon.deequ.profiles.ColumnProfile
import com.amazon.deequ.profiles.NumericColumnProfile
import com.amazon.deequ.suggestions.ConstraintSuggestion
import com.amazon.deequ.checks

/** If we see only non-negative numbers in a column, we suggest a corresponding
* constraint
*/
case class HasMean() extends ConstraintRule[ColumnProfile] {

override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = {
profile match {
case np: NumericColumnProfile => np.mean.isDefined
case _ => false
}
}

override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = {
val mean: Double = profile match { case np: NumericColumnProfile => np.maximum.get }

val description = s"'${profile.column}' <= $mean"
val constraint = meanConstraint(profile.column, _ == mean)

ConstraintSuggestion(
constraint,
profile.column,
s"Mean: $mean",
description,
this,
s""".hasMean("${profile.column}", _ == $mean)"""
)
}

override val ruleDescription: String = "If we see a numeric column, " +
"we suggest a corresponding Mean value constraint"
}
55 changes: 55 additions & 0 deletions src/main/scala/com/amazon/deequ/suggestions/rules/HasMin.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/**
* Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"). You may not
* use this file except in compliance with the License. A copy of the License
* is located at
*
* http://aws.amazon.com/apache2.0/
*
* or in the "license" file accompanying this file. This file is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
*/

package com.amazon.deequ.suggestions.rules

import com.amazon.deequ.checks.Check
import com.amazon.deequ.constraints.Constraint.minConstraint
import com.amazon.deequ.profiles.ColumnProfile
import com.amazon.deequ.profiles.NumericColumnProfile
import com.amazon.deequ.suggestions.ConstraintSuggestion

/** If we see only non-negative numbers in a column, we suggest a corresponding
* constraint
*/
case class HasMin() extends ConstraintRule[ColumnProfile] {

override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = {
profile match {
case np: NumericColumnProfile => np.minimum.isDefined
case _ => false
}
}

override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = {
val minimum: Double = profile match { case np: NumericColumnProfile => np.minimum.get }

val description = s"'${profile.column}' >= $minimum"
val constraint = minConstraint(profile.column, _ == minimum)

ConstraintSuggestion(
constraint,
profile.column,
s"Minimum: $minimum",
description,
this,
s""".hasMin("${profile.column}", _ == $minimum)"""
)
}

override val ruleDescription: String = "If we see a numeric column, " +
"we suggest a corresponding Minimum value constraint"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/**
* Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"). You may not
* use this file except in compliance with the License. A copy of the License
* is located at
*
* http://aws.amazon.com/apache2.0/
*
* or in the "license" file accompanying this file. This file is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
*/

package com.amazon.deequ.suggestions.rules

import com.amazon.deequ.checks.Check
import com.amazon.deequ.constraints.Constraint.standardDeviationConstraint
import com.amazon.deequ.profiles.ColumnProfile
import com.amazon.deequ.profiles.NumericColumnProfile
import com.amazon.deequ.suggestions.ConstraintSuggestion
import com.amazon.deequ.checks

/** If we see only non-negative numbers in a column, we suggest a corresponding
* constraint
*/
case class HasStandardDeviation() extends ConstraintRule[ColumnProfile] {

override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = {
profile match {
case np: NumericColumnProfile => np.mean.isDefined
case _ => false
}
}

override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = {
val stdDev: Double = profile match { case np: NumericColumnProfile => np.stdDev.get }

val description = s"'${profile.column}' <= $stdDev"
val constraint = standardDeviationConstraint(profile.column, _ == stdDev)

ConstraintSuggestion(
constraint,
profile.column,
s"stdDev: $stdDev",
description,
this,
s""".hasStandardDeviation("${profile.column}", _ == $stdDev)"""
)
}

override val ruleDescription: String = "If we see a numeric column, " +
"we suggest a corresponding standard deviation value constraint"
}
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ class ConstraintSuggestionsIntegrationTest extends WordSpec with SparkContextSpe

val constraintSuggestionResult = ConstraintSuggestionRunner()
.onData(data)
.addConstraintRules(Rules.DEFAULT)
.addConstraintRules(Rules.EXTENDED)
.addConstraintRule(UniqueIfApproximatelyUniqueRule())
.run()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,46 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
val complete = StandardColumnProfile("col1", 1.0, 100, String, false, Map.empty, None)
val incomplete = StandardColumnProfile("col1", .25, 100, String, false, Map.empty, None)

val completeInteger =
getFakeNumericColumnProfileWithMinMaxMeanAndStdDev(
columnName = "integer1",
completeness = 1.0,
dataType = DataTypeInstances.Integral,
mean = 12,
maximum = 123,
minimum = -123,
stdDev = 1
)

val incompleteFractional =
getFakeNumericColumnProfileWithMinMaxMeanAndStdDev(
columnName = "fractional1",
completeness = 0.9,
dataType = DataTypeInstances.Fractional,
mean = 12.5,
maximum = 123.7,
minimum = -123.89,
stdDev = 1.023
)

assert(CompleteIfCompleteRule().shouldBeApplied(complete, 1000))
assert(!CompleteIfCompleteRule().shouldBeApplied(incomplete, 1000))
assert(!HasMax().shouldBeApplied(complete, 1000))
assert(!HasMean().shouldBeApplied(complete, 1000))
assert(!HasMin().shouldBeApplied(complete, 1000))
assert(!HasStandardDeviation().shouldBeApplied(complete, 1000))
assert(CompleteIfCompleteRule().shouldBeApplied(complete, 1000))
assert(CompleteIfCompleteRule().shouldBeApplied(complete, 1000))
assert(HasMax().shouldBeApplied(completeInteger, 1000))
assert(HasMax().shouldBeApplied(incompleteFractional, 1000))
assert(HasMean().shouldBeApplied(completeInteger, 1000))
assert(HasMean().shouldBeApplied(completeInteger, 1000))
assert(HasMin().shouldBeApplied(completeInteger, 1000))
assert(HasMin().shouldBeApplied(incompleteFractional, 1000))
assert(HasStandardDeviation().shouldBeApplied(completeInteger, 1000))
assert(HasStandardDeviation().shouldBeApplied(completeInteger, 1000))
assert(HasStandardDeviation().shouldBeApplied(incompleteFractional, 1000))
assert(HasStandardDeviation().shouldBeApplied(incompleteFractional, 1000))
}

"return evaluable constraint candidates" in
Expand Down
30 changes: 30 additions & 0 deletions src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

package com.amazon.deequ.utils

import com.amazon.deequ.analyzers.DataTypeInstances
import com.amazon.deequ.profiles.NumericColumnProfile
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.Row
Expand Down Expand Up @@ -414,4 +416,32 @@ trait FixtureSupport {
("bar", 20)
).toDF("name", "age")
}

def getFakeNumericColumnProfileWithMinMaxMeanAndStdDev(
columnName: String,
completeness: Double,
dataType: DataTypeInstances.Value,
minimum: Double,
maximum: Double,
mean: Double,
stdDev: Double
): NumericColumnProfile = {

NumericColumnProfile(
column = columnName,
completeness = completeness,
approximateNumDistinctValues = 1000,
dataType = dataType,
isDataTypeInferred = false,
typeCounts = Map[String, Long](),
histogram = None,
kll = None,
mean = Some(mean),
maximum = Some(maximum),
minimum = Some(minimum),
sum = Some(1000.879),
stdDev = Some(1.023),
approxPercentiles = None
)
}
}