Skip to content

Commit

Permalink
Fix chi-square test conditions (#482)
Browse files Browse the repository at this point in the history
* add population stability index function

* add population stability index function

* correct style issues

* refactor psi formula

* update imports

* add population stability index function

Bug fix of chi-square test condition

* correct style issues

---------

Co-authored-by: Bever <[email protected]>
  • Loading branch information
2 people authored and rdsharma26 committed Apr 16, 2024
1 parent 06df314 commit 7800e91
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 1 deletion.
2 changes: 1 addition & 1 deletion src/main/scala/com/amazon/deequ/analyzers/Distance.scala
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ object Distance {
sample.map(e => (e._1, e._2.toDouble)), expectedNorm, absThresholdYates, percThresholdYates, absThresholdCochran)

// If less than 2 categories remain we cannot conduct the test
if (regroupedSample.keySet.size < chisquareMinDimension) {
if (regroupedExpected.keySet.size < chisquareMinDimension) {
Double.NaN
} else {
// run chi-square test and return statistics or p-value
Expand Down
28 changes: 28 additions & 0 deletions src/test/scala/com/amazon/deequ/KLL/KLLDistanceTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ class KLLDistanceTest extends WordSpec with SparkContextSpec
assert(distance == 0.06015037593984962)
}


"Categorial distance should compute correct linf_robust" in {
val sample1 = scala.collection.mutable.Map(
"a" -> 10L, "b" -> 20L, "c" -> 25L, "d" -> 10L, "e" -> 5L)
Expand Down Expand Up @@ -183,6 +184,33 @@ class KLLDistanceTest extends WordSpec with SparkContextSpec
assert(distance.isNaN)
}

"Categorial chi-square distance when number of expected categories below minimum" in {
val sample = scala.collection.mutable.Map(
"a" -> 10L, "b" -> 20L)
val expected = scala.collection.mutable.Map(
"b" -> 20L)
val distance = Distance.categoricalDistance(sample, expected, method = ChisquareMethod())
assert(distance.equals(Double.NaN))
}

"Categorial chi-square distance when categories do not match" in {
val sample = scala.collection.mutable.Map(
"a" -> 15L, "b" -> 20L)
val expected = scala.collection.mutable.Map(
"c" -> 20L, "d" -> 20L)
val distance = Distance.categoricalDistance(sample, expected, method = ChisquareMethod())
assert(distance.equals(Double.NaN))
}

"Categorial chi-square distance when number of sample categories is below minimum" in {
val sample = scala.collection.mutable.Map(
"a" -> 30L)
val expected = scala.collection.mutable.Map(
"a" -> 20L, "b" -> 20L)
val distance = Distance.categoricalDistance(sample, expected, method = ChisquareMethod())
assert(distance == 4.3204630539861455E-8)
}

"Population Stability Index (PSI) test with deciles " in {

val expected: List[BucketValue] = List(BucketValue(1.0, 1.05, 428), BucketValue(1.05, 1.1, 425),
Expand Down

0 comments on commit 7800e91

Please sign in to comment.