Skip to content

Commit

Permalink
Add Spark list hashing Java tests (#11379)
Browse files Browse the repository at this point in the history
This PR adds Java tests for the Spark list hashing feature added in #11292.

Depends on #11292.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Jason Lowe (https://github.com/jlowe)

URL: #11379
  • Loading branch information
bdice authored Aug 1, 2022
1 parent 8709e29 commit 35a7c81
Showing 1 changed file with 72 additions and 0 deletions.
72 changes: 72 additions & 0 deletions java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,78 @@ void testSpark32BitMurmur3HashStruct() {
}
}

@Test
void testSpark32BitMurmur3HashNestedStruct() {
try (ColumnVector strings = ColumnVector.fromStrings(
"a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721",
"A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
"in the MD5 hash function. This string needed to be longer.",
null, null);
ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null);
ColumnVector doubles = ColumnVector.fromBoxedDoubles(
0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
ColumnVector floats = ColumnVector.fromBoxedFloats(
0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null);
ColumnView structs1 = ColumnView.makeStructView(strings, integers);
ColumnView structs2 = ColumnView.makeStructView(structs1, doubles);
ColumnView structs3 = ColumnView.makeStructView(bools);
ColumnView structs = ColumnView.makeStructView(structs2, floats, structs3);
ColumnVector expected = ColumnVector.spark32BitMurmurHash3(1868, new ColumnVector[]{strings, integers, doubles, floats, bools});
ColumnVector result = ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{structs})) {
assertColumnsAreEqual(expected, result);
}
}

@Test
void testSpark32BitMurmur3HashListsAndNestedLists() {
try (ColumnVector stringListCV = ColumnVector.fromLists(
new ListType(true, new BasicType(true, DType.STRING)),
Arrays.asList(null, "a"),
Arrays.asList("B\n", ""),
Arrays.asList("dE\"\u0100\t\u0101", " \ud720\ud721"),
Collections.singletonList("A very long (greater than 128 bytes/char string) to test a multi" +
" hash-step data point in the Murmur3 hash function. This string needed to be longer."),
Collections.singletonList(""),
null);
ColumnVector strings1 = ColumnVector.fromStrings(
"a", "B\n", "dE\"\u0100\t\u0101",
"A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
"in the Murmur3 hash function. This string needed to be longer.", null, null);
ColumnVector strings2 = ColumnVector.fromStrings(
null, "", " \ud720\ud721", null, "", null);
ColumnView stringStruct = ColumnView.makeStructView(strings1, strings2);
ColumnVector stringExpected = ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{stringStruct});
ColumnVector stringResult = ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{stringListCV});
ColumnVector intListCV = ColumnVector.fromLists(
new ListType(true, new BasicType(true, DType.INT32)),
null,
Arrays.asList(0, -2, 3),
Collections.singletonList(Integer.MAX_VALUE),
Arrays.asList(5, -6, null),
Collections.singletonList(Integer.MIN_VALUE),
null);
ColumnVector integers1 = ColumnVector.fromBoxedInts(null, 0, null, 5, Integer.MIN_VALUE, null);
ColumnVector integers2 = ColumnVector.fromBoxedInts(null, -2, Integer.MAX_VALUE, null, null, null);
ColumnVector integers3 = ColumnVector.fromBoxedInts(null, 3, null, -6, null, null);
ColumnVector intExpected =
ColumnVector.spark32BitMurmurHash3(1868, new ColumnVector[]{integers1, integers2, integers3});
ColumnVector intResult = ColumnVector.spark32BitMurmurHash3(1868, new ColumnVector[]{intListCV});
ColumnVector doubles = ColumnVector.fromBoxedDoubles(
0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
ColumnVector floats = ColumnVector.fromBoxedFloats(
0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
ColumnView structCV = ColumnView.makeStructView(intListCV, stringListCV, doubles, floats);
ColumnVector nestedExpected =
ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{intListCV, strings1, strings2, doubles, floats});
ColumnVector nestedResult =
ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{structCV})) {
assertColumnsAreEqual(stringExpected, stringResult);
assertColumnsAreEqual(intExpected, intResult);
assertColumnsAreEqual(nestedExpected, nestedResult);
}
}

@Test
void testAndNullReconfigureNulls() {
try (ColumnVector v0 = ColumnVector.fromBoxedInts(0, 100, null, null, Integer.MIN_VALUE, null);
Expand Down

0 comments on commit 35a7c81

Please sign in to comment.