Skip to content

Commit

Permalink
Support Percentile in PPL
Browse files Browse the repository at this point in the history
* Support Percentile in PPL

Signed-off-by: Lantao Jin <[email protected]>

* Remove ANSI SQL percentile syntax

Signed-off-by: Lantao Jin <[email protected]>

* add more unit tests and increase test coverage

Signed-off-by: Lantao Jin <[email protected]>

* increase test coverage

Signed-off-by: Lantao Jin <[email protected]>

* address comments and add docs

Signed-off-by: Lantao Jin <[email protected]>

* add examples in doc

Signed-off-by: Lantao Jin <[email protected]>

* fix doctest failure and add more integ tests

Signed-off-by: Lantao Jin <[email protected]>

* remove useless code and antlr4 files

Signed-off-by: Lantao Jin <[email protected]>

---------

Signed-off-by: Lantao Jin <[email protected]>
  • Loading branch information
LantaoJin authored Jun 6, 2024
1 parent 3f1e3bd commit d767868
Show file tree
Hide file tree
Showing 27 changed files with 1,375 additions and 71 deletions.
1 change: 1 addition & 0 deletions core/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ dependencies {
api "com.fasterxml.jackson.core:jackson-databind:${versions.jackson_databind}"
api "com.fasterxml.jackson.core:jackson-annotations:${versions.jackson}"
api group: 'com.google.code.gson', name: 'gson', version: '2.8.9'
api group: 'com.tdunning', name: 't-digest', version: '3.3'
api project(':common')

testImplementation('org.junit.jupiter:junit-jupiter:5.9.3')
Expand Down
12 changes: 12 additions & 0 deletions core/src/main/java/org/opensearch/sql/expression/DSL.java
Original file line number Diff line number Diff line change
Expand Up @@ -735,6 +735,18 @@ public static Aggregator max(Expression... expressions) {
return aggregate(BuiltinFunctionName.MAX, expressions);
}

/**
* OpenSearch uses T-Digest to approximate percentile, so PERCENTILE and PERCENTILE_APPROX are the
* same function.
*/
public static Aggregator percentile(Expression... expressions) {
return percentileApprox(expressions);
}

public static Aggregator percentileApprox(Expression... expressions) {
return aggregate(BuiltinFunctionName.PERCENTILE_APPROX, expressions);
}

private static Aggregator aggregate(BuiltinFunctionName functionName, Expression... expressions) {
return compile(FunctionProperties.None, functionName, expressions);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ public static void register(BuiltinFunctionRepository repository) {
repository.register(stddevSamp());
repository.register(stddevPop());
repository.register(take());
repository.register(percentileApprox());
}

private static DefaultFunctionResolver avg() {
Expand Down Expand Up @@ -235,4 +236,46 @@ private static DefaultFunctionResolver take() {
.build());
return functionResolver;
}

private static DefaultFunctionResolver percentileApprox() {
FunctionName functionName = BuiltinFunctionName.PERCENTILE_APPROX.getName();
DefaultFunctionResolver functionResolver =
new DefaultFunctionResolver(
functionName,
new ImmutableMap.Builder<FunctionSignature, FunctionBuilder>()
.put(
new FunctionSignature(functionName, ImmutableList.of(INTEGER, DOUBLE)),
(functionProperties, arguments) ->
PercentileApproximateAggregator.percentileApprox(arguments, INTEGER))
.put(
new FunctionSignature(functionName, ImmutableList.of(INTEGER, DOUBLE, DOUBLE)),
(functionProperties, arguments) ->
PercentileApproximateAggregator.percentileApprox(arguments, INTEGER))
.put(
new FunctionSignature(functionName, ImmutableList.of(LONG, DOUBLE)),
(functionProperties, arguments) ->
PercentileApproximateAggregator.percentileApprox(arguments, LONG))
.put(
new FunctionSignature(functionName, ImmutableList.of(LONG, DOUBLE, DOUBLE)),
(functionProperties, arguments) ->
PercentileApproximateAggregator.percentileApprox(arguments, LONG))
.put(
new FunctionSignature(functionName, ImmutableList.of(FLOAT, DOUBLE)),
(functionProperties, arguments) ->
PercentileApproximateAggregator.percentileApprox(arguments, FLOAT))
.put(
new FunctionSignature(functionName, ImmutableList.of(FLOAT, DOUBLE, DOUBLE)),
(functionProperties, arguments) ->
PercentileApproximateAggregator.percentileApprox(arguments, FLOAT))
.put(
new FunctionSignature(functionName, ImmutableList.of(DOUBLE, DOUBLE)),
(functionProperties, arguments) ->
PercentileApproximateAggregator.percentileApprox(arguments, DOUBLE))
.put(
new FunctionSignature(functionName, ImmutableList.of(DOUBLE, DOUBLE, DOUBLE)),
(functionProperties, arguments) ->
PercentileApproximateAggregator.percentileApprox(arguments, DOUBLE))
.build());
return functionResolver;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/

package org.opensearch.sql.expression.aggregation;

import static org.opensearch.sql.data.model.ExprValueUtils.doubleValue;
import static org.opensearch.sql.utils.ExpressionUtils.format;

import com.tdunning.math.stats.AVLTreeDigest;
import java.util.List;
import org.opensearch.sql.common.utils.StringUtils;
import org.opensearch.sql.data.model.ExprNullValue;
import org.opensearch.sql.data.model.ExprValue;
import org.opensearch.sql.data.type.ExprCoreType;
import org.opensearch.sql.expression.Expression;
import org.opensearch.sql.expression.function.BuiltinFunctionName;

/** Aggregator to calculate approximate percentile. */
public class PercentileApproximateAggregator
extends Aggregator<PercentileApproximateAggregator.PercentileApproximateState> {

public static Aggregator percentileApprox(List<Expression> arguments, ExprCoreType returnType) {
return new PercentileApproximateAggregator(arguments, returnType);
}

public PercentileApproximateAggregator(List<Expression> arguments, ExprCoreType returnType) {
super(BuiltinFunctionName.PERCENTILE_APPROX.getName(), arguments, returnType);
if (!ExprCoreType.numberTypes().contains(returnType)) {
throw new IllegalArgumentException(
String.format("percentile aggregation over %s type is not supported", returnType));
}
}

@Override
public PercentileApproximateState create() {
if (getArguments().size() == 2) {
return new PercentileApproximateState(getArguments().get(1).valueOf().doubleValue());
} else {
return new PercentileApproximateState(
getArguments().get(1).valueOf().doubleValue(),
getArguments().get(2).valueOf().doubleValue());
}
}

@Override
protected PercentileApproximateState iterate(ExprValue value, PercentileApproximateState state) {
state.evaluate(value);
return state;
}

@Override
public String toString() {
return StringUtils.format("%s(%s)", "percentile", format(getArguments()));
}

/**
* PercentileApproximateState is used to store the AVLTreeDigest state for percentile estimation.
*/
protected static class PercentileApproximateState extends AVLTreeDigest
implements AggregationState {
// The compression level for the AVLTreeDigest, keep the same default value as OpenSearch core.
public static final double DEFAULT_COMPRESSION = 100.0;
private final double percent;

PercentileApproximateState(double percent) {
super(DEFAULT_COMPRESSION);
if (percent < 0.0 || percent > 100.0) {
throw new IllegalArgumentException("out of bounds percent value, must be in [0, 100]");
}
this.percent = percent / 100.0;
}

/**
* Constructor for specifying both percent and compression level.
*
* @param percent the percent to compute, must be in [0, 100]
* @param compression the compression factor of the t-digest sketches used
*/
PercentileApproximateState(double percent, double compression) {
super(compression);
if (percent < 0.0 || percent > 100.0) {
throw new IllegalArgumentException("out of bounds percent value, must be in [0, 100]");
}
this.percent = percent / 100.0;
}

public void evaluate(ExprValue value) {
this.add(value.doubleValue());
}

@Override
public ExprValue result() {
return this.size() == 0 ? ExprNullValue.of() : doubleValue(this.quantile(percent));
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,8 @@ public enum BuiltinFunctionName {
STDDEV_POP(FunctionName.of("stddev_pop")),
// take top documents from aggregation bucket.
TAKE(FunctionName.of("take")),
// t-digest percentile which is used in OpenSearch core by default.
PERCENTILE_APPROX(FunctionName.of("percentile_approx")),
// Not always an aggregation query
NESTED(FunctionName.of("nested")),

Expand Down Expand Up @@ -279,6 +281,8 @@ public enum BuiltinFunctionName {
.put("stddev_pop", BuiltinFunctionName.STDDEV_POP)
.put("stddev_samp", BuiltinFunctionName.STDDEV_SAMP)
.put("take", BuiltinFunctionName.TAKE)
.put("percentile", BuiltinFunctionName.PERCENTILE_APPROX)
.put("percentile_approx", BuiltinFunctionName.PERCENTILE_APPROX)
.build();

public static Optional<BuiltinFunctionName> of(String str) {
Expand Down
Loading

0 comments on commit d767868

Please sign in to comment.