Skip to content

Commit

Permalink
Span in PPL statsByClause could be specified after fields (#2720)
Browse files Browse the repository at this point in the history
Signed-off-by: Lantao Jin <[email protected]>
  • Loading branch information
LantaoJin authored Jul 9, 2024
1 parent f241f34 commit c063d5e
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 5 deletions.
16 changes: 15 additions & 1 deletion docs/user/ppl/cmd/stats.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ stats <aggregation>... [by-clause]
* Description: The by clause could be the fields and expressions like scalar functions and aggregation functions. Besides, the span clause can be used to split specific field into buckets in the same interval, the stats then does the aggregation by these span buckets.
* Default: If no <by-clause> is specified, the stats command returns only one row, which is the aggregation over the entire result set.

* span-expression: optional.
* span-expression: optional, at most one.

* Syntax: span(field_expr, interval_expr)
* Description: The unit of the interval expression is the natural unit by default. If the field is a date and time type field, and the interval is in date/time units, you will need to specify the unit in the interval expression. For example, to split the field ``age`` into buckets by 10 years, it looks like ``span(age, 10)``. And here is another example of time span, the span to split a ``timestamp`` field into hourly intervals, it looks like ``span(timestamp, 1h)``.
Expand Down Expand Up @@ -424,6 +424,20 @@ PPL query::
| 1 | 35 | M |
+-------+------------+----------+

Span will always be the first grouping key whatever order you specify.

PPL query::

os> source=accounts | stats count() as cnt by gender, span(age, 5) as age_span
fetched rows / total rows = 3/3
+-------+------------+----------+
| cnt | age_span | gender |
|-------+------------+----------|
| 1 | 25 | F |
| 2 | 30 | M |
| 1 | 35 | M |
+-------+------------+----------+

Example 10: Calculate the count and get email list by a gender and span
=======================================================================

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
import static org.opensearch.sql.util.MatcherUtils.rows;
import static org.opensearch.sql.util.MatcherUtils.schema;
import static org.opensearch.sql.util.MatcherUtils.verifyDataRows;
import static org.opensearch.sql.util.MatcherUtils.verifyDataRowsInOrder;
import static org.opensearch.sql.util.MatcherUtils.verifySchema;
import static org.opensearch.sql.util.MatcherUtils.verifySchemaInOrder;

import java.io.IOException;
import org.json.JSONObject;
Expand Down Expand Up @@ -190,6 +192,54 @@ public void testStatsAliasedSpan() throws IOException {
verifyDataRows(response, rows(1, 20), rows(6, 30));
}

@Test
public void testStatsBySpanAndMultipleFields() throws IOException {
JSONObject response =
executeQuery(
String.format(
"source=%s | stats count() by span(age,10), gender, state", TEST_INDEX_BANK));
verifySchemaInOrder(
response,
schema("count()", null, "integer"),
schema("span(age,10)", null, "integer"),
schema("gender", null, "string"),
schema("state", null, "string"));
verifyDataRowsInOrder(
response,
rows(1, 20, "f", "VA"),
rows(1, 30, "f", "IN"),
rows(1, 30, "f", "PA"),
rows(1, 30, "m", "IL"),
rows(1, 30, "m", "MD"),
rows(1, 30, "m", "TN"),
rows(1, 30, "m", "WA"));
}

@Test
public void testStatsByMultipleFieldsAndSpan() throws IOException {
// Use verifySchemaInOrder() and verifyDataRowsInOrder() to check that the span column is always
// the first column in result whatever the order of span in query is first or last one
JSONObject response =
executeQuery(
String.format(
"source=%s | stats count() by gender, state, span(age,10)", TEST_INDEX_BANK));
verifySchemaInOrder(
response,
schema("count()", null, "integer"),
schema("span(age,10)", null, "integer"),
schema("gender", null, "string"),
schema("state", null, "string"));
verifyDataRowsInOrder(
response,
rows(1, 20, "f", "VA"),
rows(1, 30, "f", "IN"),
rows(1, 30, "f", "PA"),
rows(1, 30, "m", "IL"),
rows(1, 30, "m", "MD"),
rows(1, 30, "m", "TN"),
rows(1, 30, "m", "WA"));
}

@Test
public void testStatsPercentile() throws IOException {
JSONObject response =
Expand Down
10 changes: 10 additions & 0 deletions integ-test/src/test/java/org/opensearch/sql/util/MatcherUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,16 @@ public static void verifySchema(JSONObject response, Matcher<JSONObject>... matc
}
}

@SafeVarargs
public static void verifySchemaInOrder(JSONObject response, Matcher<JSONObject>... matchers) {
try {
verifyInOrder(response.getJSONArray("schema"), matchers);
} catch (Exception e) {
LOG.error(String.format("verify schema failed, response: %s", response.toString()), e);
throw e;
}
}

@SafeVarargs
public static void verifyDataRows(JSONObject response, Matcher<JSONArray>... matchers) {
verify(response.getJSONArray("datarows"), matchers);
Expand Down
1 change: 1 addition & 0 deletions ppl/src/main/antlr/OpenSearchPPLParser.g4
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ statsByClause
: BY fieldList
| BY bySpanClause
| BY bySpanClause COMMA fieldList
| BY fieldList COMMA bySpanClause
;

bySpanClause
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -319,11 +319,26 @@ public void testStatsCommandWithSpan() {
exprList(alias("f1", field("f1")), alias("f2", field("f2"))),
alias("span(timestamp,1h)", span(field("timestamp"), intLiteral(1), SpanUnit.H)),
defaultStatsArgs()));
}

@Test(expected = org.opensearch.sql.common.antlr.SyntaxCheckException.class)
public void throwExceptionIfSpanInGroupByList() {
plan("source=t | stats avg(price) by f1, f2, span(timestamp, 1h)");
assertEqual(
"source=t | stats avg(price) by b, span(timestamp, 1h)",
agg(
relation("t"),
exprList(alias("avg(price)", aggregate("avg", field("price")))),
emptyList(),
exprList(alias("b", field("b"))),
alias("span(timestamp,1h)", span(field("timestamp"), intLiteral(1), SpanUnit.H)),
defaultStatsArgs()));

assertEqual(
"source=t | stats avg(price) by f1, f2, span(timestamp, 1h)",
agg(
relation("t"),
exprList(alias("avg(price)", aggregate("avg", field("price")))),
emptyList(),
exprList(alias("f1", field("f1")), alias("f2", field("f2"))),
alias("span(timestamp,1h)", span(field("timestamp"), intLiteral(1), SpanUnit.H)),
defaultStatsArgs()));
}

@Test(expected = org.opensearch.sql.common.antlr.SyntaxCheckException.class)
Expand Down

0 comments on commit c063d5e

Please sign in to comment.