update according to PR comments

- DISTINCT_COUNT_APPROX should be added to keywordsCanBeId Signed-off-by: YANGDB <[email protected]>
opensearch-project · Nov 11, 2024 · 0ae73e4 · 0ae73e4
1 parent b7f0855
commit 0ae73e4
Show file tree

Hide file tree

Showing 7 changed files with 22 additions and 26 deletions.
diff --git a/docs/ppl-lang/ppl-rare-command.md b/docs/ppl-lang/ppl-rare-command.md
@@ -12,7 +12,7 @@ Using ``rare`` command to find the least common tuple of values of all fields in
 * N: number of results to return. **Default**: 10
 * field-list: mandatory. comma-delimited list of field names.
 * by-clause: optional. one or more fields to group the results by.
-* top_approx: approximate the count by using estimated [cardinality by HyperLogLog++ algorithm](https://spark.apache.org/docs/3.5.2/sql-ref-functions-builtin.html).
+* rare_approx: approximate count of the rare (n) fields by using estimated [cardinality by HyperLogLog++ algorithm](https://spark.apache.org/docs/3.5.2/sql-ref-functions-builtin.html).
 
 
 ### Example 1: Find the least common values in a field
@@ -22,6 +22,7 @@ The example finds least common gender of all the accounts.
 PPL query:
 
     os> source=accounts | rare gender;
+    os> source=accounts | rare_approx 10 gender;
     os> source=accounts | rare_approx gender;
     fetched rows / total rows = 2/2
     +----------+

diff --git a/docs/ppl-lang/ppl-top-command.md b/docs/ppl-lang/ppl-top-command.md
@@ -11,7 +11,7 @@ Using ``top`` command to find the most common tuple of values of all fields in t
 * N: number of results to return. **Default**: 10
 * field-list: mandatory. comma-delimited list of field names.
 * by-clause: optional. one or more fields to group the results by.
-* top_approx: approximate the count by using estimated [cardinality by HyperLogLog++ algorithm](https://spark.apache.org/docs/3.5.2/sql-ref-functions-builtin.html).
+* top_approx: approximate count of the (n) top fields by using estimated [cardinality by HyperLogLog++ algorithm](https://spark.apache.org/docs/3.5.2/sql-ref-functions-builtin.html).
 
 ### Example 1: Find the most common values in a field
 
@@ -20,7 +20,7 @@ The example finds most common gender of all the accounts.
 PPL query:
 
     os> source=accounts | top gender;
-    os> source=accounts_approx | top gender;
+    os> source=accounts | top_approx gender;
     fetched rows / total rows = 2/2
     +----------+
     | gender   |
@@ -35,7 +35,7 @@ The example finds most common gender of all the accounts.
 
 PPL query:
 
-    os> source=accounts_approx | top 1 gender;
+    os> source=accounts | top_approx 1 gender;
     fetched rows / total rows = 1/1
     +----------+
     | gender   |

diff --git a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4
@@ -25,7 +25,7 @@ EVAL:                               'EVAL';
 HEAD:                               'HEAD';
 TOP_APPROX:                         'TOP_APPROX';
 TOP:                                'TOP';
-RARE_APPROX:                         'RARE_APPROX';
+RARE_APPROX:                        'RARE_APPROX';
 RARE:                               'RARE';
 PARSE:                              'PARSE';
 METHOD:                             'METHOD';

diff --git a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4
@@ -1124,6 +1124,7 @@ keywordsCanBeId
    // AGGREGATIONS
    | statsFunctionName
    | DISTINCT_COUNT
+   | DISTINCT_COUNT_APPROX
    | PERCENTILE
    | PERCENTILE_APPROX
    | ESTDC

diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystPlanContext.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystPlanContext.java
@@ -188,13 +188,6 @@ public LogicalPlan reduce(BiFunction<LogicalPlan, LogicalPlan, LogicalPlan> tran
             return result;
         }).orElse(getPlan()));
     }
-
-    /**
-     * update context using the given action and node 
-     */
-    public CatalystPlanContext update(UnaryOperator<CatalystPlanContext> action) {
-        return action.apply(this);
-    } 
 
     /**
      * apply for each plan with the given function

diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java
@@ -370,21 +370,19 @@ public LogicalPlan visitAlias(Alias node, CatalystPlanContext context) {
 
     @Override
     public LogicalPlan visitProject(Project node, CatalystPlanContext context) {
-        context.update((ctx) -> {
-            if (node.isExcluded()) {
-                List<UnresolvedExpression> intersect = ctx.getProjectedFields().stream()
-                        .filter(node.getProjectList()::contains)
-                        .collect(Collectors.toList());
-                if (!intersect.isEmpty()) {
-                    // Fields in parent projection, but they have be excluded in child. For example,
-                    // source=t | fields - A, B | fields A, B, C will throw "[Field A, Field B] can't be resolved"
-                    throw new SyntaxCheckException(intersect + " can't be resolved");
-                }
-            } else {
-                ctx.withProjectedFields(node.getProjectList());
+        //update plan's context prior to visiting node children
+        if (node.isExcluded()) {
+            List<UnresolvedExpression> intersect = context.getProjectedFields().stream()
+                    .filter(node.getProjectList()::contains)
+                    .collect(Collectors.toList());
+            if (!intersect.isEmpty()) {
+                // Fields in parent projection, but they have be excluded in child. For example,
+                // source=t | fields - A, B | fields A, B, C will throw "[Field A, Field B] can't be resolved"
+                throw new SyntaxCheckException(intersect + " can't be resolved");
             }
-            return ctx;
-        });
+        } else {
+            context.withProjectedFields(node.getProjectList());
+        }
         LogicalPlan child = visitFirstChild(node, context);
         visitExpressionList(node.getProjectList(), context);
 

diff --git a/...rk-integration/src/main/java/org/opensearch/sql/ppl/utils/BuiltinFunctionTransformer.java b/...rk-integration/src/main/java/org/opensearch/sql/ppl/utils/BuiltinFunctionTransformer.java
@@ -26,8 +26,10 @@
 import java.util.Map;
 import java.util.function.Function;
 
+import static org.opensearch.flint.spark.ppl.OpenSearchPPLLexer.DISTINCT_COUNT_APPROX;
 import static org.opensearch.sql.expression.function.BuiltinFunctionName.ADD;
 import static org.opensearch.sql.expression.function.BuiltinFunctionName.ADDDATE;
+import static org.opensearch.sql.expression.function.BuiltinFunctionName.APPROX_COUNT_DISTINCT;
 import static org.opensearch.sql.expression.function.BuiltinFunctionName.ARRAY_LENGTH;
 import static org.opensearch.sql.expression.function.BuiltinFunctionName.DATEDIFF;
 import static org.opensearch.sql.expression.function.BuiltinFunctionName.DATE_ADD;
@@ -109,6 +111,7 @@ public interface BuiltinFunctionTransformer {
             .put(TO_JSON_STRING, "to_json")
             .put(JSON_KEYS, "json_object_keys")
             .put(JSON_EXTRACT, "get_json_object")
+            .put(APPROX_COUNT_DISTINCT, "approx_count_distinct")
             .build();
 
     /**