feat(optimizer): support m <= rank AND rank <= n (#6384)

* feat(optimizer): support m <= ROW AND rank <= n * handle more cases * add e2e tests * clippy * test BETWEEN
risingwavelabs · Nov 19, 2022 · 709e1be · 709e1be
1 parent f663277
commit 709e1be
Show file tree

Hide file tree

Showing 14 changed files with 234 additions and 122 deletions.
diff --git a/e2e_test/batch/top_n/group_top_n.slt b/e2e_test/batch/top_n/group_top_n.slt
@@ -28,6 +28,19 @@ where rank <= 3;
 3  2
 3  3
 
+query II rowsort
+select x, y from (
+    select *, ROW_NUMBER() OVER (PARTITION BY x ORDER BY y) as rank from t
+)
+where rank <= 3 AND rank > 1;
+----
+1  2
+1  3
+2  2
+2  3
+3  2
+3  3
+
 query II rowsort
 select x, y from (
     select *, RANK() OVER (ORDER BY y) as rank from t

diff --git a/e2e_test/streaming/group_top_n.slt b/e2e_test/streaming/group_top_n.slt
@@ -11,6 +11,13 @@ select x, y from (
 )
 where rank <= 3;
 
+statement ok
+create materialized view mv_with_lb as 
+select x, y from (
+    select *, ROW_NUMBER() OVER (PARTITION BY x ORDER BY y) as rank from t
+)
+where rank <= 3 AND rank > 1;
+
 statement ok
 create materialized view mv_rank_no_group as 
 select x, y from (
@@ -46,6 +53,16 @@ select * from mv;
 3  2
 3  3
 
+query II rowsort
+select * from mv_with_lb;
+----
+1  2
+1  3
+2  2
+2  3
+3  2
+3  3
+
 query II rowsort
 select * from mv_rank_no_group;
 ----
@@ -78,6 +95,9 @@ DROP MATERIALIZED VIEW mv_rank_no_group;
 statement ok
 DROP MATERIALIZED VIEW mv_rank;
 
+statement ok
+DROP MATERIALIZED VIEW mv_with_lb;
+
 statement ok
 drop materialized view mv;
 

diff --git a/src/frontend/planner_test/tests/testdata/agg.yaml b/src/frontend/planner_test/tests/testdata/agg.yaml
@@ -60,13 +60,11 @@
 - sql: |
     create table t(v1 int, v2 int);
     select v1 from t group by v2;
-  planner_error: 'Invalid input syntax: column must appear in the GROUP BY clause
-    or be used in an aggregate function'
+  planner_error: 'Invalid input syntax: column must appear in the GROUP BY clause or be used in an aggregate function'
 - sql: |
     create table t(v1 int, v2 int);
     select sum(v1), v1 from t group by v2, v2;
-  planner_error: 'Invalid input syntax: column must appear in the GROUP BY clause
-    or be used in an aggregate function'
+  planner_error: 'Invalid input syntax: column must appear in the GROUP BY clause or be used in an aggregate function'
 - sql: |
     create table t(v1 int, v2 int, v3 int);
     select v3, min(v1) * avg(v1+v2) as agg from t group by v3;
@@ -117,8 +115,7 @@
     └─LogicalAgg { group_key: [(t.v1 + t.v2)], aggs: [] }
       └─LogicalProject { exprs: [(t.v1 + t.v2)] }
         └─LogicalScan { table: t, columns: [t.v1, t.v2, t._row_id] }
-- name: "test logical_agg with complex group expression \nshould complain about nested
-    agg call \n"
+- name: "test logical_agg with complex group expression \nshould complain about nested agg call \n"
   sql: |
     create table t(v1 int, v2 int);
     select avg(sum(v1 + v2)) from t group by v1 + v2;
@@ -137,8 +134,7 @@
 - sql: |
     create table t(v1 int, v2 int);
     select v1 from t group by v1 + v2;
-  planner_error: 'Invalid input syntax: column must appear in the GROUP BY clause
-    or be used in an aggregate function'
+  planner_error: 'Invalid input syntax: column must appear in the GROUP BY clause or be used in an aggregate function'
 - sql: |
     create table t(v1 int, v2 int);
     select count(v1 + v2) as cnt, sum(v1 + v2) as sum from t;
@@ -261,8 +257,7 @@
   sql: |
     create table t (v1 real, v2 int);
     select 1 from t group by v1 having v2 > 5;
-  planner_error: 'Invalid input syntax: column must appear in the GROUP BY clause
-    or be used in an aggregate function'
+  planner_error: 'Invalid input syntax: column must appear in the GROUP BY clause or be used in an aggregate function'
 - name: distinct without agg
   sql: |
     create table t (v1 int, v2 int);
@@ -555,8 +550,7 @@
   sql: |
     create table t(a int, b int);
     select avg(a) FILTER (WHERE abs(a)) AS avga from t;
-  binder_error: 'Invalid input syntax: the type of filter clause should be boolean,
-    but found Int32'
+  binder_error: 'Invalid input syntax: the type of filter clause should be boolean, but found Int32'
 - name: filter clause + subquery
   sql: |
     create table t(a int, b int);
@@ -575,8 +569,7 @@
   sql: |
     create table t(a int, b int);
     select abs(a) FILTER (WHERE a > 0) AS avga from t;
-  binder_error: 'Invalid input syntax: DISTINCT, ORDER BY or FILTER is only allowed
-    in aggregation functions, but `abs` is not an aggregation function'
+  binder_error: 'Invalid input syntax: DISTINCT, ORDER BY or FILTER is only allowed in aggregation functions, but `abs` is not an aggregation function'
 - name: prune column before filter
   sql: |
     create table t(v1 int, v2 int);
@@ -660,8 +653,7 @@
 - sql: |
     create table t(x int, y varchar);
     select string_agg(y, ',' order by y), count(distinct x) from t;
-  planner_error: 'Invalid input syntax: Order by aggregates are disallowed to occur
-    with distinct aggregates'
+  planner_error: 'Invalid input syntax: Order by aggregates are disallowed to occur with distinct aggregates'
 - sql: |
     create table t(v1 int, v2 int);
     with z(a, b) as (select count(distinct v1), count(v2) from t) select a from z;
@@ -854,8 +846,7 @@
   sql: |
     create table t (v1 int, v2 int);
     select min(v1), unnest(array[2, v2]) from t;
-  planner_error: 'Invalid input syntax: column must appear in the GROUP BY clause
-    or be used in an aggregate function'
+  planner_error: 'Invalid input syntax: column must appear in the GROUP BY clause or be used in an aggregate function'
 - name: post-agg project set - grouped
   sql: |
     create table t (v1 int, v2 int);

diff --git a/src/frontend/planner_test/tests/testdata/array.yaml b/src/frontend/planner_test/tests/testdata/array.yaml
@@ -72,8 +72,7 @@
     └─BatchValues { rows: [[]] }
 - sql: |
     select array_cat(array[233], array[array[array[66]]]);
-  binder_error: 'Bind error: unable to find least restrictive type between integer[]
-    and integer[][][]'
+  binder_error: 'Bind error: unable to find least restrictive type between integer[] and integer[][][]'
 - sql: |
     select array_cat(array[233], 123);
   binder_error: 'Bind error: Cannot concatenate integer[] and integer'
@@ -116,8 +115,7 @@
 - name: string from/to varchar[] in implicit context
   sql: |
     values (array['a', 'b']), ('{c,' || 'd}');
-  binder_error: 'Bind error: types List { datatype: Varchar } and Varchar cannot be
-    matched'
+  binder_error: 'Bind error: types List { datatype: Varchar } and Varchar cannot be matched'
 - name: string to varchar[] in assign context
   sql: |
     create table t (v1 varchar[]);

diff --git a/src/frontend/planner_test/tests/testdata/insert.yaml b/src/frontend/planner_test/tests/testdata/insert.yaml
@@ -32,32 +32,32 @@
   sql: |
     create table t (v1 int, v2 int);
     insert into t (v1, v3) values (1, 2);
-  binder_error: "Bind error: Column v3 not found in table t"
+  binder_error: 'Bind error: Column v3 not found in table t'
 - name: Invalid column name 2
   sql: |
     create table t (v1 int, v2 int);
     insert into t (v3, v1) values (1, 2);
-  binder_error: "Bind error: Column v3 not found in table t"
+  binder_error: 'Bind error: Column v3 not found in table t'
 - name: Duplicate column
   sql: |
     create table t (v1 int, v2 int);
     insert into t (v1, v1) values (1, 2);
-  binder_error: "Bind error: Column specified more than once"
+  binder_error: 'Bind error: Column specified more than once'
 - name: To many target columns
   sql: |
     create table t (v1 int, v2 int);
     insert into t (v1, v2, v2) values (5, 6);
-  binder_error: "Bind error: INSERT has more target columns than values"
+  binder_error: 'Bind error: INSERT has more target columns than values'
 - name: Not enough target columns
   sql: |
     create table t (v1 int, v2 int);
     insert into t (v1) values (5, 6);
-  binder_error: "Bind error: INSERT has less target columns than values"
+  binder_error: 'Bind error: INSERT has less target columns than values'
 - name: insert values mismatch columns length
   sql: |
     create table t (v1 real, v2 int, v3 varchar);
     insert into t values (1, 2), (3, 4);
-  binder_error: "Bind error: INSERT has more target columns than expressions"
+  binder_error: 'Bind error: INSERT has more target columns than expressions'
 - name: insert literal null
   sql: |
     create table t(v1 int);
@@ -74,18 +74,16 @@
     BatchExchange { order: [], dist: Single }
     └─BatchInsert { table: t }
       └─BatchValues { rows: [['2020-01-01 01:02:03':Varchar::Timestamp::Time], ['03:04:05':Varchar::Time]] }
-- name:
-    a `VALUES` without insert context may be invalid on its own (compare with
-    above)
+- name: a `VALUES` without insert context may be invalid on its own (compare with above)
   sql: |
     create table t (v1 time);
     values (timestamp '2020-01-01 01:02:03'), (time '03:04:05');
-  binder_error: "Bind error: types Timestamp and Time cannot be matched"
+  binder_error: 'Bind error: types Timestamp and Time cannot be matched'
 - name: a `VALUES` with `limit` loses insert context (compare with 2 cases above)
   sql: |
     create table t (v1 time);
     insert into t values (timestamp '2020-01-01 01:02:03'), (time '03:04:05') limit 1;
-  binder_error: "Bind error: types Timestamp and Time cannot be matched"
+  binder_error: 'Bind error: types Timestamp and Time cannot be matched'
 - name: null in first row without insert context
   sql: |
     values (null), (1);
@@ -99,7 +97,7 @@
 - name: rows of different number of columns
   sql: |
     values (1), (2, 3);
-  binder_error: "Bind error: VALUES lists must all be the same length"
+  binder_error: 'Bind error: VALUES lists must all be the same length'
 - name: insert into select without cast
   sql: |
     create table t (v1 time);
@@ -123,14 +121,12 @@
   sql: |
     create table t (v1 timestamp, v2 real);
     insert into t select time '01:02:03', 4.5 from t;
-  binder_error:
-    'Bind error: cannot cast type "time without time zone" to "timestamp
-    without time zone" in Assign context'
+  binder_error: 'Bind error: cannot cast type "time without time zone" to "timestamp without time zone" in Assign context'
 - name: insert into select mismatch columns length
   sql: |
     create table t (v1 int, v2 real);
     insert into t select 2, 3, 4.5 from t;
-  binder_error: "Bind error: INSERT has more expressions than target columns"
+  binder_error: 'Bind error: INSERT has more expressions than target columns'
 - name: insert with join
   sql: |
     create table t1 (a int, b int);

diff --git a/src/frontend/planner_test/tests/testdata/join.yaml b/src/frontend/planner_test/tests/testdata/join.yaml
@@ -170,8 +170,7 @@
     LogicalJoin { type: Inner, on: (t1.v2 = t2.v2), output: all }
     ├─LogicalScan { table: t1, output_columns: [t1.v1, t1.v2], required_columns: [v1, v2], predicate: (t1.v1 > 100:Int32) }
     └─LogicalScan { table: t2, output_columns: [t2.v1, t2.v2], required_columns: [v1, v2], predicate: (t2.v1 < 1000:Int32) }
-- name: Left & right has same SomeShard distribution. There should still be exchanges
-    below hash join
+- name: Left & right has same SomeShard distribution. There should still be exchanges below hash join
   sql: |
     create table t(x int);
     create index i on t(x);
@@ -190,8 +189,7 @@
       | └─StreamTableScan { table: i, columns: [i.x, i.t._row_id], pk: [i.t._row_id], dist: UpstreamHashShard(i.x) }
       └─StreamExchange { dist: HashShard(i.x) }
         └─StreamTableScan { table: i, columns: [i.x, i.t._row_id], pk: [i.t._row_id], dist: UpstreamHashShard(i.x) }
-- name: Left & right has same SomeShard distribution. There should still be exchanges
-    below hash join
+- name: Left & right has same SomeShard distribution. There should still be exchanges below hash join
   sql: |
     create table t(x int);
     create index i on t(x);
@@ -210,8 +208,7 @@
       | └─StreamTableScan { table: i, columns: [i.x, i.t._row_id], pk: [i.t._row_id], dist: UpstreamHashShard(i.x) }
       └─StreamExchange { dist: HashShard(t.x) }
         └─StreamTableScan { table: t, columns: [t.x, t._row_id], pk: [t._row_id], dist: UpstreamHashShard(t._row_id) }
-- name: Left & right has same HashShard distribution. There should be no exchange
-    below hash join
+- name: Left & right has same HashShard distribution. There should be no exchange below hash join
   sql: |
     create table t(x int);
     create index i on t(x);
@@ -480,15 +477,13 @@
     create table t1(v1 int, v2 int);
     create table t2(v1 int, v3 int);
     select * from t1 join t2 using (v2);
-  binder_error: 'Item not found: column "v2" specified in USING clause does not exist
-    in right table'
+  binder_error: 'Item not found: column "v2" specified in USING clause does not exist in right table'
 - name: Ensure error on non-existent USING col
   sql: |
     create table t1(v1 int, v2 int);
     create table t2(v1 int, v3 int);
     select * from t1 join t2 using (v3);
-  binder_error: 'Item not found: column "v3" specified in USING clause does not exist
-    in left table'
+  binder_error: 'Item not found: column "v3" specified in USING clause does not exist in left table'
 - name: Ensure that we can correctly bind nested joins
   sql: |
     create table t1(v1 int, v2 int);

diff --git a/src/frontend/planner_test/tests/testdata/nexmark.yaml b/src/frontend/planner_test/tests/testdata/nexmark.yaml
@@ -92,8 +92,7 @@
 - id: nexmark_q2
   before:
   - create_tables
-  sql: SELECT auction, price FROM bid WHERE auction = 1007 OR auction = 1020 OR auction
-    = 2001 OR auction = 2019 OR auction = 2087;
+  sql: SELECT auction, price FROM bid WHERE auction = 1007 OR auction = 1020 OR auction = 2001 OR auction = 2019 OR auction = 2087;
   batch_plan: |
     BatchExchange { order: [], dist: Single }
     └─BatchFilter { predicate: (((((bid.auction = 1007:Int32) OR (bid.auction = 1020:Int32)) OR (bid.auction = 2001:Int32)) OR (bid.auction = 2019:Int32)) OR (bid.auction = 2087:Int32)) }

diff --git a/src/frontend/planner_test/tests/testdata/order_by.yaml b/src/frontend/planner_test/tests/testdata/order_by.yaml
@@ -160,8 +160,7 @@
   sql: |
     create table t (x int, y int);
     select distinct x from t order by y;
-  planner_error: 'Invalid input syntax: for SELECT DISTINCT, ORDER BY expressions
-    must appear in select list'
+  planner_error: 'Invalid input syntax: for SELECT DISTINCT, ORDER BY expressions must appear in select list'
 - name: No BatchSort needed, when input is already sorted
   sql: |
     create table t(v int);