diff --git a/contrib/pax_storage/src/test/regress/expected/bfv_joins.out b/contrib/pax_storage/src/test/regress/expected/bfv_joins.out index 31515680256..e82e9e158c7 100644 --- a/contrib/pax_storage/src/test/regress/expected/bfv_joins.out +++ b/contrib/pax_storage/src/test/regress/expected/bfv_joins.out @@ -4190,6 +4190,36 @@ INSERT INTO ext_stats_tbl VALUES('tC', true); ANALYZE ext_stats_tbl; explain SELECT 1 FROM ext_stats_tbl t11 FULL JOIN ext_stats_tbl t12 ON t12.c2; ERROR: FULL JOIN is only supported with merge-joinable or hash-joinable join conditions +-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a +-- scan filter on the outer relation. When the same outer relation feeds +-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there +-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's +-- own outer child, discarding outer rows that should be null-padded. +create table loj_bool_x(c1 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table loj_bool_y1(c1 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table loj_bool_y2(c1 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into loj_bool_x values (true), (false), (false); +insert into loj_bool_y1 values (true); +insert into loj_bool_y2 values (true); +-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2. +-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x. +select loj_bool_x.c1, loj_bool_y2.c1 as y2c1 + from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1 + left join loj_bool_y2 on loj_bool_x.c1 + where loj_bool_y2.c1 is null + order by 1, 2; + c1 | y2c1 +----+------ + f | + f | +(2 rows) + -- Clean up. None of the objects we create are very interesting to keep around. reset search_path; set client_min_messages='warning'; diff --git a/contrib/pax_storage/src/test/regress/expected/bfv_joins_optimizer.out b/contrib/pax_storage/src/test/regress/expected/bfv_joins_optimizer.out index af48a5dd8d9..6426bf4f8fb 100644 --- a/contrib/pax_storage/src/test/regress/expected/bfv_joins_optimizer.out +++ b/contrib/pax_storage/src/test/regress/expected/bfv_joins_optimizer.out @@ -4215,6 +4215,36 @@ INSERT INTO ext_stats_tbl VALUES('tC', true); ANALYZE ext_stats_tbl; explain SELECT 1 FROM ext_stats_tbl t11 FULL JOIN ext_stats_tbl t12 ON t12.c2; ERROR: FULL JOIN is only supported with merge-joinable or hash-joinable join conditions +-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a +-- scan filter on the outer relation. When the same outer relation feeds +-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there +-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's +-- own outer child, discarding outer rows that should be null-padded. +create table loj_bool_x(c1 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table loj_bool_y1(c1 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table loj_bool_y2(c1 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into loj_bool_x values (true), (false), (false); +insert into loj_bool_y1 values (true); +insert into loj_bool_y2 values (true); +-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2. +-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x. +select loj_bool_x.c1, loj_bool_y2.c1 as y2c1 + from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1 + left join loj_bool_y2 on loj_bool_x.c1 + where loj_bool_y2.c1 is null + order by 1, 2; + c1 | y2c1 +----+------ + f | + f | +(2 rows) + -- Clean up. None of the objects we create are very interesting to keep around. reset search_path; set client_min_messages='warning'; diff --git a/contrib/pax_storage/src/test/regress/sql/bfv_joins.sql b/contrib/pax_storage/src/test/regress/sql/bfv_joins.sql index edc39f58a7d..cb4acd0a9c6 100644 --- a/contrib/pax_storage/src/test/regress/sql/bfv_joins.sql +++ b/contrib/pax_storage/src/test/regress/sql/bfv_joins.sql @@ -604,6 +604,26 @@ ANALYZE ext_stats_tbl; explain SELECT 1 FROM ext_stats_tbl t11 FULL JOIN ext_stats_tbl t12 ON t12.c2; +-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a +-- scan filter on the outer relation. When the same outer relation feeds +-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there +-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's +-- own outer child, discarding outer rows that should be null-padded. +create table loj_bool_x(c1 boolean); +create table loj_bool_y1(c1 boolean); +create table loj_bool_y2(c1 boolean); +insert into loj_bool_x values (true), (false), (false); +insert into loj_bool_y1 values (true); +insert into loj_bool_y2 values (true); + +-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2. +-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x. +select loj_bool_x.c1, loj_bool_y2.c1 as y2c1 + from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1 + left join loj_bool_y2 on loj_bool_x.c1 + where loj_bool_y2.c1 is null + order by 1, 2; + -- Clean up. None of the objects we create are very interesting to keep around. reset search_path; set client_min_messages='warning'; diff --git a/src/backend/gporca/libgpopt/src/xforms/CJoinOrderDPv2.cpp b/src/backend/gporca/libgpopt/src/xforms/CJoinOrderDPv2.cpp index c6c5a12500b..3f9445abaf3 100644 --- a/src/backend/gporca/libgpopt/src/xforms/CJoinOrderDPv2.cpp +++ b/src/backend/gporca/libgpopt/src/xforms/CJoinOrderDPv2.cpp @@ -767,7 +767,7 @@ CJoinOrderDPv2::AddSelectNodeForRemainingEdges(CExpression *join_expr) // we will have to repeat this check pedge->m_fUsed = false; } - else + else if (0 == pedge->m_loj_num) { // found an unused edge, this one will need to go into // a select node on top of the join diff --git a/src/test/regress/expected/bfv_joins.out b/src/test/regress/expected/bfv_joins.out index da6e7481318..ff7947488c8 100644 --- a/src/test/regress/expected/bfv_joins.out +++ b/src/test/regress/expected/bfv_joins.out @@ -4235,6 +4235,36 @@ select (trunc(extract(epoch from now())) - :unix_time1) < 100 is_ok; (1 row) reset optimizer; +-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a +-- scan filter on the outer relation. When the same outer relation feeds +-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there +-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's +-- own outer child, discarding outer rows that should be null-padded. +create table loj_bool_x(c1 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table loj_bool_y1(c1 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table loj_bool_y2(c1 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into loj_bool_x values (true), (false), (false); +insert into loj_bool_y1 values (true); +insert into loj_bool_y2 values (true); +-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2. +-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x. +select loj_bool_x.c1, loj_bool_y2.c1 as y2c1 + from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1 + left join loj_bool_y2 on loj_bool_x.c1 + where loj_bool_y2.c1 is null + order by 1, 2; + c1 | y2c1 +----+------ + f | + f | +(2 rows) + -- Clean up. None of the objects we create are very interesting to keep around. reset search_path; set client_min_messages='warning'; diff --git a/src/test/regress/expected/bfv_joins_optimizer.out b/src/test/regress/expected/bfv_joins_optimizer.out index 934b682492b..d8cb7b7a425 100644 --- a/src/test/regress/expected/bfv_joins_optimizer.out +++ b/src/test/regress/expected/bfv_joins_optimizer.out @@ -4252,6 +4252,36 @@ select (trunc(extract(epoch from now())) - :unix_time1) < 100 is_ok; (1 row) reset optimizer; +-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a +-- scan filter on the outer relation. When the same outer relation feeds +-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there +-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's +-- own outer child, discarding outer rows that should be null-padded. +create table loj_bool_x(c1 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table loj_bool_y1(c1 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table loj_bool_y2(c1 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into loj_bool_x values (true), (false), (false); +insert into loj_bool_y1 values (true); +insert into loj_bool_y2 values (true); +-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2. +-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x. +select loj_bool_x.c1, loj_bool_y2.c1 as y2c1 + from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1 + left join loj_bool_y2 on loj_bool_x.c1 + where loj_bool_y2.c1 is null + order by 1, 2; + c1 | y2c1 +----+------ + f | + f | +(2 rows) + -- Clean up. None of the objects we create are very interesting to keep around. reset search_path; set client_min_messages='warning'; diff --git a/src/test/regress/sql/bfv_joins.sql b/src/test/regress/sql/bfv_joins.sql index 3a0fca09fc7..1dca58051c8 100644 --- a/src/test/regress/sql/bfv_joins.sql +++ b/src/test/regress/sql/bfv_joins.sql @@ -649,6 +649,26 @@ select (trunc(extract(epoch from now())) - :unix_time1) < 100 is_ok; reset optimizer; +-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a +-- scan filter on the outer relation. When the same outer relation feeds +-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there +-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's +-- own outer child, discarding outer rows that should be null-padded. +create table loj_bool_x(c1 boolean); +create table loj_bool_y1(c1 boolean); +create table loj_bool_y2(c1 boolean); +insert into loj_bool_x values (true), (false), (false); +insert into loj_bool_y1 values (true); +insert into loj_bool_y2 values (true); + +-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2. +-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x. +select loj_bool_x.c1, loj_bool_y2.c1 as y2c1 + from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1 + left join loj_bool_y2 on loj_bool_x.c1 + where loj_bool_y2.c1 is null + order by 1, 2; + -- Clean up. None of the objects we create are very interesting to keep around. reset search_path; set client_min_messages='warning';