From 1a2091965f19a9ae415c8f58ae7d488c8bd1b070 Mon Sep 17 00:00:00 2001
From: Jianghua Yang <yjhjstz@gmail.com>
Date: Thu, 2 Jul 2026 13:54:53 +0800
Subject: [PATCH 1/2] ORCA: don't emit NIJ ON-pred edges into the Select above
 DPv2 join trees

When two non-inner joins in an NAry join have structurally identical ON
predicates, ORCA could silently drop or misplace predicates, producing
wrong results:

  select x.c1, y2.c1 from x left join y1 on x.c1
                            left join y2 on x.c1
   where y2.c1 is null;

returned 0 rows instead of the two null-padded FALSE rows, because a
copy of the ON pred ended up as a scan filter on x.

Root cause: CJoinOrderDPv2's m_expression_to_edge_map is keyed on
structural equality (CExpression::HashValue / CUtils::Equals). With two
structurally identical ON preds, RecursivelyMarkEdgesAsUsed can only
ever mark one of the duplicate edges as used, so
AddSelectNodeForRemainingEdges treated the other edge as a leftover
WHERE predicate and emitted it into a Select on top of the join tree.
The normalizer then legitimately pushed that Select onto the LOJ's own
outer child, filtering out rows that outer-join semantics require to be
null-padded. (The map is only populated when a WHERE predicate
references an NIJ right child, which is why the WHERE clause is needed
to trigger the bug.)

Fix at the source: skip ON-pred edges (m_loj_num > 0) when collecting
remaining edges. An NIJ's ON predicate is always applied by the join
itself when its right child is placed (IsRightChildOfNIJ), so an
"unused" ON-pred edge can only be a bookkeeping artifact of the
structural-equality map and must never be duplicated above the join.

An earlier attempt fixed this downstream, by stripping conjuncts that
structurally match the LOJ's ON pred in CNormalizer::PushThruOuterChild.
That layer cannot distinguish the leaked ON-pred copy from legitimate,
structurally identical conjuncts arriving from above, and silently
deleted user predicates:

  select * from x left join y on x.c1 where x.c1;       -- 3 rows, not 1
  select 1 from a t1
    left join (a t2 left join a t3 on t2.id = 1)
    on t2.id = 1;                                        -- lost the
                                                         -- Index Cond on
                                                         -- t2 and the ON
                                                         -- pred entirely

With this fix, the original repro returns the correct 2 rows with no
scan filter on x, the queries above return planner-identical results,
and the nested-LOJ query regains Index Cond: (id = 1) on t2.

Add the repro as a regression test in bfv_joins.
---
 .../libgpopt/src/xforms/CJoinOrderDPv2.cpp    |  2 +-
 src/test/regress/expected/bfv_joins.out       | 30 +++++++++++++++++++
 .../regress/expected/bfv_joins_optimizer.out  | 30 +++++++++++++++++++
 src/test/regress/sql/bfv_joins.sql            | 20 +++++++++++++
 4 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/src/backend/gporca/libgpopt/src/xforms/CJoinOrderDPv2.cpp b/src/backend/gporca/libgpopt/src/xforms/CJoinOrderDPv2.cpp
index c6c5a12500b..3f9445abaf3 100644
--- a/src/backend/gporca/libgpopt/src/xforms/CJoinOrderDPv2.cpp
+++ b/src/backend/gporca/libgpopt/src/xforms/CJoinOrderDPv2.cpp
@@ -767,7 +767,7 @@ CJoinOrderDPv2::AddSelectNodeForRemainingEdges(CExpression *join_expr)
 			// we will have to repeat this check
 			pedge->m_fUsed = false;
 		}
-		else
+		else if (0 == pedge->m_loj_num)
 		{
 			// found an unused edge, this one will need to go into
 			// a select node on top of the join
diff --git a/src/test/regress/expected/bfv_joins.out b/src/test/regress/expected/bfv_joins.out
index da6e7481318..ff7947488c8 100644
--- a/src/test/regress/expected/bfv_joins.out
+++ b/src/test/regress/expected/bfv_joins.out
@@ -4235,6 +4235,36 @@ select (trunc(extract(epoch from now())) - :unix_time1) < 100 is_ok;
 (1 row)
 
 reset optimizer;
+-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a
+-- scan filter on the outer relation. When the same outer relation feeds
+-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there
+-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's
+-- own outer child, discarding outer rows that should be null-padded.
+create table loj_bool_x(c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table loj_bool_y1(c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table loj_bool_y2(c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into loj_bool_x values (true), (false), (false);
+insert into loj_bool_y1 values (true);
+insert into loj_bool_y2 values (true);
+-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2.
+-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x.
+select loj_bool_x.c1, loj_bool_y2.c1 as y2c1
+  from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1
+                  left join loj_bool_y2 on loj_bool_x.c1
+ where loj_bool_y2.c1 is null
+ order by 1, 2;
+ c1 | y2c1 
+----+------
+ f  | 
+ f  | 
+(2 rows)
+
 -- Clean up. None of the objects we create are very interesting to keep around.
 reset search_path;
 set client_min_messages='warning';
diff --git a/src/test/regress/expected/bfv_joins_optimizer.out b/src/test/regress/expected/bfv_joins_optimizer.out
index 934b682492b..d8cb7b7a425 100644
--- a/src/test/regress/expected/bfv_joins_optimizer.out
+++ b/src/test/regress/expected/bfv_joins_optimizer.out
@@ -4252,6 +4252,36 @@ select (trunc(extract(epoch from now())) - :unix_time1) < 100 is_ok;
 (1 row)
 
 reset optimizer;
+-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a
+-- scan filter on the outer relation. When the same outer relation feeds
+-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there
+-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's
+-- own outer child, discarding outer rows that should be null-padded.
+create table loj_bool_x(c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table loj_bool_y1(c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table loj_bool_y2(c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into loj_bool_x values (true), (false), (false);
+insert into loj_bool_y1 values (true);
+insert into loj_bool_y2 values (true);
+-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2.
+-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x.
+select loj_bool_x.c1, loj_bool_y2.c1 as y2c1
+  from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1
+                  left join loj_bool_y2 on loj_bool_x.c1
+ where loj_bool_y2.c1 is null
+ order by 1, 2;
+ c1 | y2c1 
+----+------
+ f  | 
+ f  | 
+(2 rows)
+
 -- Clean up. None of the objects we create are very interesting to keep around.
 reset search_path;
 set client_min_messages='warning';
diff --git a/src/test/regress/sql/bfv_joins.sql b/src/test/regress/sql/bfv_joins.sql
index 3a0fca09fc7..1dca58051c8 100644
--- a/src/test/regress/sql/bfv_joins.sql
+++ b/src/test/regress/sql/bfv_joins.sql
@@ -649,6 +649,26 @@ select (trunc(extract(epoch from now())) - :unix_time1) < 100 is_ok;
 
 reset optimizer;
 
+-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a
+-- scan filter on the outer relation. When the same outer relation feeds
+-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there
+-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's
+-- own outer child, discarding outer rows that should be null-padded.
+create table loj_bool_x(c1 boolean);
+create table loj_bool_y1(c1 boolean);
+create table loj_bool_y2(c1 boolean);
+insert into loj_bool_x values (true), (false), (false);
+insert into loj_bool_y1 values (true);
+insert into loj_bool_y2 values (true);
+
+-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2.
+-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x.
+select loj_bool_x.c1, loj_bool_y2.c1 as y2c1
+  from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1
+                  left join loj_bool_y2 on loj_bool_x.c1
+ where loj_bool_y2.c1 is null
+ order by 1, 2;
+
 -- Clean up. None of the objects we create are very interesting to keep around.
 reset search_path;
 set client_min_messages='warning';

From ad16de30964122b2ddc5d5ddca24a97c0d09f2f7 Mon Sep 17 00:00:00 2001
From: Jianghua Yang <yjhjstz@gmail.com>
Date: Thu, 2 Jul 2026 13:58:41 +0800
Subject: [PATCH 2/2] regress: mirror LOJ ON-pred regression test into
 pax_storage regress suite

Mirror the bfv_joins regression case for the duplicate-ON-pred DPv2 bug
(two LEFT JOINs sharing the same boolean ON column plus a WHERE on the
inner side) into the pax_storage copy of the suite.

Unlike the earlier version of this mirror, join_optimizer.out is left
untouched: with the root-cause fix in CJoinOrderDPv2 the previously
refreshed plan (Seq Scan on t2, outer Join Filter reduced to true) no
longer exists; the original expected plan with Index Cond: (id = 1) on
t2 is produced again.
---
 .../src/test/regress/expected/bfv_joins.out   | 30 +++++++++++++++++++
 .../regress/expected/bfv_joins_optimizer.out  | 30 +++++++++++++++++++
 .../src/test/regress/sql/bfv_joins.sql        | 20 +++++++++++++
 3 files changed, 80 insertions(+)

diff --git a/contrib/pax_storage/src/test/regress/expected/bfv_joins.out b/contrib/pax_storage/src/test/regress/expected/bfv_joins.out
index 31515680256..e82e9e158c7 100644
--- a/contrib/pax_storage/src/test/regress/expected/bfv_joins.out
+++ b/contrib/pax_storage/src/test/regress/expected/bfv_joins.out
@@ -4190,6 +4190,36 @@ INSERT INTO ext_stats_tbl VALUES('tC', true);
 ANALYZE ext_stats_tbl;
 explain SELECT 1 FROM ext_stats_tbl t11 FULL JOIN ext_stats_tbl t12 ON t12.c2;
 ERROR:  FULL JOIN is only supported with merge-joinable or hash-joinable join conditions
+-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a
+-- scan filter on the outer relation. When the same outer relation feeds
+-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there
+-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's
+-- own outer child, discarding outer rows that should be null-padded.
+create table loj_bool_x(c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table loj_bool_y1(c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table loj_bool_y2(c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into loj_bool_x values (true), (false), (false);
+insert into loj_bool_y1 values (true);
+insert into loj_bool_y2 values (true);
+-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2.
+-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x.
+select loj_bool_x.c1, loj_bool_y2.c1 as y2c1
+  from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1
+                  left join loj_bool_y2 on loj_bool_x.c1
+ where loj_bool_y2.c1 is null
+ order by 1, 2;
+ c1 | y2c1 
+----+------
+ f  | 
+ f  | 
+(2 rows)
+
 -- Clean up. None of the objects we create are very interesting to keep around.
 reset search_path;
 set client_min_messages='warning';
diff --git a/contrib/pax_storage/src/test/regress/expected/bfv_joins_optimizer.out b/contrib/pax_storage/src/test/regress/expected/bfv_joins_optimizer.out
index af48a5dd8d9..6426bf4f8fb 100644
--- a/contrib/pax_storage/src/test/regress/expected/bfv_joins_optimizer.out
+++ b/contrib/pax_storage/src/test/regress/expected/bfv_joins_optimizer.out
@@ -4215,6 +4215,36 @@ INSERT INTO ext_stats_tbl VALUES('tC', true);
 ANALYZE ext_stats_tbl;
 explain SELECT 1 FROM ext_stats_tbl t11 FULL JOIN ext_stats_tbl t12 ON t12.c2;
 ERROR:  FULL JOIN is only supported with merge-joinable or hash-joinable join conditions
+-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a
+-- scan filter on the outer relation. When the same outer relation feeds
+-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there
+-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's
+-- own outer child, discarding outer rows that should be null-padded.
+create table loj_bool_x(c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table loj_bool_y1(c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table loj_bool_y2(c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into loj_bool_x values (true), (false), (false);
+insert into loj_bool_y1 values (true);
+insert into loj_bool_y2 values (true);
+-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2.
+-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x.
+select loj_bool_x.c1, loj_bool_y2.c1 as y2c1
+  from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1
+                  left join loj_bool_y2 on loj_bool_x.c1
+ where loj_bool_y2.c1 is null
+ order by 1, 2;
+ c1 | y2c1 
+----+------
+ f  | 
+ f  | 
+(2 rows)
+
 -- Clean up. None of the objects we create are very interesting to keep around.
 reset search_path;
 set client_min_messages='warning';
diff --git a/contrib/pax_storage/src/test/regress/sql/bfv_joins.sql b/contrib/pax_storage/src/test/regress/sql/bfv_joins.sql
index edc39f58a7d..cb4acd0a9c6 100644
--- a/contrib/pax_storage/src/test/regress/sql/bfv_joins.sql
+++ b/contrib/pax_storage/src/test/regress/sql/bfv_joins.sql
@@ -604,6 +604,26 @@ ANALYZE ext_stats_tbl;
 
 explain SELECT 1 FROM ext_stats_tbl t11 FULL JOIN ext_stats_tbl t12 ON t12.c2;
 
+-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a
+-- scan filter on the outer relation. When the same outer relation feeds
+-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there
+-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's
+-- own outer child, discarding outer rows that should be null-padded.
+create table loj_bool_x(c1 boolean);
+create table loj_bool_y1(c1 boolean);
+create table loj_bool_y2(c1 boolean);
+insert into loj_bool_x values (true), (false), (false);
+insert into loj_bool_y1 values (true);
+insert into loj_bool_y2 values (true);
+
+-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2.
+-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x.
+select loj_bool_x.c1, loj_bool_y2.c1 as y2c1
+  from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1
+                  left join loj_bool_y2 on loj_bool_x.c1
+ where loj_bool_y2.c1 is null
+ order by 1, 2;
+
 -- Clean up. None of the objects we create are very interesting to keep around.
 reset search_path;
 set client_min_messages='warning';