winoros · AilinKid · Mar 17, 2022 · Mar 18, 2022 · Mar 19, 2022 · Mar 23, 2022
diff --git a/planner/core/logical_plan_builder.go b/planner/core/logical_plan_builder.go
diff --git a/planner/core/logical_plans.go b/planner/core/logical_plans.go
@@ -15,7 +15,9 @@
 package core
 
 import (
+	"fmt"
 	"math"
+	"strings"
 
 	"github.com/pingcap/tidb/expression"
 	"github.com/pingcap/tidb/expression/aggregation"
@@ -198,6 +200,11 @@ func (p *LogicalJoin) extractFDForSemiJoin(filtersFromApply []expression.Express
 	// 1: since semi join will keep the part or all rows of the outer table, it's outer FD can be saved.
 	// 2: the un-projected column will be left for the upper layer projection or already be pruned from bottom up.
 	outerFD, _ := p.children[0].ExtractFD(), p.children[1].ExtractFD()
+	outerAcrossBlock := p.SelectBlockOffset() != p.children[0].SelectBlockOffset()
+	if outerAcrossBlock {
+		outerFD.HasAggBuilt = false
+		outerFD.GroupByCols.Clear()
+	}
 	fds := outerFD
 
 	eqCondSlice := expression.ScalarFuncs2Exprs(p.EqualConditions)
@@ -215,6 +222,11 @@ func (p *LogicalJoin) extractFDForSemiJoin(filtersFromApply []expression.Express
 
 func (p *LogicalJoin) extractFDForInnerJoin(filtersFromApply []expression.Expression) *fd.FDSet {
 	leftFD, rightFD := p.children[0].ExtractFD(), p.children[1].ExtractFD()
+	leftAcrossBlock, rightAcrossBlock := p.SelectBlockOffset() != p.children[0].SelectBlockOffset(), p.SelectBlockOffset() != p.children[1].SelectBlockOffset()
+	if leftAcrossBlock {
+		leftFD.HasAggBuilt = false
+		leftFD.GroupByCols.Clear()
+	}
 	fds := leftFD
 	fds.MakeCartesianProduct(rightFD)
 
@@ -245,16 +257,19 @@ func (p *LogicalJoin) extractFDForInnerJoin(filtersFromApply []expression.Expres
 			fds.HashCodeToUniqueID[k] = v
 		}
 	}
-	for i, ok := rightFD.GroupByCols.Next(0); ok; i, ok = rightFD.GroupByCols.Next(i + 1) {
-		fds.GroupByCols.Insert(i)
+	if !rightAcrossBlock {
+		for i, ok := rightFD.GroupByCols.Next(0); ok; i, ok = rightFD.GroupByCols.Next(i + 1) {
+			fds.GroupByCols.Insert(i)
+		}
+		fds.HasAggBuilt = fds.HasAggBuilt || rightFD.HasAggBuilt
 	}
-	fds.HasAggBuilt = fds.HasAggBuilt || rightFD.HasAggBuilt
 	p.fdSet = fds
 	return fds
 }
 
 func (p *LogicalJoin) extractFDForOuterJoin(filtersFromApply []expression.Expression) *fd.FDSet {
 	outerFD, innerFD := p.children[0].ExtractFD(), p.children[1].ExtractFD()
+	outerAcrossBlock, innerAcrossBlock := p.SelectBlockOffset() != p.children[0].SelectBlockOffset(), p.SelectBlockOffset() != p.children[1].SelectBlockOffset()
 	innerCondition := p.RightConditions
 	outerCondition := p.LeftConditions
 	outerCols, innerCols := fd.NewFastIntSet(), fd.NewFastIntSet()
@@ -266,6 +281,7 @@ func (p *LogicalJoin) extractFDForOuterJoin(filtersFromApply []expression.Expres
 	}
 	if p.JoinType == RightOuterJoin {
 		innerFD, outerFD = outerFD, innerFD
+		outerAcrossBlock, innerAcrossBlock = innerAcrossBlock, outerAcrossBlock
 		innerCondition = p.LeftConditions
 		outerCondition = p.RightConditions
 		innerCols, outerCols = outerCols, innerCols
@@ -346,9 +362,16 @@ func (p *LogicalJoin) extractFDForOuterJoin(filtersFromApply []expression.Expres
 			}
 		}
 	}
-
+	if outerAcrossBlock {
+		outerFD.HasAggBuilt = false
+		outerFD.GroupByCols.Clear()
+	}
 	fds := outerFD
-	fds.MakeOuterJoin(innerFD, filterFD, outerCols, innerCols, &opt)
+
+	if strings.HasPrefix(p.ctx.GetSessionVars().StmtCtx.OriginalSQL, "select c1.a, count(*) from customer2 c3 left join (customer1 c1 left join customer2 c2 on c1.a=c2.b) on c3.b=c1.a where c2.pk in (7,9) group by c2.b") {
+		fmt.Println(1)
+	}
+	fds.MakeOuterJoin(innerFD, filterFD, outerCols, innerCols, &opt, innerAcrossBlock)
 	p.fdSet = fds
 	return fds
 }
@@ -578,6 +601,13 @@ func (p *LogicalProjection) ExtractFD() *fd.FDSet {
 	fds.MakeNotNull(notnullColsUniqueIDs)
 	// select max(a) from t group by b, we should project both `a` & `b` to maintain the FD down here, even if select-fields only contain `a`.
 	fds.ProjectCols(outputColsUniqueIDs.Union(fds.GroupByCols))
+	if fds.HasAggBuilt && fds.GroupByCols.Only1Zero() && p.baseLogicalPlan.FDChecked {
+		// maxOneRow is delayed from agg's ExtractFD logic since some details listed in it.
+		fds.MaxOneRow(outputColsUniqueIDs)
+		// for select * from view (include agg), outer projection don't have to check select list with the inner group-by flag.
+		fds.HasAggBuilt = false
+		fds.GroupByCols.Clear()
+	}
 	// just trace it down in every operator for test checking.
 	p.fdSet = fds
 	return fds
@@ -966,7 +996,7 @@ func (p *LogicalSelection) ExtractFD() *fd.FDSet {
 	// join's schema will miss t2.a while join.full schema has. since selection
 	// itself doesn't contain schema, extracting schema should tell them apart.
 	var columns []*expression.Column
-	if join, ok := p.children[0].(*LogicalJoin); ok {
+	if join, ok := p.children[0].(*LogicalJoin); ok && join.fullSchema != nil {
 		columns = join.fullSchema.Columns
 	} else {
 		columns = p.Schema().Columns
@@ -984,19 +1014,6 @@ func (p *LogicalSelection) ExtractFD() *fd.FDSet {
 	// extract equivalence cols.
 	equivUniqueIDs := extractEquivalenceCols(p.Conditions, p.SCtx(), fds)
 
-	// after left join, according to rule 3.3.3, it may create a lax FD from inner equivalence
-	// cols pointing to outer equivalence cols.  eg: t left join t1 on t.a = t1.b, leading a
-	// lax FD from t1.b ~> t.a, this lax attribute is coming from supplied null value to all
-	// left rows, once there is a null-refusing predicate on the inner side on upper layer, this
-	// can be equivalence again. (the outer rows left are all coming from equal matching)
-	//
-	// why not just makeNotNull of them, because even a non-equiv-related inner col can also
-	// refuse supplied null values.
-	if fds.Rule333Equiv.InnerCols.Len() != 0 && notnullColsUniqueIDs.Intersects(fds.Rule333Equiv.InnerCols) {
-		// restore/re-strength FDs from rule 333
-		fds.MakeRestoreRule333()
-	}
-
 	// apply operator's characteristic's FD setting.
 	fds.MakeNotNull(notnullColsUniqueIDs)
 	fds.AddConstants(constUniqueIDs)
@@ -1059,11 +1076,26 @@ func (la *LogicalApply) ExtractFD() *fd.FDSet {
 			}
 		}
 	}
+	// select (select t1.a from t1 where t1.rid = t2.id), count(t2.b) from t2 group by (t2.id)
+	// for correlated scalar sub-query, the whole sub-query will be projected as a new column for example here.
+	// while for every same t2.id, this sub-query's scalar output must be the same, actually it's a kind of strict FD here.
+	applyStrictDetermine := fd.NewFastIntSet()
+	applyStrictDependency := fd.NewFastIntSet()
+	if innerPlan.Schema().Len() == 1 && len(deduplicateCorrelatedCols) > 0 {
+		// single column in apply join inner side will be output directly.
+		for _, cc := range deduplicateCorrelatedCols {
+			applyStrictDetermine.Insert(int(cc.UniqueID))
+		}
+		applyStrictDependency.Insert(int(innerPlan.Schema().Columns[0].UniqueID))
+	}
+
 	switch la.JoinType {
 	case InnerJoin:
 		return la.extractFDForInnerJoin(eqCond)
 	case LeftOuterJoin, RightOuterJoin:
-		return la.extractFDForOuterJoin(eqCond)
+		fds := la.extractFDForOuterJoin(eqCond)
+		fds.AddStrictFunctionalDependency(applyStrictDetermine, applyStrictDependency)
+		return fds
 	case SemiJoin:
 		return la.extractFDForSemiJoin(eqCond)
 	default:

diff --git a/planner/core/plan.go b/planner/core/plan.go
@@ -377,7 +377,8 @@ type baseLogicalPlan struct {
 	// including eliminating unnecessary DISTINCT operators, simplifying ORDER BY columns,
 	// removing Max1Row operators, and mapping semi-joins to inner-joins.
 	// for now, it's hard to maintain in individual operator, build it from bottom up when using.
-	fdSet *fd.FDSet
+	fdSet     *fd.FDSet
+	FDChecked bool
 }
 
 // ExtractFD return the children[0]'s fdSet if there are no adding/removing fd in this logic plan.
@@ -386,8 +387,13 @@ func (p *baseLogicalPlan) ExtractFD() *fd.FDSet {
 		return p.fdSet
 	}
 	fds := &fd.FDSet{HashCodeToUniqueID: make(map[string]int)}
+	// isolation between different logical query blocks.
+	acrossBlock := false
 	for _, ch := range p.children {
-		fds.AddFrom(ch.ExtractFD())
+		if p.SelectBlockOffset() != ch.SelectBlockOffset() {
+			acrossBlock = true
+		}
+		fds.AddFrom(ch.ExtractFD(), acrossBlock)
 	}
 	return fds
 }

diff --git a/planner/funcdep/doc.go b/planner/funcdep/doc.go
@@ -10,3 +10,154 @@ package funcdep
 //   https://cs.uwaterloo.ca/research/tr/2000/11/CS-2000-11.thesis.pdf
 
 // TODO: Add the RFC design.
+
+// NOTE 1.
+// when handling Lax FD, we don't care the null value in the dependency, which means
+// as long as null-attribute coverage of the determinant can make a Lax FD as strict one.
+
+// The definition of "lax" used in the paper differs from the definition used by this
+// library. For a lax dependency A~~>B, the paper allows this set of rows:
+//
+//	a  b
+//	-------
+//	1  1
+//	1  NULL
+//
+//	This alternate definition is briefly covered in section 2.5.3.2 of the paper (see definition
+//	2.19). The reason for this change is to allow a lax dependency to be upgraded to a strict
+//	dependency more readily, needing only the determinant columns to be not-null rather than
+//  both determinant and dependant columns.
+//
+// This is on the condition that, for definite values of determinant of a Lax FD, it won't
+// have two same definite dependant value. That's true, because there is no way can derive
+// to this kind of FD.
+//
+// Even in our implementation of outer join, the only way to produce duplicate definite
+// determinant is the join predicate. But for now, we only maintain the equivalence and
+// some strict FD of it.
+//
+//   t(a,b) left join t1(c,d,e) on t.a = t1.c and b=1
+//  a  b  |  c     d     e
+//  ------+----------------
+//  1  1  |  1    NULL   1
+//  1  2  | NULL  NULL  NULL
+//  2  1  | NULL  NULL  NULL
+//
+// Actually it's possible, the lax FD {a} -> {c} can be derived but not that useful. we only
+// maintain the {c} ~> {a} for existence after outer join. Besides, there two Cond-FD should
+// be preserved waiting for be visible again once with the null-reject on the condition of
+// null constraint columns. (see below)
+//
+// NOTE 2.
+// When handle outer join, it won't produce lax FD with duplicate definite determinant values and
+// different dependency values.
+//
+// In implementation，we come across some lax FD dependent on null-reject of some other cols. For
+// example.
+//   t(a,b) left join t1(c,d,e) on t.a = t1.c and b=1
+//  a  b  |  c     d     e
+//  ------+----------------
+//  1  1  |  1    NULL   1
+//  1  2  | NULL  NULL  NULL
+//  2  1  | NULL  NULL  NULL
+//
+// here constant FD {} -> {b} won't be existed after the outer join is done. Notice null-constraint
+// {c,d,e} -| {c,d,e}, this FD should be preserved and will be visible again when some null-reject
+// predicate take effect on the null-constraint cols.
+//
+// It's same for strict equivalence {t.a} = {t1.c}. Notice there are no lax equivalence here, because
+// left side couldn't be guaranteed to be definite or null. like a=2 here. Let's collect all of this
+// on-condition FD down, correspondent with a null-constraints column set, name it as Cond-FD.
+//
+// lax equivalencies are theoretically possible, but it won't be constructed from an outer join unless
+// t already has a constant FD in column `a` here before outer join take a run. So the lax equivalence
+// has some pre-conditions as you see, and it couldn't cover the case shown above. Let us do it like a
+// Cond-FD does.
+//
+// The FD constructed from the join predicate should be considered as Cond-FD. Here like equivalence of
+// {a} == {c} and constant FD {b} = 1 (if the join condition is e=1, it's here too). We can say that for
+// every matched row, this FDs is valid, while for the other rows, the inner side are supplied of null
+// rows. So this FDs are stored as ncEdges with nc condition of all inner table cols.
+//
+// We introduced invisible FD with null-constraint column to solve the problem above named as Cond-FD.
+// For multi embedded left join, we take the following case as an example.
+//    a,b         c,d,e
+// 	-----------+-----------
+//   1    2    |    1  1  1
+// 	 2    2    |
+//  -----------+-----------
+//
+//  left join on (a=c) res:
+//   a   b    c     e     e
+//  -------------------------
+//   1   2    1     1     1
+//   2   2 +- null null  null -+
+//         |                   |
+//         +-------------------+
+//                              \
+//                               \
+//  the Cond-FD are < a=c with {c,d,e} > the latter is as null constraint cols
+//
+//    e,f
+//  -----------------------
+//   1   2
+//   2   2
+//   3   3
+//  -----------------------
+//
+//  left join on (e=a) res:
+//   e   f    a     b      c    d    e
+//  -----------------------------------
+//   1   2    1     2      1    1    1
+//   2   2    2     2  +- null null null --+---------------> Cond-FD are <a=c with {c,d,e}> still exists.
+//   3   3 +-null null |  null null null   |---+
+//         |           +-------------------+   |
+//         +-----------------------------------+-----------> New Cond-FD are <e=a with {a,b,c,d,e}> occurs.
+//
+//
+// the old Cond-FD with null constraint columns set {c,d,e} is preserved cause new append cols are all null too.
+// the new Cond-FD with null constraint columns set {a,b,c,d,e} are also meaningful, even if the null-reject column
+// is one of {c,d,e} which may reduce one of the matched row out of the result, the equivalence {a}={e} still exist.
+//
+// Provide that the result of the first left join is like:
+//  left join on (a=c) res:
+//   a     b    c     e     e
+//  ---------------------------
+//   1     2    1     1     1
+//   null  2  null  null  null
+//
+//  THEN: left join on (e=a) res:
+//   e   f    a     b     c    d    e
+//  ---------------------------------
+//   1   2    1     2     1    1    1
+//   2   2    null null null null null
+//   3   3    3     3   null null null
+//
+//  Even like that, the case of old Cond-FD and new Cond-FD are existed too. Seems the null-constraint column set of
+//  old Cond-FD {c,d,e} can be expanded as {a,b,c,d,e} visually, but we couldn't derive the inference of the join predicate
+//  (e=a). The null-reject of column `a` couldn't bring the visibility to the old Cond-FD theoretically, it just happened
+//  to refuse that row with a null value in column a.
+//
+// Think about adding one more row in first left join result.
+//
+//  left join on (a=c) res:
+//   a     b    c     e     e
+//  ---------------------------
+//   1     2    1     1     1
+//   null  2  null  null  null
+//   3     3  null  null  null
+//
+//  THEN: left join on (e=a) res:
+//   e   f    a     b     c    d    e
+//  ---------------------------------
+//   1   2    1     2     1    1    1
+//   2   2    null null null null null
+//   3   3    3     3   null null null
+//
+//  Conclusion:
+//  As you see that's right we couldn't derive the inference of the join predicate (e=a) to expand old Cond-FD's nc
+//  {c,d,e} as {a,b,c,d,e}. So the rule for Cond-FD is quite simple, just keep the old ncEdge from right, appending
+//  the new ncEdges in current left join.
+//
+//  If the first left join result is in the outer side of the second left join, just keep the ncEdge from left as well,
+//  appending the new ncEdges in current left join.
diff --git a/planner/funcdep/extract_fd_test.go b/planner/funcdep/extract_fd_test.go
@@ -349,9 +349,6 @@ func TestFDSet_MakeOuterJoin(t *testing.T) {
 	ctx := context.TODO()
 	is := testGetIS(ass, tk.Session())
 	for i, tt := range tests {
-		if i == 0 {
-			fmt.Println(1)
-		}
 		comment := fmt.Sprintf("case:%v sql:%s", i, tt.sql)
 		stmt, err := par.ParseOneStmt(tt.sql, "", "")
 		ass.Nil(err, comment)