Skip to content
413 changes: 351 additions & 62 deletions planner/core/logical_plan_builder.go

Large diffs are not rendered by default.

72 changes: 52 additions & 20 deletions planner/core/logical_plans.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
package core

import (
"fmt"
"math"
"strings"

"github.com/pingcap/tidb/expression"
"github.com/pingcap/tidb/expression/aggregation"
Expand Down Expand Up @@ -198,6 +200,11 @@ func (p *LogicalJoin) extractFDForSemiJoin(filtersFromApply []expression.Express
// 1: since semi join will keep the part or all rows of the outer table, it's outer FD can be saved.
// 2: the un-projected column will be left for the upper layer projection or already be pruned from bottom up.
outerFD, _ := p.children[0].ExtractFD(), p.children[1].ExtractFD()
outerAcrossBlock := p.SelectBlockOffset() != p.children[0].SelectBlockOffset()
if outerAcrossBlock {
outerFD.HasAggBuilt = false
outerFD.GroupByCols.Clear()
}
fds := outerFD

eqCondSlice := expression.ScalarFuncs2Exprs(p.EqualConditions)
Expand All @@ -215,6 +222,11 @@ func (p *LogicalJoin) extractFDForSemiJoin(filtersFromApply []expression.Express

func (p *LogicalJoin) extractFDForInnerJoin(filtersFromApply []expression.Expression) *fd.FDSet {
leftFD, rightFD := p.children[0].ExtractFD(), p.children[1].ExtractFD()
leftAcrossBlock, rightAcrossBlock := p.SelectBlockOffset() != p.children[0].SelectBlockOffset(), p.SelectBlockOffset() != p.children[1].SelectBlockOffset()
if leftAcrossBlock {
leftFD.HasAggBuilt = false
leftFD.GroupByCols.Clear()
}
fds := leftFD
fds.MakeCartesianProduct(rightFD)

Expand Down Expand Up @@ -245,16 +257,19 @@ func (p *LogicalJoin) extractFDForInnerJoin(filtersFromApply []expression.Expres
fds.HashCodeToUniqueID[k] = v
}
}
for i, ok := rightFD.GroupByCols.Next(0); ok; i, ok = rightFD.GroupByCols.Next(i + 1) {
fds.GroupByCols.Insert(i)
if !rightAcrossBlock {
for i, ok := rightFD.GroupByCols.Next(0); ok; i, ok = rightFD.GroupByCols.Next(i + 1) {
fds.GroupByCols.Insert(i)
}
fds.HasAggBuilt = fds.HasAggBuilt || rightFD.HasAggBuilt
}
fds.HasAggBuilt = fds.HasAggBuilt || rightFD.HasAggBuilt
p.fdSet = fds
return fds
}

func (p *LogicalJoin) extractFDForOuterJoin(filtersFromApply []expression.Expression) *fd.FDSet {
outerFD, innerFD := p.children[0].ExtractFD(), p.children[1].ExtractFD()
outerAcrossBlock, innerAcrossBlock := p.SelectBlockOffset() != p.children[0].SelectBlockOffset(), p.SelectBlockOffset() != p.children[1].SelectBlockOffset()
innerCondition := p.RightConditions
outerCondition := p.LeftConditions
outerCols, innerCols := fd.NewFastIntSet(), fd.NewFastIntSet()
Expand All @@ -266,6 +281,7 @@ func (p *LogicalJoin) extractFDForOuterJoin(filtersFromApply []expression.Expres
}
if p.JoinType == RightOuterJoin {
innerFD, outerFD = outerFD, innerFD
outerAcrossBlock, innerAcrossBlock = innerAcrossBlock, outerAcrossBlock
innerCondition = p.LeftConditions
outerCondition = p.RightConditions
innerCols, outerCols = outerCols, innerCols
Expand Down Expand Up @@ -346,9 +362,16 @@ func (p *LogicalJoin) extractFDForOuterJoin(filtersFromApply []expression.Expres
}
}
}

if outerAcrossBlock {
outerFD.HasAggBuilt = false
outerFD.GroupByCols.Clear()
}
fds := outerFD
fds.MakeOuterJoin(innerFD, filterFD, outerCols, innerCols, &opt)

if strings.HasPrefix(p.ctx.GetSessionVars().StmtCtx.OriginalSQL, "select c1.a, count(*) from customer2 c3 left join (customer1 c1 left join customer2 c2 on c1.a=c2.b) on c3.b=c1.a where c2.pk in (7,9) group by c2.b") {
fmt.Println(1)
}
fds.MakeOuterJoin(innerFD, filterFD, outerCols, innerCols, &opt, innerAcrossBlock)
p.fdSet = fds
return fds
}
Expand Down Expand Up @@ -578,6 +601,13 @@ func (p *LogicalProjection) ExtractFD() *fd.FDSet {
fds.MakeNotNull(notnullColsUniqueIDs)
// select max(a) from t group by b, we should project both `a` & `b` to maintain the FD down here, even if select-fields only contain `a`.
fds.ProjectCols(outputColsUniqueIDs.Union(fds.GroupByCols))
if fds.HasAggBuilt && fds.GroupByCols.Only1Zero() && p.baseLogicalPlan.FDChecked {
// maxOneRow is delayed from agg's ExtractFD logic since some details listed in it.
fds.MaxOneRow(outputColsUniqueIDs)
// for select * from view (include agg), outer projection don't have to check select list with the inner group-by flag.
fds.HasAggBuilt = false
fds.GroupByCols.Clear()
}
// just trace it down in every operator for test checking.
p.fdSet = fds
return fds
Expand Down Expand Up @@ -966,7 +996,7 @@ func (p *LogicalSelection) ExtractFD() *fd.FDSet {
// join's schema will miss t2.a while join.full schema has. since selection
// itself doesn't contain schema, extracting schema should tell them apart.
var columns []*expression.Column
if join, ok := p.children[0].(*LogicalJoin); ok {
if join, ok := p.children[0].(*LogicalJoin); ok && join.fullSchema != nil {
columns = join.fullSchema.Columns
} else {
columns = p.Schema().Columns
Expand All @@ -984,19 +1014,6 @@ func (p *LogicalSelection) ExtractFD() *fd.FDSet {
// extract equivalence cols.
equivUniqueIDs := extractEquivalenceCols(p.Conditions, p.SCtx(), fds)

// after left join, according to rule 3.3.3, it may create a lax FD from inner equivalence
// cols pointing to outer equivalence cols. eg: t left join t1 on t.a = t1.b, leading a
// lax FD from t1.b ~> t.a, this lax attribute is coming from supplied null value to all
// left rows, once there is a null-refusing predicate on the inner side on upper layer, this
// can be equivalence again. (the outer rows left are all coming from equal matching)
//
// why not just makeNotNull of them, because even a non-equiv-related inner col can also
// refuse supplied null values.
if fds.Rule333Equiv.InnerCols.Len() != 0 && notnullColsUniqueIDs.Intersects(fds.Rule333Equiv.InnerCols) {
// restore/re-strength FDs from rule 333
fds.MakeRestoreRule333()
}

// apply operator's characteristic's FD setting.
fds.MakeNotNull(notnullColsUniqueIDs)
fds.AddConstants(constUniqueIDs)
Expand Down Expand Up @@ -1059,11 +1076,26 @@ func (la *LogicalApply) ExtractFD() *fd.FDSet {
}
}
}
// select (select t1.a from t1 where t1.rid = t2.id), count(t2.b) from t2 group by (t2.id)
// for correlated scalar sub-query, the whole sub-query will be projected as a new column for example here.
// while for every same t2.id, this sub-query's scalar output must be the same, actually it's a kind of strict FD here.
applyStrictDetermine := fd.NewFastIntSet()
applyStrictDependency := fd.NewFastIntSet()
if innerPlan.Schema().Len() == 1 && len(deduplicateCorrelatedCols) > 0 {
// single column in apply join inner side will be output directly.
for _, cc := range deduplicateCorrelatedCols {
applyStrictDetermine.Insert(int(cc.UniqueID))
}
applyStrictDependency.Insert(int(innerPlan.Schema().Columns[0].UniqueID))
}

switch la.JoinType {
case InnerJoin:
return la.extractFDForInnerJoin(eqCond)
case LeftOuterJoin, RightOuterJoin:
return la.extractFDForOuterJoin(eqCond)
fds := la.extractFDForOuterJoin(eqCond)
fds.AddStrictFunctionalDependency(applyStrictDetermine, applyStrictDependency)
return fds
case SemiJoin:
return la.extractFDForSemiJoin(eqCond)
default:
Expand Down
10 changes: 8 additions & 2 deletions planner/core/plan.go
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,8 @@ type baseLogicalPlan struct {
// including eliminating unnecessary DISTINCT operators, simplifying ORDER BY columns,
// removing Max1Row operators, and mapping semi-joins to inner-joins.
// for now, it's hard to maintain in individual operator, build it from bottom up when using.
fdSet *fd.FDSet
fdSet *fd.FDSet
FDChecked bool
}

// ExtractFD return the children[0]'s fdSet if there are no adding/removing fd in this logic plan.
Expand All @@ -386,8 +387,13 @@ func (p *baseLogicalPlan) ExtractFD() *fd.FDSet {
return p.fdSet
}
fds := &fd.FDSet{HashCodeToUniqueID: make(map[string]int)}
// isolation between different logical query blocks.
acrossBlock := false
Comment thread
winoros marked this conversation as resolved.
for _, ch := range p.children {
fds.AddFrom(ch.ExtractFD())
if p.SelectBlockOffset() != ch.SelectBlockOffset() {
acrossBlock = true
}
fds.AddFrom(ch.ExtractFD(), acrossBlock)
}
return fds
}
Expand Down
151 changes: 151 additions & 0 deletions planner/funcdep/doc.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,154 @@ package funcdep
// https://cs.uwaterloo.ca/research/tr/2000/11/CS-2000-11.thesis.pdf

// TODO: Add the RFC design.

// NOTE 1.
// when handling Lax FD, we don't care the null value in the dependency, which means
// as long as null-attribute coverage of the determinant can make a Lax FD as strict one.

// The definition of "lax" used in the paper differs from the definition used by this
// library. For a lax dependency A~~>B, the paper allows this set of rows:
//
// a b
// -------
// 1 1
// 1 NULL
//
// This alternate definition is briefly covered in section 2.5.3.2 of the paper (see definition
// 2.19). The reason for this change is to allow a lax dependency to be upgraded to a strict
// dependency more readily, needing only the determinant columns to be not-null rather than
// both determinant and dependant columns.
//
// This is on the condition that, for definite values of determinant of a Lax FD, it won't
// have two same definite dependant value. That's true, because there is no way can derive
// to this kind of FD.
//
// Even in our implementation of outer join, the only way to produce duplicate definite
// determinant is the join predicate. But for now, we only maintain the equivalence and
// some strict FD of it.
//
// t(a,b) left join t1(c,d,e) on t.a = t1.c and b=1
// a b | c d e
// ------+----------------
// 1 1 | 1 NULL 1
// 1 2 | NULL NULL NULL
// 2 1 | NULL NULL NULL
//
// Actually it's possible, the lax FD {a} -> {c} can be derived but not that useful. we only
// maintain the {c} ~> {a} for existence after outer join. Besides, there two Cond-FD should
// be preserved waiting for be visible again once with the null-reject on the condition of
// null constraint columns. (see below)
//
// NOTE 2.
// When handle outer join, it won't produce lax FD with duplicate definite determinant values and
// different dependency values.
//
// In implementation,we come across some lax FD dependent on null-reject of some other cols. For
// example.
// t(a,b) left join t1(c,d,e) on t.a = t1.c and b=1
// a b | c d e
// ------+----------------
// 1 1 | 1 NULL 1
// 1 2 | NULL NULL NULL
// 2 1 | NULL NULL NULL
//
// here constant FD {} -> {b} won't be existed after the outer join is done. Notice null-constraint
// {c,d,e} -| {c,d,e}, this FD should be preserved and will be visible again when some null-reject
// predicate take effect on the null-constraint cols.
//
// It's same for strict equivalence {t.a} = {t1.c}. Notice there are no lax equivalence here, because
// left side couldn't be guaranteed to be definite or null. like a=2 here. Let's collect all of this
// on-condition FD down, correspondent with a null-constraints column set, name it as Cond-FD.
//
// lax equivalencies are theoretically possible, but it won't be constructed from an outer join unless
// t already has a constant FD in column `a` here before outer join take a run. So the lax equivalence
// has some pre-conditions as you see, and it couldn't cover the case shown above. Let us do it like a
// Cond-FD does.
//
// The FD constructed from the join predicate should be considered as Cond-FD. Here like equivalence of
// {a} == {c} and constant FD {b} = 1 (if the join condition is e=1, it's here too). We can say that for
// every matched row, this FDs is valid, while for the other rows, the inner side are supplied of null
// rows. So this FDs are stored as ncEdges with nc condition of all inner table cols.
//
// We introduced invisible FD with null-constraint column to solve the problem above named as Cond-FD.
// For multi embedded left join, we take the following case as an example.
// a,b c,d,e
// -----------+-----------
// 1 2 | 1 1 1
// 2 2 |
// -----------+-----------
//
// left join on (a=c) res:
// a b c e e
// -------------------------
// 1 2 1 1 1
// 2 2 +- null null null -+
// | |
// +-------------------+
// \
// \
// the Cond-FD are < a=c with {c,d,e} > the latter is as null constraint cols
//
// e,f
// -----------------------
// 1 2
// 2 2
// 3 3
// -----------------------
//
// left join on (e=a) res:
// e f a b c d e
// -----------------------------------
// 1 2 1 2 1 1 1
// 2 2 2 2 +- null null null --+---------------> Cond-FD are <a=c with {c,d,e}> still exists.
// 3 3 +-null null | null null null |---+
// | +-------------------+ |
// +-----------------------------------+-----------> New Cond-FD are <e=a with {a,b,c,d,e}> occurs.
//
//
// the old Cond-FD with null constraint columns set {c,d,e} is preserved cause new append cols are all null too.
// the new Cond-FD with null constraint columns set {a,b,c,d,e} are also meaningful, even if the null-reject column
// is one of {c,d,e} which may reduce one of the matched row out of the result, the equivalence {a}={e} still exist.
//
// Provide that the result of the first left join is like:
// left join on (a=c) res:
// a b c e e
// ---------------------------
// 1 2 1 1 1
// null 2 null null null
//
// THEN: left join on (e=a) res:
// e f a b c d e
// ---------------------------------
// 1 2 1 2 1 1 1
// 2 2 null null null null null
// 3 3 3 3 null null null

@AilinKid AilinKid Apr 20, 2022

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

L134 should be
3 3 null null null null null

//
// Even like that, the case of old Cond-FD and new Cond-FD are existed too. Seems the null-constraint column set of
// old Cond-FD {c,d,e} can be expanded as {a,b,c,d,e} visually, but we couldn't derive the inference of the join predicate
// (e=a). The null-reject of column `a` couldn't bring the visibility to the old Cond-FD theoretically, it just happened
// to refuse that row with a null value in column a.
//
// Think about adding one more row in first left join result.
//
// left join on (a=c) res:
// a b c e e
// ---------------------------
// 1 2 1 1 1
// null 2 null null null
// 3 3 null null null
//
// THEN: left join on (e=a) res:
// e f a b c d e
// ---------------------------------
// 1 2 1 2 1 1 1
// 2 2 null null null null null
// 3 3 3 3 null null null
//
// Conclusion:
// As you see that's right we couldn't derive the inference of the join predicate (e=a) to expand old Cond-FD's nc
// {c,d,e} as {a,b,c,d,e}. So the rule for Cond-FD is quite simple, just keep the old ncEdge from right, appending
// the new ncEdges in current left join.
//
// If the first left join result is in the outer side of the second left join, just keep the ncEdge from left as well,
// appending the new ncEdges in current left join.
3 changes: 0 additions & 3 deletions planner/funcdep/extract_fd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -349,9 +349,6 @@ func TestFDSet_MakeOuterJoin(t *testing.T) {
ctx := context.TODO()
is := testGetIS(ass, tk.Session())
for i, tt := range tests {
if i == 0 {
fmt.Println(1)
}
comment := fmt.Sprintf("case:%v sql:%s", i, tt.sql)
stmt, err := par.ParseOneStmt(tt.sql, "", "")
ass.Nil(err, comment)
Expand Down
Loading