-
Notifications
You must be signed in to change notification settings - Fork 12
[feat] Support mapping of multi-cycle operations with three strategies (exclusive, distributed, inclusive) #49
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
ebb118f
fa08582
f536d42
45e8b25
3969cbc
e7b5959
76cf3c9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,6 +64,10 @@ CGRANode::CGRANode(int t_id, int t_x, int t_y) { | |
m_mapped = false; | ||
m_DVFSLatencyMultiple = 1; | ||
m_synced = false; | ||
|
||
// Indicates whether this CGRA node can execute multiple operations | ||
// simultaneously. (e.g., single-cycle overlaps with multi-cycle) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This comment means exactly "inclusive", right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes |
||
m_canMultipleOps = true; | ||
} | ||
|
||
// FIXME: should handle the case that the data is maintained in the registers | ||
|
@@ -262,6 +266,7 @@ bool CGRANode::canSupport(DFGNode* t_opt) { | |
return true; | ||
} | ||
|
||
// t_cycle, t_II 是什么? | ||
HobbitQia marked this conversation as resolved.
Show resolved
Hide resolved
|
||
bool CGRANode::canOccupy(DFGNode* t_opt, int t_cycle, int t_II) { | ||
if (m_disabled) | ||
return false; | ||
|
@@ -293,6 +298,9 @@ bool CGRANode::canOccupy(DFGNode* t_opt, int t_cycle, int t_II) { | |
if (not t_opt->isMultiCycleExec(getDVFSLatencyMultiple())) { | ||
// Single-cycle opt: | ||
for (int cycle=t_cycle%t_II; cycle<m_cycleBoundary; cycle+=t_II) { | ||
if (!canMultipleOps() && !m_dfgNodesWithOccupyStatus[cycle]->empty()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here is the key implementation about "inclusive", right? Then, plz add comment. And please mention why we don't need to specify "exclusive" and "distributed". |
||
return false; | ||
} | ||
for (pair<DFGNode*, int> p: *(m_dfgNodesWithOccupyStatus[cycle])) { | ||
if (p.second != IN_PIPE_OCCUPY) { | ||
return false; | ||
|
@@ -302,6 +310,19 @@ bool CGRANode::canOccupy(DFGNode* t_opt, int t_cycle, int t_II) { | |
} else { | ||
// Multi-cycle opt. | ||
for (int cycle=t_cycle%t_II; cycle<m_cycleBoundary; cycle+=t_II) { | ||
// Can not support simultaneous execution of multiple operations. | ||
if (!canMultipleOps()) { | ||
int exec_latency = t_opt->getExecLatency(getDVFSLatencyMultiple()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why we need There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actully I am not aware of codes about DVFS...I just see in other places we get the latency by the similar way |
||
for (int duration=0; duration < exec_latency; duration++) { | ||
if (cycle + duration >= m_cycleBoundary) { | ||
break; | ||
} | ||
if (!m_dfgNodesWithOccupyStatus[cycle+duration]->empty()) { | ||
return false; | ||
} | ||
} | ||
} | ||
else { | ||
// Check start cycle. | ||
for (pair<DFGNode*, int> p: *(m_dfgNodesWithOccupyStatus[cycle])) { | ||
// Cannot occupy/overlap by/with other operation if DVFS is enabled. | ||
|
@@ -348,6 +369,7 @@ bool CGRANode::canOccupy(DFGNode* t_opt, int t_cycle, int t_II) { | |
} | ||
} | ||
} | ||
} | ||
HobbitQia marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
return true; | ||
|
@@ -427,9 +449,16 @@ void CGRANode::setDFGNode(DFGNode* t_opt, int t_cycle, int t_II, | |
if (not t_opt->isMultiCycleExec(getDVFSLatencyMultiple())) { | ||
m_dfgNodesWithOccupyStatus[cycle]->push_back(make_pair(t_opt, SINGLE_OCCUPY)); | ||
} else { | ||
// if (t_opt->getID() == 34 && getID() == 4 ){ | ||
HobbitQia marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// cout << "now here " << t_opt->getExecLatency(getDVFSLatencyMultiple()) << "\n"; | ||
// } | ||
m_dfgNodesWithOccupyStatus[cycle]->push_back(make_pair(t_opt, START_PIPE_OCCUPY)); | ||
for (int i=1; i<t_opt->getExecLatency(getDVFSLatencyMultiple())-1; ++i) { | ||
// cout << "opt latency" << t_opt->getExecLatency(getDVFSLatencyMultiple()) << "\n"; | ||
// cout << "now cycle:" << cycle+i << " " << m_cycleBoundary << "\n"; | ||
if (cycle+i < m_cycleBoundary) { | ||
// if (cycle + i < 32) | ||
// cout << "now cycle: " << cycle+i << " " << t_opt->getID() << "\n"; | ||
m_dfgNodesWithOccupyStatus[cycle+i]->push_back(make_pair(t_opt, IN_PIPE_OCCUPY)); | ||
} | ||
} | ||
|
@@ -640,6 +669,11 @@ void CGRANode::enableDiv() { | |
m_canDiv = true; | ||
} | ||
|
||
void CGRANode::disableMultipleOps() { | ||
printf("disabling multiple ops\n"); | ||
m_canMultipleOps = false; | ||
} | ||
|
||
bool CGRANode::supportComplex(string type) { | ||
if (type == "") return m_supportComplex; | ||
for (string t: m_supportComplexType) { | ||
|
@@ -713,6 +747,10 @@ bool CGRANode::canDiv() { | |
return m_canDiv; | ||
} | ||
|
||
bool CGRANode::canMultipleOps() { | ||
return m_canMultipleOps; | ||
} | ||
|
||
int CGRANode::getX() { | ||
return m_x; | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,7 +15,7 @@ DFG::DFG(Function& t_F, list<Loop*>* t_loops, bool t_targetFunction, | |
bool t_precisionAware, bool t_heterogeneity, | ||
map<string, int>* t_execLatency, list<string>* t_pipelinedOpt, | ||
bool t_supportDVFS, bool t_DVFSAwareMapping, | ||
int t_vectorFactorForIdiv) { | ||
int t_vectorFactorForIdiv, bool enableDistributed) { | ||
m_num = 0; | ||
m_targetFunction = t_targetFunction; | ||
m_targetLoops = t_loops; | ||
|
@@ -37,6 +37,54 @@ DFG::DFG(Function& t_F, list<Loop*>* t_loops, bool t_targetFunction, | |
} | ||
initExecLatency(t_execLatency); | ||
initPipelinedOpt(t_pipelinedOpt); | ||
if (enableDistributed) { | ||
splitNodes(); | ||
} | ||
calculateCycles(); | ||
} | ||
|
||
void DFG::splitNodes() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the new split node name? Can you demonstrate a simple example as the comment here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's for splitting multi-cycle nodes in the DFG into multiple single-cycle nodes when distributed strategy is adopted. |
||
list<DFGNode*>* add_nodes = new list<DFGNode*>(); | ||
int dfgNodeID = nodes.size(); | ||
for (DFGNode* dfgNode: nodes) { | ||
int ExecLatency = dfgNode->getExecLatency(1); | ||
if (ExecLatency == 1) continue; | ||
dfgNode->setExecLatency(1); | ||
int dfgNodeID = nodes.size(); | ||
DFGNode* nowNode = dfgNode; | ||
DFGNode* stNode; | ||
for (int i = 1; i < ExecLatency; i++) { | ||
DFGNode* newNode = new DFGNode(dfgNodeID++, dfgNode); | ||
int dfgEdgeID = m_DFGEdges.size(); | ||
DFGEdge* newEdge = new DFGEdge(dfgEdgeID++, nowNode, newNode); | ||
newNode->setExecLatency(1); | ||
m_DFGEdges.push_back(newEdge); | ||
// nodes.push_back(newNode); | ||
add_nodes->push_back(newNode); | ||
// Update the pred and succ nodes of nods. | ||
newNode->deleteAllPredNodes(); | ||
newNode->deleteAllSuccNodes(); | ||
nowNode->addSuccNode(newNode); | ||
newNode->addPredNode(nowNode); | ||
nowNode = newNode; | ||
if (i == 1) stNode = nowNode; | ||
} | ||
// change the successors of dfgNode to nowNode; | ||
for (DFGNode* succNode: *(dfgNode->getSuccNodes())) { | ||
if (succNode == stNode) continue; | ||
replaceDFGEdge(dfgNode, succNode, nowNode, succNode); | ||
// dfgNode->deleteSuccNode(succNode); | ||
nowNode->addSuccNode(succNode); | ||
succNode->deletePredNode(dfgNode); | ||
succNode->addPredNode(nowNode); | ||
} | ||
dfgNode->deleteAllSuccNodes(); | ||
dfgNode->addSuccNode(stNode); | ||
} | ||
|
||
for (DFGNode* dfgNode: *add_nodes) { | ||
nodes.push_back(dfgNode); | ||
} | ||
} | ||
|
||
// Pre-assigns the DVFS levels to each DFG node. | ||
|
@@ -481,13 +529,13 @@ void DFG::combineAddAdd(string type) { | |
for (DFGNode* succNode: *(tailNode->getSuccNodes())) { | ||
if (succNode->isOpt(t_opt) and !succNode->hasCombined()) { | ||
// Indicate the pattern is finally found and matched | ||
if (i == (patternSize-1) and dfgNode->isSuccessorOf(succNode)){ | ||
if (i == (patternSize-1) and dfgNode->isSuccessorOf(succNode)){ | ||
toBeMatchedDFGNodes->push_back(succNode); | ||
for(DFGNode* optNode: *toBeMatchedDFGNodes){ | ||
if(optNode != dfgNode){ | ||
dfgNode ->addPatternPartner(optNode); | ||
} | ||
optNode->setCombine(); | ||
optNode->setCombine(); | ||
} | ||
break; | ||
} else if(i == (patternSize-1) and !dfgNode->isSuccessorOf(succNode)){ | ||
|
@@ -980,11 +1028,22 @@ void DFG::combineAddAdd(string type) { | |
targetOpt.insert(iter->first); | ||
} | ||
for (DFGNode* node: nodes) { | ||
if (t_execLatency->find(node->getOpcodeName()) != t_execLatency->end()) { | ||
string opcodeName = node->getOpcodeName(); | ||
node->setExecLatency((*t_execLatency)[opcodeName]); | ||
targetOpt.erase(opcodeName); | ||
} | ||
if (!node->hasCombined()) { | ||
if (t_execLatency->find(node->getOpcodeName()) != t_execLatency->end()) { | ||
string opcodeName = node->getOpcodeName(); | ||
node->setExecLatency((*t_execLatency)[opcodeName]); | ||
targetOpt.erase(opcodeName); | ||
} | ||
} | ||
else { | ||
HobbitQia marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// Initialize the execution latency of fused patterns. | ||
HobbitQia marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// If a multiplication and an addition are fused and the pattern is called MAC, then we can set "MAC": 2 in the optLatency of param.json. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why we need There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here I just take "MAC" as an example. To avoid ambiguity, I changed the comments as below: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. SGTM. |
||
if (t_execLatency->find(node->getComplexType()) != t_execLatency->end()) { | ||
string opcodeName = node->getComplexType(); | ||
node->setExecLatency((*t_execLatency)[opcodeName]); | ||
targetOpt.erase(opcodeName); | ||
} | ||
} | ||
} | ||
if (!targetOpt.empty()) { | ||
cout<<"\033[0;31mPlease check the operations targeting multi-cycle execution in <param.json>:\"\033[0m"; | ||
|
@@ -1001,13 +1060,26 @@ void DFG::combineAddAdd(string type) { | |
targetOpt.insert(opt); | ||
} | ||
for (DFGNode* node: nodes) { | ||
list<string>::iterator it; | ||
it = find(t_pipelinedOpt->begin(), t_pipelinedOpt->end(), node->getOpcodeName()); | ||
if(it != t_pipelinedOpt->end()) { | ||
string opcodeName = node->getOpcodeName(); | ||
node->setPipelinable(); | ||
targetOpt.erase(opcodeName); | ||
} | ||
if (!node->hasCombined()) { | ||
list<string>::iterator it; | ||
it = find(t_pipelinedOpt->begin(), t_pipelinedOpt->end(), node->getOpcodeName()); | ||
if(it != t_pipelinedOpt->end()) { | ||
string opcodeName = node->getOpcodeName(); | ||
node->setPipelinable(); | ||
targetOpt.erase(opcodeName); | ||
} | ||
} | ||
else { | ||
// Initialize the pipelinable ability of fused patterns. | ||
HobbitQia marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// Similar to initExecLatency() | ||
list<string>::iterator it; | ||
it = find(t_pipelinedOpt->begin(), t_pipelinedOpt->end(), node->getComplexType()); | ||
if(it != t_pipelinedOpt->end()) { | ||
string opcodeName = node->getComplexType(); | ||
node->setPipelinable(); | ||
targetOpt.erase(opcodeName); | ||
} | ||
} | ||
} | ||
if (!targetOpt.empty()) { | ||
cout<<"\033[0;31mPlease check the pipelinable operations in <param.json>:\"\033[0m"; | ||
|
@@ -1687,6 +1759,16 @@ void DFG::combineAddAdd(string type) { | |
return false; | ||
} | ||
|
||
// used for initializing II when exclusive strategy | ||
HobbitQia marked this conversation as resolved.
Show resolved
Hide resolved
|
||
int DFG::getMaxExecLantecy() { | ||
int max_exec_latency = 0; | ||
for (DFGNode* dfgNode: nodes) { | ||
int exec_latecy = dfgNode->getExecLatency(1); | ||
HobbitQia marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (exec_latecy > max_exec_latency) max_exec_latency = exec_latecy; | ||
} | ||
return max_exec_latency; | ||
} | ||
|
||
// TODO: This is necessary for inter-iteration data dependency | ||
// checking (ld/st dependency analysis on base address). | ||
void DFG::detectMemDataDependency() { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
m_supportInclusive
sounds similar tom_canMultipleOps
? How are they related to each other?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
m_supportInclusive
means the whole CGRA's execution strategy whilem_canMultipleOps
means the specific tiles can perform inclusive excution or not. In current design these two are equivalent since we dont consider the heterogeneous support for inclusive execution.