Skip to content

[feat] Support mapping of multi-cycle operations with three strategies (exclusive, distributed, inclusive) #49

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .github/workflows/cmake.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,10 @@ jobs:
sh run.sh
sh verify.sh

- name: Test multi-cyle mapping
working-directory: ${{github.workspace}}/test/multicycle
run: |
sh compile.sh
sh run.sh
sh verify.sh

12 changes: 11 additions & 1 deletion src/CGRA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,13 @@ using json = nlohmann::json;
CGRA::CGRA(int t_rows, int t_columns, bool t_diagonalVectorization,
bool t_heterogeneity, bool t_parameterizableCGRA,
map<string, list<int>*>* t_additionalFunc,
bool t_supportDVFS, int t_DVFSIslandDim) {
bool t_supportDVFS, int t_DVFSIslandDim, bool enableMultipleOps) {
m_rows = t_rows;
m_columns = t_columns;
m_FUCount = t_rows * t_columns;
m_supportDVFS = t_supportDVFS;
m_DVFSIslandDim = t_DVFSIslandDim;
m_supportInclusive = enableMultipleOps;
m_supportComplex = new list<string>();
m_supportCall = new list<string>();
nodes = new CGRANode**[t_rows];
Expand Down Expand Up @@ -53,6 +54,9 @@ CGRA::CGRA(int t_rows, int t_columns, bool t_diagonalVectorization,
nodes[i] = new CGRANode*[t_columns];
for (int j=0; j<t_columns; ++j) {
nodes[i][j] = new CGRANode(node_id, j, i);
if (!enableMultipleOps) {
nodes[i][j]->disableMultipleOps();
}
// nodes[i][j]->disableAllFUs();
id2Node[node_id] = nodes[i][j];
node_id += 1;
Expand Down Expand Up @@ -118,6 +122,9 @@ CGRA::CGRA(int t_rows, int t_columns, bool t_diagonalVectorization,
nodes[i] = new CGRANode*[t_columns];
for (int j=0; j<t_columns; ++j) {
nodes[i][j] = new CGRANode(node_id++, j, i);
if (!enableMultipleOps) {
nodes[i][j]->disableMultipleOps();
}
}
}

Expand Down Expand Up @@ -394,3 +401,6 @@ void CGRA::syncDVFSIsland(CGRANode* t_node) {
}
}

bool CGRA::getSupportInclusive() {
return m_supportInclusive;
}
4 changes: 3 additions & 1 deletion src/CGRA.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,15 @@ class CGRA {
int m_rows;
int m_columns;
bool m_supportDVFS;
bool m_supportInclusive;
int m_DVFSIslandDim;
map<int, vector<CGRANode*>> m_DVFSIslands;
list<string>* m_supportComplex;
list<string>* m_supportCall;
void disableSpecificConnections();

public:
CGRA(int, int, bool, bool, bool, map<string, list<int>*>*, bool, int);
CGRA(int, int, bool, bool, bool, map<string, list<int>*>*, bool, int, bool=true);
CGRANode ***nodes;
CGRALink **links;
int getFUCount();
Expand All @@ -49,5 +50,6 @@ class CGRA {
void syncDVFSIsland(CGRANode*);
list<string>* getSupportComplex();
list<string>* getSupportCall();
bool getSupportInclusive();
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

m_supportInclusive sounds similar to m_canMultipleOps? How are they related to each other?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

m_supportInclusive means the whole CGRA's execution strategy while m_canMultipleOps means the specific tiles can perform inclusive excution or not. In current design these two are equivalent since we dont consider the heterogeneous support for inclusive execution.

};

38 changes: 38 additions & 0 deletions src/CGRANode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ CGRANode::CGRANode(int t_id, int t_x, int t_y) {
m_mapped = false;
m_DVFSLatencyMultiple = 1;
m_synced = false;

// Indicates whether this CGRA node can execute multiple operations
// simultaneously. (e.g., single-cycle overlaps with multi-cycle)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment means exactly "inclusive", right?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes

m_canMultipleOps = true;
}

// FIXME: should handle the case that the data is maintained in the registers
Expand Down Expand Up @@ -262,6 +266,7 @@ bool CGRANode::canSupport(DFGNode* t_opt) {
return true;
}

// t_cycle, t_II 是什么?
bool CGRANode::canOccupy(DFGNode* t_opt, int t_cycle, int t_II) {
if (m_disabled)
return false;
Expand Down Expand Up @@ -293,6 +298,9 @@ bool CGRANode::canOccupy(DFGNode* t_opt, int t_cycle, int t_II) {
if (not t_opt->isMultiCycleExec(getDVFSLatencyMultiple())) {
// Single-cycle opt:
for (int cycle=t_cycle%t_II; cycle<m_cycleBoundary; cycle+=t_II) {
if (!canMultipleOps() && !m_dfgNodesWithOccupyStatus[cycle]->empty()) {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is the key implementation about "inclusive", right? Then, plz add comment. And please mention why we don't need to specify "exclusive" and "distributed".

return false;
}
for (pair<DFGNode*, int> p: *(m_dfgNodesWithOccupyStatus[cycle])) {
if (p.second != IN_PIPE_OCCUPY) {
return false;
Expand All @@ -302,6 +310,19 @@ bool CGRANode::canOccupy(DFGNode* t_opt, int t_cycle, int t_II) {
} else {
// Multi-cycle opt.
for (int cycle=t_cycle%t_II; cycle<m_cycleBoundary; cycle+=t_II) {
// Can not support simultaneous execution of multiple operations.
if (!canMultipleOps()) {
int exec_latency = t_opt->getExecLatency(getDVFSLatencyMultiple());
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why we need getDVFSLatencyMultiple()? I don't quite understand what we are trying to handle here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actully I am not aware of codes about DVFS...I just see in other places we get the latency by the similar way t_opt->isMultiCycleExec(getDVFSLatencyMultiple() (CGRANode.cpp:298) so I followed it.

for (int duration=0; duration < exec_latency; duration++) {
if (cycle + duration >= m_cycleBoundary) {
break;
}
if (!m_dfgNodesWithOccupyStatus[cycle+duration]->empty()) {
return false;
}
}
}
else {
// Check start cycle.
for (pair<DFGNode*, int> p: *(m_dfgNodesWithOccupyStatus[cycle])) {
// Cannot occupy/overlap by/with other operation if DVFS is enabled.
Expand Down Expand Up @@ -348,6 +369,7 @@ bool CGRANode::canOccupy(DFGNode* t_opt, int t_cycle, int t_II) {
}
}
}
}
}

return true;
Expand Down Expand Up @@ -427,9 +449,16 @@ void CGRANode::setDFGNode(DFGNode* t_opt, int t_cycle, int t_II,
if (not t_opt->isMultiCycleExec(getDVFSLatencyMultiple())) {
m_dfgNodesWithOccupyStatus[cycle]->push_back(make_pair(t_opt, SINGLE_OCCUPY));
} else {
// if (t_opt->getID() == 34 && getID() == 4 ){
// cout << "now here " << t_opt->getExecLatency(getDVFSLatencyMultiple()) << "\n";
// }
m_dfgNodesWithOccupyStatus[cycle]->push_back(make_pair(t_opt, START_PIPE_OCCUPY));
for (int i=1; i<t_opt->getExecLatency(getDVFSLatencyMultiple())-1; ++i) {
// cout << "opt latency" << t_opt->getExecLatency(getDVFSLatencyMultiple()) << "\n";
// cout << "now cycle:" << cycle+i << " " << m_cycleBoundary << "\n";
if (cycle+i < m_cycleBoundary) {
// if (cycle + i < 32)
// cout << "now cycle: " << cycle+i << " " << t_opt->getID() << "\n";
m_dfgNodesWithOccupyStatus[cycle+i]->push_back(make_pair(t_opt, IN_PIPE_OCCUPY));
}
}
Expand Down Expand Up @@ -640,6 +669,11 @@ void CGRANode::enableDiv() {
m_canDiv = true;
}

void CGRANode::disableMultipleOps() {
printf("disabling multiple ops\n");
m_canMultipleOps = false;
}

bool CGRANode::supportComplex(string type) {
if (type == "") return m_supportComplex;
for (string t: m_supportComplexType) {
Expand Down Expand Up @@ -713,6 +747,10 @@ bool CGRANode::canDiv() {
return m_canDiv;
}

bool CGRANode::canMultipleOps() {
return m_canMultipleOps;
}

int CGRANode::getX() {
return m_x;
}
Expand Down
4 changes: 4 additions & 0 deletions src/CGRANode.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ class CGRANode {
bool m_mapped;
bool m_synced;

bool m_canMultipleOps;

public:
CGRANode(int, int, int);
// CGRANode(int, int, int, int, int);
Expand Down Expand Up @@ -113,6 +115,7 @@ class CGRANode {
void enableLogic();
void enableBr();
void enableDiv();
void disableMultipleOps();

void attachInLink(CGRALink*);
void attachOutLink(CGRALink*);
Expand Down Expand Up @@ -153,6 +156,7 @@ class CGRANode {
bool canLogic();
bool canBr();
bool canDiv();
bool canMultipleOps();
DFGNode* getMappedDFGNode(int);
bool containMappedDFGNode(DFGNode*, int);
void allocateReg(CGRALink*, int, int, int);
Expand Down
112 changes: 97 additions & 15 deletions src/DFG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ DFG::DFG(Function& t_F, list<Loop*>* t_loops, bool t_targetFunction,
bool t_precisionAware, bool t_heterogeneity,
map<string, int>* t_execLatency, list<string>* t_pipelinedOpt,
bool t_supportDVFS, bool t_DVFSAwareMapping,
int t_vectorFactorForIdiv) {
int t_vectorFactorForIdiv, bool enableDistributed) {
m_num = 0;
m_targetFunction = t_targetFunction;
m_targetLoops = t_loops;
Expand All @@ -37,6 +37,54 @@ DFG::DFG(Function& t_F, list<Loop*>* t_loops, bool t_targetFunction,
}
initExecLatency(t_execLatency);
initPipelinedOpt(t_pipelinedOpt);
if (enableDistributed) {
splitNodes();
}
calculateCycles();
}

void DFG::splitNodes() {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the new split node name? Can you demonstrate a simple example as the comment here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's for splitting multi-cycle nodes in the DFG into multiple single-cycle nodes when distributed strategy is adopted.
Example: Division takes 8 cycles on our hardware, so each division node in the DFG should be split into 8 sub-nodes, each of which only needs to perform one cycle of division execution.

list<DFGNode*>* add_nodes = new list<DFGNode*>();
int dfgNodeID = nodes.size();
for (DFGNode* dfgNode: nodes) {
int ExecLatency = dfgNode->getExecLatency(1);
if (ExecLatency == 1) continue;
dfgNode->setExecLatency(1);
int dfgNodeID = nodes.size();
DFGNode* nowNode = dfgNode;
DFGNode* stNode;
for (int i = 1; i < ExecLatency; i++) {
DFGNode* newNode = new DFGNode(dfgNodeID++, dfgNode);
int dfgEdgeID = m_DFGEdges.size();
DFGEdge* newEdge = new DFGEdge(dfgEdgeID++, nowNode, newNode);
newNode->setExecLatency(1);
m_DFGEdges.push_back(newEdge);
// nodes.push_back(newNode);
add_nodes->push_back(newNode);
// Update the pred and succ nodes of nods.
newNode->deleteAllPredNodes();
newNode->deleteAllSuccNodes();
nowNode->addSuccNode(newNode);
newNode->addPredNode(nowNode);
nowNode = newNode;
if (i == 1) stNode = nowNode;
}
// change the successors of dfgNode to nowNode;
for (DFGNode* succNode: *(dfgNode->getSuccNodes())) {
if (succNode == stNode) continue;
replaceDFGEdge(dfgNode, succNode, nowNode, succNode);
// dfgNode->deleteSuccNode(succNode);
nowNode->addSuccNode(succNode);
succNode->deletePredNode(dfgNode);
succNode->addPredNode(nowNode);
}
dfgNode->deleteAllSuccNodes();
dfgNode->addSuccNode(stNode);
}

for (DFGNode* dfgNode: *add_nodes) {
nodes.push_back(dfgNode);
}
}

// Pre-assigns the DVFS levels to each DFG node.
Expand Down Expand Up @@ -481,13 +529,13 @@ void DFG::combineAddAdd(string type) {
for (DFGNode* succNode: *(tailNode->getSuccNodes())) {
if (succNode->isOpt(t_opt) and !succNode->hasCombined()) {
// Indicate the pattern is finally found and matched
if (i == (patternSize-1) and dfgNode->isSuccessorOf(succNode)){
if (i == (patternSize-1) and dfgNode->isSuccessorOf(succNode)){
toBeMatchedDFGNodes->push_back(succNode);
for(DFGNode* optNode: *toBeMatchedDFGNodes){
if(optNode != dfgNode){
dfgNode ->addPatternPartner(optNode);
}
optNode->setCombine();
optNode->setCombine();
}
break;
} else if(i == (patternSize-1) and !dfgNode->isSuccessorOf(succNode)){
Expand Down Expand Up @@ -980,11 +1028,22 @@ void DFG::combineAddAdd(string type) {
targetOpt.insert(iter->first);
}
for (DFGNode* node: nodes) {
if (t_execLatency->find(node->getOpcodeName()) != t_execLatency->end()) {
string opcodeName = node->getOpcodeName();
node->setExecLatency((*t_execLatency)[opcodeName]);
targetOpt.erase(opcodeName);
}
if (!node->hasCombined()) {
if (t_execLatency->find(node->getOpcodeName()) != t_execLatency->end()) {
string opcodeName = node->getOpcodeName();
node->setExecLatency((*t_execLatency)[opcodeName]);
targetOpt.erase(opcodeName);
}
}
else {
// Initialize the execution latency of fused patterns.
// If a multiplication and an addition are fused and the pattern is called MAC, then we can set "MAC": 2 in the optLatency of param.json.
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why we need MAC's optLatency as 2? MAC is normally done in single-cycle. And what if user decide different latency, e.g., 1/2/3/4/5?

Copy link
Collaborator Author

@HobbitQia HobbitQia May 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here I just take "MAC" as an example. To avoid ambiguity, I changed the comments as below:
If a division and an addition are fused and the pattern is called DIVADD, and we determine its latency to be 2 according to the timing results, then we can set "DIVADD": 2 in the optLatency of param.json.
How about this?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SGTM.

if (t_execLatency->find(node->getComplexType()) != t_execLatency->end()) {
string opcodeName = node->getComplexType();
node->setExecLatency((*t_execLatency)[opcodeName]);
targetOpt.erase(opcodeName);
}
}
}
if (!targetOpt.empty()) {
cout<<"\033[0;31mPlease check the operations targeting multi-cycle execution in <param.json>:\"\033[0m";
Expand All @@ -1001,13 +1060,26 @@ void DFG::combineAddAdd(string type) {
targetOpt.insert(opt);
}
for (DFGNode* node: nodes) {
list<string>::iterator it;
it = find(t_pipelinedOpt->begin(), t_pipelinedOpt->end(), node->getOpcodeName());
if(it != t_pipelinedOpt->end()) {
string opcodeName = node->getOpcodeName();
node->setPipelinable();
targetOpt.erase(opcodeName);
}
if (!node->hasCombined()) {
list<string>::iterator it;
it = find(t_pipelinedOpt->begin(), t_pipelinedOpt->end(), node->getOpcodeName());
if(it != t_pipelinedOpt->end()) {
string opcodeName = node->getOpcodeName();
node->setPipelinable();
targetOpt.erase(opcodeName);
}
}
else {
// Initialize the pipelinable ability of fused patterns.
// Similar to initExecLatency()
list<string>::iterator it;
it = find(t_pipelinedOpt->begin(), t_pipelinedOpt->end(), node->getComplexType());
if(it != t_pipelinedOpt->end()) {
string opcodeName = node->getComplexType();
node->setPipelinable();
targetOpt.erase(opcodeName);
}
}
}
if (!targetOpt.empty()) {
cout<<"\033[0;31mPlease check the pipelinable operations in <param.json>:\"\033[0m";
Expand Down Expand Up @@ -1687,6 +1759,16 @@ void DFG::combineAddAdd(string type) {
return false;
}

// used for initializing II when exclusive strategy
int DFG::getMaxExecLantecy() {
int max_exec_latency = 0;
for (DFGNode* dfgNode: nodes) {
int exec_latecy = dfgNode->getExecLatency(1);
if (exec_latecy > max_exec_latency) max_exec_latency = exec_latecy;
}
return max_exec_latency;
}

// TODO: This is necessary for inter-iteration data dependency
// checking (ld/st dependency analysis on base address).
void DFG::detectMemDataDependency() {
Expand Down
4 changes: 3 additions & 1 deletion src/DFG.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,17 +100,19 @@ class DFG {
// target nonlinear ops
void nonlinear_combine();
void ctrl_combine();
void splitNodes();

public:
DFG(Function&, list<Loop*>*, bool, bool, bool, map<string, int>*,
list<string>*, bool, bool, int t_vectorFactorForIdiv=4);
list<string>*, bool, bool, int t_vectorFactorForIdiv=4, bool enableDistributed=false);
list<list<DFGNode*>*>* m_cycleNodeLists;
//initial ordering of insts
list<DFGNode*> nodes;

list<DFGNode*>* getBFSOrderedNodes();
list<DFGNode*>* getDFSOrderedNodes();
int getNodeCount();
int getMaxExecLantecy();
void construct(Function&);
void setupCycles();
list<list<DFGEdge*>*>* calculateCycles();
Expand Down
Loading