Skip to content

[feat] Support mapping of multi-cycle operations with three strategies (exclusive, distributed, inclusive) #49

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .github/workflows/cmake.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,10 @@ jobs:
sh verify.sh


- name: Test multi-cyle mapping
working-directory: ${{github.workspace}}/test/multicycle
run: |
sh compile.sh
sh run.sh
sh verify.sh

12 changes: 11 additions & 1 deletion src/CGRA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,13 @@ using json = nlohmann::json;
CGRA::CGRA(int t_rows, int t_columns, bool t_diagonalVectorization,
list<string>* t_fusionStrategy, bool t_parameterizableCGRA,
map<string, list<int>*>* t_additionalFunc,
bool t_supportDVFS, int t_DVFSIslandDim) {
bool t_supportDVFS, int t_DVFSIslandDim, bool enableMultipleOps) {
m_rows = t_rows;
m_columns = t_columns;
m_FUCount = t_rows * t_columns;
m_supportDVFS = t_supportDVFS;
m_DVFSIslandDim = t_DVFSIslandDim;
m_supportInclusive = enableMultipleOps;
m_supportComplex = new list<string>();
m_supportCall = new list<string>();
nodes = new CGRANode**[t_rows];
Expand Down Expand Up @@ -53,6 +54,9 @@ CGRA::CGRA(int t_rows, int t_columns, bool t_diagonalVectorization,
nodes[i] = new CGRANode*[t_columns];
for (int j=0; j<t_columns; ++j) {
nodes[i][j] = new CGRANode(node_id, j, i);
if (!enableMultipleOps) {
nodes[i][j]->disableMultipleOps();
}
// nodes[i][j]->disableAllFUs();
id2Node[node_id] = nodes[i][j];
node_id += 1;
Expand Down Expand Up @@ -118,6 +122,9 @@ CGRA::CGRA(int t_rows, int t_columns, bool t_diagonalVectorization,
nodes[i] = new CGRANode*[t_columns];
for (int j=0; j<t_columns; ++j) {
nodes[i][j] = new CGRANode(node_id++, j, i);
if (!enableMultipleOps) {
nodes[i][j]->disableMultipleOps();
}
}
}

Expand Down Expand Up @@ -394,3 +401,6 @@ void CGRA::syncDVFSIsland(CGRANode* t_node) {
}
}

bool CGRA::getSupportInclusive() {
return m_supportInclusive;
}
4 changes: 3 additions & 1 deletion src/CGRA.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,15 @@ class CGRA {
int m_rows;
int m_columns;
bool m_supportDVFS;
bool m_supportInclusive;
int m_DVFSIslandDim;
map<int, vector<CGRANode*>> m_DVFSIslands;
list<string>* m_supportComplex;
list<string>* m_supportCall;
void disableSpecificConnections();

public:
CGRA(int, int, bool, list<string>*, bool, map<string, list<int>*>*, bool, int);
CGRA(int, int, bool, list<string>*, bool, map<string, list<int>*>*, bool, int, bool=true);
CGRANode ***nodes;
CGRALink **links;
int getFUCount();
Expand All @@ -49,5 +50,6 @@ class CGRA {
void syncDVFSIsland(CGRANode*);
list<string>* getSupportComplex();
list<string>* getSupportCall();
bool getSupportInclusive();
};

110 changes: 71 additions & 39 deletions src/CGRANode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ CGRANode::CGRANode(int t_id, int t_x, int t_y) {
m_mapped = false;
m_DVFSLatencyMultiple = 1;
m_synced = false;

// Indicates whether this CGRA node can execute multiple operations
// simultaneously. (e.g., single-cycle overlaps with multi-cycle)
// i.e., inclusive execution
m_canMultipleOps = true;
}

// FIXME: should handle the case that the data is maintained in the registers
Expand Down Expand Up @@ -293,6 +298,10 @@ bool CGRANode::canOccupy(DFGNode* t_opt, int t_cycle, int t_II) {
if (not t_opt->isMultiCycleExec(getDVFSLatencyMultiple())) {
// Single-cycle opt:
for (int cycle=t_cycle%t_II; cycle<m_cycleBoundary; cycle+=t_II) {
// If this tile don't support inclusive execution (canMultipleOps() == false), and there has been an operation occupied this tile at the current cycle, we cannot map t_opt on it.
if (!canMultipleOps() && !m_dfgNodesWithOccupyStatus[cycle]->empty()) {
return false;
}
for (pair<DFGNode*, int> p: *(m_dfgNodesWithOccupyStatus[cycle])) {
if (p.second != IN_PIPE_OCCUPY) {
return false;
Expand All @@ -302,49 +311,63 @@ bool CGRANode::canOccupy(DFGNode* t_opt, int t_cycle, int t_II) {
} else {
// Multi-cycle opt.
for (int cycle=t_cycle%t_II; cycle<m_cycleBoundary; cycle+=t_II) {
// Check start cycle.
for (pair<DFGNode*, int> p: *(m_dfgNodesWithOccupyStatus[cycle])) {
// Cannot occupy/overlap by/with other operation if DVFS is enabled.
if (isDVFSEnabled() and
(p.second == SINGLE_OCCUPY or
p.second == START_PIPE_OCCUPY or
p.second == IN_PIPE_OCCUPY or
p.second == END_PIPE_OCCUPY)) {
return false;
}
// Multi-cycle opt's start cycle overlaps with single-cycle opt' cycle.
else if (p.second == SINGLE_OCCUPY) {
return false;
}
// Multi-cycle opt's start cycle overlaps with multi-cycle opt's start cycle.
else if (p.second == START_PIPE_OCCUPY) {
return false;
}
// Multi-cycle opt's start cycle overlaps with multi-cycle opt with the same type:
else if ((p.second == IN_PIPE_OCCUPY or p.second == END_PIPE_OCCUPY) and
(t_opt->shareFU(p.first)) and
(not t_opt->isPipelinable() or not p.first->isPipelinable())) {
return false;
// Can not support simultaneous execution of multiple operations.
if (!canMultipleOps()) {
int exec_latency = t_opt->getExecLatency(getDVFSLatencyMultiple());
for (int duration=0; duration < exec_latency; duration++) {
if (cycle + duration >= m_cycleBoundary) {
break;
}
if (!m_dfgNodesWithOccupyStatus[cycle+duration]->empty()) {
return false;
}
}
}
if (cycle+t_opt->getExecLatency(getDVFSLatencyMultiple())-1 >= m_cycleBoundary) {
break;
}
// Check end cycle.
for (pair<DFGNode*, int> p: *(m_dfgNodesWithOccupyStatus[cycle+t_opt->getExecLatency(getDVFSLatencyMultiple())-1])) {
// Multi-cycle opt's end cycle overlaps with single-cycle opt' cycle.
if (p.second == SINGLE_OCCUPY) {
return false;
else {
// Check start cycle.
for (pair<DFGNode*, int> p: *(m_dfgNodesWithOccupyStatus[cycle])) {
// Cannot occupy/overlap by/with other operation if DVFS is enabled.
if (isDVFSEnabled() and
(p.second == SINGLE_OCCUPY or
p.second == START_PIPE_OCCUPY or
p.second == IN_PIPE_OCCUPY or
p.second == END_PIPE_OCCUPY)) {
return false;
}
// Multi-cycle opt's start cycle overlaps with single-cycle opt' cycle.
else if (p.second == SINGLE_OCCUPY) {
return false;
}
// Multi-cycle opt's start cycle overlaps with multi-cycle opt's start cycle.
else if (p.second == START_PIPE_OCCUPY) {
return false;
}
// Multi-cycle opt's start cycle overlaps with multi-cycle opt with the same type:
else if ((p.second == IN_PIPE_OCCUPY or p.second == END_PIPE_OCCUPY) and
(t_opt->shareFU(p.first)) and
(not t_opt->isPipelinable() or not p.first->isPipelinable())) {
return false;
}
}
// Multi-cycle opt's end cycle overlaps with multi-cycle opt's end cycle.
else if (p.second == END_PIPE_OCCUPY) {
return false;
if (cycle+t_opt->getExecLatency(getDVFSLatencyMultiple())-1 >= m_cycleBoundary) {
break;
}
// Multi-cycle opt's end cycle overlaps with multi-cycle opt with the same type:
else if ((p.second == IN_PIPE_OCCUPY or p.second == START_PIPE_OCCUPY) and
(t_opt->shareFU(p.first)) and
(not t_opt->isPipelinable() or not p.first->isPipelinable())) {
return false;
// Check end cycle.
for (pair<DFGNode*, int> p: *(m_dfgNodesWithOccupyStatus[cycle+t_opt->getExecLatency(getDVFSLatencyMultiple())-1])) {
// Multi-cycle opt's end cycle overlaps with single-cycle opt' cycle.
if (p.second == SINGLE_OCCUPY) {
return false;
}
// Multi-cycle opt's end cycle overlaps with multi-cycle opt's end cycle.
else if (p.second == END_PIPE_OCCUPY) {
return false;
}
// Multi-cycle opt's end cycle overlaps with multi-cycle opt with the same type:
else if ((p.second == IN_PIPE_OCCUPY or p.second == START_PIPE_OCCUPY) and
(t_opt->shareFU(p.first)) and
(not t_opt->isPipelinable() or not p.first->isPipelinable())) {
return false;
}
}
}
}
Expand Down Expand Up @@ -640,6 +663,11 @@ void CGRANode::enableDiv() {
m_canDiv = true;
}

void CGRANode::disableMultipleOps() {
printf("disabling multiple ops\n");
m_canMultipleOps = false;
}

bool CGRANode::supportComplex(string type) {
if (type == "") return m_supportComplex;
for (string t: m_supportComplexType) {
Expand Down Expand Up @@ -713,6 +741,10 @@ bool CGRANode::canDiv() {
return m_canDiv;
}

bool CGRANode::canMultipleOps() {
return m_canMultipleOps;
}

int CGRANode::getX() {
return m_x;
}
Expand Down
4 changes: 4 additions & 0 deletions src/CGRANode.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ class CGRANode {
bool m_mapped;
bool m_synced;

bool m_canMultipleOps;

public:
CGRANode(int, int, int);
// CGRANode(int, int, int, int, int);
Expand Down Expand Up @@ -113,6 +115,7 @@ class CGRANode {
void enableLogic();
void enableBr();
void enableDiv();
void disableMultipleOps();

void attachInLink(CGRALink*);
void attachOutLink(CGRALink*);
Expand Down Expand Up @@ -153,6 +156,7 @@ class CGRANode {
bool canLogic();
bool canBr();
bool canDiv();
bool canMultipleOps();
DFGNode* getMappedDFGNode(int);
bool containMappedDFGNode(DFGNode*, int);
void allocateReg(CGRALink*, int, int, int);
Expand Down
62 changes: 61 additions & 1 deletion src/DFG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ DFG::DFG(Function& t_F, list<Loop*>* t_loops, bool t_targetFunction,
map<string, int>* t_execLatency, list<string>* t_pipelinedOpt,
map<string, list<string>*>* t_fusionPattern,
bool t_supportDVFS, bool t_DVFSAwareMapping,
int t_vectorFactorForIdiv) {
int t_vectorFactorForIdiv, bool enableDistributed) {
m_num = 0;
m_targetFunction = t_targetFunction;
m_targetLoops = t_loops;
Expand Down Expand Up @@ -54,6 +54,56 @@ DFG::DFG(Function& t_F, list<Loop*>* t_loops, bool t_targetFunction,
}
initExecLatency(t_execLatency);
initPipelinedOpt(t_pipelinedOpt);
if (enableDistributed) {
splitNodes();
}
calculateCycles();
}

// Split multi-cycle nodes in the DFG into multiple single-cycle nodes when distributed strategy is adopted.
// Example: Division takes 8 cycles on our hardware, so each division node in the DFG should be split into 8 sub-nodes, each of which only needs to perform one cycle of division execution.
void DFG::splitNodes() {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the new split node name? Can you demonstrate a simple example as the comment here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's for splitting multi-cycle nodes in the DFG into multiple single-cycle nodes when distributed strategy is adopted.
Example: Division takes 8 cycles on our hardware, so each division node in the DFG should be split into 8 sub-nodes, each of which only needs to perform one cycle of division execution.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where is 8 set? Provided by user via the opLatency in .json right? So it is 4 in your tests?

list<DFGNode*>* add_nodes = new list<DFGNode*>();
int dfgNodeID = nodes.size();
for (DFGNode* dfgNode: nodes) {
int ExecLatency = dfgNode->getExecLatency(dfgNode->getDVFSLatencyMultiple());
if (ExecLatency == 1) continue;
dfgNode->setExecLatency(1);
int dfgNodeID = nodes.size();
DFGNode* nowNode = dfgNode;
DFGNode* stNode;
for (int i = 1; i < ExecLatency; i++) {
DFGNode* newNode = new DFGNode(dfgNodeID++, dfgNode);
int dfgEdgeID = m_DFGEdges.size();
DFGEdge* newEdge = new DFGEdge(dfgEdgeID++, nowNode, newNode);
newNode->setExecLatency(1);
m_DFGEdges.push_back(newEdge);
// nodes.push_back(newNode);
add_nodes->push_back(newNode);
// Update the pred and succ nodes of nods.
newNode->deleteAllPredNodes();
newNode->deleteAllSuccNodes();
nowNode->addSuccNode(newNode);
newNode->addPredNode(nowNode);
nowNode = newNode;
if (i == 1) stNode = nowNode;
}
// change the successors of dfgNode to nowNode;
for (DFGNode* succNode: *(dfgNode->getSuccNodes())) {
if (succNode == stNode) continue;
replaceDFGEdge(dfgNode, succNode, nowNode, succNode);
// dfgNode->deleteSuccNode(succNode);
nowNode->addSuccNode(succNode);
succNode->deletePredNode(dfgNode);
succNode->addPredNode(nowNode);
}
dfgNode->deleteAllSuccNodes();
dfgNode->addSuccNode(stNode);
}

for (DFGNode* dfgNode: *add_nodes) {
nodes.push_back(dfgNode);
}
}

// Pre-assigns the DVFS levels to each DFG node.
Expand Down Expand Up @@ -1707,6 +1757,16 @@ bool DFG::searchDFS(DFGNode* t_target, DFGNode* t_head,
return false;
}

// Used for initializing II for exclusive strategy.
int DFG::getMaxExecLatency() {
int max_exec_latency = 0;
for (DFGNode* dfgNode: nodes) {
int exec_latecy = dfgNode->getExecLatency(dfgNode->getDVFSLatencyMultiple());
if (exec_latecy > max_exec_latency) max_exec_latency = exec_latecy;
}
return max_exec_latency;
}

// TODO: This is necessary for inter-iteration data dependency
// checking (ld/st dependency analysis on base address).
void DFG::detectMemDataDependency() {
Expand Down
4 changes: 3 additions & 1 deletion src/DFG.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,17 +101,19 @@ class DFG {
void nonlinear_combine();
// target control flows
void ctrlFlow_combine(map<string, list<string>*>*);
void splitNodes();

public:
DFG(Function&, list<Loop*>*, bool, bool, list<string>*, map<string, int>*,
list<string>*, map<string, list<string>*>*, bool, bool, int t_vectorFactorForIdiv=4);
list<string>*, map<string, list<string>*>*, bool, bool, int t_vectorFactorForIdiv = 4, bool enableDistributed = false);
list<list<DFGNode*>*>* m_cycleNodeLists;
//initial ordering of insts
list<DFGNode*> nodes;

list<DFGNode*>* getBFSOrderedNodes();
list<DFGNode*>* getDFSOrderedNodes();
int getNodeCount();
int getMaxExecLatency();
void construct(Function&);
void setupCycles();
list<list<DFGEdge*>*>* calculateCycles();
Expand Down
Loading