tancheng · HobbitQia · Feb 5, 2025 · Feb 5, 2025 · Apr 24, 2025 · Apr 25, 2025
diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
@@ -92,3 +92,10 @@ jobs:
         sh verify.sh
 
 
+    - name: Test multi-cyle mapping
+      working-directory: ${{github.workspace}}/test/multicycle
+      run: |
+        sh compile.sh
+        sh run.sh
+        sh verify.sh
+
diff --git a/src/CGRA.cpp b/src/CGRA.cpp
@@ -17,12 +17,13 @@ using json = nlohmann::json;
 CGRA::CGRA(int t_rows, int t_columns, bool t_diagonalVectorization,
 	   list<string>* t_fusionStrategy, bool t_parameterizableCGRA,
 	   map<string, list<int>*>* t_additionalFunc,
-	   bool t_supportDVFS, int t_DVFSIslandDim) {
+	   bool t_supportDVFS, int t_DVFSIslandDim, bool enableMultipleOps) {
   m_rows = t_rows;
   m_columns = t_columns;
   m_FUCount = t_rows * t_columns;
   m_supportDVFS = t_supportDVFS;
   m_DVFSIslandDim = t_DVFSIslandDim;
+  m_supportInclusive = enableMultipleOps;
   m_supportComplex = new list<string>();
   m_supportCall = new list<string>();
   nodes = new CGRANode**[t_rows];
@@ -53,6 +54,9 @@ CGRA::CGRA(int t_rows, int t_columns, bool t_diagonalVectorization,
       nodes[i] = new CGRANode*[t_columns];
       for (int j=0; j<t_columns; ++j) {
         nodes[i][j] = new CGRANode(node_id, j, i);
+        if (!enableMultipleOps) {
+          nodes[i][j]->disableMultipleOps();
+        }
 	// nodes[i][j]->disableAllFUs();
 	id2Node[node_id] = nodes[i][j];
 	node_id += 1;
@@ -118,6 +122,9 @@ CGRA::CGRA(int t_rows, int t_columns, bool t_diagonalVectorization,
       nodes[i] = new CGRANode*[t_columns];
       for (int j=0; j<t_columns; ++j) {
         nodes[i][j] = new CGRANode(node_id++, j, i);
+        if (!enableMultipleOps) {
+          nodes[i][j]->disableMultipleOps();
+        }
       }
     }
 
@@ -394,3 +401,6 @@ void CGRA::syncDVFSIsland(CGRANode* t_node) {
   }
 }
 
+bool CGRA::getSupportInclusive() {
+  return m_supportInclusive;
+}
diff --git a/src/CGRA.h b/src/CGRA.h
@@ -23,14 +23,15 @@ class CGRA {
     int m_rows;
     int m_columns;
     bool m_supportDVFS;
+    bool m_supportInclusive;
     int m_DVFSIslandDim;
     map<int, vector<CGRANode*>> m_DVFSIslands;
     list<string>* m_supportComplex;
     list<string>* m_supportCall;
     void disableSpecificConnections();
 
   public:
-    CGRA(int, int, bool, list<string>*, bool, map<string, list<int>*>*, bool, int);
+    CGRA(int, int, bool, list<string>*, bool, map<string, list<int>*>*, bool, int, bool=true);
     CGRANode ***nodes;
     CGRALink **links;
     int getFUCount();
@@ -49,5 +50,6 @@ class CGRA {
     void syncDVFSIsland(CGRANode*);
     list<string>* getSupportComplex();
     list<string>* getSupportCall();
+    bool getSupportInclusive();
 };
 
diff --git a/src/CGRANode.cpp b/src/CGRANode.cpp
@@ -64,6 +64,11 @@ CGRANode::CGRANode(int t_id, int t_x, int t_y) {
   m_mapped = false;
   m_DVFSLatencyMultiple = 1;
   m_synced = false;
+
+  // Indicates whether this CGRA node can execute multiple operations
+  // simultaneously. (e.g.,  single-cycle overlaps with multi-cycle)
+  // i.e., inclusive execution
+  m_canMultipleOps = true;
 }
 
 // FIXME: should handle the case that the data is maintained in the registers
@@ -293,6 +298,10 @@ bool CGRANode::canOccupy(DFGNode* t_opt, int t_cycle, int t_II) {
   if (not t_opt->isMultiCycleExec(getDVFSLatencyMultiple())) {
     // Single-cycle opt:
     for (int cycle=t_cycle%t_II; cycle<m_cycleBoundary; cycle+=t_II) {
+      // If this tile don't support inclusive execution (canMultipleOps() == false), and there has been an operation occupied this tile at the current cycle, we cannot map t_opt on it. 
+      if (!canMultipleOps() && !m_dfgNodesWithOccupyStatus[cycle]->empty()) {
+        return false;
+      }
       for (pair<DFGNode*, int> p: *(m_dfgNodesWithOccupyStatus[cycle])) {
         if (p.second != IN_PIPE_OCCUPY) {
           return false;
@@ -302,49 +311,63 @@ bool CGRANode::canOccupy(DFGNode* t_opt, int t_cycle, int t_II) {
   } else {
     // Multi-cycle opt.
     for (int cycle=t_cycle%t_II; cycle<m_cycleBoundary; cycle+=t_II) {
-      // Check start cycle.
-      for (pair<DFGNode*, int> p: *(m_dfgNodesWithOccupyStatus[cycle])) {
-	// Cannot occupy/overlap by/with other operation if DVFS is enabled.
-	if (isDVFSEnabled() and
-	    (p.second == SINGLE_OCCUPY or
-	     p.second == START_PIPE_OCCUPY or
-	     p.second == IN_PIPE_OCCUPY or
-	     p.second == END_PIPE_OCCUPY)) {
-	  return false;
-	}
-        // Multi-cycle opt's start cycle overlaps with single-cycle opt' cycle.
-	else if (p.second == SINGLE_OCCUPY) {
-          return false;
-        }
-        // Multi-cycle opt's start cycle overlaps with multi-cycle opt's start cycle.
-        else if (p.second == START_PIPE_OCCUPY) {
-          return false;
-        }
-        // Multi-cycle opt's start cycle overlaps with multi-cycle opt with the same type:
-        else if ((p.second == IN_PIPE_OCCUPY or p.second == END_PIPE_OCCUPY) and
-                 (t_opt->shareFU(p.first))   and
-                 (not t_opt->isPipelinable() or not p.first->isPipelinable())) {
-          return false;
+      // Can not support simultaneous execution of multiple operations.
+      if (!canMultipleOps()) {
+        int exec_latency = t_opt->getExecLatency(getDVFSLatencyMultiple());
+        for (int duration=0; duration < exec_latency; duration++) {
+          if (cycle + duration >= m_cycleBoundary) {
+            break;
+          }
+          if (!m_dfgNodesWithOccupyStatus[cycle+duration]->empty()) {
+            return false;
+          }
         }
       }
-      if (cycle+t_opt->getExecLatency(getDVFSLatencyMultiple())-1 >= m_cycleBoundary) {
-        break;
-      }
-      // Check end cycle.
-      for (pair<DFGNode*, int> p: *(m_dfgNodesWithOccupyStatus[cycle+t_opt->getExecLatency(getDVFSLatencyMultiple())-1])) {
-        // Multi-cycle opt's end cycle overlaps with single-cycle opt' cycle.
-        if (p.second == SINGLE_OCCUPY) {
-          return false;
+      else {
+        // Check start cycle.
+        for (pair<DFGNode*, int> p: *(m_dfgNodesWithOccupyStatus[cycle])) {
+          // Cannot occupy/overlap by/with other operation if DVFS is enabled.
+          if (isDVFSEnabled() and
+              (p.second == SINGLE_OCCUPY or
+              p.second == START_PIPE_OCCUPY or
+              p.second == IN_PIPE_OCCUPY or
+              p.second == END_PIPE_OCCUPY)) {
+            return false;
+          }
+          // Multi-cycle opt's start cycle overlaps with single-cycle opt' cycle.
+          else if (p.second == SINGLE_OCCUPY) {
+            return false;
+          }
+          // Multi-cycle opt's start cycle overlaps with multi-cycle opt's start cycle.
+          else if (p.second == START_PIPE_OCCUPY) {
+            return false;
+          }
+          // Multi-cycle opt's start cycle overlaps with multi-cycle opt with the same type:
+          else if ((p.second == IN_PIPE_OCCUPY or p.second == END_PIPE_OCCUPY) and
+                  (t_opt->shareFU(p.first))   and
+                  (not t_opt->isPipelinable() or not p.first->isPipelinable())) {
+            return false;
+          }
         }
-        // Multi-cycle opt's end cycle overlaps with multi-cycle opt's end cycle.
-        else if (p.second == END_PIPE_OCCUPY) {
-          return false;
+        if (cycle+t_opt->getExecLatency(getDVFSLatencyMultiple())-1 >= m_cycleBoundary) {
+          break;
         }
-        // Multi-cycle opt's end cycle overlaps with multi-cycle opt with the same type:
-        else if ((p.second == IN_PIPE_OCCUPY or p.second == START_PIPE_OCCUPY) and
-                 (t_opt->shareFU(p.first))   and
-                 (not t_opt->isPipelinable() or not p.first->isPipelinable())) {
-          return false;
+        // Check end cycle.
+        for (pair<DFGNode*, int> p: *(m_dfgNodesWithOccupyStatus[cycle+t_opt->getExecLatency(getDVFSLatencyMultiple())-1])) {
+          // Multi-cycle opt's end cycle overlaps with single-cycle opt' cycle.
+          if (p.second == SINGLE_OCCUPY) {
+            return false;
+          }
+          // Multi-cycle opt's end cycle overlaps with multi-cycle opt's end cycle.
+          else if (p.second == END_PIPE_OCCUPY) {
+            return false;
+          }
+          // Multi-cycle opt's end cycle overlaps with multi-cycle opt with the same type:
+          else if ((p.second == IN_PIPE_OCCUPY or p.second == START_PIPE_OCCUPY) and
+                  (t_opt->shareFU(p.first))   and
+                  (not t_opt->isPipelinable() or not p.first->isPipelinable())) {
+            return false;
+          }
         }
       }
     }
@@ -640,6 +663,11 @@ void CGRANode::enableDiv() {
   m_canDiv = true;
 }
 
+void CGRANode::disableMultipleOps() {
+  printf("disabling multiple ops\n");
+  m_canMultipleOps = false;
+}
+
 bool CGRANode::supportComplex(string type) {
   if (type == "") return m_supportComplex;
   for (string t: m_supportComplexType) {
@@ -713,6 +741,10 @@ bool CGRANode::canDiv() {
   return m_canDiv;
 }
 
+bool CGRANode::canMultipleOps() {
+  return m_canMultipleOps;
+}
+
 int CGRANode::getX() {
   return m_x;
 }

diff --git a/src/CGRANode.h b/src/CGRANode.h
@@ -82,6 +82,8 @@ class CGRANode {
     bool m_mapped;
     bool m_synced;
 
+    bool m_canMultipleOps;
+
   public:
     CGRANode(int, int, int);
 //    CGRANode(int, int, int, int, int);
@@ -113,6 +115,7 @@ class CGRANode {
     void enableLogic();
     void enableBr();
     void enableDiv();
+    void disableMultipleOps();
 
     void attachInLink(CGRALink*);
     void attachOutLink(CGRALink*);
@@ -153,6 +156,7 @@ class CGRANode {
     bool canLogic();
     bool canBr();
     bool canDiv();
+    bool canMultipleOps();
     DFGNode* getMappedDFGNode(int);
     bool containMappedDFGNode(DFGNode*, int);
     void allocateReg(CGRALink*, int, int, int);

diff --git a/src/DFG.cpp b/src/DFG.cpp
@@ -17,7 +17,7 @@ DFG::DFG(Function& t_F, list<Loop*>* t_loops, bool t_targetFunction,
          map<string, int>* t_execLatency, list<string>* t_pipelinedOpt,
          map<string, list<string>*>* t_fusionPattern,
 	      bool t_supportDVFS, bool t_DVFSAwareMapping,
-	      int t_vectorFactorForIdiv) {
+	      int t_vectorFactorForIdiv, bool enableDistributed) {
   m_num = 0;
   m_targetFunction = t_targetFunction;
   m_targetLoops = t_loops;
@@ -54,6 +54,56 @@ DFG::DFG(Function& t_F, list<Loop*>* t_loops, bool t_targetFunction,
   }
   initExecLatency(t_execLatency);
   initPipelinedOpt(t_pipelinedOpt);
+  if (enableDistributed) {
+    splitNodes();
+  }
+  calculateCycles();
+}
+
+// Split multi-cycle nodes in the DFG into multiple single-cycle nodes when distributed strategy is adopted.
+// Example: Division takes 8 cycles on our hardware, so each division node in the DFG should be split into 8 sub-nodes, each of which only needs to perform one cycle of division execution.
+void DFG::splitNodes() {
+  list<DFGNode*>* add_nodes = new list<DFGNode*>();
+  int dfgNodeID = nodes.size();
+  for (DFGNode* dfgNode: nodes) {
+    int ExecLatency = dfgNode->getExecLatency(dfgNode->getDVFSLatencyMultiple());
+    if (ExecLatency == 1) continue;
+      dfgNode->setExecLatency(1);
+      int dfgNodeID = nodes.size();
+      DFGNode* nowNode = dfgNode;
+      DFGNode* stNode;
+      for (int i = 1; i < ExecLatency; i++) {
+        DFGNode* newNode = new DFGNode(dfgNodeID++, dfgNode);
+        int dfgEdgeID = m_DFGEdges.size();
+        DFGEdge* newEdge = new DFGEdge(dfgEdgeID++, nowNode, newNode);
+        newNode->setExecLatency(1);
+        m_DFGEdges.push_back(newEdge);
+        // nodes.push_back(newNode);
+        add_nodes->push_back(newNode);
+        // Update the pred and succ nodes of nods.
+        newNode->deleteAllPredNodes();
+        newNode->deleteAllSuccNodes();
+        nowNode->addSuccNode(newNode);
+        newNode->addPredNode(nowNode);
+        nowNode = newNode;
+        if (i == 1) stNode = nowNode;
+      }
+      // change the successors of dfgNode to nowNode;
+      for (DFGNode* succNode: *(dfgNode->getSuccNodes())) {
+        if (succNode == stNode) continue;
+        replaceDFGEdge(dfgNode, succNode, nowNode, succNode);
+        // dfgNode->deleteSuccNode(succNode);
+        nowNode->addSuccNode(succNode);
+        succNode->deletePredNode(dfgNode);
+        succNode->addPredNode(nowNode);
+      }
+      dfgNode->deleteAllSuccNodes();
+      dfgNode->addSuccNode(stNode);
+  }
+
+  for (DFGNode* dfgNode: *add_nodes) {
+    nodes.push_back(dfgNode);
+  }
 }
 
 // Pre-assigns the DVFS levels to each DFG node.
@@ -1707,6 +1757,16 @@ bool DFG::searchDFS(DFGNode* t_target, DFGNode* t_head,
   return false;
 }
 
+// Used for initializing II for exclusive strategy.
+int DFG::getMaxExecLatency() {
+  int max_exec_latency = 0;
+  for (DFGNode* dfgNode: nodes) {
+    int exec_latecy = dfgNode->getExecLatency(dfgNode->getDVFSLatencyMultiple());
+    if (exec_latecy > max_exec_latency) max_exec_latency = exec_latecy;
+  }
+  return max_exec_latency;
+}
+
 // TODO: This is necessary for inter-iteration data dependency
 //       checking (ld/st dependency analysis on base address).
 void DFG::detectMemDataDependency() {

diff --git a/src/DFG.h b/src/DFG.h
@@ -101,17 +101,19 @@ class DFG {
     void nonlinear_combine();
     // target control flows
     void ctrlFlow_combine(map<string, list<string>*>*);
+    void splitNodes();
 
   public:
     DFG(Function&, list<Loop*>*, bool, bool, list<string>*, map<string, int>*,
-        list<string>*, map<string, list<string>*>*, bool, bool, int t_vectorFactorForIdiv=4);
+        list<string>*, map<string, list<string>*>*, bool, bool, int t_vectorFactorForIdiv = 4, bool enableDistributed = false);
     list<list<DFGNode*>*>* m_cycleNodeLists;
     //initial ordering of insts
     list<DFGNode*> nodes;
 
     list<DFGNode*>* getBFSOrderedNodes();
     list<DFGNode*>* getDFSOrderedNodes();
     int getNodeCount();
+    int getMaxExecLatency();
     void construct(Function&);
     void setupCycles();
     list<list<DFGEdge*>*>* calculateCycles();