Skip to content

Commit e179d88

Browse files
authored
Merge pull request #2781 from verilog-to-routing/chan_z_prefix_sum
Chan z prefix sum
2 parents 99b9c99 + 596ddaf commit e179d88

File tree

3 files changed

+123
-61
lines changed

3 files changed

+123
-61
lines changed

vpr/src/place/net_cost_handler.cpp

Lines changed: 97 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,8 @@ NetCostHandler::NetCostHandler(const t_placer_opts& placer_opts,
112112
, placer_opts_(placer_opts) {
113113
const int num_layers = g_vpr_ctx.device().grid.get_num_layers();
114114

115+
is_multi_layer_ = num_layers > 1;
116+
115117
// Either 3D BB or per layer BB data structure are used, not both.
116118
if (cube_bb_) {
117119
ts_bb_edge_new_.resize(num_nets, t_bb());
@@ -145,10 +147,11 @@ NetCostHandler::NetCostHandler(const t_placer_opts& placer_opts,
145147
* been recomputed. */
146148
bb_update_status_.resize(num_nets, NetUpdateState::NOT_UPDATED_YET);
147149

148-
alloc_and_load_chan_w_factors_for_place_cost_(placer_opts_.place_cost_exp);
150+
alloc_and_load_chan_w_factors_for_place_cost_();
149151
}
150152

151-
void NetCostHandler::alloc_and_load_chan_w_factors_for_place_cost_(float place_cost_exp) {
153+
void NetCostHandler::alloc_and_load_chan_w_factors_for_place_cost_() {
154+
const double place_cost_exp = static_cast<double>(placer_opts_.place_cost_exp);
152155
auto& device_ctx = g_vpr_ctx.device();
153156

154157
const int grid_height = device_ctx.grid.height();
@@ -190,7 +193,7 @@ void NetCostHandler::alloc_and_load_chan_w_factors_for_place_cost_(float place_c
190193
}
191194

192195
chanx_place_cost_fac_[high][low] = (high - low + 1.) / chanx_place_cost_fac_[high][low];
193-
chanx_place_cost_fac_[high][low] = pow((double)chanx_place_cost_fac_[high][low], (double)place_cost_exp);
196+
chanx_place_cost_fac_[high][low] = pow((double)chanx_place_cost_fac_[high][low], place_cost_exp);
194197
}
195198
}
196199

@@ -220,71 +223,87 @@ void NetCostHandler::alloc_and_load_chan_w_factors_for_place_cost_(float place_c
220223
}
221224

222225
chany_place_cost_fac_[high][low] = (high - low + 1.) / chany_place_cost_fac_[high][low];
223-
chany_place_cost_fac_[high][low] = pow((double)chany_place_cost_fac_[high][low], (double)place_cost_exp);
226+
chany_place_cost_fac_[high][low] = pow((double)chany_place_cost_fac_[high][low], place_cost_exp);
224227
}
225228
}
226229

227-
if (device_ctx.grid.get_num_layers() > 1) {
228-
alloc_and_load_for_fast_vertical_cost_update_(place_cost_exp);
230+
if (is_multi_layer_) {
231+
alloc_and_load_for_fast_vertical_cost_update_();
229232
}
230233
}
231234

232-
void NetCostHandler::alloc_and_load_for_fast_vertical_cost_update_(float place_cost_exp) {
235+
void NetCostHandler::alloc_and_load_for_fast_vertical_cost_update_() {
233236
const auto& device_ctx = g_vpr_ctx.device();
234237
const auto& rr_graph = device_ctx.rr_graph;
235238

236239
const size_t grid_height = device_ctx.grid.height();
237240
const size_t grid_width = device_ctx.grid.width();
238241

239242

240-
chanz_place_cost_fac_ = vtr::NdMatrix<float, 4>({grid_width, grid_height, grid_width, grid_height}, 0.);
243+
acc_tile_num_inter_die_conn_ = vtr::NdMatrix<int, 2>({grid_width, grid_height}, 0.);
244+
245+
vtr::NdMatrix<float, 2> tile_num_inter_die_conn({grid_width, grid_height}, 0.);
241246

242-
vtr::NdMatrix<float, 2> tile_num_inter_die_conn({grid_width, grid_height}, 0.);
247+
/*
248+
* Step 1: iterate over the rr-graph, recording how many edges go between layers at each (x,y) location
249+
* in the device. We count all these edges, regardless of which layers they connect. Then we divide by
250+
* the number of layers - 1 to get the average cross-layer edge count per (x,y) location -- this mirrors
251+
* what we do for the horizontal and vertical channels where we assume the channel width doesn't change
252+
* along the length of the channel. It lets us be more memory-efficient for 3D devices, and could be revisited
253+
* if someday we have architectures with widely varying connectivity between different layers in a stack.
254+
*/
243255

256+
/*
257+
* To calculate the accumulative number of inter-die connections we first need to get the number of
258+
* inter-die connection per location. To be able to work for the cases that RR Graph is read instead
259+
* of being made from the architecture file, we calculate this number by iterating over the RR graph. Once
260+
* tile_num_inter_die_conn is populated, we can start populating acc_tile_num_inter_die_conn_. First,
261+
* we populate the first row and column. Then, we iterate over the rest of blocks and get the number of
262+
* inter-die connections by adding up the number of inter-die block at that location + the accumulation
263+
* for the block below and left to it. Then, since the accumulated number of inter-die connection to
264+
* the block on the lower left connection of the block is added twice, that part needs to be removed.
265+
*/
244266
for (const auto& src_rr_node : rr_graph.nodes()) {
245-
for (const auto& rr_edge_idx : rr_graph.configurable_edges(src_rr_node)) {
267+
for (const auto& rr_edge_idx : rr_graph.edges(src_rr_node)) {
246268
const auto& sink_rr_node = rr_graph.edge_sink_node(src_rr_node, rr_edge_idx);
247269
if (rr_graph.node_layer(src_rr_node) != rr_graph.node_layer(sink_rr_node)) {
248270
// We assume that the nodes driving the inter-layer connection or being driven by it
249-
// are not streched across multiple tiles
271+
// are not stretched across multiple tiles
250272
int src_x = rr_graph.node_xhigh(src_rr_node);
251273
int src_y = rr_graph.node_yhigh(src_rr_node);
252274
VTR_ASSERT(rr_graph.node_xlow(src_rr_node) == src_x && rr_graph.node_ylow(src_rr_node) == src_y);
253275

254276
tile_num_inter_die_conn[src_x][src_y]++;
255277
}
256278
}
279+
}
257280

258-
for (const auto& rr_edge_idx : rr_graph.non_configurable_edges(src_rr_node)) {
259-
const auto& sink_rr_node = rr_graph.edge_sink_node(src_rr_node, rr_edge_idx);
260-
if (rr_graph.node_layer(src_rr_node) != rr_graph.node_layer(sink_rr_node)) {
261-
int src_x = rr_graph.node_xhigh(src_rr_node);
262-
VTR_ASSERT(rr_graph.node_xlow(src_rr_node) == src_x && rr_graph.node_xlow(src_rr_node) == src_x);
263-
int src_y = rr_graph.node_yhigh(src_rr_node);
264-
VTR_ASSERT(rr_graph.node_ylow(src_rr_node) == src_y && rr_graph.node_ylow(src_rr_node) == src_y);
265-
tile_num_inter_die_conn[src_x][src_y]++;
266-
}
281+
int num_layers = device_ctx.grid.get_num_layers();
282+
for (size_t x = 0; x < device_ctx.grid.width(); x++) {
283+
for (size_t y = 0; y < device_ctx.grid.height(); y++) {
284+
tile_num_inter_die_conn[x][y] /= (num_layers-1);
267285
}
268286
}
269287

270-
for (int x_high = 0; x_high < (int)device_ctx.grid.width(); x_high++) {
271-
for (int y_high = 0; y_high < (int)device_ctx.grid.height(); y_high++) {
272-
for (int x_low = 0; x_low <= x_high; x_low++) {
273-
for (int y_low = 0; y_low <= y_high; y_low++) {
274-
int num_inter_die_conn = 0;
275-
for (int x = x_low; x <= x_high; x++) {
276-
for (int y = y_low; y <= y_high; y++) {
277-
num_inter_die_conn += tile_num_inter_die_conn[x][y];
278-
}
279-
}
280-
int seen_num_tiles = (x_high - x_low + 1) * (y_high - y_low + 1);
281-
chanz_place_cost_fac_[x_high][y_high][x_low][y_low] = seen_num_tiles / static_cast<float>(num_inter_die_conn);
282-
283-
chanz_place_cost_fac_[x_high][y_high][x_low][y_low] = pow(
284-
(double)chanz_place_cost_fac_[x_high][y_high][x_low][y_low],
285-
(double)place_cost_exp);
286-
}
287-
}
288+
// Step 2: Calculate prefix sum of the inter-die connectivity up to and including the channel at (x, y).
289+
acc_tile_num_inter_die_conn_[0][0] = tile_num_inter_die_conn[0][0];
290+
// Initialize the first row and column
291+
for (size_t x = 1; x < device_ctx.grid.width(); x++) {
292+
acc_tile_num_inter_die_conn_[x][0] = acc_tile_num_inter_die_conn_[x-1][0] +
293+
tile_num_inter_die_conn[x][0];
294+
}
295+
296+
for (size_t y = 1; y < device_ctx.grid.height(); y++) {
297+
acc_tile_num_inter_die_conn_[0][y] = acc_tile_num_inter_die_conn_[0][y-1] +
298+
tile_num_inter_die_conn[0][y];
299+
}
300+
301+
for (size_t x_high = 1; x_high < device_ctx.grid.width(); x_high++) {
302+
for (size_t y_high = 1; y_high < device_ctx.grid.height(); y_high++) {
303+
acc_tile_num_inter_die_conn_[x_high][y_high] = acc_tile_num_inter_die_conn_[x_high-1][y_high] +
304+
acc_tile_num_inter_die_conn_[x_high][y_high-1] +
305+
tile_num_inter_die_conn[x_high][y_high] -
306+
acc_tile_num_inter_die_conn_[x_high-1][y_high-1];
288307
}
289308
}
290309
}
@@ -818,7 +837,7 @@ void NetCostHandler::update_bb_(ClusterNetId net_id,
818837
}
819838

820839
/* Now account for the layer motion. */
821-
if (num_layers > 1) {
840+
if (is_multi_layer_) {
822841
/* We need to update it only if multiple layers are available */
823842
for (int layer_num = 0; layer_num < num_layers; layer_num++) {
824843
num_sink_pin_layer_new[layer_num] = curr_num_sink_pin_layer[layer_num];
@@ -1402,8 +1421,6 @@ double NetCostHandler::get_net_cube_bb_cost_(ClusterNetId net_id, bool use_ts) {
14021421

14031422
const t_bb& bb = use_ts ? ts_bb_coord_new_[net_id] : placer_state_.move().bb_coords[net_id];
14041423

1405-
const bool is_multi_layer = (g_vpr_ctx.device().grid.get_num_layers() > 1);
1406-
14071424
double crossing = wirelength_crossing_count(cluster_ctx.clb_nlist.net_pins(net_id).size());
14081425

14091426
/* Could insert a check for xmin == xmax. In that case, assume *
@@ -1420,12 +1437,14 @@ double NetCostHandler::get_net_cube_bb_cost_(ClusterNetId net_id, bool use_ts) {
14201437
*/
14211438

14221439
double ncost;
1423-
ncost = (bb.xmax - bb.xmin + 1) * crossing * chanx_place_cost_fac_[bb.ymax][bb.ymin - 1];
1424-
ncost += (bb.ymax - bb.ymin + 1) * crossing * chany_place_cost_fac_[bb.xmax][bb.xmin - 1];
1425-
if (is_multi_layer) {
1426-
ncost += (bb.layer_max - bb.layer_min) * crossing * chanz_place_cost_fac_[bb.xmax][bb.ymax][bb.xmin][bb.ymin];
1440+
ncost = (bb.xmax - bb.xmin + 1) * chanx_place_cost_fac_[bb.ymax][bb.ymin - 1];
1441+
ncost += (bb.ymax - bb.ymin + 1) * chany_place_cost_fac_[bb.xmax][bb.xmin - 1];
1442+
if (is_multi_layer_) {
1443+
ncost += (bb.layer_max - bb.layer_min) * get_chanz_cost_factor_(bb);
14271444
}
14281445

1446+
ncost *= crossing;
1447+
14291448
return ncost;
14301449
}
14311450

@@ -1526,6 +1545,39 @@ double NetCostHandler::get_net_wirelength_from_layer_bb_(ClusterNetId net_id) {
15261545
return ncost;
15271546
}
15281547

1548+
float NetCostHandler::get_chanz_cost_factor_(const t_bb& bb) {
1549+
float place_cost_exp = placer_opts_.place_cost_exp;
1550+
1551+
int num_inter_dir_conn;
1552+
1553+
if (bb.xmin == 0 && bb.ymin == 0) {
1554+
num_inter_dir_conn = acc_tile_num_inter_die_conn_[bb.xmax][bb.ymax];
1555+
} else if (bb.xmin == 0) {
1556+
num_inter_dir_conn = acc_tile_num_inter_die_conn_[bb.xmax][bb.ymax] -
1557+
acc_tile_num_inter_die_conn_[bb.xmax][bb.ymin-1];
1558+
} else if (bb.ymin == 0) {
1559+
num_inter_dir_conn = acc_tile_num_inter_die_conn_[bb.xmax][bb.ymax] -
1560+
acc_tile_num_inter_die_conn_[bb.xmin-1][bb.ymax];
1561+
} else {
1562+
num_inter_dir_conn = acc_tile_num_inter_die_conn_[bb.xmax][bb.ymax] -
1563+
acc_tile_num_inter_die_conn_[bb.xmin-1][bb.ymax] -
1564+
acc_tile_num_inter_die_conn_[bb.xmax][bb.ymin-1] +
1565+
acc_tile_num_inter_die_conn_[bb.xmin-1][bb.ymin-1];
1566+
}
1567+
1568+
float z_cost_factor;
1569+
if (num_inter_dir_conn == 0) {
1570+
return 1.0f;
1571+
} else {
1572+
int bb_num_tiles = (bb.xmax - bb.xmin + 1) * (bb.ymax - bb.ymin + 1);
1573+
z_cost_factor = bb_num_tiles / static_cast<float>(num_inter_dir_conn);
1574+
z_cost_factor = pow((double)z_cost_factor, (double)place_cost_exp);
1575+
}
1576+
1577+
return z_cost_factor;
1578+
1579+
}
1580+
15291581
double NetCostHandler::recompute_bb_cost_() {
15301582
double cost = 0;
15311583

vpr/src/place/net_cost_handler.h

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,8 @@ class NetCostHandler {
123123
private:
124124
///@brief Specifies whether the bounding box is computed using cube method or per-layer method.
125125
bool cube_bb_ = false;
126+
///@brief Determines whether the FPGA has multiple dies (layers)
127+
bool is_multi_layer_ = false;
126128
///@brief A reference to the placer's state to be updated by this object.
127129
PlacerState& placer_state_;
128130
///@brief Contains some parameter that determine how the placement cost is computed.
@@ -196,12 +198,14 @@ class NetCostHandler {
196198
vtr::NdOffsetMatrix<float, 2> chanx_place_cost_fac_; // [-1...device_ctx.grid.width()-1]
197199
vtr::NdOffsetMatrix<float, 2> chany_place_cost_fac_; // [-1...device_ctx.grid.height()-1]
198200
/**
199-
@brief This data structure functions similarly to the matrices described above
200-
but is applied to 3D connections linking different FPGA layers. It is used in the
201-
placement cost function calculation, where the height of the bounding box is divided
202-
by the average number of inter-die connections within the bounding box.
201+
* @brief The matrix below is used to calculate a chanz_place_cost_fac based on the average channel width in
202+
* the cross-die-layer direction over a 2D (x,y) region. We don't assume the inter-die connectivity is the same at all (x,y) locations, so we
203+
* can't compute the full chanz_place_cost_fac for all possible (xlow,ylow)(xhigh,yhigh) without a 4D array, which would
204+
* be too big: O(n^2) in circuit size. Instead we compute a prefix sum that stores the number of inter-die connections per layer from
205+
* (x=0,y=0) to (x,y). Given this, we can compute the average number of inter-die connections over a (xlow,ylow) to (xhigh,yhigh)
206+
* region in O(1) (by adding and subtracting 4 entries)
203207
*/
204-
vtr::NdMatrix<float, 4> chanz_place_cost_fac_; // [0...device_ctx.grid.width()-1][0...device_ctx.grid.height()-1][0...device_ctx.grid.width()-1][0...device_ctx.grid.height()-1]
208+
vtr::NdMatrix<int, 2> acc_tile_num_inter_die_conn_; // [0..grid_width-1][0..grid_height-1]
205209

206210

207211
private:
@@ -250,23 +254,17 @@ class NetCostHandler {
250254
* have to bother calling this routine; when using the cost function described above, however, you must always
251255
* call this routine before you do any placement cost determination. The place_cost_exp factor specifies to
252256
* what power the width of the channel should be taken -- larger numbers make narrower channels more expensive.
253-
*
254-
* @param place_cost_exp It is an exponent to which you take the average inverse channel capacity;
255-
* a higher value would favour wider channels more over narrower channels during placement (usually we use 1).
256257
*/
257-
void alloc_and_load_chan_w_factors_for_place_cost_(float place_cost_exp);
258+
void alloc_and_load_chan_w_factors_for_place_cost_();
258259

259260
/**
260-
* @brief Allocates and loads the chanz_place_cost_fac array with the inverse of
261-
* the average number of inter-die connections between [subhigh] and [sublow].
261+
* @brief Allocates and loads acc_tile_num_inter_die_conn_ which contains the accumulative number of inter-die
262+
* conntections.
262263
*
263264
* @details This is only useful for multi-die FPGAs. The place_cost_exp factor specifies to
264265
* what power the average number of inter-die connections should be take -- larger numbers make narrower channels more expensive.
265-
*
266-
* @param place_cost_exp It is an exponent to which you take the average number of inter-die connections;
267-
* a higher value would favour areas with more inter-die connections over areas with less of those during placement (usually we use 1).
268266
*/
269-
void alloc_and_load_for_fast_vertical_cost_update_(float place_cost_exp);
267+
void alloc_and_load_for_fast_vertical_cost_update_();
270268

271269
/**
272270
* @brief Calculate the new connection delay and timing cost of all the
@@ -511,4 +509,16 @@ class NetCostHandler {
511509
*/
512510
double get_net_wirelength_from_layer_bb_(ClusterNetId net_id);
513511

512+
/**
513+
* @brief Calculate the chanz cost factor based on the inverse of the average number of inter-die connections
514+
* in the given bounding box. This cost factor increases the placement cost for blocks that require inter-layer
515+
* connections in areas with, on average, fewer inter-die connections. If inter-die connections are evenly
516+
* distributed across tiles, the cost factor will be the same for all bounding boxes, but it will still
517+
* weight z-directed vs. x- and y-directed connections appropriately.
518+
*
519+
* @param bounding_box Bounding box of the net which chanz cost factor is to be calculated
520+
* @return ChanZ cost factor
521+
*/
522+
float get_chanz_cost_factor_(const t_bb& bounding_box);
523+
514524
};
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
arch circuit script_params vtr_flow_elapsed_time vtr_max_mem_stage vtr_max_mem error odin_synth_time max_odin_mem parmys_synth_time max_parmys_mem abc_depth abc_synth_time abc_cec_time abc_sec_time max_abc_mem ace_time max_ace_mem num_clb num_io num_memories num_mult vpr_status vpr_revision vpr_build_info vpr_compiler vpr_compiled hostname rundir max_vpr_mem num_primary_inputs num_primary_outputs num_pre_packed_nets num_pre_packed_blocks num_netlist_clocks num_post_packed_nets num_post_packed_blocks device_width device_height device_grid_tiles device_limiting_resources device_name pack_mem pack_time placed_wirelength_est place_mem place_time place_quench_time placed_CPD_est placed_setup_TNS_est placed_setup_WNS_est placed_geomean_nonvirtual_intradomain_critical_path_delay_est place_delay_matrix_lookup_time place_quench_timing_analysis_time place_quench_sta_time place_total_timing_analysis_time place_total_sta_time min_chan_width routed_wirelength min_chan_width_route_success_iteration logic_block_area_total logic_block_area_used min_chan_width_routing_area_total min_chan_width_routing_area_per_tile min_chan_width_route_time min_chan_width_total_timing_analysis_time min_chan_width_total_sta_time crit_path_routed_wirelength crit_path_route_success_iteration crit_path_total_nets_routed crit_path_total_connections_routed crit_path_total_heap_pushes crit_path_total_heap_pops critical_path_delay geomean_nonvirtual_intradomain_critical_path_delay setup_TNS setup_WNS hold_TNS hold_WNS crit_path_routing_area_total crit_path_routing_area_per_tile router_lookahead_computation_time crit_path_route_time crit_path_total_timing_analysis_time crit_path_total_sta_time
2-
k6_frac_N10_40nm.xml stereovision3.v common 1.44 vpr 57.96 MiB -1 -1 0.42 25620 5 0.11 -1 -1 36164 -1 -1 7 10 -1 -1 success v8.0.0-6989-g4a9293e1e-dirty release IPO VTR_ASSERT_LEVEL=3 GNU 11.3.0 on Linux-5.15.0-58-generic x86_64 2023-02-04T01:37:29 dev /home/dev/Desktop/CAS-Atlantic/vtr-verilog-to-routing 59352 10 2 181 183 1 37 19 5 5 25 clb auto 19.7 MiB 0.05 108 58.0 MiB 0.01 0.00 1.93928 -79.1821 -1.93928 1.93928 0.02 0.000104618 8.1277e-05 0.00610425 0.00496002 24 129 10 485046 377258 28445.8 1137.83 0.13 0.0425172 0.0362734 109 8 74 103 1476 611 2.06938 2.06938 -89.2305 -2.06938 0 0 37126.9 1485.07 0.02 0.01 0.00966903 0.00924379
2+
k6_frac_N10_40nm.xml stereovision3.v common 1.44 vpr 57.96 MiB -1 -1 0.42 25620 5 0.11 -1 -1 36164 -1 -1 7 10 -1 -1 success v8.0.0-6989-g4a9293e1e-dirty release IPO VTR_ASSERT_LEVEL=3 GNU 11.3.0 on Linux-5.15.0-58-generic x86_64 2023-02-04T01:37:29 dev /home/dev/Desktop/CAS-Atlantic/vtr-verilog-to-routing 59352 10 2 181 183 1 37 19 5 5 25 clb auto 19.7 MiB 0.05 108 58.0 MiB 0.01 0.00 1.93928 -79.1821 -1.93928 1.93928 0.02 0.000104618 8.1277e-05 0.00610425 0.00496002 26 129 10 485046 377258 34134.96 1365.396 0.13 0.0425172 0.0362734 109 8 74 103 1476 611 2.06938 2.06938 -89.2305 -2.06938 0 0 37126.9 1485.07 0.02 0.01 0.00966903 0.00924379

0 commit comments

Comments
 (0)