Skip to content

Commit 214fed5

Browse files
tt-ahoaliuTT
andauthored
#20712: Update external cable check to support N300/T3K and add report to health check (#22830)
### Ticket #20712 ### Problem description Want to report whether missing links are internal/external for health check. ### What's changed Add support for checking external cables on generic N300 devices and update health check script to report which links are internal/external. ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI with demo tests passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] (For models and ops writers) [Single-card demo tests](https://github.com/tenstorrent/tt-metal/actions/workflows/single-card-demo-tests.yaml) CI passes (if applicable) See [recommended dev flow](https://github.com/tenstorrent/tt-metal/blob/main/models/MODEL_ADD.md#a-recommended-dev-flow-on-github-for-adding-new-models). - [x] New/Existing tests provide coverage for changes Co-authored-by: Allan Liu <aliu@tenstorrent.com>
1 parent 16549d9 commit 214fed5

File tree

4 files changed

+63
-46
lines changed

4 files changed

+63
-46
lines changed

tests/tt_metal/tt_fabric/system_health/test_system_health.cpp

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,7 @@ std::pair<std::uint32_t, std::uint32_t> get_ubb_ids(chip_id_t chip_id) {
2828
const auto& tray_bus_ids = ubb_bus_ids.at(cluster.arch());
2929
auto tray_bus_id_it = std::find(tray_bus_ids.begin(), tray_bus_ids.end(), cluster.get_bus_id(chip_id) & 0xF0);
3030
if (tray_bus_id_it != tray_bus_ids.end()) {
31-
auto unique_chip_id = cluster.get_unique_chip_ids().at(chip_id);
32-
auto ubb_asic_id = ((unique_chip_id >> 56) & 0xFF);
31+
auto ubb_asic_id = cluster.get_ubb_asic_id(chip_id);
3332
return std::make_pair(tray_bus_id_it - tray_bus_ids.begin() + 1, ubb_asic_id);
3433
}
3534
return std::make_pair(0, 0);
@@ -44,7 +43,6 @@ TEST(Cluster, ReportSystemHealth) {
4443

4544
auto unique_chip_ids = tt::tt_metal::MetalContext::instance().get_cluster().get_unique_chip_ids();
4645
std::stringstream ss;
47-
ss << "Found " << unique_chip_ids.size() << " chips in cluster:" << std::endl;
4846
std::vector<std::uint32_t> read_vec;
4947
auto retrain_count_addr = tt::tt_metal::MetalContext::instance().hal().get_dev_addr(
5048
tt::tt_metal::HalProgrammableCoreType::ACTIVE_ETH, tt::tt_metal::HalL1MemAddrType::RETRAIN_COUNT);
@@ -54,6 +52,7 @@ TEST(Cluster, ReportSystemHealth) {
5452
unique_chip_ids[chip_id] = chip_id;
5553
}
5654
}
55+
ss << "Found " << unique_chip_ids.size() << " chips in cluster:" << std::endl;
5756

5857
auto cluster_type = cluster.get_cluster_type();
5958

@@ -73,27 +72,29 @@ TEST(Cluster, ReportSystemHealth) {
7372
std::stringstream eth_ss;
7473
cluster.read_core(read_vec, sizeof(uint32_t), virtual_eth_core, retrain_count_addr);
7574
eth_ss << " eth channel " << std::dec << (uint32_t)chan << " " << eth_core.str();
75+
std::string connection_type =
76+
cluster.is_external_cable(chip_id, eth_core) ? "(external connector)" : "(internal trace)";
7677
if (cluster.is_ethernet_link_up(chip_id, eth_core)) {
7778
if (eth_connections.at(chip_id).find(chan) != eth_connections.at(chip_id).end()) {
7879
const auto& [connected_chip_id, connected_eth_core] =
7980
cluster.get_connected_ethernet_core(std::make_tuple(chip_id, eth_core));
8081
std::cout << "Connected chip: " << connected_chip_id
8182
<< " connected eth core: " << connected_eth_core.str() << std::endl;
82-
eth_ss << " link UP, retrain: " << read_vec[0] << ", connected to chip " << connected_chip_id << " "
83-
<< connected_eth_core.str();
83+
eth_ss << " link UP " << connection_type << ", retrain: " << read_vec[0] << ", connected to chip "
84+
<< connected_chip_id << " " << connected_eth_core.str();
8485
} else {
8586
const auto& [connected_chip_unique_id, connected_eth_core] =
8687
cluster.get_connected_ethernet_core_to_remote_mmio_device(std::make_tuple(chip_id, eth_core));
8788
std::cout << "Connected unique chip: " << connected_chip_unique_id
8889
<< " connected eth core: " << connected_eth_core.str() << std::endl;
89-
eth_ss << " link UP, retrain: " << read_vec[0] << ", connected to chip " << connected_chip_unique_id
90-
<< " " << connected_eth_core.str();
90+
eth_ss << " link UP " << connection_type << ", retrain: " << read_vec[0] << ", connected to chip "
91+
<< connected_chip_unique_id << " " << connected_eth_core.str();
9192
}
9293
if (read_vec[0] > 0) {
9394
unexpected_system_states.push_back(chip_id_ss.str() + eth_ss.str());
9495
}
9596
} else {
96-
eth_ss << " link DOWN";
97+
eth_ss << " link DOWN/unconnected " << connection_type;
9798
unexpected_system_states.push_back(chip_id_ss.str() + eth_ss.str());
9899
}
99100
ss << eth_ss.str() << std::endl;
@@ -176,9 +177,13 @@ TEST(Cluster, TestMeshFullConnectivity) {
176177
auto [tray_id, ubb_asic_id] = get_ubb_ids(chip);
177178
chip_ss << " Tray: " << tray_id << " N" << ubb_asic_id;
178179
}
180+
const auto& soc_desc = cluster.get_soc_desc(chip);
179181
std::map<chip_id_t, int> num_connections_to_chip;
180182
for (const auto& [channel, remote_chip_and_channel] : connections) {
181-
num_connections_to_chip[std::get<0>(remote_chip_and_channel)]++;
183+
tt::umd::CoreCoord logical_active_eth = soc_desc.get_eth_core_for_channel(channel, CoordSystem::LOGICAL);
184+
if (cluster.is_ethernet_link_up(chip, logical_active_eth)) {
185+
num_connections_to_chip[std::get<0>(remote_chip_and_channel)]++;
186+
}
182187
}
183188
if (target_system_topology.has_value()) {
184189
if (*target_system_topology == FabricType::TORUS_2D) {

tt_metal/fabric/control_plane.cpp

Lines changed: 10 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,12 @@
2727
#include "core_coord.hpp"
2828
#include "fabric_host_interface.h"
2929
#include "hal_types.hpp"
30+
#include "impl/context/metal_context.hpp"
3031
#include "logger.hpp"
3132
#include "mesh_coord.hpp"
3233
#include "mesh_graph.hpp"
3334
#include "metal_soc_descriptor.h"
3435
#include "routing_table_generator.hpp"
35-
#include "impl/context/metal_context.hpp"
3636
#include <umd/device/tt_core_coordinates.h>
3737
#include <umd/device/tt_xy_pair.h>
3838
#include <umd/device/types/cluster_descriptor_types.h>
@@ -46,40 +46,14 @@ std::unordered_map<chip_id_t, std::vector<CoreCoord>> get_ethernet_cores_grouped
4646
return tt::tt_metal::MetalContext::instance().get_cluster().get_ethernet_cores_grouped_by_connected_chips(chip_id);
4747
}
4848

49-
// Get the physical chip ids for a mesh
50-
std::uint32_t get_ubb_asic_id(chip_id_t physical_chip_id) {
51-
auto unique_chip_id =
52-
tt::tt_metal::MetalContext::instance().get_cluster().get_unique_chip_ids().at(physical_chip_id);
53-
return ((unique_chip_id >> 56) & 0xFF);
54-
}
55-
56-
bool is_external_ubb_cable(chip_id_t physical_chip_id, CoreCoord eth_core) {
57-
auto chan_id = tt::tt_metal::MetalContext::instance()
58-
.get_cluster()
59-
.get_soc_desc(physical_chip_id)
60-
.logical_eth_core_to_chan_map.at(eth_core);
61-
auto ubb_asic_id = get_ubb_asic_id(physical_chip_id);
62-
bool is_external_cable = false;
63-
if (ubb_asic_id == 1) {
64-
// UBB 1 has external cables on channesl 0-7
65-
is_external_cable = (chan_id >= 0 and chan_id <= 7);
66-
} else if (ubb_asic_id >= 2 and ubb_asic_id <= 4) {
67-
// UBB 2 to 4 has external cables on channesl 0-3
68-
is_external_cable = (chan_id >= 0 and chan_id <= 3);
69-
} else if (ubb_asic_id == 5) {
70-
// UBB 5 has external cables on channesl 4-7
71-
is_external_cable = (chan_id >= 4 and chan_id <= 7);
72-
}
73-
return is_external_cable;
74-
}
75-
7649
bool is_chip_on_edge_of_mesh(
7750
chip_id_t physical_chip_id,
7851
int num_ports_per_side,
7952
const std::unordered_map<chip_id_t, std::vector<CoreCoord>>& ethernet_cores_grouped_by_connected_chips) {
53+
const auto& cluster = tt::tt_metal::MetalContext::instance().get_cluster();
8054
// Chip is on edge if it does not have full connections to four sides
81-
if (tt::tt_metal::MetalContext::instance().get_cluster().get_board_type(physical_chip_id) == BoardType::UBB) {
82-
auto ubb_asic_id = get_ubb_asic_id(physical_chip_id);
55+
if (cluster.get_board_type(physical_chip_id) == BoardType::UBB) {
56+
auto ubb_asic_id = cluster.get_ubb_asic_id(physical_chip_id);
8357
return (ubb_asic_id >= 2) and (ubb_asic_id <= 5);
8458
} else {
8559
int i = 0;
@@ -96,8 +70,9 @@ bool is_chip_on_corner_of_mesh(
9670
chip_id_t physical_chip_id,
9771
int num_ports_per_side,
9872
const std::unordered_map<chip_id_t, std::vector<CoreCoord>>& ethernet_cores_grouped_by_connected_chips) {
99-
if (tt::tt_metal::MetalContext::instance().get_cluster().get_board_type(physical_chip_id) == BoardType::UBB) {
100-
auto ubb_asic_id = get_ubb_asic_id(physical_chip_id);
73+
const auto& cluster = tt::tt_metal::MetalContext::instance().get_cluster();
74+
if (cluster.get_board_type(physical_chip_id) == BoardType::UBB) {
75+
auto ubb_asic_id = cluster.get_ubb_asic_id(physical_chip_id);
10176
return (ubb_asic_id == 1);
10277
} else {
10378
// Chip is a corner if it has exactly 2 fully connected sides
@@ -250,10 +225,8 @@ std::vector<chip_id_t> ControlPlane::get_mesh_physical_chip_ids(
250225
bool is_ubb = cluster.get_board_type(current_chip_id) == BoardType::UBB;
251226
for (const auto& [connected_chip_id, eth_ports] : eth_links) {
252227
// Do not include any corner to corner links on UBB
253-
if (is_ubb) {
254-
if (is_external_ubb_cable(current_chip_id, eth_ports[0])) {
255-
continue;
256-
}
228+
if (is_ubb && cluster.is_external_cable(current_chip_id, eth_ports[0])) {
229+
continue;
257230
}
258231
if (eth_ports.size() >= num_ports_per_side) {
259232
if (visited_physical_chips.find(connected_chip_id) == visited_physical_chips.end()) {
@@ -321,7 +294,7 @@ std::vector<chip_id_t> ControlPlane::get_mesh_physical_chip_ids(
321294
bool found_chip = false;
322295
bool is_ubb = cluster.get_board_type(physical_chip_id_from_north) == BoardType::UBB;
323296
for (const auto& [connected_chip_id, eth_ports] : eth_links_grouped_by_connected_chips) {
324-
if (is_ubb and is_external_ubb_cable(physical_chip_id_from_north, eth_ports[0])) {
297+
if (is_ubb && cluster.is_external_cable(physical_chip_id_from_north, eth_ports[0])) {
325298
continue;
326299
}
327300
if (visited_physical_chips.find(connected_chip_id) == visited_physical_chips.end() and

tt_metal/llrt/tt_cluster.cpp

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1422,6 +1422,39 @@ void Cluster::initialize_control_plane() {
14221422
global_control_plane_ = std::make_unique<tt::tt_fabric::GlobalControlPlane>(mesh_graph_desc_path.string());
14231423
}
14241424

1425+
std::uint32_t Cluster::get_ubb_asic_id(chip_id_t physical_chip_id) const {
1426+
auto unique_chip_id = this->get_unique_chip_ids().at(physical_chip_id);
1427+
return ((unique_chip_id >> 56) & 0xFF);
1428+
}
1429+
1430+
bool Cluster::is_external_cable(chip_id_t physical_chip_id, CoreCoord eth_core) const {
1431+
auto chan_id = this->get_soc_desc(physical_chip_id).logical_eth_core_to_chan_map.at(eth_core);
1432+
bool is_external_cable = false;
1433+
auto board_type = this->get_board_type(physical_chip_id);
1434+
if (board_type == BoardType::UBB) {
1435+
auto ubb_asic_id = get_ubb_asic_id(physical_chip_id);
1436+
if (ubb_asic_id == 1) {
1437+
// UBB 1 has external cables on channels 0-7
1438+
is_external_cable = (chan_id >= 0 and chan_id <= 7);
1439+
} else if (ubb_asic_id >= 2 and ubb_asic_id <= 4) {
1440+
// UBB 2 to 4 has external cables on channels 0-3
1441+
is_external_cable = (chan_id >= 0 and chan_id <= 3);
1442+
} else if (ubb_asic_id == 5) {
1443+
// UBB 5 has external cables on channels 4-7
1444+
is_external_cable = (chan_id >= 4 and chan_id <= 7);
1445+
}
1446+
} else if (board_type == BoardType::N300) {
1447+
// N300 has external cables on channels 8-9 on MMIO chips and channels 0-1 on non-MMIO chips
1448+
auto mmio_device_id = this->get_associated_mmio_device(physical_chip_id);
1449+
if (mmio_device_id == physical_chip_id) {
1450+
is_external_cable = (chan_id != 8 and chan_id != 9);
1451+
} else {
1452+
is_external_cable = (chan_id != 0 and chan_id != 1);
1453+
}
1454+
}
1455+
return is_external_cable;
1456+
}
1457+
14251458
} // namespace tt
14261459

14271460
std::ostream &operator<<(std::ostream &os, tt_target_dram const &dram) {

tt_metal/llrt/tt_cluster.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,12 @@ class Cluster {
340340

341341
const std::unordered_map<CoreCoord, int32_t>& get_virtual_routing_to_profiler_flat_id(chip_id_t chip_id) const;
342342

343+
std::uint32_t get_ubb_asic_id(chip_id_t physical_chip_id) const;
344+
345+
// TODO: move to separate system descriptor class
346+
// return enum for connection type, Internal, QSFP, Other, Unknown
347+
bool is_external_cable(chip_id_t physical_chip_id, CoreCoord eth_core) const;
348+
343349
private:
344350
void detect_arch_and_target();
345351
void generate_cluster_descriptor();

0 commit comments

Comments
 (0)