From e711b62ab72e538b4ebc45df66c684aad5f48dbf Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 11 Feb 2026 00:12:09 +0000
Subject: [PATCH 001/132] Initial plan


From c881bc5e16c8dd8ef9488a462c3a783c5db185f1 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 11 Feb 2026 00:17:18 +0000
Subject: [PATCH 002/132] Replace gtest/gtest.h with framework.hpp in all unit
 tests

Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
---
 test/framework.cc                 | 253 ++++++++++++++++++++++
 test/framework.hpp                | 336 ++++++++++++++++++++++++++++++
 test/perf/framework.cc            | 155 ++------------
 test/perf/framework.hpp           |  64 +-----
 test/unit/compile_tests.cu        |   2 +-
 test/unit/core_tests.cc           |   2 +-
 test/unit/errors_tests.cc         |   2 +-
 test/unit/fifo_tests.cu           |   2 +-
 test/unit/gpu_utils_tests.cc      |   2 +-
 test/unit/local_channel_tests.cu  |   2 +-
 test/unit/numa_tests.cc           |   2 +-
 test/unit/socket_tests.cc         |   2 +-
 test/unit/utils_internal_tests.cc |   2 +-
 test/unit/utils_tests.cc          |   2 +-
 14 files changed, 625 insertions(+), 203 deletions(-)
 create mode 100644 test/framework.cc
 create mode 100644 test/framework.hpp
diff --git a/test/framework.cc b/test/framework.cc
new file mode 100644
index 000000000..5fd096f12
--- /dev/null
+++ b/test/framework.cc
@@ -0,0 +1,253 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "framework.hpp"
+
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+namespace mscclpp {
+namespace test {
+
+// Global state
+static int g_mpi_rank = 0;
+static int g_mpi_size = 1;
+static bool g_mpi_initialized = false;
+static bool g_current_test_passed = true;
+static std::string g_current_test_failure_message;
+
+namespace utils {
+
+// Internal MPI helper functions (not exposed in header)
+void initializeMPI(int argc, char* argv[]) {
+  if (g_mpi_initialized) return;
+
+  MPI_Init(&argc, &argv);
+  MPI_Comm_rank(MPI_COMM_WORLD, &g_mpi_rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &g_mpi_size);
+  g_mpi_initialized = true;
+}
+
+static void finalizeMPI() {
+  if (!g_mpi_initialized) return;
+
+  MPI_Finalize();
+  g_mpi_initialized = false;
+}
+
+static int getMPIRank() { return g_mpi_rank; }
+
+static int getMPISize() { return g_mpi_size; }
+
+static bool isMainProcess() { return g_mpi_rank == 0; }
+
+// Public utility functions for test output
+bool isMainRank() { return g_mpi_rank == 0; }
+
+int getMPIRank() { return g_mpi_rank; }
+
+int getMPISize() { return g_mpi_size; }
+
+void cleanupMPI() { finalizeMPI(); }
+
+void reportFailure(const char* file, int line, const std::string& message) {
+  g_current_test_passed = false;
+  std::ostringstream oss;
+  oss << file << ":" << line << ": " << message;
+  if (!g_current_test_failure_message.empty()) {
+    g_current_test_failure_message += "\n";
+  }
+  g_current_test_failure_message += oss.str();
+  std::cerr << oss.str() << std::endl;
+}
+
+void reportSuccess() {
+  g_current_test_passed = true;
+  g_current_test_failure_message.clear();
+}
+
+// Timer implementation
+Timer::Timer() : is_running_(false) {}
+
+void Timer::start() {
+  start_time_ = std::chrono::high_resolution_clock::now();
+  is_running_ = true;
+}
+
+void Timer::stop() {
+  end_time_ = std::chrono::high_resolution_clock::now();
+  is_running_ = false;
+}
+
+double Timer::elapsedMicroseconds() const {
+  if (is_running_) {
+    auto now = std::chrono::high_resolution_clock::now();
+    return std::chrono::duration_cast<std::chrono::microseconds>(now - start_time_).count();
+  }
+  return std::chrono::duration_cast<std::chrono::microseconds>(end_time_ - start_time_).count();
+}
+
+double Timer::elapsedMilliseconds() const { return elapsedMicroseconds() / 1000.0; }
+
+double Timer::elapsedSeconds() const { return elapsedMicroseconds() / 1000000.0; }
+
+void cudaCheck(cudaError_t err, const char* file, int line) {
+  if (err != cudaSuccess) {
+    std::string msg =
+        std::string("CUDA error at ") + file + ":" + std::to_string(line) + " - " + cudaGetErrorString(err);
+    throw std::runtime_error(msg);
+  }
+}
+
+int runMultipleTests(
+    int argc, char* argv[],
+    const std::vector<std::tuple<std::string, std::string, std::function<void(int, int, int)>>>& tests) {
+  int totalResult = 0;
+
+  // Initialize MPI once for all tests
+  initializeMPI(argc, argv);
+
+  try {
+    // Get MPI information
+    int rank = getMPIRank();
+    int size = getMPISize();
+    int local_rank = rank;  // For simplicity, assume local_rank = rank
+
+    for (const auto& test : tests) {
+      const std::string& testName = std::get<0>(test);
+      const std::string& testDescription = std::get<1>(test);
+      const std::function<void(int, int, int)>& testFunction = std::get<2>(test);
+
+      if (rank == 0) {
+        std::cout << "Running test: " << testName << std::endl;
+        if (!testDescription.empty()) {
+          std::cout << "  " << testDescription << std::endl;
+        }
+      }
+
+      // Don't clear results - accumulate them for all tests in the same file
+      // g_results.clear();  // Commented out to accumulate results
+
+      try {
+        // Run the individual test function with MPI information
+        testFunction(rank, size, local_rank);
+
+        // Synchronize before moving to next test
+        MPI_Barrier(MPI_COMM_WORLD);
+
+      } catch (const std::exception& e) {
+        if (rank == 0) {
+          std::cerr << "Error in test " << testName << ": " << e.what() << std::endl;
+        }
+        totalResult = 1;
+      }
+    }
+
+    // Don't cleanup MPI here - let the caller handle it
+    // finalizeMPI();
+
+  } catch (const std::exception& e) {
+    if (g_mpi_rank == 0) {
+      std::cerr << "Error: " << e.what() << std::endl;
+    }
+    finalizeMPI();
+    return 1;
+  }
+
+  return totalResult;
+}
+
+}  // namespace utils
+
+// TestRegistry implementation
+TestRegistry& TestRegistry::instance() {
+  static TestRegistry registry;
+  return registry;
+}
+
+void TestRegistry::registerTest(const std::string& test_suite, const std::string& test_name, TestFactory factory) {
+  TestInfo info;
+  info.suite_name = test_suite;
+  info.test_name = test_name;
+  info.factory = factory;
+  tests_.push_back(info);
+}
+
+int TestRegistry::runAllTests(int argc, char* argv[]) {
+  // Initialize MPI if not already initialized
+  if (!g_mpi_initialized) {
+    utils::initializeMPI(argc, argv);
+  }
+
+  int passed = 0;
+  int failed = 0;
+
+  if (g_mpi_rank == 0) {
+    std::cout << "[==========] Running " << tests_.size() << " tests.\n";
+  }
+
+  for (const auto& test_info : tests_) {
+    g_current_test_passed = true;
+    g_current_test_failure_message.clear();
+
+    if (g_mpi_rank == 0) {
+      std::cout << "[ RUN      ] " << test_info.suite_name << "." << test_info.test_name << std::endl;
+    }
+
+    TestCase* test_case = nullptr;
+    try {
+      test_case = test_info.factory();
+      test_case->SetUp();
+      test_case->TestBody();
+      test_case->TearDown();
+    } catch (const std::exception& e) {
+      g_current_test_passed = false;
+      if (g_current_test_failure_message.empty()) {
+        g_current_test_failure_message = e.what();
+      }
+    } catch (...) {
+      g_current_test_passed = false;
+      if (g_current_test_failure_message.empty()) {
+        g_current_test_failure_message = "Unknown exception";
+      }
+    }
+
+    delete test_case;
+
+    // Synchronize test status across all MPI processes
+    int local_passed = g_current_test_passed ? 1 : 0;
+    int global_passed = 1;
+    if (g_mpi_initialized) {
+      MPI_Allreduce(&local_passed, &global_passed, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+    } else {
+      global_passed = local_passed;
+    }
+
+    if (g_mpi_rank == 0) {
+      if (global_passed) {
+        std::cout << "[       OK ] " << test_info.suite_name << "." << test_info.test_name << std::endl;
+        passed++;
+      } else {
+        std::cout << "[  FAILED  ] " << test_info.suite_name << "." << test_info.test_name << std::endl;
+        failed++;
+      }
+    }
+  }
+
+  if (g_mpi_rank == 0) {
+    std::cout << "[==========] " << tests_.size() << " tests ran.\n";
+    if (passed > 0) {
+      std::cout << "[  PASSED  ] " << passed << " tests.\n";
+    }
+    if (failed > 0) {
+      std::cout << "[  FAILED  ] " << failed << " tests.\n";
+    }
+  }
+
+  return failed > 0 ? 1 : 0;
+}
+
+}  // namespace test
+}  // namespace mscclpp
diff --git a/test/framework.hpp b/test/framework.hpp
new file mode 100644
index 000000000..6d510382c
--- /dev/null
+++ b/test/framework.hpp
@@ -0,0 +1,336 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef MSCCLPP_TEST_FRAMEWORK_HPP_
+#define MSCCLPP_TEST_FRAMEWORK_HPP_
+
+#include <mpi.h>
+
+#include <chrono>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <map>
+#include <mscclpp/gpu.hpp>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+#include <vector>
+
+namespace mscclpp {
+namespace test {
+
+// Test result structure
+struct TestResult {
+  std::string test_name;
+  std::string test_category;
+  std::map<std::string, std::string> test_params;
+  int num_processes;
+  int process_rank;
+  std::string timestamp;
+  bool passed;
+  std::string failure_message;
+};
+
+// Test case base class
+class TestCase {
+ public:
+  virtual ~TestCase() = default;
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+  virtual void TestBody() = 0;
+};
+
+// Test registry and runner
+class TestRegistry {
+ public:
+  using TestFactory = std::function<TestCase*()>;
+  
+  static TestRegistry& instance();
+  
+  void registerTest(const std::string& test_suite, const std::string& test_name, TestFactory factory);
+  int runAllTests(int argc, char* argv[]);
+  
+ private:
+  TestRegistry() = default;
+  struct TestInfo {
+    std::string suite_name;
+    std::string test_name;
+    TestFactory factory;
+  };
+  std::vector<TestInfo> tests_;
+};
+
+// Simple utility functions for testing
+namespace utils {
+
+// Test execution utilities (for performance tests)
+int runMultipleTests(
+    int argc, char* argv[],
+    const std::vector<std::tuple<std::string, std::string, std::function<void(int, int, int)>>>& tests);
+
+// MPI management
+void initializeMPI(int argc, char* argv[]);
+void cleanupMPI();
+bool isMainRank();
+int getMPIRank();
+int getMPISize();
+
+// Timing utilities
+class Timer {
+ public:
+  Timer();
+  void start();
+  void stop();
+  double elapsedMicroseconds() const;
+  double elapsedMilliseconds() const;
+  double elapsedSeconds() const;
+
+ private:
+  std::chrono::high_resolution_clock::time_point start_time_;
+  std::chrono::high_resolution_clock::time_point end_time_;
+  bool is_running_;
+};
+
+// CUDA utilities
+void cudaCheck(cudaError_t err, const char* file, int line);
+#define CUDA_CHECK(call) mscclpp::test::utils::cudaCheck(call, __FILE__, __LINE__)
+
+// Test assertion helpers
+void reportFailure(const char* file, int line, const std::string& message);
+void reportSuccess();
+
+}  // namespace utils
+
+}  // namespace test
+}  // namespace mscclpp
+
+// Test registration macros
+#define TEST(test_suite, test_name)                                                    \
+  class test_suite##_##test_name##_Test : public ::mscclpp::test::TestCase {          \
+   public:                                                                             \
+    test_suite##_##test_name##_Test() {}                                              \
+    void TestBody() override;                                                          \
+  };                                                                                   \
+  static bool test_suite##_##test_name##_registered = []() {                          \
+    ::mscclpp::test::TestRegistry::instance().registerTest(                           \
+        #test_suite, #test_name,                                                       \
+        []() -> ::mscclpp::test::TestCase* { return new test_suite##_##test_name##_Test(); }); \
+    return true;                                                                       \
+  }();                                                                                 \
+  void test_suite##_##test_name##_Test::TestBody()
+
+#define TEST_F(test_fixture, test_name)                                                \
+  class test_fixture##_##test_name##_Test : public test_fixture {                     \
+   public:                                                                             \
+    test_fixture##_##test_name##_Test() {}                                            \
+    void TestBody() override;                                                          \
+  };                                                                                   \
+  static bool test_fixture##_##test_name##_registered = []() {                        \
+    ::mscclpp::test::TestRegistry::instance().registerTest(                           \
+        #test_fixture, #test_name,                                                     \
+        []() -> ::mscclpp::test::TestCase* { return new test_fixture##_##test_name##_Test(); }); \
+    return true;                                                                       \
+  }();                                                                                 \
+  void test_fixture##_##test_name##_Test::TestBody()
+
+// Test runner macro
+#define RUN_ALL_TESTS() ::mscclpp::test::TestRegistry::instance().runAllTests(argc, argv)
+
+// Assertion macros
+#define EXPECT_TRUE(condition)                                                         \
+  do {                                                                                 \
+    if (!(condition)) {                                                                \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__,                       \
+                                             "Expected: " #condition " to be true");   \
+    }                                                                                  \
+  } while (0)
+
+#define EXPECT_FALSE(condition)                                                        \
+  do {                                                                                 \
+    if (condition) {                                                                   \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__,                       \
+                                             "Expected: " #condition " to be false");  \
+    }                                                                                  \
+  } while (0)
+
+#define EXPECT_EQ(val1, val2)                                                          \
+  do {                                                                                 \
+    auto v1 = (val1);                                                                  \
+    auto v2 = (val2);                                                                  \
+    if (!(v1 == v2)) {                                                                 \
+      std::ostringstream oss;                                                          \
+      oss << "Expected: " #val1 " == " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+    }                                                                                  \
+  } while (0)
+
+#define EXPECT_NE(val1, val2)                                                          \
+  do {                                                                                 \
+    auto v1 = (val1);                                                                  \
+    auto v2 = (val2);                                                                  \
+    if (!(v1 != v2)) {                                                                 \
+      std::ostringstream oss;                                                          \
+      oss << "Expected: " #val1 " != " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+    }                                                                                  \
+  } while (0)
+
+#define EXPECT_LT(val1, val2)                                                          \
+  do {                                                                                 \
+    auto v1 = (val1);                                                                  \
+    auto v2 = (val2);                                                                  \
+    if (!(v1 < v2)) {                                                                  \
+      std::ostringstream oss;                                                          \
+      oss << "Expected: " #val1 " < " #val2 << "\n  Actual: " << v1 << " vs " << v2;  \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+    }                                                                                  \
+  } while (0)
+
+#define EXPECT_LE(val1, val2)                                                          \
+  do {                                                                                 \
+    auto v1 = (val1);                                                                  \
+    auto v2 = (val2);                                                                  \
+    if (!(v1 <= v2)) {                                                                 \
+      std::ostringstream oss;                                                          \
+      oss << "Expected: " #val1 " <= " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+    }                                                                                  \
+  } while (0)
+
+#define EXPECT_GT(val1, val2)                                                          \
+  do {                                                                                 \
+    auto v1 = (val1);                                                                  \
+    auto v2 = (val2);                                                                  \
+    if (!(v1 > v2)) {                                                                  \
+      std::ostringstream oss;                                                          \
+      oss << "Expected: " #val1 " > " #val2 << "\n  Actual: " << v1 << " vs " << v2;  \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+    }                                                                                  \
+  } while (0)
+
+#define EXPECT_GE(val1, val2)                                                          \
+  do {                                                                                 \
+    auto v1 = (val1);                                                                  \
+    auto v2 = (val2);                                                                  \
+    if (!(v1 >= v2)) {                                                                 \
+      std::ostringstream oss;                                                          \
+      oss << "Expected: " #val1 " >= " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+    }                                                                                  \
+  } while (0)
+
+#define ASSERT_TRUE(condition)                                                         \
+  do {                                                                                 \
+    if (!(condition)) {                                                                \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__,                       \
+                                             "Expected: " #condition " to be true");   \
+      throw std::runtime_error("Test assertion failed");                              \
+    }                                                                                  \
+  } while (0)
+
+#define ASSERT_FALSE(condition)                                                        \
+  do {                                                                                 \
+    if (condition) {                                                                   \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__,                       \
+                                             "Expected: " #condition " to be false");  \
+      throw std::runtime_error("Test assertion failed");                              \
+    }                                                                                  \
+  } while (0)
+
+#define ASSERT_EQ(val1, val2)                                                          \
+  do {                                                                                 \
+    auto v1 = (val1);                                                                  \
+    auto v2 = (val2);                                                                  \
+    if (!(v1 == v2)) {                                                                 \
+      std::ostringstream oss;                                                          \
+      oss << "Expected: " #val1 " == " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+      throw std::runtime_error("Test assertion failed");                              \
+    }                                                                                  \
+  } while (0)
+
+#define ASSERT_NE(val1, val2)                                                          \
+  do {                                                                                 \
+    auto v1 = (val1);                                                                  \
+    auto v2 = (val2);                                                                  \
+    if (!(v1 != v2)) {                                                                 \
+      std::ostringstream oss;                                                          \
+      oss << "Expected: " #val1 " != " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+      throw std::runtime_error("Test assertion failed");                              \
+    }                                                                                  \
+  } while (0)
+
+#define ASSERT_LT(val1, val2)                                                          \
+  do {                                                                                 \
+    auto v1 = (val1);                                                                  \
+    auto v2 = (val2);                                                                  \
+    if (!(v1 < v2)) {                                                                  \
+      std::ostringstream oss;                                                          \
+      oss << "Expected: " #val1 " < " #val2 << "\n  Actual: " << v1 << " vs " << v2;  \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+      throw std::runtime_error("Test assertion failed");                              \
+    }                                                                                  \
+  } while (0)
+
+#define ASSERT_LE(val1, val2)                                                          \
+  do {                                                                                 \
+    auto v1 = (val1);                                                                  \
+    auto v2 = (val2);                                                                  \
+    if (!(v1 <= v2)) {                                                                 \
+      std::ostringstream oss;                                                          \
+      oss << "Expected: " #val1 " <= " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+      throw std::runtime_error("Test assertion failed");                              \
+    }                                                                                  \
+  } while (0)
+
+#define ASSERT_GT(val1, val2)                                                          \
+  do {                                                                                 \
+    auto v1 = (val1);                                                                  \
+    auto v2 = (val2);                                                                  \
+    if (!(v1 > v2)) {                                                                  \
+      std::ostringstream oss;                                                          \
+      oss << "Expected: " #val1 " > " #val2 << "\n  Actual: " << v1 << " vs " << v2;  \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+      throw std::runtime_error("Test assertion failed");                              \
+    }                                                                                  \
+  } while (0)
+
+#define ASSERT_GE(val1, val2)                                                          \
+  do {                                                                                 \
+    auto v1 = (val1);                                                                  \
+    auto v2 = (val2);                                                                  \
+    if (!(v1 >= v2)) {                                                                 \
+      std::ostringstream oss;                                                          \
+      oss << "Expected: " #val1 " >= " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+      throw std::runtime_error("Test assertion failed");                              \
+    }                                                                                  \
+  } while (0)
+
+#define ASSERT_NO_THROW(statement)                                                     \
+  do {                                                                                 \
+    try {                                                                              \
+      statement;                                                                       \
+    } catch (const std::exception& e) {                                                \
+      std::ostringstream oss;                                                          \
+      oss << "Expected: " #statement " not to throw\n  Actual: threw " << e.what();   \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+      throw std::runtime_error("Test assertion failed");                              \
+    } catch (...) {                                                                    \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__,                       \
+                                             "Expected: " #statement " not to throw\n  Actual: threw unknown exception"); \
+      throw std::runtime_error("Test assertion failed");                              \
+    }                                                                                  \
+  } while (0)
+
+#define FAIL()                                                                         \
+  do {                                                                                 \
+    ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Test failed");         \
+    throw std::runtime_error("Test failed");                                          \
+  } while (0)
+
+#endif  // MSCCLPP_TEST_FRAMEWORK_HPP_
diff --git a/test/perf/framework.cc b/test/perf/framework.cc
index 85f7abd81..600257d16 100644
--- a/test/perf/framework.cc
+++ b/test/perf/framework.cc
@@ -11,43 +11,18 @@
 namespace mscclpp {
 namespace test {
 
-// Global state for results
-static std::vector<TestResult> g_results;
-static int g_mpi_rank = 0;
-static int g_mpi_size = 1;
-static bool g_mpi_initialized = false;
-
-namespace utils {
-
-// Internal MPI helper functions (not exposed in header)
-void initializeMPI(int argc, char* argv[]) {
-  if (g_mpi_initialized) return;
-
-  MPI_Init(&argc, &argv);
-  MPI_Comm_rank(MPI_COMM_WORLD, &g_mpi_rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &g_mpi_size);
-  g_mpi_initialized = true;
-}
-
-static void finalizeMPI() {
-  if (!g_mpi_initialized) return;
-
-  MPI_Finalize();
-  g_mpi_initialized = false;
-}
-
-static int getMPIRank() { return g_mpi_rank; }
-
-static int getMPISize() { return g_mpi_size; }
-
-static bool isMainProcess() { return g_mpi_rank == 0; }
-
-// Public utility functions for test output
-bool isMainRank() { return g_mpi_rank == 0; }
-
-void cleanupMPI() { finalizeMPI(); }
-
-std::string getCurrentTimestamp() {
+// Global state for performance test results
+static std::vector<struct PerfTestResult {
+  std::string test_name;
+  std::string test_category;
+  std::map<std::string, std::string> test_params;
+  nlohmann::ordered_json metrics;
+  int num_processes;
+  int process_rank;
+  std::string timestamp;
+}> g_perf_results;
+
+static std::string getCurrentTimestamp() {
   auto now = std::chrono::system_clock::now();
   auto time_t = std::chrono::system_clock::to_time_t(now);
   std::stringstream ss;
@@ -57,16 +32,16 @@ std::string getCurrentTimestamp() {
 
 void recordResult(const std::string& test_name, const std::string& test_category, const nlohmann::ordered_json& metrics,
                   const std::map<std::string, std::string>& test_params) {
-  TestResult result;
+  PerfTestResult result;
   result.test_name = test_name;
   result.test_category = test_category;
   result.test_params = test_params;
   result.metrics = metrics;
-  result.num_processes = g_mpi_size;
-  result.process_rank = g_mpi_rank;
+  result.num_processes = utils::getMPISize();
+  result.process_rank = utils::getMPIRank();
   result.timestamp = getCurrentTimestamp();
 
-  g_results.push_back(result);
+  g_perf_results.push_back(result);
 }
 
 void writeResultsToFile(const std::string& filename) {
@@ -75,7 +50,7 @@ void writeResultsToFile(const std::string& filename) {
     throw std::runtime_error("Cannot open output file: " + filename);
   }
 
-  for (const auto& result : g_results) {
+  for (const auto& result : g_perf_results) {
     nlohmann::ordered_json j;
     j["test_name"] = result.test_name;
     j["test_category"] = result.test_category;
@@ -90,11 +65,11 @@ void writeResultsToFile(const std::string& filename) {
 }
 
 void printResults(bool verbose) {
-  if (!isMainProcess()) return;
+  if (!utils::isMainRank()) return;
 
   std::cout << "\n=== Test Results ===" << std::endl;
 
-  for (const auto& result : g_results) {
+  for (const auto& result : g_perf_results) {
     std::cout << "\nTest: " << result.test_name << " (" << result.test_category << ")" << std::endl;
 
     if (verbose && !result.test_params.empty()) {
@@ -112,97 +87,5 @@ void printResults(bool verbose) {
   std::cout << std::endl;
 }
 
-// Timer implementation
-Timer::Timer() : is_running_(false) {}
-
-void Timer::start() {
-  start_time_ = std::chrono::high_resolution_clock::now();
-  is_running_ = true;
-}
-
-void Timer::stop() {
-  end_time_ = std::chrono::high_resolution_clock::now();
-  is_running_ = false;
-}
-
-double Timer::elapsedMicroseconds() const {
-  if (is_running_) {
-    auto now = std::chrono::high_resolution_clock::now();
-    return std::chrono::duration_cast<std::chrono::microseconds>(now - start_time_).count();
-  }
-  return std::chrono::duration_cast<std::chrono::microseconds>(end_time_ - start_time_).count();
-}
-
-double Timer::elapsedMilliseconds() const { return elapsedMicroseconds() / 1000.0; }
-
-double Timer::elapsedSeconds() const { return elapsedMicroseconds() / 1000000.0; }
-
-void cudaCheck(cudaError_t err, const char* file, int line) {
-  if (err != cudaSuccess) {
-    std::string msg =
-        std::string("CUDA error at ") + file + ":" + std::to_string(line) + " - " + cudaGetErrorString(err);
-    throw std::runtime_error(msg);
-  }
-}
-
-int runMultipleTests(
-    int argc, char* argv[],
-    const std::vector<std::tuple<std::string, std::string, std::function<void(int, int, int)>>>& tests) {
-  int totalResult = 0;
-
-  // Initialize MPI once for all tests
-  initializeMPI(argc, argv);
-
-  try {
-    // Get MPI information
-    int rank = getMPIRank();
-    int size = getMPISize();
-    int local_rank = rank;  // For simplicity, assume local_rank = rank
-
-    for (const auto& test : tests) {
-      const std::string& testName = std::get<0>(test);
-      const std::string& testDescription = std::get<1>(test);
-      const std::function<void(int, int, int)>& testFunction = std::get<2>(test);
-
-      if (rank == 0) {
-        std::cout << "Running test: " << testName << std::endl;
-        if (!testDescription.empty()) {
-          std::cout << "  " << testDescription << std::endl;
-        }
-      }
-
-      // Don't clear results - accumulate them for all tests in the same file
-      // g_results.clear();  // Commented out to accumulate results
-
-      try {
-        // Run the individual test function with MPI information
-        testFunction(rank, size, local_rank);
-
-        // Synchronize before moving to next test
-        MPI_Barrier(MPI_COMM_WORLD);
-
-      } catch (const std::exception& e) {
-        if (rank == 0) {
-          std::cerr << "Error in test " << testName << ": " << e.what() << std::endl;
-        }
-        totalResult = 1;
-      }
-    }
-
-    // Don't cleanup MPI here - let the caller handle it
-    // finalizeMPI();
-
-  } catch (const std::exception& e) {
-    if (g_mpi_rank == 0) {
-      std::cerr << "Error: " << e.what() << std::endl;
-    }
-    finalizeMPI();
-    return 1;
-  }
-
-  return totalResult;
-}
-
-}  // namespace utils
 }  // namespace test
 }  // namespace mscclpp
diff --git a/test/perf/framework.hpp b/test/perf/framework.hpp
index e9b8c31f5..fe49be911 100644
--- a/test/perf/framework.hpp
+++ b/test/perf/framework.hpp
@@ -4,75 +4,25 @@
 #ifndef MSCCLPP_TEST_PERF_FRAMEWORK_HPP_
 #define MSCCLPP_TEST_PERF_FRAMEWORK_HPP_
 
-#include <mpi.h>
+// This file is kept for backwards compatibility with perf tests
+// The actual framework is now in test/framework.hpp
+
+#include "../framework.hpp"
 
-#include <chrono>
-#include <fstream>
-#include <functional>
-#include <map>
-#include <mscclpp/gpu.hpp>
 #include <nlohmann/json.hpp>
-#include <string>
-#include <tuple>
-#include <vector>
 
 namespace mscclpp {
 namespace test {
 
-// Test result structure
-struct TestResult {
-  std::string test_name;
-  std::string test_category;
-  std::map<std::string, std::string> test_params;
-  nlohmann::ordered_json metrics;
-  int num_processes;
-  int process_rank;
-  std::string timestamp;
-};
-
-// Simple utility functions for testing
-namespace utils {
-
-// Test execution utilities
-int runMultipleTests(
-    int argc, char* argv[],
-    const std::vector<std::tuple<std::string, std::string, std::function<void(int, int, int)>>>& tests);
-
-// MPI management
-void initializeMPI(int argc, char* argv[]);
-void cleanupMPI();
-bool isMainRank();
+// Additional performance test utilities not in the base framework
 
-// Result recording
+// Result recording for performance tests
 void recordResult(const std::string& test_name, const std::string& test_category, const nlohmann::ordered_json& metrics,
                   const std::map<std::string, std::string>& test_params = {});
 
-// Output utilities
+// Output utilities for performance tests
 void writeResultsToFile(const std::string& filename);
 void printResults(bool verbose = false);
-void cleanupMPI();
-
-// Timing utilities
-class Timer {
- public:
-  Timer();
-  void start();
-  void stop();
-  double elapsedMicroseconds() const;
-  double elapsedMilliseconds() const;
-  double elapsedSeconds() const;
-
- private:
-  std::chrono::high_resolution_clock::time_point start_time_;
-  std::chrono::high_resolution_clock::time_point end_time_;
-  bool is_running_;
-};
-
-// CUDA utilities
-void cudaCheck(cudaError_t err, const char* file, int line);
-#define CUDA_CHECK(call) cudaCheck(call, __FILE__, __LINE__)
-
-}  // namespace utils
 
 }  // namespace test
 }  // namespace mscclpp
diff --git a/test/unit/compile_tests.cu b/test/unit/compile_tests.cu
index 9db91a4f4..18046a1f8 100644
--- a/test/unit/compile_tests.cu
+++ b/test/unit/compile_tests.cu
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include <gtest/gtest.h>
+#include "../framework.hpp"
 
 #undef NDEBUG
 #ifndef DEBUG_BUILD
diff --git a/test/unit/core_tests.cc b/test/unit/core_tests.cc
index 32e6a1b57..1c8ee886e 100644
--- a/test/unit/core_tests.cc
+++ b/test/unit/core_tests.cc
@@ -2,7 +2,7 @@
 // Licensed under the MIT license.
 
 #include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "../framework.hpp"
 
 #include <mscclpp/core.hpp>
 
diff --git a/test/unit/errors_tests.cc b/test/unit/errors_tests.cc
index f9faad199..8d6283d90 100644
--- a/test/unit/errors_tests.cc
+++ b/test/unit/errors_tests.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include <gtest/gtest.h>
+#include "../framework.hpp"
 
 #include <mscclpp/errors.hpp>
 
diff --git a/test/unit/fifo_tests.cu b/test/unit/fifo_tests.cu
index b67a220d1..a0cf5447c 100644
--- a/test/unit/fifo_tests.cu
+++ b/test/unit/fifo_tests.cu
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include <gtest/gtest.h>
+#include "../framework.hpp"
 
 #include <mscclpp/fifo.hpp>
 #include <mscclpp/gpu_utils.hpp>
diff --git a/test/unit/gpu_utils_tests.cc b/test/unit/gpu_utils_tests.cc
index f4aba0d75..dc4027a17 100644
--- a/test/unit/gpu_utils_tests.cc
+++ b/test/unit/gpu_utils_tests.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include <gtest/gtest.h>
+#include "../framework.hpp"
 
 #include <mscclpp/gpu_utils.hpp>
 
diff --git a/test/unit/local_channel_tests.cu b/test/unit/local_channel_tests.cu
index 50ffc9ea5..d7cd4c658 100644
--- a/test/unit/local_channel_tests.cu
+++ b/test/unit/local_channel_tests.cu
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include <gtest/gtest.h>
+#include "../framework.hpp"
 
 #include <mscclpp/core.hpp>
 #include <mscclpp/gpu_utils.hpp>
diff --git a/test/unit/numa_tests.cc b/test/unit/numa_tests.cc
index dfa63a74a..31ba373cb 100644
--- a/test/unit/numa_tests.cc
+++ b/test/unit/numa_tests.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include <gtest/gtest.h>
+#include "../framework.hpp"
 
 #include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/numa.hpp>
diff --git a/test/unit/socket_tests.cc b/test/unit/socket_tests.cc
index 1ab592bae..cfd5bd4fd 100644
--- a/test/unit/socket_tests.cc
+++ b/test/unit/socket_tests.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include <gtest/gtest.h>
+#include "../framework.hpp"
 
 #include <mscclpp/utils.hpp>
 #include <thread>
diff --git a/test/unit/utils_internal_tests.cc b/test/unit/utils_internal_tests.cc
index 5479a681a..73b03833d 100644
--- a/test/unit/utils_internal_tests.cc
+++ b/test/unit/utils_internal_tests.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
-#include <gtest/gtest.h>
+#include "../framework.hpp"
 
 #include <thread>
 
diff --git a/test/unit/utils_tests.cc b/test/unit/utils_tests.cc
index fa079b306..ae77892d2 100644
--- a/test/unit/utils_tests.cc
+++ b/test/unit/utils_tests.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include <gtest/gtest.h>
+#include "../framework.hpp"
 
 #include <mscclpp/errors.hpp>
 #include <mscclpp/utils.hpp>

From e227fdc1ef5777441c0ef2c8485a10eeb3cff32f Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 11 Feb 2026 00:21:04 +0000
Subject: [PATCH 003/132] Convert mp_unit tests from gtest to framework.hpp

- Modified test/mp_unit/mp_unit_tests.hpp to use ../framework.hpp instead of gtest/gtest.h
- Enhanced test/framework.hpp with GTest-compatible APIs:
  - Added Environment base class for global test setup/teardown
  - Added TestInfo and UnitTest classes for test metadata access
  - Added GTEST_SKIP macro support via SkipHelper class
  - Added namespace alias 'testing' for compatibility
  - Added InitGoogleTest and AddGlobalTestEnvironment helper functions
- Updated test/framework.cc with implementations for new classes
- All mp_unit test files now use framework.hpp through mp_unit_tests.hpp
- Formatting applied via lint.sh

Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
---
 test/executor_test.cc             |   7 +-
 test/framework.cc                 |  45 +++-
 test/framework.hpp                | 409 ++++++++++++++++++------------
 test/mp_unit/mp_unit_tests.hpp    |   3 +-
 test/perf/framework.cc            |   4 +-
 test/perf/framework.hpp           |   4 +-
 test/unit/core_tests.cc           |   3 +-
 test/unit/errors_tests.cc         |   4 +-
 test/unit/fifo_tests.cu           |   3 +-
 test/unit/gpu_utils_tests.cc      |   4 +-
 test/unit/local_channel_tests.cu  |   4 +-
 test/unit/numa_tests.cc           |   4 +-
 test/unit/socket_tests.cc         |   3 +-
 test/unit/utils_internal_tests.cc |   3 +-
 test/unit/utils_tests.cc          |   4 +-
 15 files changed, 310 insertions(+), 194 deletions(-)

diff --git a/test/executor_test.cc b/test/executor_test.cc
index 0e7869aba..cc7456590 100644
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -93,11 +93,8 @@ double benchTime(int rank, std::shared_ptr<mscclpp::Bootstrap> bootstrap, std::s
 
 int main(int argc, char* argv[]) {
   if (argc != 5 && argc != 6) {
-    std::cerr << "Usage: " << argv[0] << " <buffer size>"
-              << " <execution plan path>"
-              << " <number of iterations>"
-              << " <number of graph iterations>"
-              << " (optional) <packet type>" << std::endl;
+    std::cerr << "Usage: " << argv[0] << " <buffer size>" << " <execution plan path>" << " <number of iterations>"
+              << " <number of graph iterations>" << " (optional) <packet type>" << std::endl;
     return 1;
   }
 
diff --git a/test/framework.cc b/test/framework.cc
index 5fd096f12..fc339b764 100644
--- a/test/framework.cc
+++ b/test/framework.cc
@@ -161,6 +161,12 @@ int runMultipleTests(
 
 }  // namespace utils
 
+// UnitTest implementation
+UnitTest* UnitTest::GetInstance() {
+  static UnitTest instance;
+  return &instance;
+}
+
 // TestRegistry implementation
 TestRegistry& TestRegistry::instance() {
   static TestRegistry registry;
@@ -168,19 +174,38 @@ TestRegistry& TestRegistry::instance() {
 }
 
 void TestRegistry::registerTest(const std::string& test_suite, const std::string& test_name, TestFactory factory) {
-  TestInfo info;
+  TestInfoInternal info;
   info.suite_name = test_suite;
   info.test_name = test_name;
   info.factory = factory;
   tests_.push_back(info);
 }
 
+void TestRegistry::addGlobalTestEnvironment(Environment* env) { environments_.push_back(env); }
+
+void TestRegistry::initGoogleTest(int* argc, char** argv) {
+  // Parse command-line arguments if needed
+  // For now, this is a no-op placeholder for compatibility
+}
+
 int TestRegistry::runAllTests(int argc, char* argv[]) {
   // Initialize MPI if not already initialized
   if (!g_mpi_initialized) {
     utils::initializeMPI(argc, argv);
   }
 
+  // Set up global test environments
+  for (auto* env : environments_) {
+    try {
+      env->SetUp();
+    } catch (const std::exception& e) {
+      if (g_mpi_rank == 0) {
+        std::cerr << "Failed to set up test environment: " << e.what() << std::endl;
+      }
+      return 1;
+    }
+  }
+
   int passed = 0;
   int failed = 0;
 
@@ -196,6 +221,10 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
       std::cout << "[ RUN      ] " << test_info.suite_name << "." << test_info.test_name << std::endl;
     }
 
+    // Set current test info for UnitTest::GetInstance()->current_test_info()
+    TestInfo current_info(test_info.suite_name, test_info.test_name);
+    UnitTest::GetInstance()->set_current_test_info(&current_info);
+
     TestCase* test_case = nullptr;
     try {
       test_case = test_info.factory();
@@ -216,6 +245,9 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
 
     delete test_case;
 
+    // Clear current test info
+    UnitTest::GetInstance()->set_current_test_info(nullptr);
+
     // Synchronize test status across all MPI processes
     int local_passed = g_current_test_passed ? 1 : 0;
     int global_passed = 1;
@@ -246,6 +278,17 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
     }
   }
 
+  // Tear down global test environments (in reverse order)
+  for (auto it = environments_.rbegin(); it != environments_.rend(); ++it) {
+    try {
+      (*it)->TearDown();
+    } catch (const std::exception& e) {
+      if (g_mpi_rank == 0) {
+        std::cerr << "Failed to tear down test environment: " << e.what() << std::endl;
+      }
+    }
+  }
+
   return failed > 0 ? 1 : 0;
 }
 
diff --git a/test/framework.hpp b/test/framework.hpp
index 6d510382c..1ef9aaeae 100644
--- a/test/framework.hpp
+++ b/test/framework.hpp
@@ -33,6 +33,12 @@ struct TestResult {
   std::string failure_message;
 };
 
+// Forward declarations
+class Environment;
+class TestCase;
+class TestInfo;
+class UnitTest;
+
 // Test case base class
 class TestCase {
  public:
@@ -42,24 +48,61 @@ class TestCase {
   virtual void TestBody() = 0;
 };
 
+// Environment base class (for global test setup/teardown)
+class Environment {
+ public:
+  virtual ~Environment() = default;
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+};
+
+// Test info class (for getting current test information)
+class TestInfo {
+ public:
+  TestInfo(const std::string& suite, const std::string& name) : test_suite_name_(suite), test_name_(name) {}
+
+  const char* test_suite_name() const { return test_suite_name_.c_str(); }
+  const char* name() const { return test_name_.c_str(); }
+
+ private:
+  std::string test_suite_name_;
+  std::string test_name_;
+};
+
+// UnitTest singleton (for getting test information)
+class UnitTest {
+ public:
+  static UnitTest* GetInstance();
+
+  const TestInfo* current_test_info() const { return current_test_info_; }
+  void set_current_test_info(const TestInfo* info) { current_test_info_ = info; }
+
+ private:
+  UnitTest() = default;
+  const TestInfo* current_test_info_ = nullptr;
+};
+
 // Test registry and runner
 class TestRegistry {
  public:
   using TestFactory = std::function<TestCase*()>;
-  
+
   static TestRegistry& instance();
-  
+
   void registerTest(const std::string& test_suite, const std::string& test_name, TestFactory factory);
+  void addGlobalTestEnvironment(Environment* env);
   int runAllTests(int argc, char* argv[]);
-  
+  void initGoogleTest(int* argc, char** argv);
+
  private:
   TestRegistry() = default;
-  struct TestInfo {
+  struct TestInfoInternal {
     std::string suite_name;
     std::string test_name;
     TestFactory factory;
   };
-  std::vector<TestInfo> tests_;
+  std::vector<TestInfoInternal> tests_;
+  std::vector<Environment*> environments_;
 };
 
 // Simple utility functions for testing
@@ -107,230 +150,266 @@ void reportSuccess();
 }  // namespace mscclpp
 
 // Test registration macros
-#define TEST(test_suite, test_name)                                                    \
-  class test_suite##_##test_name##_Test : public ::mscclpp::test::TestCase {          \
-   public:                                                                             \
-    test_suite##_##test_name##_Test() {}                                              \
-    void TestBody() override;                                                          \
-  };                                                                                   \
-  static bool test_suite##_##test_name##_registered = []() {                          \
-    ::mscclpp::test::TestRegistry::instance().registerTest(                           \
-        #test_suite, #test_name,                                                       \
+#define TEST(test_suite, test_name)                                                            \
+  class test_suite##_##test_name##_Test : public ::mscclpp::test::TestCase {                   \
+   public:                                                                                     \
+    test_suite##_##test_name##_Test() {}                                                       \
+    void TestBody() override;                                                                  \
+  };                                                                                           \
+  static bool test_suite##_##test_name##_registered = []() {                                   \
+    ::mscclpp::test::TestRegistry::instance().registerTest(                                    \
+        #test_suite, #test_name,                                                               \
         []() -> ::mscclpp::test::TestCase* { return new test_suite##_##test_name##_Test(); }); \
-    return true;                                                                       \
-  }();                                                                                 \
+    return true;                                                                               \
+  }();                                                                                         \
   void test_suite##_##test_name##_Test::TestBody()
 
-#define TEST_F(test_fixture, test_name)                                                \
-  class test_fixture##_##test_name##_Test : public test_fixture {                     \
-   public:                                                                             \
-    test_fixture##_##test_name##_Test() {}                                            \
-    void TestBody() override;                                                          \
-  };                                                                                   \
-  static bool test_fixture##_##test_name##_registered = []() {                        \
-    ::mscclpp::test::TestRegistry::instance().registerTest(                           \
-        #test_fixture, #test_name,                                                     \
+#define TEST_F(test_fixture, test_name)                                                          \
+  class test_fixture##_##test_name##_Test : public test_fixture {                                \
+   public:                                                                                       \
+    test_fixture##_##test_name##_Test() {}                                                       \
+    void TestBody() override;                                                                    \
+  };                                                                                             \
+  static bool test_fixture##_##test_name##_registered = []() {                                   \
+    ::mscclpp::test::TestRegistry::instance().registerTest(                                      \
+        #test_fixture, #test_name,                                                               \
         []() -> ::mscclpp::test::TestCase* { return new test_fixture##_##test_name##_Test(); }); \
-    return true;                                                                       \
-  }();                                                                                 \
+    return true;                                                                                 \
+  }();                                                                                           \
   void test_fixture##_##test_name##_Test::TestBody()
 
 // Test runner macro
 #define RUN_ALL_TESTS() ::mscclpp::test::TestRegistry::instance().runAllTests(argc, argv)
 
 // Assertion macros
-#define EXPECT_TRUE(condition)                                                         \
-  do {                                                                                 \
-    if (!(condition)) {                                                                \
-      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__,                       \
-                                             "Expected: " #condition " to be true");   \
-    }                                                                                  \
+#define EXPECT_TRUE(condition)                                                                          \
+  do {                                                                                                  \
+    if (!(condition)) {                                                                                 \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be true"); \
+    }                                                                                                   \
   } while (0)
 
-#define EXPECT_FALSE(condition)                                                        \
-  do {                                                                                 \
-    if (condition) {                                                                   \
-      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__,                       \
-                                             "Expected: " #condition " to be false");  \
-    }                                                                                  \
+#define EXPECT_FALSE(condition)                                                                          \
+  do {                                                                                                   \
+    if (condition) {                                                                                     \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be false"); \
+    }                                                                                                    \
   } while (0)
 
-#define EXPECT_EQ(val1, val2)                                                          \
-  do {                                                                                 \
-    auto v1 = (val1);                                                                  \
-    auto v2 = (val2);                                                                  \
-    if (!(v1 == v2)) {                                                                 \
-      std::ostringstream oss;                                                          \
+#define EXPECT_EQ(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 == v2)) {                                                                \
+      std::ostringstream oss;                                                         \
       oss << "Expected: " #val1 " == " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
       ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
-    }                                                                                  \
+    }                                                                                 \
   } while (0)
 
-#define EXPECT_NE(val1, val2)                                                          \
-  do {                                                                                 \
-    auto v1 = (val1);                                                                  \
-    auto v2 = (val2);                                                                  \
-    if (!(v1 != v2)) {                                                                 \
-      std::ostringstream oss;                                                          \
+#define EXPECT_NE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 != v2)) {                                                                \
+      std::ostringstream oss;                                                         \
       oss << "Expected: " #val1 " != " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
       ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
-    }                                                                                  \
+    }                                                                                 \
   } while (0)
 
-#define EXPECT_LT(val1, val2)                                                          \
-  do {                                                                                 \
-    auto v1 = (val1);                                                                  \
-    auto v2 = (val2);                                                                  \
-    if (!(v1 < v2)) {                                                                  \
-      std::ostringstream oss;                                                          \
-      oss << "Expected: " #val1 " < " #val2 << "\n  Actual: " << v1 << " vs " << v2;  \
-      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
-    }                                                                                  \
+#define EXPECT_LT(val1, val2)                                                        \
+  do {                                                                               \
+    auto v1 = (val1);                                                                \
+    auto v2 = (val2);                                                                \
+    if (!(v1 < v2)) {                                                                \
+      std::ostringstream oss;                                                        \
+      oss << "Expected: " #val1 " < " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());          \
+    }                                                                                \
   } while (0)
 
-#define EXPECT_LE(val1, val2)                                                          \
-  do {                                                                                 \
-    auto v1 = (val1);                                                                  \
-    auto v2 = (val2);                                                                  \
-    if (!(v1 <= v2)) {                                                                 \
-      std::ostringstream oss;                                                          \
+#define EXPECT_LE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 <= v2)) {                                                                \
+      std::ostringstream oss;                                                         \
       oss << "Expected: " #val1 " <= " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
       ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
-    }                                                                                  \
+    }                                                                                 \
   } while (0)
 
-#define EXPECT_GT(val1, val2)                                                          \
-  do {                                                                                 \
-    auto v1 = (val1);                                                                  \
-    auto v2 = (val2);                                                                  \
-    if (!(v1 > v2)) {                                                                  \
-      std::ostringstream oss;                                                          \
-      oss << "Expected: " #val1 " > " #val2 << "\n  Actual: " << v1 << " vs " << v2;  \
-      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
-    }                                                                                  \
+#define EXPECT_GT(val1, val2)                                                        \
+  do {                                                                               \
+    auto v1 = (val1);                                                                \
+    auto v2 = (val2);                                                                \
+    if (!(v1 > v2)) {                                                                \
+      std::ostringstream oss;                                                        \
+      oss << "Expected: " #val1 " > " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());          \
+    }                                                                                \
   } while (0)
 
-#define EXPECT_GE(val1, val2)                                                          \
-  do {                                                                                 \
-    auto v1 = (val1);                                                                  \
-    auto v2 = (val2);                                                                  \
-    if (!(v1 >= v2)) {                                                                 \
-      std::ostringstream oss;                                                          \
+#define EXPECT_GE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 >= v2)) {                                                                \
+      std::ostringstream oss;                                                         \
       oss << "Expected: " #val1 " >= " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
       ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
-    }                                                                                  \
+    }                                                                                 \
   } while (0)
 
-#define ASSERT_TRUE(condition)                                                         \
-  do {                                                                                 \
-    if (!(condition)) {                                                                \
-      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__,                       \
-                                             "Expected: " #condition " to be true");   \
-      throw std::runtime_error("Test assertion failed");                              \
-    }                                                                                  \
+#define ASSERT_TRUE(condition)                                                                          \
+  do {                                                                                                  \
+    if (!(condition)) {                                                                                 \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be true"); \
+      throw std::runtime_error("Test assertion failed");                                                \
+    }                                                                                                   \
   } while (0)
 
-#define ASSERT_FALSE(condition)                                                        \
-  do {                                                                                 \
-    if (condition) {                                                                   \
-      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__,                       \
-                                             "Expected: " #condition " to be false");  \
-      throw std::runtime_error("Test assertion failed");                              \
-    }                                                                                  \
+#define ASSERT_FALSE(condition)                                                                          \
+  do {                                                                                                   \
+    if (condition) {                                                                                     \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be false"); \
+      throw std::runtime_error("Test assertion failed");                                                 \
+    }                                                                                                    \
   } while (0)
 
-#define ASSERT_EQ(val1, val2)                                                          \
-  do {                                                                                 \
-    auto v1 = (val1);                                                                  \
-    auto v2 = (val2);                                                                  \
-    if (!(v1 == v2)) {                                                                 \
-      std::ostringstream oss;                                                          \
+#define ASSERT_EQ(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 == v2)) {                                                                \
+      std::ostringstream oss;                                                         \
       oss << "Expected: " #val1 " == " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
       ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
       throw std::runtime_error("Test assertion failed");                              \
-    }                                                                                  \
+    }                                                                                 \
   } while (0)
 
-#define ASSERT_NE(val1, val2)                                                          \
-  do {                                                                                 \
-    auto v1 = (val1);                                                                  \
-    auto v2 = (val2);                                                                  \
-    if (!(v1 != v2)) {                                                                 \
-      std::ostringstream oss;                                                          \
+#define ASSERT_NE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 != v2)) {                                                                \
+      std::ostringstream oss;                                                         \
       oss << "Expected: " #val1 " != " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
       ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
       throw std::runtime_error("Test assertion failed");                              \
-    }                                                                                  \
+    }                                                                                 \
   } while (0)
 
-#define ASSERT_LT(val1, val2)                                                          \
-  do {                                                                                 \
-    auto v1 = (val1);                                                                  \
-    auto v2 = (val2);                                                                  \
-    if (!(v1 < v2)) {                                                                  \
-      std::ostringstream oss;                                                          \
-      oss << "Expected: " #val1 " < " #val2 << "\n  Actual: " << v1 << " vs " << v2;  \
-      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
-      throw std::runtime_error("Test assertion failed");                              \
-    }                                                                                  \
+#define ASSERT_LT(val1, val2)                                                        \
+  do {                                                                               \
+    auto v1 = (val1);                                                                \
+    auto v2 = (val2);                                                                \
+    if (!(v1 < v2)) {                                                                \
+      std::ostringstream oss;                                                        \
+      oss << "Expected: " #val1 " < " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());          \
+      throw std::runtime_error("Test assertion failed");                             \
+    }                                                                                \
   } while (0)
 
-#define ASSERT_LE(val1, val2)                                                          \
-  do {                                                                                 \
-    auto v1 = (val1);                                                                  \
-    auto v2 = (val2);                                                                  \
-    if (!(v1 <= v2)) {                                                                 \
-      std::ostringstream oss;                                                          \
+#define ASSERT_LE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 <= v2)) {                                                                \
+      std::ostringstream oss;                                                         \
       oss << "Expected: " #val1 " <= " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
       ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
       throw std::runtime_error("Test assertion failed");                              \
-    }                                                                                  \
+    }                                                                                 \
   } while (0)
 
-#define ASSERT_GT(val1, val2)                                                          \
-  do {                                                                                 \
-    auto v1 = (val1);                                                                  \
-    auto v2 = (val2);                                                                  \
-    if (!(v1 > v2)) {                                                                  \
-      std::ostringstream oss;                                                          \
-      oss << "Expected: " #val1 " > " #val2 << "\n  Actual: " << v1 << " vs " << v2;  \
-      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
-      throw std::runtime_error("Test assertion failed");                              \
-    }                                                                                  \
+#define ASSERT_GT(val1, val2)                                                        \
+  do {                                                                               \
+    auto v1 = (val1);                                                                \
+    auto v2 = (val2);                                                                \
+    if (!(v1 > v2)) {                                                                \
+      std::ostringstream oss;                                                        \
+      oss << "Expected: " #val1 " > " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());          \
+      throw std::runtime_error("Test assertion failed");                             \
+    }                                                                                \
   } while (0)
 
-#define ASSERT_GE(val1, val2)                                                          \
-  do {                                                                                 \
-    auto v1 = (val1);                                                                  \
-    auto v2 = (val2);                                                                  \
-    if (!(v1 >= v2)) {                                                                 \
-      std::ostringstream oss;                                                          \
+#define ASSERT_GE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 >= v2)) {                                                                \
+      std::ostringstream oss;                                                         \
       oss << "Expected: " #val1 " >= " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
       ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
       throw std::runtime_error("Test assertion failed");                              \
-    }                                                                                  \
+    }                                                                                 \
   } while (0)
 
-#define ASSERT_NO_THROW(statement)                                                     \
-  do {                                                                                 \
-    try {                                                                              \
-      statement;                                                                       \
-    } catch (const std::exception& e) {                                                \
-      std::ostringstream oss;                                                          \
-      oss << "Expected: " #statement " not to throw\n  Actual: threw " << e.what();   \
-      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
-      throw std::runtime_error("Test assertion failed");                              \
-    } catch (...) {                                                                    \
-      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__,                       \
-                                             "Expected: " #statement " not to throw\n  Actual: threw unknown exception"); \
-      throw std::runtime_error("Test assertion failed");                              \
-    }                                                                                  \
+#define ASSERT_NO_THROW(statement)                                                                         \
+  do {                                                                                                     \
+    try {                                                                                                  \
+      statement;                                                                                           \
+    } catch (const std::exception& e) {                                                                    \
+      std::ostringstream oss;                                                                              \
+      oss << "Expected: " #statement " not to throw\n  Actual: threw " << e.what();                        \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());                                \
+      throw std::runtime_error("Test assertion failed");                                                   \
+    } catch (...) {                                                                                        \
+      ::mscclpp::test::utils::reportFailure(                                                               \
+          __FILE__, __LINE__, "Expected: " #statement " not to throw\n  Actual: threw unknown exception"); \
+      throw std::runtime_error("Test assertion failed");                                                   \
+    }                                                                                                      \
   } while (0)
 
-#define FAIL()                                                                         \
-  do {                                                                                 \
-    ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Test failed");         \
-    throw std::runtime_error("Test failed");                                          \
+#define FAIL()                                                                \
+  do {                                                                        \
+    ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Test failed"); \
+    throw std::runtime_error("Test failed");                                  \
   } while (0)
 
+// Helper class for GTEST_SKIP functionality
+class SkipHelper {
+ public:
+  explicit SkipHelper(const char* file, int line) : file_(file), line_(line) {}
+  template <typename T>
+  SkipHelper& operator<<(const T& value) {
+    message_ << value;
+    return *this;
+  }
+  ~SkipHelper() noexcept(false) {
+    std::string msg = message_.str();
+    if (!msg.empty()) {
+      ::mscclpp::test::utils::reportFailure(file_, line_, "Test skipped: " + msg);
+    } else {
+      ::mscclpp::test::utils::reportFailure(file_, line_, "Test skipped");
+    }
+    throw std::runtime_error("Test skipped");
+  }
+
+ private:
+  const char* file_;
+  int line_;
+  std::ostringstream message_;
+};
+
+#define GTEST_SKIP() ::SkipHelper(__FILE__, __LINE__)
+
+// Create a namespace alias for compatibility with GTest code
+namespace testing = ::mscclpp::test;
+
+// Helper functions for compatibility with GTest API
+inline void InitGoogleTest(int* argc, char** argv) {
+  ::mscclpp::test::TestRegistry::instance().initGoogleTest(argc, argv);
+}
+
+inline ::mscclpp::test::Environment* AddGlobalTestEnvironment(::mscclpp::test::Environment* env) {
+  ::mscclpp::test::TestRegistry::instance().addGlobalTestEnvironment(env);
+  return env;
+}
+
 #endif  // MSCCLPP_TEST_FRAMEWORK_HPP_
diff --git a/test/mp_unit/mp_unit_tests.hpp b/test/mp_unit/mp_unit_tests.hpp
index 17046a576..8b1fab279 100644
--- a/test/mp_unit/mp_unit_tests.hpp
+++ b/test/mp_unit/mp_unit_tests.hpp
@@ -4,8 +4,6 @@
 #ifndef MSCCLPP_MP_UNIT_TESTS_HPP_
 #define MSCCLPP_MP_UNIT_TESTS_HPP_
 
-#include <gtest/gtest.h>
-
 #include <mscclpp/core.hpp>
 #include <mscclpp/executor.hpp>
 #include <mscclpp/memory_channel.hpp>
@@ -13,6 +11,7 @@
 #include <mscclpp/port_channel.hpp>
 #include <mscclpp/utils.hpp>
 
+#include "../framework.hpp"
 #include "ib.hpp"
 #include "utils_internal.hpp"
 
diff --git a/test/perf/framework.cc b/test/perf/framework.cc
index 600257d16..0b011cc5c 100644
--- a/test/perf/framework.cc
+++ b/test/perf/framework.cc
@@ -12,7 +12,7 @@ namespace mscclpp {
 namespace test {
 
 // Global state for performance test results
-static std::vector<struct PerfTestResult {
+static std::vector < struct PerfTestResult {
   std::string test_name;
   std::string test_category;
   std::map<std::string, std::string> test_params;
@@ -20,7 +20,7 @@ static std::vector<struct PerfTestResult {
   int num_processes;
   int process_rank;
   std::string timestamp;
-}> g_perf_results;
+} > g_perf_results;
 
 static std::string getCurrentTimestamp() {
   auto now = std::chrono::system_clock::now();
diff --git a/test/perf/framework.hpp b/test/perf/framework.hpp
index fe49be911..094d5cb13 100644
--- a/test/perf/framework.hpp
+++ b/test/perf/framework.hpp
@@ -7,10 +7,10 @@
 // This file is kept for backwards compatibility with perf tests
 // The actual framework is now in test/framework.hpp
 
-#include "../framework.hpp"
-
 #include <nlohmann/json.hpp>
 
+#include "../framework.hpp"
+
 namespace mscclpp {
 namespace test {
 
diff --git a/test/unit/core_tests.cc b/test/unit/core_tests.cc
index 1c8ee886e..a2c39c1b4 100644
--- a/test/unit/core_tests.cc
+++ b/test/unit/core_tests.cc
@@ -2,10 +2,11 @@
 // Licensed under the MIT license.
 
 #include <gmock/gmock.h>
-#include "../framework.hpp"
 
 #include <mscclpp/core.hpp>
 
+#include "../framework.hpp"
+
 class LocalCommunicatorTest : public ::testing::Test {
  protected:
   void SetUp() override {
diff --git a/test/unit/errors_tests.cc b/test/unit/errors_tests.cc
index 8d6283d90..4cd68ee63 100644
--- a/test/unit/errors_tests.cc
+++ b/test/unit/errors_tests.cc
@@ -1,10 +1,10 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "../framework.hpp"
-
 #include <mscclpp/errors.hpp>
 
+#include "../framework.hpp"
+
 TEST(ErrorsTest, SystemError) {
   mscclpp::Error error("test", mscclpp::ErrorCode::SystemError);
   EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::SystemError);
diff --git a/test/unit/fifo_tests.cu b/test/unit/fifo_tests.cu
index a0cf5447c..68e777d07 100644
--- a/test/unit/fifo_tests.cu
+++ b/test/unit/fifo_tests.cu
@@ -1,13 +1,12 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "../framework.hpp"
-
 #include <mscclpp/fifo.hpp>
 #include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/numa.hpp>
 #include <mscclpp/utils.hpp>
 
+#include "../framework.hpp"
 #include "utils_internal.hpp"
 
 #define ITER 10000  // should be larger than the FIFO size for proper testing
diff --git a/test/unit/gpu_utils_tests.cc b/test/unit/gpu_utils_tests.cc
index dc4027a17..c10f113c4 100644
--- a/test/unit/gpu_utils_tests.cc
+++ b/test/unit/gpu_utils_tests.cc
@@ -1,10 +1,10 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "../framework.hpp"
-
 #include <mscclpp/gpu_utils.hpp>
 
+#include "../framework.hpp"
+
 TEST(GpuUtilsTest, StreamPool) {
   auto streamPool = mscclpp::gpuStreamPool();
   cudaStream_t s;
diff --git a/test/unit/local_channel_tests.cu b/test/unit/local_channel_tests.cu
index d7cd4c658..76060f97f 100644
--- a/test/unit/local_channel_tests.cu
+++ b/test/unit/local_channel_tests.cu
@@ -1,13 +1,13 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "../framework.hpp"
-
 #include <mscclpp/core.hpp>
 #include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/port_channel.hpp>
 #include <mscclpp/port_channel_device.hpp>
 
+#include "../framework.hpp"
+
 #define MAGIC_CONST 777
 
 __constant__ mscclpp::PortChannelDeviceHandle gPortChannel;
diff --git a/test/unit/numa_tests.cc b/test/unit/numa_tests.cc
index 31ba373cb..c27fde904 100644
--- a/test/unit/numa_tests.cc
+++ b/test/unit/numa_tests.cc
@@ -1,11 +1,11 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "../framework.hpp"
-
 #include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/numa.hpp>
 
+#include "../framework.hpp"
+
 TEST(NumaTest, Basic) {
   int num;
   MSCCLPP_CUDATHROW(cudaGetDeviceCount(&num));
diff --git a/test/unit/socket_tests.cc b/test/unit/socket_tests.cc
index cfd5bd4fd..6b7c19033 100644
--- a/test/unit/socket_tests.cc
+++ b/test/unit/socket_tests.cc
@@ -1,11 +1,10 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "../framework.hpp"
-
 #include <mscclpp/utils.hpp>
 #include <thread>
 
+#include "../framework.hpp"
 #include "socket.h"
 #include "utils_internal.hpp"
 
diff --git a/test/unit/utils_internal_tests.cc b/test/unit/utils_internal_tests.cc
index 73b03833d..8526d9feb 100644
--- a/test/unit/utils_internal_tests.cc
+++ b/test/unit/utils_internal_tests.cc
@@ -1,10 +1,9 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
-#include "../framework.hpp"
-
 #include <thread>
 
+#include "../framework.hpp"
 #include "utils_internal.hpp"
 
 TEST(UtilsInternalTest, getHostHash) {
diff --git a/test/unit/utils_tests.cc b/test/unit/utils_tests.cc
index ae77892d2..110550dac 100644
--- a/test/unit/utils_tests.cc
+++ b/test/unit/utils_tests.cc
@@ -1,12 +1,12 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "../framework.hpp"
-
 #include <mscclpp/errors.hpp>
 #include <mscclpp/utils.hpp>
 #include <thread>
 
+#include "../framework.hpp"
+
 TEST(UtilsTest, getHostName) {
   std::string hostname1 = mscclpp::getHostName(1024, '.');
   EXPECT_FALSE(hostname1.empty());

From 1e32e17c1e01809ce9a4227d1ed1860e7a094548 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 11 Feb 2026 00:22:04 +0000
Subject: [PATCH 004/132] Address code review comments

- Remove duplicate static getMPIRank() and getMPISize() functions
- Add full namespace qualification to GTEST_SKIP macro

Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
---
 test/framework.cc  | 4 ----
 test/framework.hpp | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/test/framework.cc b/test/framework.cc
index fc339b764..28001ed36 100644
--- a/test/framework.cc
+++ b/test/framework.cc
@@ -37,10 +37,6 @@ static void finalizeMPI() {
   g_mpi_initialized = false;
 }
 
-static int getMPIRank() { return g_mpi_rank; }
-
-static int getMPISize() { return g_mpi_size; }
-
 static bool isMainProcess() { return g_mpi_rank == 0; }
 
 // Public utility functions for test output
diff --git a/test/framework.hpp b/test/framework.hpp
index 1ef9aaeae..c19580dcc 100644
--- a/test/framework.hpp
+++ b/test/framework.hpp
@@ -397,7 +397,7 @@ class SkipHelper {
   std::ostringstream message_;
 };
 
-#define GTEST_SKIP() ::SkipHelper(__FILE__, __LINE__)
+#define GTEST_SKIP() ::mscclpp::test::SkipHelper(__FILE__, __LINE__)
 
 // Create a namespace alias for compatibility with GTest code
 namespace testing = ::mscclpp::test;

From eafa6fbfaf13c0abb2f2dad3516b1d96ca3e22cf Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 11 Feb 2026 00:24:03 +0000
Subject: [PATCH 005/132] Add custom test framework and code coverage support

- Move test framework from test/perf/ to test/ for shared use
- Add GTest-compatible macros (TEST, TEST_F, EXPECT_*, ASSERT_*, etc.)
- Remove GTest dependency from CMakeLists.txt
- Add test_framework library for unit and mp_unit tests
- Add code coverage support with lcov (MSCCLPP_ENABLE_COVERAGE option)
- Update perf tests to use shared framework

Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
---
 CMakeLists.txt           | 64 ++++++++++++++++++++++++++++++++++++++++
 test/CMakeLists.txt      | 19 ++++++------
 test/perf/CMakeLists.txt |  2 +-
 3 files changed, 74 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6288dbb08..9bfef1ef7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,6 +56,7 @@ option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF)
 option(MSCCLPP_USE_IB "Use InfiniBand." ON)
 option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF)
 option(MSCCLPP_NPKIT_FLAGS "Set NPKIT flags" OFF)
+option(MSCCLPP_ENABLE_COVERAGE "Enable code coverage" OFF)
 set(MSCCLPP_GPU_ARCHS "" CACHE STRING "Specify GPU architectures with delimiters (comma, space, or semicolon).")
 
 if(MSCCLPP_BYPASS_GPU_CHECK)
@@ -98,6 +99,69 @@ else()
         message(FATAL_ERROR "No compatible GPU found. Set MSCCLPP_USE_CUDA or MSCCLPP_USE_ROCM to ON.")
     endif()
 endif()
+
+# Code coverage setup
+if(MSCCLPP_ENABLE_COVERAGE)
+    if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+        message(WARNING "Code coverage results with an optimized (non-Debug) build may be misleading")
+    endif()
+    
+    if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+        message(STATUS "Code coverage enabled")
+        
+        # Add coverage flags to all targets
+        add_compile_options(--coverage -O0 -g)
+        add_link_options(--coverage)
+        
+        # Find lcov
+        find_program(LCOV_PATH lcov)
+        find_program(GENHTML_PATH genhtml)
+        
+        if(NOT LCOV_PATH)
+            message(WARNING "lcov not found. Install lcov to generate coverage reports.")
+        endif()
+        
+        if(NOT GENHTML_PATH)
+            message(WARNING "genhtml not found. Install lcov to generate HTML coverage reports.")
+        endif()
+        
+        if(LCOV_PATH AND GENHTML_PATH)
+            # Add coverage target
+            add_custom_target(coverage
+                COMMAND ${CMAKE_COMMAND} -E echo "Removing old coverage data..."
+                COMMAND ${LCOV_PATH} --directory . --zerocounters
+                
+                COMMAND ${CMAKE_COMMAND} -E echo "Running tests..."
+                COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure
+                
+                COMMAND ${CMAKE_COMMAND} -E echo "Collecting coverage data..."
+                COMMAND ${LCOV_PATH} --directory . --capture --output-file coverage.info
+                
+                COMMAND ${CMAKE_COMMAND} -E echo "Filtering coverage data..."
+                COMMAND ${LCOV_PATH} --remove coverage.info '/usr/*' '*/test/*' '*/build/*' --output-file coverage.info
+                
+                COMMAND ${CMAKE_COMMAND} -E echo "Generating HTML report..."
+                COMMAND ${GENHTML_PATH} coverage.info --output-directory coverage_html
+                
+                COMMAND ${CMAKE_COMMAND} -E echo "Coverage report generated in coverage_html/index.html"
+                
+                WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+                COMMENT "Generating code coverage report"
+            )
+            
+            # Add coverage clean target
+            add_custom_target(coverage-clean
+                COMMAND ${CMAKE_COMMAND} -E remove_directory coverage_html
+                COMMAND ${CMAKE_COMMAND} -E remove coverage.info
+                COMMAND ${LCOV_PATH} --directory . --zerocounters
+                WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+                COMMENT "Cleaning coverage data"
+            )
+        endif()
+    else()
+        message(WARNING "Code coverage is only supported with GCC or Clang compilers")
+    endif()
+endif()
 if(MSCCLPP_GPU_ARCHS)
     string(STRIP "${MSCCLPP_GPU_ARCHS}" MSCCLPP_GPU_ARCHS)
     string(REPLACE " " ";" MSCCLPP_GPU_ARCHS "${MSCCLPP_GPU_ARCHS}")
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 6452ebf8f..7c4e9684e 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -7,7 +7,6 @@ set(TEST_LIBS_COMMON mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} Threads::Threads
 if(MSCCLPP_USE_IB)
     list(APPEND TEST_LIBS_COMMON ${IBVERBS_LIBRARIES})
 endif()
-set(TEST_LIBS_GTEST GTest::gtest_main GTest::gmock_main)
 set(TEST_INC_COMMON PRIVATE ${PROJECT_SOURCE_DIR}/include SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
 set(TEST_INC_INTERNAL PRIVATE ${PROJECT_SOURCE_DIR}/src/core/include)
 
@@ -38,25 +37,25 @@ add_test_executable(executor_test executor_test.cc)
 configure_file(run_mpi_test.sh.in run_mpi_test.sh)
 
 include(CTest)
-include(FetchContent)
-FetchContent_Declare(googletest URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip)
-option(INSTALL_GTEST OFF)
-FetchContent_MakeAvailable(googletest)
-include(GoogleTest)
+
+# Build test framework library
+add_library(test_framework STATIC framework.cc)
+target_include_directories(test_framework PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_link_libraries(test_framework PUBLIC MPI::MPI_CXX)
 
 # Unit tests
 add_executable(unit_tests)
-target_link_libraries(unit_tests ${TEST_LIBS_COMMON} ${TEST_LIBS_GTEST})
+target_link_libraries(unit_tests ${TEST_LIBS_COMMON} test_framework)
 target_include_directories(unit_tests ${TEST_INC_COMMON} ${TEST_INC_INTERNAL})
 add_subdirectory(unit)
-gtest_discover_tests(unit_tests DISCOVERY_MODE PRE_TEST)
+add_test(NAME unit_tests COMMAND unit_tests)
 
 # Multi-process unit tests
 add_executable(mp_unit_tests)
-target_link_libraries(mp_unit_tests ${TEST_LIBS_COMMON} ${TEST_LIBS_GTEST} MPI::MPI_CXX)
+target_link_libraries(mp_unit_tests ${TEST_LIBS_COMMON} test_framework MPI::MPI_CXX)
 target_include_directories(mp_unit_tests ${TEST_INC_COMMON} ${TEST_INC_INTERNAL})
 add_subdirectory(mp_unit)
-gtest_discover_tests(mp_unit_tests DISCOVERY_MODE PRE_TEST)
+add_test(NAME mp_unit_tests COMMAND ${CMAKE_CURRENT_BINARY_DIR}/run_mpi_test.sh mp_unit_tests 2)
 
 # mscclpp-test
 add_subdirectory(mscclpp-test)
diff --git a/test/perf/CMakeLists.txt b/test/perf/CMakeLists.txt
index 6a16c0345..caee29f07 100644
--- a/test/perf/CMakeLists.txt
+++ b/test/perf/CMakeLists.txt
@@ -22,7 +22,7 @@ function(add_perf_test_executable name sources)
         set_source_files_properties(${sources} PROPERTIES LANGUAGE CXX)
     endif()
     add_executable(${name} ${sources})
-    target_link_libraries(${name} ${PERF_TEST_LIBS_COMMON})
+    target_link_libraries(${name} ${PERF_TEST_LIBS_COMMON} test_framework)
     
     # Link nlohmann_json - use the target from main project
     target_link_libraries(${name} nlohmann_json::nlohmann_json)

From 3d8a2e7349a94053c27c734404c7519bb1dbceb8 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 11 Feb 2026 00:25:43 +0000
Subject: [PATCH 006/132] Add --gtest_filter support to framework

Support --gtest_filter command line argument for test filtering,
compatible with Azure pipeline configurations.

Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
---
 test/framework.cc | 45 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 5 deletions(-)

diff --git a/test/framework.cc b/test/framework.cc
index 28001ed36..f072a075b 100644
--- a/test/framework.cc
+++ b/test/framework.cc
@@ -190,6 +190,18 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
     utils::initializeMPI(argc, argv);
   }
 
+  // Parse command line arguments for test filter
+  std::string filter = "";
+  for (int i = 1; i < argc; ++i) {
+    std::string arg = argv[i];
+    if (arg.find("--gtest_filter=") == 0) {
+      filter = arg.substr(15);  // Length of "--gtest_filter="
+    } else if (arg == "--gtest_filter" && i + 1 < argc) {
+      filter = argv[i + 1];
+      ++i;
+    }
+  }
+
   // Set up global test environments
   for (auto* env : environments_) {
     try {
@@ -204,17 +216,40 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
 
   int passed = 0;
   int failed = 0;
+  int skipped = 0;
+
+  // Count tests to run
+  int total_to_run = 0;
+  for (const auto& test_info : tests_) {
+    std::string full_name = test_info.suite_name + "." + test_info.test_name;
+    if (!filter.empty() && full_name.find(filter) == std::string::npos) {
+      skipped++;
+      continue;
+    }
+    total_to_run++;
+  }
 
   if (g_mpi_rank == 0) {
-    std::cout << "[==========] Running " << tests_.size() << " tests.\n";
+    std::cout << "[==========] Running " << total_to_run << " tests";
+    if (skipped > 0) {
+      std::cout << " (" << skipped << " skipped by filter)";
+    }
+    std::cout << ".\n";
   }
 
   for (const auto& test_info : tests_) {
+    std::string full_name = test_info.suite_name + "." + test_info.test_name;
+
+    // Apply filter
+    if (!filter.empty() && full_name.find(filter) == std::string::npos) {
+      continue;
+    }
+
     g_current_test_passed = true;
     g_current_test_failure_message.clear();
 
     if (g_mpi_rank == 0) {
-      std::cout << "[ RUN      ] " << test_info.suite_name << "." << test_info.test_name << std::endl;
+      std::cout << "[ RUN      ] " << full_name << std::endl;
     }
 
     // Set current test info for UnitTest::GetInstance()->current_test_info()
@@ -255,17 +290,17 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
 
     if (g_mpi_rank == 0) {
       if (global_passed) {
-        std::cout << "[       OK ] " << test_info.suite_name << "." << test_info.test_name << std::endl;
+        std::cout << "[       OK ] " << full_name << std::endl;
         passed++;
       } else {
-        std::cout << "[  FAILED  ] " << test_info.suite_name << "." << test_info.test_name << std::endl;
+        std::cout << "[  FAILED  ] " << full_name << std::endl;
         failed++;
       }
     }
   }
 
   if (g_mpi_rank == 0) {
-    std::cout << "[==========] " << tests_.size() << " tests ran.\n";
+    std::cout << "[==========] " << total_to_run << " tests ran.\n";
     if (passed > 0) {
       std::cout << "[  PASSED  ] " << passed << " tests.\n";
     }

From a10aff559f7c521e92835490e5ff7c2381d5f0e3 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 11 Feb 2026 00:28:00 +0000
Subject: [PATCH 007/132] Address code review feedback

- Move PerfTestResult struct definition outside vector declaration
- Move getCurrentTimestamp to anonymous namespace
- Add documentation for GTEST_SKIP macro explaining RAII pattern

Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
---
 test/framework.hpp     |  7 +++++++
 test/perf/framework.cc | 13 +++++++++----
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/test/framework.hpp b/test/framework.hpp
index c19580dcc..4b953e379 100644
--- a/test/framework.hpp
+++ b/test/framework.hpp
@@ -373,6 +373,11 @@ void reportSuccess();
   } while (0)
 
 // Helper class for GTEST_SKIP functionality
+// This class uses RAII (Resource Acquisition Is Initialization) pattern:
+// - The constructor records file and line information
+// - The stream operator (<<) allows appending a skip message
+// - The destructor throws an exception to skip the test
+// This enables usage like: GTEST_SKIP() << "Reason for skipping";
 class SkipHelper {
  public:
   explicit SkipHelper(const char* file, int line) : file_(file), line_(line) {}
@@ -397,6 +402,8 @@ class SkipHelper {
   std::ostringstream message_;
 };
 
+// Test skip macro - throws exception to skip test execution
+// Usage: GTEST_SKIP() << "Optional skip message";
 #define GTEST_SKIP() ::mscclpp::test::SkipHelper(__FILE__, __LINE__)
 
 // Create a namespace alias for compatibility with GTest code
diff --git a/test/perf/framework.cc b/test/perf/framework.cc
index 0b011cc5c..45a0c28c8 100644
--- a/test/perf/framework.cc
+++ b/test/perf/framework.cc
@@ -11,8 +11,8 @@
 namespace mscclpp {
 namespace test {
 
-// Global state for performance test results
-static std::vector < struct PerfTestResult {
+// Performance test result structure
+struct PerfTestResult {
   std::string test_name;
   std::string test_category;
   std::map<std::string, std::string> test_params;
@@ -20,15 +20,20 @@ static std::vector < struct PerfTestResult {
   int num_processes;
   int process_rank;
   std::string timestamp;
-} > g_perf_results;
+};
+
+// Global state for performance test results
+static std::vector<PerfTestResult> g_perf_results;
 
-static std::string getCurrentTimestamp() {
+namespace {
+std::string getCurrentTimestamp() {
   auto now = std::chrono::system_clock::now();
   auto time_t = std::chrono::system_clock::to_time_t(now);
   std::stringstream ss;
   ss << std::put_time(std::gmtime(&time_t), "%Y-%m-%dT%H:%M:%S");
   return ss.str();
 }
+}  // namespace
 
 void recordResult(const std::string& test_name, const std::string& test_category, const nlohmann::ordered_json& metrics,
                   const std::map<std::string, std::string>& test_params) {

From 1818709de0a5a0ebdbf73083690c60f8f8929a81 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 11 Feb 2026 00:59:34 +0000
Subject: [PATCH 008/132] Fix CodeQL workflow by disabling test builds

The recent removal of GTest and introduction of custom test framework
requires MPI dependency which is not needed for CodeQL analysis.
Disable test building in CodeQL workflows to fix the build failures.

CodeQL only needs to analyze the core library code, not the tests.

Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
---
 .github/workflows/codeql-analysis.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index b423e3265..6982c69c5 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -62,7 +62,7 @@ jobs:
     - name: Build
       run: |
         rm -rf build && mkdir build && cd build
-        cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
+        cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=OFF ..
         make -j
 
     - name: Perform CodeQL Analysis
@@ -107,7 +107,7 @@ jobs:
     - name: Build
       run: |
         rm -rf build && mkdir build && cd build
-        CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON ..
+        CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=OFF ..
         make -j
 
     - name: Perform CodeQL Analysis

From 5657e4a321eb67a8d23dabe8ec8bb5c20fc356cd Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 11 Feb 2026 01:02:59 +0000
Subject: [PATCH 009/132] Initial plan for fixing test build with GPU bypass

Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
---
 build_test/CMakeCache.txt                     | 390 ++++++++
 .../CMakeFiles/3.31.6/CMakeCXXCompiler.cmake  | 101 ++
 .../3.31.6/CMakeDetermineCompilerABI_CXX.bin  | Bin 0 -> 15992 bytes
 .../CMakeFiles/3.31.6/CMakeSystem.cmake       |  15 +
 .../CompilerIdCXX/CMakeCXXCompilerId.cpp      | 919 ++++++++++++++++++
 .../CMakeFiles/3.31.6/CompilerIdCXX/a.out     | Bin 0 -> 16096 bytes
 build_test/CMakeFiles/CMakeConfigureLog.yaml  | 294 ++++++
 build_test/CMakeFiles/cmake.check_cache       |   1 +
 build_test/include/mscclpp/version.hpp        |  13 +
 9 files changed, 1733 insertions(+)
 create mode 100644 build_test/CMakeCache.txt
 create mode 100644 build_test/CMakeFiles/3.31.6/CMakeCXXCompiler.cmake
 create mode 100755 build_test/CMakeFiles/3.31.6/CMakeDetermineCompilerABI_CXX.bin
 create mode 100644 build_test/CMakeFiles/3.31.6/CMakeSystem.cmake
 create mode 100644 build_test/CMakeFiles/3.31.6/CompilerIdCXX/CMakeCXXCompilerId.cpp
 create mode 100755 build_test/CMakeFiles/3.31.6/CompilerIdCXX/a.out
 create mode 100644 build_test/CMakeFiles/CMakeConfigureLog.yaml
 create mode 100644 build_test/CMakeFiles/cmake.check_cache
 create mode 100644 build_test/include/mscclpp/version.hpp

diff --git a/build_test/CMakeCache.txt b/build_test/CMakeCache.txt
new file mode 100644
index 000000000..cc9de9e11
--- /dev/null
+++ b/build_test/CMakeCache.txt
@@ -0,0 +1,390 @@
+# This is the CMakeCache file.
+# For build in directory: /home/runner/work/mscclpp/mscclpp/build_test
+# It was generated by CMake: /usr/local/bin/cmake
+# You can edit this file to change values found and used by cmake.
+# If you do not want to change any of the values, simply exit the editor.
+# If you do want to change a value, simply edit, save, and exit the editor.
+# The syntax for the file is as follows:
+# KEY:TYPE=VALUE
+# KEY is the name of a variable in the cache.
+# TYPE is a hint to GUIs for the type of VALUE, DO NOT EDIT TYPE!.
+# VALUE is the current value for the KEY.
+
+########################
+# EXTERNAL cache entries
+########################
+
+//Path to a program.
+CMAKE_ADDR2LINE:FILEPATH=/usr/bin/addr2line
+
+//Path to a program.
+CMAKE_AR:FILEPATH=/usr/bin/ar
+
+//Choose the type of build, options are: None Debug Release RelWithDebInfo
+// MinSizeRel ...
+CMAKE_BUILD_TYPE:STRING=
+
+//Enable/Disable color output during build.
+CMAKE_COLOR_MAKEFILE:BOOL=ON
+
+//CXX compiler
+CMAKE_CXX_COMPILER:FILEPATH=/usr/bin/c++
+
+//A wrapper around 'ar' adding the appropriate '--plugin' option
+// for the GCC compiler
+CMAKE_CXX_COMPILER_AR:FILEPATH=/usr/bin/gcc-ar-13
+
+//A wrapper around 'ranlib' adding the appropriate '--plugin' option
+// for the GCC compiler
+CMAKE_CXX_COMPILER_RANLIB:FILEPATH=/usr/bin/gcc-ranlib-13
+
+//Flags used by the CXX compiler during all build types.
+CMAKE_CXX_FLAGS:STRING=
+
+//Flags used by the CXX compiler during DEBUG builds.
+CMAKE_CXX_FLAGS_DEBUG:STRING=-g
+
+//Flags used by the CXX compiler during MINSIZEREL builds.
+CMAKE_CXX_FLAGS_MINSIZEREL:STRING=-Os -DNDEBUG
+
+//Flags used by the CXX compiler during RELEASE builds.
+CMAKE_CXX_FLAGS_RELEASE:STRING=-O3 -DNDEBUG
+
+//Flags used by the CXX compiler during RELWITHDEBINFO builds.
+CMAKE_CXX_FLAGS_RELWITHDEBINFO:STRING=-O2 -g -DNDEBUG
+
+//Path to a program.
+CMAKE_DLLTOOL:FILEPATH=CMAKE_DLLTOOL-NOTFOUND
+
+//Flags used by the linker during all build types.
+CMAKE_EXE_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during DEBUG builds.
+CMAKE_EXE_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during MINSIZEREL builds.
+CMAKE_EXE_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during RELEASE builds.
+CMAKE_EXE_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during RELWITHDEBINFO builds.
+CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//Enable/Disable output of compile commands during generation.
+CMAKE_EXPORT_COMPILE_COMMANDS:BOOL=
+
+//Value Computed by CMake.
+CMAKE_FIND_PACKAGE_REDIRECTS_DIR:STATIC=/home/runner/work/mscclpp/mscclpp/build_test/CMakeFiles/pkgRedirects
+
+//Install path prefix, prepended onto install directories.
+CMAKE_INSTALL_PREFIX:PATH=/usr/local
+
+//Path to a program.
+CMAKE_LINKER:FILEPATH=/usr/bin/ld
+
+//Path to a program.
+CMAKE_MAKE_PROGRAM:FILEPATH=/usr/bin/gmake
+
+//Flags used by the linker during the creation of modules during
+// all build types.
+CMAKE_MODULE_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during the creation of modules during
+// DEBUG builds.
+CMAKE_MODULE_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during the creation of modules during
+// MINSIZEREL builds.
+CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during the creation of modules during
+// RELEASE builds.
+CMAKE_MODULE_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during the creation of modules during
+// RELWITHDEBINFO builds.
+CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//Path to a program.
+CMAKE_NM:FILEPATH=/usr/bin/nm
+
+//Path to a program.
+CMAKE_OBJCOPY:FILEPATH=/usr/bin/objcopy
+
+//Path to a program.
+CMAKE_OBJDUMP:FILEPATH=/usr/bin/objdump
+
+//Value Computed by CMake
+CMAKE_PROJECT_DESCRIPTION:STATIC=
+
+//Value Computed by CMake
+CMAKE_PROJECT_HOMEPAGE_URL:STATIC=
+
+//Value Computed by CMake
+CMAKE_PROJECT_NAME:STATIC=mscclpp
+
+//Path to a program.
+CMAKE_RANLIB:FILEPATH=/usr/bin/ranlib
+
+//Path to a program.
+CMAKE_READELF:FILEPATH=/usr/bin/readelf
+
+//Flags used by the linker during the creation of shared libraries
+// during all build types.
+CMAKE_SHARED_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during the creation of shared libraries
+// during DEBUG builds.
+CMAKE_SHARED_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during the creation of shared libraries
+// during MINSIZEREL builds.
+CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during the creation of shared libraries
+// during RELEASE builds.
+CMAKE_SHARED_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during the creation of shared libraries
+// during RELWITHDEBINFO builds.
+CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//If set, runtime paths are not added when installing shared libraries,
+// but are added when building.
+CMAKE_SKIP_INSTALL_RPATH:BOOL=NO
+
+//If set, runtime paths are not added when using shared libraries.
+CMAKE_SKIP_RPATH:BOOL=NO
+
+//Flags used by the linker during the creation of static libraries
+// during all build types.
+CMAKE_STATIC_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during the creation of static libraries
+// during DEBUG builds.
+CMAKE_STATIC_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during the creation of static libraries
+// during MINSIZEREL builds.
+CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during the creation of static libraries
+// during RELEASE builds.
+CMAKE_STATIC_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during the creation of static libraries
+// during RELWITHDEBINFO builds.
+CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//Path to a program.
+CMAKE_STRIP:FILEPATH=/usr/bin/strip
+
+//Path to a program.
+CMAKE_TAPI:FILEPATH=CMAKE_TAPI-NOTFOUND
+
+//If this value is on, makefiles will be generated without the
+// .SILENT directive, and all commands will be echoed to the console
+// during the make.  This is useful for debugging only. With Visual
+// Studio IDE projects all commands are done without /nologo.
+CMAKE_VERBOSE_MAKEFILE:BOOL=FALSE
+
+//Path to a program.
+CUDAToolkit_NVCC_EXECUTABLE:FILEPATH=CUDAToolkit_NVCC_EXECUTABLE-NOTFOUND
+
+//Path to a file.
+CUDAToolkit_SENTINEL_FILE:FILEPATH=CUDAToolkit_SENTINEL_FILE-NOTFOUND
+
+//Git command line client
+GIT_EXECUTABLE:FILEPATH=/usr/bin/git
+
+//Build collective algorithms
+MSCCLPP_BUILD_EXT_COLLECTIVES:BOOL=ON
+
+//Build NCCL interfaces
+MSCCLPP_BUILD_EXT_NCCL:BOOL=ON
+
+//Build Python bindings
+MSCCLPP_BUILD_PYTHON_BINDINGS:BOOL=ON
+
+//Build tests
+MSCCLPP_BUILD_TESTS:BOOL=ON
+
+//Bypass GPU check.
+MSCCLPP_BYPASS_GPU_CHECK:BOOL=ON
+
+//Enable code coverage
+MSCCLPP_ENABLE_COVERAGE:BOOL=OFF
+
+//Enable tracing
+MSCCLPP_ENABLE_TRACE:BOOL=OFF
+
+//Specify GPU architectures with delimiters (comma, space, or semicolon).
+MSCCLPP_GPU_ARCHS:STRING=
+
+//Set NPKIT flags
+MSCCLPP_NPKIT_FLAGS:BOOL=OFF
+
+//Use NVIDIA/CUDA.
+MSCCLPP_USE_CUDA:BOOL=ON
+
+//Use InfiniBand.
+MSCCLPP_USE_IB:BOOL=ON
+
+//Use AMD/ROCm.
+MSCCLPP_USE_ROCM:BOOL=OFF
+
+//Value Computed by CMake
+mscclpp_BINARY_DIR:STATIC=/home/runner/work/mscclpp/mscclpp/build_test
+
+//Value Computed by CMake
+mscclpp_IS_TOP_LEVEL:STATIC=ON
+
+//Value Computed by CMake
+mscclpp_SOURCE_DIR:STATIC=/home/runner/work/mscclpp/mscclpp
+
+
+########################
+# INTERNAL cache entries
+########################
+
+//ADVANCED property for variable: CMAKE_ADDR2LINE
+CMAKE_ADDR2LINE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_AR
+CMAKE_AR-ADVANCED:INTERNAL=1
+//This is the directory where this CMakeCache.txt was created
+CMAKE_CACHEFILE_DIR:INTERNAL=/home/runner/work/mscclpp/mscclpp/build_test
+//Major version of cmake used to create the current loaded cache
+CMAKE_CACHE_MAJOR_VERSION:INTERNAL=3
+//Minor version of cmake used to create the current loaded cache
+CMAKE_CACHE_MINOR_VERSION:INTERNAL=31
+//Patch version of cmake used to create the current loaded cache
+CMAKE_CACHE_PATCH_VERSION:INTERNAL=6
+//ADVANCED property for variable: CMAKE_COLOR_MAKEFILE
+CMAKE_COLOR_MAKEFILE-ADVANCED:INTERNAL=1
+//Path to CMake executable.
+CMAKE_COMMAND:INTERNAL=/usr/local/bin/cmake
+//Path to cpack program executable.
+CMAKE_CPACK_COMMAND:INTERNAL=/usr/local/bin/cpack
+//Path to ctest program executable.
+CMAKE_CTEST_COMMAND:INTERNAL=/usr/local/bin/ctest
+//ADVANCED property for variable: CMAKE_CXX_COMPILER
+CMAKE_CXX_COMPILER-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_COMPILER_AR
+CMAKE_CXX_COMPILER_AR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_COMPILER_RANLIB
+CMAKE_CXX_COMPILER_RANLIB-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS
+CMAKE_CXX_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS_DEBUG
+CMAKE_CXX_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS_MINSIZEREL
+CMAKE_CXX_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS_RELEASE
+CMAKE_CXX_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS_RELWITHDEBINFO
+CMAKE_CXX_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_DLLTOOL
+CMAKE_DLLTOOL-ADVANCED:INTERNAL=1
+//Path to cache edit program executable.
+CMAKE_EDIT_COMMAND:INTERNAL=/usr/local/bin/ccmake
+//Executable file format
+CMAKE_EXECUTABLE_FORMAT:INTERNAL=ELF
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS
+CMAKE_EXE_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_DEBUG
+CMAKE_EXE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_MINSIZEREL
+CMAKE_EXE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELEASE
+CMAKE_EXE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXPORT_COMPILE_COMMANDS
+CMAKE_EXPORT_COMPILE_COMMANDS-ADVANCED:INTERNAL=1
+//Name of external makefile project generator.
+CMAKE_EXTRA_GENERATOR:INTERNAL=
+//Name of generator.
+CMAKE_GENERATOR:INTERNAL=Unix Makefiles
+//Generator instance identifier.
+CMAKE_GENERATOR_INSTANCE:INTERNAL=
+//Name of generator platform.
+CMAKE_GENERATOR_PLATFORM:INTERNAL=
+//Name of generator toolset.
+CMAKE_GENERATOR_TOOLSET:INTERNAL=
+//Source directory with the top level CMakeLists.txt file for this
+// project
+CMAKE_HOME_DIRECTORY:INTERNAL=/home/runner/work/mscclpp/mscclpp
+//Install .so files without execute permission.
+CMAKE_INSTALL_SO_NO_EXE:INTERNAL=1
+//ADVANCED property for variable: CMAKE_LINKER
+CMAKE_LINKER-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MAKE_PROGRAM
+CMAKE_MAKE_PROGRAM-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS
+CMAKE_MODULE_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_DEBUG
+CMAKE_MODULE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL
+CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELEASE
+CMAKE_MODULE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_NM
+CMAKE_NM-ADVANCED:INTERNAL=1
+//number of local generators
+CMAKE_NUMBER_OF_MAKEFILES:INTERNAL=1
+//ADVANCED property for variable: CMAKE_OBJCOPY
+CMAKE_OBJCOPY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_OBJDUMP
+CMAKE_OBJDUMP-ADVANCED:INTERNAL=1
+//Platform information initialized
+CMAKE_PLATFORM_INFO_INITIALIZED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_RANLIB
+CMAKE_RANLIB-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_READELF
+CMAKE_READELF-ADVANCED:INTERNAL=1
+//Path to CMake installation.
+CMAKE_ROOT:INTERNAL=/usr/local/share/cmake-3.31
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS
+CMAKE_SHARED_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_DEBUG
+CMAKE_SHARED_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL
+CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELEASE
+CMAKE_SHARED_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SKIP_INSTALL_RPATH
+CMAKE_SKIP_INSTALL_RPATH-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SKIP_RPATH
+CMAKE_SKIP_RPATH-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS
+CMAKE_STATIC_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_DEBUG
+CMAKE_STATIC_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL
+CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELEASE
+CMAKE_STATIC_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STRIP
+CMAKE_STRIP-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_TAPI
+CMAKE_TAPI-ADVANCED:INTERNAL=1
+//uname command
+CMAKE_UNAME:INTERNAL=/usr/bin/uname
+//ADVANCED property for variable: CMAKE_VERBOSE_MAKEFILE
+CMAKE_VERBOSE_MAKEFILE-ADVANCED:INTERNAL=1
+//Details about finding Git
+FIND_PACKAGE_MESSAGE_DETAILS_Git:INTERNAL=[/usr/bin/git][v2.52.0()]
+//ADVANCED property for variable: GIT_EXECUTABLE
+GIT_EXECUTABLE-ADVANCED:INTERNAL=1
+//linker supports push/pop state
+_CMAKE_CXX_LINKER_PUSHPOP_STATE_SUPPORTED:INTERNAL=TRUE
+//linker supports push/pop state
+_CMAKE_LINKER_PUSHPOP_STATE_SUPPORTED:INTERNAL=TRUE
+
diff --git a/build_test/CMakeFiles/3.31.6/CMakeCXXCompiler.cmake b/build_test/CMakeFiles/3.31.6/CMakeCXXCompiler.cmake
new file mode 100644
index 000000000..14f6ae31d
--- /dev/null
+++ b/build_test/CMakeFiles/3.31.6/CMakeCXXCompiler.cmake
@@ -0,0 +1,101 @@
+set(CMAKE_CXX_COMPILER "/usr/bin/c++")
+set(CMAKE_CXX_COMPILER_ARG1 "")
+set(CMAKE_CXX_COMPILER_ID "GNU")
+set(CMAKE_CXX_COMPILER_VERSION "13.3.0")
+set(CMAKE_CXX_COMPILER_VERSION_INTERNAL "")
+set(CMAKE_CXX_COMPILER_WRAPPER "")
+set(CMAKE_CXX_STANDARD_COMPUTED_DEFAULT "17")
+set(CMAKE_CXX_EXTENSIONS_COMPUTED_DEFAULT "ON")
+set(CMAKE_CXX_STANDARD_LATEST "23")
+set(CMAKE_CXX_COMPILE_FEATURES "cxx_std_98;cxx_template_template_parameters;cxx_std_11;cxx_alias_templates;cxx_alignas;cxx_alignof;cxx_attributes;cxx_auto_type;cxx_constexpr;cxx_decltype;cxx_decltype_incomplete_return_types;cxx_default_function_template_args;cxx_defaulted_functions;cxx_defaulted_move_initializers;cxx_delegating_constructors;cxx_deleted_functions;cxx_enum_forward_declarations;cxx_explicit_conversions;cxx_extended_friend_declarations;cxx_extern_templates;cxx_final;cxx_func_identifier;cxx_generalized_initializers;cxx_inheriting_constructors;cxx_inline_namespaces;cxx_lambdas;cxx_local_type_template_args;cxx_long_long_type;cxx_noexcept;cxx_nonstatic_member_init;cxx_nullptr;cxx_override;cxx_range_for;cxx_raw_string_literals;cxx_reference_qualified_functions;cxx_right_angle_brackets;cxx_rvalue_references;cxx_sizeof_member;cxx_static_assert;cxx_strong_enums;cxx_thread_local;cxx_trailing_return_types;cxx_unicode_literals;cxx_uniform_initialization;cxx_unrestricted_unions;cxx_user_literals;cxx_variadic_macros;cxx_variadic_templates;cxx_std_14;cxx_aggregate_default_initializers;cxx_attribute_deprecated;cxx_binary_literals;cxx_contextual_conversions;cxx_decltype_auto;cxx_digit_separators;cxx_generic_lambdas;cxx_lambda_init_captures;cxx_relaxed_constexpr;cxx_return_type_deduction;cxx_variable_templates;cxx_std_17;cxx_std_20;cxx_std_23")
+set(CMAKE_CXX98_COMPILE_FEATURES "cxx_std_98;cxx_template_template_parameters")
+set(CMAKE_CXX11_COMPILE_FEATURES "cxx_std_11;cxx_alias_templates;cxx_alignas;cxx_alignof;cxx_attributes;cxx_auto_type;cxx_constexpr;cxx_decltype;cxx_decltype_incomplete_return_types;cxx_default_function_template_args;cxx_defaulted_functions;cxx_defaulted_move_initializers;cxx_delegating_constructors;cxx_deleted_functions;cxx_enum_forward_declarations;cxx_explicit_conversions;cxx_extended_friend_declarations;cxx_extern_templates;cxx_final;cxx_func_identifier;cxx_generalized_initializers;cxx_inheriting_constructors;cxx_inline_namespaces;cxx_lambdas;cxx_local_type_template_args;cxx_long_long_type;cxx_noexcept;cxx_nonstatic_member_init;cxx_nullptr;cxx_override;cxx_range_for;cxx_raw_string_literals;cxx_reference_qualified_functions;cxx_right_angle_brackets;cxx_rvalue_references;cxx_sizeof_member;cxx_static_assert;cxx_strong_enums;cxx_thread_local;cxx_trailing_return_types;cxx_unicode_literals;cxx_uniform_initialization;cxx_unrestricted_unions;cxx_user_literals;cxx_variadic_macros;cxx_variadic_templates")
+set(CMAKE_CXX14_COMPILE_FEATURES "cxx_std_14;cxx_aggregate_default_initializers;cxx_attribute_deprecated;cxx_binary_literals;cxx_contextual_conversions;cxx_decltype_auto;cxx_digit_separators;cxx_generic_lambdas;cxx_lambda_init_captures;cxx_relaxed_constexpr;cxx_return_type_deduction;cxx_variable_templates")
+set(CMAKE_CXX17_COMPILE_FEATURES "cxx_std_17")
+set(CMAKE_CXX20_COMPILE_FEATURES "cxx_std_20")
+set(CMAKE_CXX23_COMPILE_FEATURES "cxx_std_23")
+set(CMAKE_CXX26_COMPILE_FEATURES "")
+
+set(CMAKE_CXX_PLATFORM_ID "Linux")
+set(CMAKE_CXX_SIMULATE_ID "")
+set(CMAKE_CXX_COMPILER_FRONTEND_VARIANT "GNU")
+set(CMAKE_CXX_SIMULATE_VERSION "")
+
+
+
+
+set(CMAKE_AR "/usr/bin/ar")
+set(CMAKE_CXX_COMPILER_AR "/usr/bin/gcc-ar-13")
+set(CMAKE_RANLIB "/usr/bin/ranlib")
+set(CMAKE_CXX_COMPILER_RANLIB "/usr/bin/gcc-ranlib-13")
+set(CMAKE_LINKER "/usr/bin/ld")
+set(CMAKE_LINKER_LINK "")
+set(CMAKE_LINKER_LLD "")
+set(CMAKE_CXX_COMPILER_LINKER "/usr/bin/ld")
+set(CMAKE_CXX_COMPILER_LINKER_ID "GNU")
+set(CMAKE_CXX_COMPILER_LINKER_VERSION 2.42)
+set(CMAKE_CXX_COMPILER_LINKER_FRONTEND_VARIANT GNU)
+set(CMAKE_MT "")
+set(CMAKE_TAPI "CMAKE_TAPI-NOTFOUND")
+set(CMAKE_COMPILER_IS_GNUCXX 1)
+set(CMAKE_CXX_COMPILER_LOADED 1)
+set(CMAKE_CXX_COMPILER_WORKS TRUE)
+set(CMAKE_CXX_ABI_COMPILED TRUE)
+
+set(CMAKE_CXX_COMPILER_ENV_VAR "CXX")
+
+set(CMAKE_CXX_COMPILER_ID_RUN 1)
+set(CMAKE_CXX_SOURCE_FILE_EXTENSIONS C;M;c++;cc;cpp;cxx;m;mm;mpp;CPP;ixx;cppm;ccm;cxxm;c++m)
+set(CMAKE_CXX_IGNORE_EXTENSIONS inl;h;hpp;HPP;H;o;O;obj;OBJ;def;DEF;rc;RC)
+
+foreach (lang IN ITEMS C OBJC OBJCXX)
+  if (CMAKE_${lang}_COMPILER_ID_RUN)
+    foreach(extension IN LISTS CMAKE_${lang}_SOURCE_FILE_EXTENSIONS)
+      list(REMOVE_ITEM CMAKE_CXX_SOURCE_FILE_EXTENSIONS ${extension})
+    endforeach()
+  endif()
+endforeach()
+
+set(CMAKE_CXX_LINKER_PREFERENCE 30)
+set(CMAKE_CXX_LINKER_PREFERENCE_PROPAGATES 1)
+set(CMAKE_CXX_LINKER_DEPFILE_SUPPORTED )
+
+# Save compiler ABI information.
+set(CMAKE_CXX_SIZEOF_DATA_PTR "8")
+set(CMAKE_CXX_COMPILER_ABI "ELF")
+set(CMAKE_CXX_BYTE_ORDER "LITTLE_ENDIAN")
+set(CMAKE_CXX_LIBRARY_ARCHITECTURE "x86_64-linux-gnu")
+
+if(CMAKE_CXX_SIZEOF_DATA_PTR)
+  set(CMAKE_SIZEOF_VOID_P "${CMAKE_CXX_SIZEOF_DATA_PTR}")
+endif()
+
+if(CMAKE_CXX_COMPILER_ABI)
+  set(CMAKE_INTERNAL_PLATFORM_ABI "${CMAKE_CXX_COMPILER_ABI}")
+endif()
+
+if(CMAKE_CXX_LIBRARY_ARCHITECTURE)
+  set(CMAKE_LIBRARY_ARCHITECTURE "x86_64-linux-gnu")
+endif()
+
+set(CMAKE_CXX_CL_SHOWINCLUDES_PREFIX "")
+if(CMAKE_CXX_CL_SHOWINCLUDES_PREFIX)
+  set(CMAKE_CL_SHOWINCLUDES_PREFIX "${CMAKE_CXX_CL_SHOWINCLUDES_PREFIX}")
+endif()
+
+
+
+
+
+set(CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES "/usr/include/c++/13;/usr/include/x86_64-linux-gnu/c++/13;/usr/include/c++/13/backward;/usr/lib/gcc/x86_64-linux-gnu/13/include;/usr/local/include;/usr/include/x86_64-linux-gnu;/usr/include")
+set(CMAKE_CXX_IMPLICIT_LINK_LIBRARIES "stdc++;m;gcc_s;gcc;c;gcc_s;gcc")
+set(CMAKE_CXX_IMPLICIT_LINK_DIRECTORIES "/usr/lib/gcc/x86_64-linux-gnu/13;/usr/lib/x86_64-linux-gnu;/usr/lib;/lib/x86_64-linux-gnu;/lib")
+set(CMAKE_CXX_IMPLICIT_LINK_FRAMEWORK_DIRECTORIES "")
+set(CMAKE_CXX_COMPILER_CLANG_RESOURCE_DIR "")
+
+set(CMAKE_CXX_COMPILER_IMPORT_STD "")
+### Imported target for C++23 standard library
+set(CMAKE_CXX23_COMPILER_IMPORT_STD_NOT_FOUND_MESSAGE "Unsupported generator: Unix Makefiles")
+
+
+
diff --git a/build_test/CMakeFiles/3.31.6/CMakeDetermineCompilerABI_CXX.bin b/build_test/CMakeFiles/3.31.6/CMakeDetermineCompilerABI_CXX.bin
new file mode 100755
index 0000000000000000000000000000000000000000..e90f3f71d98d8b48fdca37fdc4f6d991fd1db519
GIT binary patch
literal 15992
zcmeHOYit}>6~4Q9xipD4Y0{XaG)rkv(&C9<F-{<K9ebTw<k%r`pb)}j);qRGtar`s
zEVT=iKzP&&K>;D4KR{7b9#VzWB18~B+O2|G6~rSFg`oZkluAK_))g&sA!Ipc?)lc^
z(YodJ1Btn-o$sFSoOAD;bMNflnYs7l>A`_`ET)i_sdp%rQVGqZMA7qB$q=Mek6J^=
zH>g|GN|KlRoYto_kXENl@x|CA{4zrJYvD`-yhYPggHC86Bl|6t=2mD8P|10)pRW=b
zJn#{z00_QbUs7re;fVMFgMJ*FxmN8rw|6lnB`(_q;m0ETDMQ;+cjzQomHL2)C&z@p
zJrd6_wn;I-u-}CEg|T1!fLsTs!_RrSf2Y2K;&&$L7o)=X7ELQ4>U$UY`Ee2bYXQ3X
zkkq$SKO`jnKnbtfnRm0@T|4u+*1TJ&Ot((=bhmbQ8ReqU;aAP=O466d)c&C(ii)W+
zCt+0a6Iw=jtlJ=Zw*TRV!E;T|eD<lsJAcyk*c}_b`>XiRpJy9xH~X*+CoT^|gk{ci
zoou7y@d?Vw*e1N_{A|)EmN>BA`Ubi_;*t$`YYD!v1b-9pw>2n7Sr$cf)GB*+$+ISH
zw?NG3v~7*K1v~HF>nK)pe7n{D!OXrstHbCpcGdHpUCPRg9I$du$r*Rco>Lk*(3dY3
zoDn;lcc`rK$znlDx3p<PLylm~|LC5Ik<9JIc&Ti5Z{Vo&_+##SU-&YGIZnTLI^jCT
z^^;tu`FXj%!C#gFn^Ia29`dETG|zp=eS&m3zz6&NN`S{0W1qPI&*KMaKETUQB2*DZ
z5r`rXMIeem6oDuLQ3Rq0{2xc)&&{{~)jWB%$vm~<H#?OwKV9|WwO^Pgf7Eork4kOV
zIihRZ9;9RQ)|6uV+O|hY8f)I#uY9@vPnp?^A24TsXP*51+`*A_d$s*3^Yq>yQvtP&
zWiowf%xK>FDZf18A0Wm&z2b`uyXU=)RQ0<#PgUPgyWG6>1RGuuBzxDl-<4(9aowDq
zGarBcF7xsEWoGON^Wt@H0~N4M3TUcb*6o5nxA(+eR;$XLN6eFZ<D4~TpYv9mr}nNS
z;mVF$t#&0xhbLD2o$k70$H=!{Kl}gT9#V4V2>H!^?5a6ix%_1M8aMM)`l|U=^Yq52
z*HU=CzdX_WXf>9;ChP`2&1YD1etEq4d|30_Mw*R(43%{4*afcI@1uIJaMe+YA`nF&
zia->BC<0Lgq6kD0h$0Y0Ac{Z~fhYq1d<6LY*Q=$>(7^DXGQFQGj#;@WuXMDn=UC8w
zC^I~e-Q&$zPO0eRj+Qd}to=jjO#e`?^6h;8?2PAF#S*={J35#d85vAl>7o8i?+{t|
zdOPbLrF97G5ZkisZT#+y-({V7p;kLic$V;f!iNb>!UyJRwX=kr_?;@J*u95TY&sF!
zvU*k18G50{Jg*%%PCjpDgZ@?i8@byl+eP2)#QVhB#K78?cQ)U6Ptyr?*XG@Kbl&d2
zzGVOR(>DP-%5&l}J^H>#{70BbuT6X=-nV9DyhJrK5v3>sQ3Rq0L=lK05Je!0Koo%}
z0#O8_2>fqE0P7X8J`rmV{hJ<Y;%YQg)-SFR`9WFd_<E7C4swggxb@jAGS)-#{SqhW
zU%p-|viz_tV#M0S3BKW@q}Q}6bxHKE)3mx@@J7KF!Ht3dtc|S7`o~qGXp@T2j;ipq
z*wara?^cmv_qUpEFU85Hu8XV}lhX_C1-<V{x2FF2&B^(^A~M<~#sBvJ>%;%U60t6I
ze_!98<n|-kO2Mln+dGX;qph{O;)@;kb#xhRT|0z+^$K}hEmtqr!d4vb7->Ey0ZEDh
zuN!V;&;1csYt@vDM=@7P;m?NnPT?`WVV|K)Otq*)N;4SuyvjO8PYW<!wN|N*Qikir
z^#Y#9VNBhmF#f@Ri!zPc|Cn!|P|2jW#CUyL_>}M%cP|TnTzCQ1LJf|oggPMvtrGCl
zQgPen+pkv#-zbIwXw=S5-=10*8c%O0Ua58Ub^0h~*tfq~;W`8F5Z`Eh`6r1_!YF{>
z@%c?kr2-^nzfOEYZL0SdwBI0peY{!W_Xzw$VjnK&2Y&gmTEHiXUl-q`Fz%uGCG%9X
zN@_+fWA!ZY2^v2wDOhUc{UYmWoTOwN`p=q3bw%tk-r)6;*zb_vQ~wzfDPJL;+Y`25
z5wAA|MfkXt_}dmSTG&JU`Z)bchOP^Bc(mlT8%0_vPfyz{&mLDql)cK>m@%prR@GbH
zq&3Rx>dR!AD_Z0EV%E-EIj>kMTXtnyjTR@T@{Z@^jJC!WyrSQ=>{7|5hk^yKG^55!
z_M~IwDwC5l<Pwl9vh)_2_8qW4==9xvcOTW_=ABaSzKk(CHKnZg4Yqf?g|VU)coxZQ
zhh`U^Fj`r6oa)WFHtjGV{chhYpwGLWmv;gtJ-!7+g&H?-sP};Xbkd?t1pV(F>OGL@
zBbs(&SZPzVX8$2&?H?T8*E?tp4-6bmk60tU`{<!28HV;aq_CCYwYD!fIoq?9A37?9
z1-+MngvA>htX#QhP1uDTZ+gfKlU2?wSe3GqQ+!HfpDmZgS9V#@MhSl2%4ftoC>m~y
zSiBdb-fZ51;dc`4M=H-udUlr3D`}iS&MnY(j45Rlik@SP7b?b7sW|17yqN%%t+=$8
z#?1*u{o2Z7&^Mp3%M;4T%@n8#jb2G>KJ1jrZn3aPut-;O@-{mtgGZ1urt<n=j29{6
zIn#9HVMvxmKeC21Ap>tBNB)qszaD|w19>Xko^(g4IovS@1yva|^e1UVH@NElb&BUr
zbjjDBzK8e0Vcvw2**2KoL;}xk=yLbdQv1C`U7vqJ?xsx8KfLdYpOXg@eh0zv|7p-4
z|L4FY3<bmf?;-v#G&e%~F&_k?e#{3kA49P=Wq2+Kf6NzwXT*@($gzVz=6No0JOzP2
z=AS_RpAV*R{69oWp8LTc^F1Ku(P%&HfcKF<&m|#aJ_&4-%ERqPn@&@PV+w!FZ-G@Y
zME&9O{|f2(oS?7&U&#Lk=JisHUl;O>U!!l(KPi4d5$i6Hf#*X0ZK43e4h294J{0m#
zi2|4lbr}3m-XkG@%qM`j?}2@I{GJzo#9t-FQt<O40)&RB^t^DP|IUa3kl%p?Q@H-0
zl9Epm^;eVH8u%qG){p3a5Wl7j&mnPNg83}=Nrvqq1D_?|=72xu&-1NBQi7e97G&@*
zkb=h^>aWi`4ee3olcU7rpA-DhkKZJYP2i7tXmuxBE0yw(3kUcE=SdaxuRFA9AJl^q
z;0O6SWtc<#n71XwKWs0j19!EI2<F7R&cpxCI-@i24<h<LXqu7&zby^p>-c8+qCNQi
l<NGkQJ?MXhZ=fipLWQGVt>rm#WB={^$3kg!$RQ-Ee*hEc8gl>u

literal 0
HcmV?d00001

diff --git a/build_test/CMakeFiles/3.31.6/CMakeSystem.cmake b/build_test/CMakeFiles/3.31.6/CMakeSystem.cmake
new file mode 100644
index 000000000..b2715a602
--- /dev/null
+++ b/build_test/CMakeFiles/3.31.6/CMakeSystem.cmake
@@ -0,0 +1,15 @@
+set(CMAKE_HOST_SYSTEM "Linux-6.11.0-1018-azure")
+set(CMAKE_HOST_SYSTEM_NAME "Linux")
+set(CMAKE_HOST_SYSTEM_VERSION "6.11.0-1018-azure")
+set(CMAKE_HOST_SYSTEM_PROCESSOR "x86_64")
+
+
+
+set(CMAKE_SYSTEM "Linux-6.11.0-1018-azure")
+set(CMAKE_SYSTEM_NAME "Linux")
+set(CMAKE_SYSTEM_VERSION "6.11.0-1018-azure")
+set(CMAKE_SYSTEM_PROCESSOR "x86_64")
+
+set(CMAKE_CROSSCOMPILING "FALSE")
+
+set(CMAKE_SYSTEM_LOADED 1)
diff --git a/build_test/CMakeFiles/3.31.6/CompilerIdCXX/CMakeCXXCompilerId.cpp b/build_test/CMakeFiles/3.31.6/CompilerIdCXX/CMakeCXXCompilerId.cpp
new file mode 100644
index 000000000..3b6e114ca
--- /dev/null
+++ b/build_test/CMakeFiles/3.31.6/CompilerIdCXX/CMakeCXXCompilerId.cpp
@@ -0,0 +1,919 @@
+/* This source file must have a .cpp extension so that all C++ compilers
+   recognize the extension without flags.  Borland does not know .cxx for
+   example.  */
+#ifndef __cplusplus
+# error "A C compiler has been selected for C++."
+#endif
+
+#if !defined(__has_include)
+/* If the compiler does not have __has_include, pretend the answer is
+   always no.  */
+#  define __has_include(x) 0
+#endif
+
+
+/* Version number components: V=Version, R=Revision, P=Patch
+   Version date components:   YYYY=Year, MM=Month,   DD=Day  */
+
+#if defined(__INTEL_COMPILER) || defined(__ICC)
+# define COMPILER_ID "Intel"
+# if defined(_MSC_VER)
+#  define SIMULATE_ID "MSVC"
+# endif
+# if defined(__GNUC__)
+#  define SIMULATE_ID "GNU"
+# endif
+  /* __INTEL_COMPILER = VRP prior to 2021, and then VVVV for 2021 and later,
+     except that a few beta releases use the old format with V=2021.  */
+# if __INTEL_COMPILER < 2021 || __INTEL_COMPILER == 202110 || __INTEL_COMPILER == 202111
+#  define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER/100)
+#  define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER/10 % 10)
+#  if defined(__INTEL_COMPILER_UPDATE)
+#   define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER_UPDATE)
+#  else
+#   define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER   % 10)
+#  endif
+# else
+#  define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER)
+#  define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER_UPDATE)
+   /* The third version component from --version is an update index,
+      but no macro is provided for it.  */
+#  define COMPILER_VERSION_PATCH DEC(0)
+# endif
+# if defined(__INTEL_COMPILER_BUILD_DATE)
+   /* __INTEL_COMPILER_BUILD_DATE = YYYYMMDD */
+#  define COMPILER_VERSION_TWEAK DEC(__INTEL_COMPILER_BUILD_DATE)
+# endif
+# if defined(_MSC_VER)
+   /* _MSC_VER = VVRR */
+#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# endif
+# if defined(__GNUC__)
+#  define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
+# elif defined(__GNUG__)
+#  define SIMULATE_VERSION_MAJOR DEC(__GNUG__)
+# endif
+# if defined(__GNUC_MINOR__)
+#  define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
+# endif
+# if defined(__GNUC_PATCHLEVEL__)
+#  define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
+# endif
+
+#elif (defined(__clang__) && defined(__INTEL_CLANG_COMPILER)) || defined(__INTEL_LLVM_COMPILER)
+# define COMPILER_ID "IntelLLVM"
+#if defined(_MSC_VER)
+# define SIMULATE_ID "MSVC"
+#endif
+#if defined(__GNUC__)
+# define SIMULATE_ID "GNU"
+#endif
+/* __INTEL_LLVM_COMPILER = VVVVRP prior to 2021.2.0, VVVVRRPP for 2021.2.0 and
+ * later.  Look for 6 digit vs. 8 digit version number to decide encoding.
+ * VVVV is no smaller than the current year when a version is released.
+ */
+#if __INTEL_LLVM_COMPILER < 1000000L
+# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/100)
+# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER    % 10)
+#else
+# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/10000)
+# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/100 % 100)
+# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER     % 100)
+#endif
+#if defined(_MSC_VER)
+  /* _MSC_VER = VVRR */
+# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+#endif
+#if defined(__GNUC__)
+# define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
+#elif defined(__GNUG__)
+# define SIMULATE_VERSION_MAJOR DEC(__GNUG__)
+#endif
+#if defined(__GNUC_MINOR__)
+# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
+#endif
+#if defined(__GNUC_PATCHLEVEL__)
+# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
+#endif
+
+#elif defined(__PATHCC__)
+# define COMPILER_ID "PathScale"
+# define COMPILER_VERSION_MAJOR DEC(__PATHCC__)
+# define COMPILER_VERSION_MINOR DEC(__PATHCC_MINOR__)
+# if defined(__PATHCC_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__PATHCC_PATCHLEVEL__)
+# endif
+
+#elif defined(__BORLANDC__) && defined(__CODEGEARC_VERSION__)
+# define COMPILER_ID "Embarcadero"
+# define COMPILER_VERSION_MAJOR HEX(__CODEGEARC_VERSION__>>24 & 0x00FF)
+# define COMPILER_VERSION_MINOR HEX(__CODEGEARC_VERSION__>>16 & 0x00FF)
+# define COMPILER_VERSION_PATCH DEC(__CODEGEARC_VERSION__     & 0xFFFF)
+
+#elif defined(__BORLANDC__)
+# define COMPILER_ID "Borland"
+  /* __BORLANDC__ = 0xVRR */
+# define COMPILER_VERSION_MAJOR HEX(__BORLANDC__>>8)
+# define COMPILER_VERSION_MINOR HEX(__BORLANDC__ & 0xFF)
+
+#elif defined(__WATCOMC__) && __WATCOMC__ < 1200
+# define COMPILER_ID "Watcom"
+   /* __WATCOMC__ = VVRR */
+# define COMPILER_VERSION_MAJOR DEC(__WATCOMC__ / 100)
+# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
+# if (__WATCOMC__ % 10) > 0
+#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
+# endif
+
+#elif defined(__WATCOMC__)
+# define COMPILER_ID "OpenWatcom"
+   /* __WATCOMC__ = VVRP + 1100 */
+# define COMPILER_VERSION_MAJOR DEC((__WATCOMC__ - 1100) / 100)
+# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
+# if (__WATCOMC__ % 10) > 0
+#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
+# endif
+
+#elif defined(__SUNPRO_CC)
+# define COMPILER_ID "SunPro"
+# if __SUNPRO_CC >= 0x5100
+   /* __SUNPRO_CC = 0xVRRP */
+#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>12)
+#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xFF)
+#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC    & 0xF)
+# else
+   /* __SUNPRO_CC = 0xVRP */
+#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>8)
+#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xF)
+#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC    & 0xF)
+# endif
+
+#elif defined(__HP_aCC)
+# define COMPILER_ID "HP"
+  /* __HP_aCC = VVRRPP */
+# define COMPILER_VERSION_MAJOR DEC(__HP_aCC/10000)
+# define COMPILER_VERSION_MINOR DEC(__HP_aCC/100 % 100)
+# define COMPILER_VERSION_PATCH DEC(__HP_aCC     % 100)
+
+#elif defined(__DECCXX)
+# define COMPILER_ID "Compaq"
+  /* __DECCXX_VER = VVRRTPPPP */
+# define COMPILER_VERSION_MAJOR DEC(__DECCXX_VER/10000000)
+# define COMPILER_VERSION_MINOR DEC(__DECCXX_VER/100000  % 100)
+# define COMPILER_VERSION_PATCH DEC(__DECCXX_VER         % 10000)
+
+#elif defined(__IBMCPP__) && defined(__COMPILER_VER__)
+# define COMPILER_ID "zOS"
+  /* __IBMCPP__ = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
+# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
+
+#elif defined(__open_xl__) && defined(__clang__)
+# define COMPILER_ID "IBMClang"
+# define COMPILER_VERSION_MAJOR DEC(__open_xl_version__)
+# define COMPILER_VERSION_MINOR DEC(__open_xl_release__)
+# define COMPILER_VERSION_PATCH DEC(__open_xl_modification__)
+# define COMPILER_VERSION_TWEAK DEC(__open_xl_ptf_fix_level__)
+
+
+#elif defined(__ibmxl__) && defined(__clang__)
+# define COMPILER_ID "XLClang"
+# define COMPILER_VERSION_MAJOR DEC(__ibmxl_version__)
+# define COMPILER_VERSION_MINOR DEC(__ibmxl_release__)
+# define COMPILER_VERSION_PATCH DEC(__ibmxl_modification__)
+# define COMPILER_VERSION_TWEAK DEC(__ibmxl_ptf_fix_level__)
+
+
+#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ >= 800
+# define COMPILER_ID "XL"
+  /* __IBMCPP__ = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
+# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
+
+#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ < 800
+# define COMPILER_ID "VisualAge"
+  /* __IBMCPP__ = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
+# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
+
+#elif defined(__NVCOMPILER)
+# define COMPILER_ID "NVHPC"
+# define COMPILER_VERSION_MAJOR DEC(__NVCOMPILER_MAJOR__)
+# define COMPILER_VERSION_MINOR DEC(__NVCOMPILER_MINOR__)
+# if defined(__NVCOMPILER_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__NVCOMPILER_PATCHLEVEL__)
+# endif
+
+#elif defined(__PGI)
+# define COMPILER_ID "PGI"
+# define COMPILER_VERSION_MAJOR DEC(__PGIC__)
+# define COMPILER_VERSION_MINOR DEC(__PGIC_MINOR__)
+# if defined(__PGIC_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__PGIC_PATCHLEVEL__)
+# endif
+
+#elif defined(__clang__) && defined(__cray__)
+# define COMPILER_ID "CrayClang"
+# define COMPILER_VERSION_MAJOR DEC(__cray_major__)
+# define COMPILER_VERSION_MINOR DEC(__cray_minor__)
+# define COMPILER_VERSION_PATCH DEC(__cray_patchlevel__)
+# define COMPILER_VERSION_INTERNAL_STR __clang_version__
+
+
+#elif defined(_CRAYC)
+# define COMPILER_ID "Cray"
+# define COMPILER_VERSION_MAJOR DEC(_RELEASE_MAJOR)
+# define COMPILER_VERSION_MINOR DEC(_RELEASE_MINOR)
+
+#elif defined(__TI_COMPILER_VERSION__)
+# define COMPILER_ID "TI"
+  /* __TI_COMPILER_VERSION__ = VVVRRRPPP */
+# define COMPILER_VERSION_MAJOR DEC(__TI_COMPILER_VERSION__/1000000)
+# define COMPILER_VERSION_MINOR DEC(__TI_COMPILER_VERSION__/1000   % 1000)
+# define COMPILER_VERSION_PATCH DEC(__TI_COMPILER_VERSION__        % 1000)
+
+#elif defined(__CLANG_FUJITSU)
+# define COMPILER_ID "FujitsuClang"
+# define COMPILER_VERSION_MAJOR DEC(__FCC_major__)
+# define COMPILER_VERSION_MINOR DEC(__FCC_minor__)
+# define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__)
+# define COMPILER_VERSION_INTERNAL_STR __clang_version__
+
+
+#elif defined(__FUJITSU)
+# define COMPILER_ID "Fujitsu"
+# if defined(__FCC_version__)
+#   define COMPILER_VERSION __FCC_version__
+# elif defined(__FCC_major__)
+#   define COMPILER_VERSION_MAJOR DEC(__FCC_major__)
+#   define COMPILER_VERSION_MINOR DEC(__FCC_minor__)
+#   define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__)
+# endif
+# if defined(__fcc_version)
+#   define COMPILER_VERSION_INTERNAL DEC(__fcc_version)
+# elif defined(__FCC_VERSION)
+#   define COMPILER_VERSION_INTERNAL DEC(__FCC_VERSION)
+# endif
+
+
+#elif defined(__ghs__)
+# define COMPILER_ID "GHS"
+/* __GHS_VERSION_NUMBER = VVVVRP */
+# ifdef __GHS_VERSION_NUMBER
+# define COMPILER_VERSION_MAJOR DEC(__GHS_VERSION_NUMBER / 100)
+# define COMPILER_VERSION_MINOR DEC(__GHS_VERSION_NUMBER / 10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__GHS_VERSION_NUMBER      % 10)
+# endif
+
+#elif defined(__TASKING__)
+# define COMPILER_ID "Tasking"
+  # define COMPILER_VERSION_MAJOR DEC(__VERSION__/1000)
+  # define COMPILER_VERSION_MINOR DEC(__VERSION__ % 100)
+# define COMPILER_VERSION_INTERNAL DEC(__VERSION__)
+
+#elif defined(__ORANGEC__)
+# define COMPILER_ID "OrangeC"
+# define COMPILER_VERSION_MAJOR DEC(__ORANGEC_MAJOR__)
+# define COMPILER_VERSION_MINOR DEC(__ORANGEC_MINOR__)
+# define COMPILER_VERSION_PATCH DEC(__ORANGEC_PATCHLEVEL__)
+
+#elif defined(__SCO_VERSION__)
+# define COMPILER_ID "SCO"
+
+#elif defined(__ARMCC_VERSION) && !defined(__clang__)
+# define COMPILER_ID "ARMCC"
+#if __ARMCC_VERSION >= 1000000
+  /* __ARMCC_VERSION = VRRPPPP */
+  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/1000000)
+  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 100)
+  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION     % 10000)
+#else
+  /* __ARMCC_VERSION = VRPPPP */
+  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/100000)
+  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 10)
+  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION    % 10000)
+#endif
+
+
+#elif defined(__clang__) && defined(__apple_build_version__)
+# define COMPILER_ID "AppleClang"
+# if defined(_MSC_VER)
+#  define SIMULATE_ID "MSVC"
+# endif
+# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
+# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
+# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
+# if defined(_MSC_VER)
+   /* _MSC_VER = VVRR */
+#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# endif
+# define COMPILER_VERSION_TWEAK DEC(__apple_build_version__)
+
+#elif defined(__clang__) && defined(__ARMCOMPILER_VERSION)
+# define COMPILER_ID "ARMClang"
+  # define COMPILER_VERSION_MAJOR DEC(__ARMCOMPILER_VERSION/1000000)
+  # define COMPILER_VERSION_MINOR DEC(__ARMCOMPILER_VERSION/10000 % 100)
+  # define COMPILER_VERSION_PATCH DEC(__ARMCOMPILER_VERSION/100   % 100)
+# define COMPILER_VERSION_INTERNAL DEC(__ARMCOMPILER_VERSION)
+
+#elif defined(__clang__) && defined(__ti__)
+# define COMPILER_ID "TIClang"
+  # define COMPILER_VERSION_MAJOR DEC(__ti_major__)
+  # define COMPILER_VERSION_MINOR DEC(__ti_minor__)
+  # define COMPILER_VERSION_PATCH DEC(__ti_patchlevel__)
+# define COMPILER_VERSION_INTERNAL DEC(__ti_version__)
+
+#elif defined(__clang__)
+# define COMPILER_ID "Clang"
+# if defined(_MSC_VER)
+#  define SIMULATE_ID "MSVC"
+# endif
+# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
+# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
+# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
+# if defined(_MSC_VER)
+   /* _MSC_VER = VVRR */
+#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# endif
+
+#elif defined(__LCC__) && (defined(__GNUC__) || defined(__GNUG__) || defined(__MCST__))
+# define COMPILER_ID "LCC"
+# define COMPILER_VERSION_MAJOR DEC(__LCC__ / 100)
+# define COMPILER_VERSION_MINOR DEC(__LCC__ % 100)
+# if defined(__LCC_MINOR__)
+#  define COMPILER_VERSION_PATCH DEC(__LCC_MINOR__)
+# endif
+# if defined(__GNUC__) && defined(__GNUC_MINOR__)
+#  define SIMULATE_ID "GNU"
+#  define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
+#  define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
+#  if defined(__GNUC_PATCHLEVEL__)
+#   define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
+#  endif
+# endif
+
+#elif defined(__GNUC__) || defined(__GNUG__)
+# define COMPILER_ID "GNU"
+# if defined(__GNUC__)
+#  define COMPILER_VERSION_MAJOR DEC(__GNUC__)
+# else
+#  define COMPILER_VERSION_MAJOR DEC(__GNUG__)
+# endif
+# if defined(__GNUC_MINOR__)
+#  define COMPILER_VERSION_MINOR DEC(__GNUC_MINOR__)
+# endif
+# if defined(__GNUC_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
+# endif
+
+#elif defined(_MSC_VER)
+# define COMPILER_ID "MSVC"
+  /* _MSC_VER = VVRR */
+# define COMPILER_VERSION_MAJOR DEC(_MSC_VER / 100)
+# define COMPILER_VERSION_MINOR DEC(_MSC_VER % 100)
+# if defined(_MSC_FULL_VER)
+#  if _MSC_VER >= 1400
+    /* _MSC_FULL_VER = VVRRPPPPP */
+#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 100000)
+#  else
+    /* _MSC_FULL_VER = VVRRPPPP */
+#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 10000)
+#  endif
+# endif
+# if defined(_MSC_BUILD)
+#  define COMPILER_VERSION_TWEAK DEC(_MSC_BUILD)
+# endif
+
+#elif defined(_ADI_COMPILER)
+# define COMPILER_ID "ADSP"
+#if defined(__VERSIONNUM__)
+  /* __VERSIONNUM__ = 0xVVRRPPTT */
+#  define COMPILER_VERSION_MAJOR DEC(__VERSIONNUM__ >> 24 & 0xFF)
+#  define COMPILER_VERSION_MINOR DEC(__VERSIONNUM__ >> 16 & 0xFF)
+#  define COMPILER_VERSION_PATCH DEC(__VERSIONNUM__ >> 8 & 0xFF)
+#  define COMPILER_VERSION_TWEAK DEC(__VERSIONNUM__ & 0xFF)
+#endif
+
+#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
+# define COMPILER_ID "IAR"
+# if defined(__VER__) && defined(__ICCARM__)
+#  define COMPILER_VERSION_MAJOR DEC((__VER__) / 1000000)
+#  define COMPILER_VERSION_MINOR DEC(((__VER__) / 1000) % 1000)
+#  define COMPILER_VERSION_PATCH DEC((__VER__) % 1000)
+#  define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
+# elif defined(__VER__) && (defined(__ICCAVR__) || defined(__ICCRX__) || defined(__ICCRH850__) || defined(__ICCRL78__) || defined(__ICC430__) || defined(__ICCRISCV__) || defined(__ICCV850__) || defined(__ICC8051__) || defined(__ICCSTM8__))
+#  define COMPILER_VERSION_MAJOR DEC((__VER__) / 100)
+#  define COMPILER_VERSION_MINOR DEC((__VER__) - (((__VER__) / 100)*100))
+#  define COMPILER_VERSION_PATCH DEC(__SUBVERSION__)
+#  define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
+# endif
+
+
+/* These compilers are either not known or too old to define an
+  identification macro.  Try to identify the platform and guess that
+  it is the native compiler.  */
+#elif defined(__hpux) || defined(__hpua)
+# define COMPILER_ID "HP"
+
+#else /* unknown compiler */
+# define COMPILER_ID ""
+#endif
+
+/* Construct the string literal in pieces to prevent the source from
+   getting matched.  Store it in a pointer rather than an array
+   because some compilers will just produce instructions to fill the
+   array rather than assigning a pointer to a static array.  */
+char const* info_compiler = "INFO" ":" "compiler[" COMPILER_ID "]";
+#ifdef SIMULATE_ID
+char const* info_simulate = "INFO" ":" "simulate[" SIMULATE_ID "]";
+#endif
+
+#ifdef __QNXNTO__
+char const* qnxnto = "INFO" ":" "qnxnto[]";
+#endif
+
+#if defined(__CRAYXT_COMPUTE_LINUX_TARGET)
+char const *info_cray = "INFO" ":" "compiler_wrapper[CrayPrgEnv]";
+#endif
+
+#define STRINGIFY_HELPER(X) #X
+#define STRINGIFY(X) STRINGIFY_HELPER(X)
+
+/* Identify known platforms by name.  */
+#if defined(__linux) || defined(__linux__) || defined(linux)
+# define PLATFORM_ID "Linux"
+
+#elif defined(__MSYS__)
+# define PLATFORM_ID "MSYS"
+
+#elif defined(__CYGWIN__)
+# define PLATFORM_ID "Cygwin"
+
+#elif defined(__MINGW32__)
+# define PLATFORM_ID "MinGW"
+
+#elif defined(__APPLE__)
+# define PLATFORM_ID "Darwin"
+
+#elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
+# define PLATFORM_ID "Windows"
+
+#elif defined(__FreeBSD__) || defined(__FreeBSD)
+# define PLATFORM_ID "FreeBSD"
+
+#elif defined(__NetBSD__) || defined(__NetBSD)
+# define PLATFORM_ID "NetBSD"
+
+#elif defined(__OpenBSD__) || defined(__OPENBSD)
+# define PLATFORM_ID "OpenBSD"
+
+#elif defined(__sun) || defined(sun)
+# define PLATFORM_ID "SunOS"
+
+#elif defined(_AIX) || defined(__AIX) || defined(__AIX__) || defined(__aix) || defined(__aix__)
+# define PLATFORM_ID "AIX"
+
+#elif defined(__hpux) || defined(__hpux__)
+# define PLATFORM_ID "HP-UX"
+
+#elif defined(__HAIKU__)
+# define PLATFORM_ID "Haiku"
+
+#elif defined(__BeOS) || defined(__BEOS__) || defined(_BEOS)
+# define PLATFORM_ID "BeOS"
+
+#elif defined(__QNX__) || defined(__QNXNTO__)
+# define PLATFORM_ID "QNX"
+
+#elif defined(__tru64) || defined(_tru64) || defined(__TRU64__)
+# define PLATFORM_ID "Tru64"
+
+#elif defined(__riscos) || defined(__riscos__)
+# define PLATFORM_ID "RISCos"
+
+#elif defined(__sinix) || defined(__sinix__) || defined(__SINIX__)
+# define PLATFORM_ID "SINIX"
+
+#elif defined(__UNIX_SV__)
+# define PLATFORM_ID "UNIX_SV"
+
+#elif defined(__bsdos__)
+# define PLATFORM_ID "BSDOS"
+
+#elif defined(_MPRAS) || defined(MPRAS)
+# define PLATFORM_ID "MP-RAS"
+
+#elif defined(__osf) || defined(__osf__)
+# define PLATFORM_ID "OSF1"
+
+#elif defined(_SCO_SV) || defined(SCO_SV) || defined(sco_sv)
+# define PLATFORM_ID "SCO_SV"
+
+#elif defined(__ultrix) || defined(__ultrix__) || defined(_ULTRIX)
+# define PLATFORM_ID "ULTRIX"
+
+#elif defined(__XENIX__) || defined(_XENIX) || defined(XENIX)
+# define PLATFORM_ID "Xenix"
+
+#elif defined(__WATCOMC__)
+# if defined(__LINUX__)
+#  define PLATFORM_ID "Linux"
+
+# elif defined(__DOS__)
+#  define PLATFORM_ID "DOS"
+
+# elif defined(__OS2__)
+#  define PLATFORM_ID "OS2"
+
+# elif defined(__WINDOWS__)
+#  define PLATFORM_ID "Windows3x"
+
+# elif defined(__VXWORKS__)
+#  define PLATFORM_ID "VxWorks"
+
+# else /* unknown platform */
+#  define PLATFORM_ID
+# endif
+
+#elif defined(__INTEGRITY)
+# if defined(INT_178B)
+#  define PLATFORM_ID "Integrity178"
+
+# else /* regular Integrity */
+#  define PLATFORM_ID "Integrity"
+# endif
+
+# elif defined(_ADI_COMPILER)
+#  define PLATFORM_ID "ADSP"
+
+#else /* unknown platform */
+# define PLATFORM_ID
+
+#endif
+
+/* For windows compilers MSVC and Intel we can determine
+   the architecture of the compiler being used.  This is because
+   the compilers do not have flags that can change the architecture,
+   but rather depend on which compiler is being used
+*/
+#if defined(_WIN32) && defined(_MSC_VER)
+# if defined(_M_IA64)
+#  define ARCHITECTURE_ID "IA64"
+
+# elif defined(_M_ARM64EC)
+#  define ARCHITECTURE_ID "ARM64EC"
+
+# elif defined(_M_X64) || defined(_M_AMD64)
+#  define ARCHITECTURE_ID "x64"
+
+# elif defined(_M_IX86)
+#  define ARCHITECTURE_ID "X86"
+
+# elif defined(_M_ARM64)
+#  define ARCHITECTURE_ID "ARM64"
+
+# elif defined(_M_ARM)
+#  if _M_ARM == 4
+#   define ARCHITECTURE_ID "ARMV4I"
+#  elif _M_ARM == 5
+#   define ARCHITECTURE_ID "ARMV5I"
+#  else
+#   define ARCHITECTURE_ID "ARMV" STRINGIFY(_M_ARM)
+#  endif
+
+# elif defined(_M_MIPS)
+#  define ARCHITECTURE_ID "MIPS"
+
+# elif defined(_M_SH)
+#  define ARCHITECTURE_ID "SHx"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__WATCOMC__)
+# if defined(_M_I86)
+#  define ARCHITECTURE_ID "I86"
+
+# elif defined(_M_IX86)
+#  define ARCHITECTURE_ID "X86"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
+# if defined(__ICCARM__)
+#  define ARCHITECTURE_ID "ARM"
+
+# elif defined(__ICCRX__)
+#  define ARCHITECTURE_ID "RX"
+
+# elif defined(__ICCRH850__)
+#  define ARCHITECTURE_ID "RH850"
+
+# elif defined(__ICCRL78__)
+#  define ARCHITECTURE_ID "RL78"
+
+# elif defined(__ICCRISCV__)
+#  define ARCHITECTURE_ID "RISCV"
+
+# elif defined(__ICCAVR__)
+#  define ARCHITECTURE_ID "AVR"
+
+# elif defined(__ICC430__)
+#  define ARCHITECTURE_ID "MSP430"
+
+# elif defined(__ICCV850__)
+#  define ARCHITECTURE_ID "V850"
+
+# elif defined(__ICC8051__)
+#  define ARCHITECTURE_ID "8051"
+
+# elif defined(__ICCSTM8__)
+#  define ARCHITECTURE_ID "STM8"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__ghs__)
+# if defined(__PPC64__)
+#  define ARCHITECTURE_ID "PPC64"
+
+# elif defined(__ppc__)
+#  define ARCHITECTURE_ID "PPC"
+
+# elif defined(__ARM__)
+#  define ARCHITECTURE_ID "ARM"
+
+# elif defined(__x86_64__)
+#  define ARCHITECTURE_ID "x64"
+
+# elif defined(__i386__)
+#  define ARCHITECTURE_ID "X86"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__clang__) && defined(__ti__)
+# if defined(__ARM_ARCH)
+#  define ARCHITECTURE_ID "ARM"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__TI_COMPILER_VERSION__)
+# if defined(__TI_ARM__)
+#  define ARCHITECTURE_ID "ARM"
+
+# elif defined(__MSP430__)
+#  define ARCHITECTURE_ID "MSP430"
+
+# elif defined(__TMS320C28XX__)
+#  define ARCHITECTURE_ID "TMS320C28x"
+
+# elif defined(__TMS320C6X__) || defined(_TMS320C6X)
+#  define ARCHITECTURE_ID "TMS320C6x"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+# elif defined(__ADSPSHARC__)
+#  define ARCHITECTURE_ID "SHARC"
+
+# elif defined(__ADSPBLACKFIN__)
+#  define ARCHITECTURE_ID "Blackfin"
+
+#elif defined(__TASKING__)
+
+# if defined(__CTC__) || defined(__CPTC__)
+#  define ARCHITECTURE_ID "TriCore"
+
+# elif defined(__CMCS__)
+#  define ARCHITECTURE_ID "MCS"
+
+# elif defined(__CARM__)
+#  define ARCHITECTURE_ID "ARM"
+
+# elif defined(__CARC__)
+#  define ARCHITECTURE_ID "ARC"
+
+# elif defined(__C51__)
+#  define ARCHITECTURE_ID "8051"
+
+# elif defined(__CPCP__)
+#  define ARCHITECTURE_ID "PCP"
+
+# else
+#  define ARCHITECTURE_ID ""
+# endif
+
+#else
+#  define ARCHITECTURE_ID
+#endif
+
+/* Convert integer to decimal digit literals.  */
+#define DEC(n)                   \
+  ('0' + (((n) / 10000000)%10)), \
+  ('0' + (((n) / 1000000)%10)),  \
+  ('0' + (((n) / 100000)%10)),   \
+  ('0' + (((n) / 10000)%10)),    \
+  ('0' + (((n) / 1000)%10)),     \
+  ('0' + (((n) / 100)%10)),      \
+  ('0' + (((n) / 10)%10)),       \
+  ('0' +  ((n) % 10))
+
+/* Convert integer to hex digit literals.  */
+#define HEX(n)             \
+  ('0' + ((n)>>28 & 0xF)), \
+  ('0' + ((n)>>24 & 0xF)), \
+  ('0' + ((n)>>20 & 0xF)), \
+  ('0' + ((n)>>16 & 0xF)), \
+  ('0' + ((n)>>12 & 0xF)), \
+  ('0' + ((n)>>8  & 0xF)), \
+  ('0' + ((n)>>4  & 0xF)), \
+  ('0' + ((n)     & 0xF))
+
+/* Construct a string literal encoding the version number. */
+#ifdef COMPILER_VERSION
+char const* info_version = "INFO" ":" "compiler_version[" COMPILER_VERSION "]";
+
+/* Construct a string literal encoding the version number components. */
+#elif defined(COMPILER_VERSION_MAJOR)
+char const info_version[] = {
+  'I', 'N', 'F', 'O', ':',
+  'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','[',
+  COMPILER_VERSION_MAJOR,
+# ifdef COMPILER_VERSION_MINOR
+  '.', COMPILER_VERSION_MINOR,
+#  ifdef COMPILER_VERSION_PATCH
+   '.', COMPILER_VERSION_PATCH,
+#   ifdef COMPILER_VERSION_TWEAK
+    '.', COMPILER_VERSION_TWEAK,
+#   endif
+#  endif
+# endif
+  ']','\0'};
+#endif
+
+/* Construct a string literal encoding the internal version number. */
+#ifdef COMPILER_VERSION_INTERNAL
+char const info_version_internal[] = {
+  'I', 'N', 'F', 'O', ':',
+  'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','_',
+  'i','n','t','e','r','n','a','l','[',
+  COMPILER_VERSION_INTERNAL,']','\0'};
+#elif defined(COMPILER_VERSION_INTERNAL_STR)
+char const* info_version_internal = "INFO" ":" "compiler_version_internal[" COMPILER_VERSION_INTERNAL_STR "]";
+#endif
+
+/* Construct a string literal encoding the version number components. */
+#ifdef SIMULATE_VERSION_MAJOR
+char const info_simulate_version[] = {
+  'I', 'N', 'F', 'O', ':',
+  's','i','m','u','l','a','t','e','_','v','e','r','s','i','o','n','[',
+  SIMULATE_VERSION_MAJOR,
+# ifdef SIMULATE_VERSION_MINOR
+  '.', SIMULATE_VERSION_MINOR,
+#  ifdef SIMULATE_VERSION_PATCH
+   '.', SIMULATE_VERSION_PATCH,
+#   ifdef SIMULATE_VERSION_TWEAK
+    '.', SIMULATE_VERSION_TWEAK,
+#   endif
+#  endif
+# endif
+  ']','\0'};
+#endif
+
+/* Construct the string literal in pieces to prevent the source from
+   getting matched.  Store it in a pointer rather than an array
+   because some compilers will just produce instructions to fill the
+   array rather than assigning a pointer to a static array.  */
+char const* info_platform = "INFO" ":" "platform[" PLATFORM_ID "]";
+char const* info_arch = "INFO" ":" "arch[" ARCHITECTURE_ID "]";
+
+
+
+#define CXX_STD_98 199711L
+#define CXX_STD_11 201103L
+#define CXX_STD_14 201402L
+#define CXX_STD_17 201703L
+#define CXX_STD_20 202002L
+#define CXX_STD_23 202302L
+
+#if defined(__INTEL_COMPILER) && defined(_MSVC_LANG)
+#  if _MSVC_LANG > CXX_STD_17
+#    define CXX_STD _MSVC_LANG
+#  elif _MSVC_LANG == CXX_STD_17 && defined(__cpp_aggregate_paren_init)
+#    define CXX_STD CXX_STD_20
+#  elif _MSVC_LANG > CXX_STD_14 && __cplusplus > CXX_STD_17
+#    define CXX_STD CXX_STD_20
+#  elif _MSVC_LANG > CXX_STD_14
+#    define CXX_STD CXX_STD_17
+#  elif defined(__INTEL_CXX11_MODE__) && defined(__cpp_aggregate_nsdmi)
+#    define CXX_STD CXX_STD_14
+#  elif defined(__INTEL_CXX11_MODE__)
+#    define CXX_STD CXX_STD_11
+#  else
+#    define CXX_STD CXX_STD_98
+#  endif
+#elif defined(_MSC_VER) && defined(_MSVC_LANG)
+#  if _MSVC_LANG > __cplusplus
+#    define CXX_STD _MSVC_LANG
+#  else
+#    define CXX_STD __cplusplus
+#  endif
+#elif defined(__NVCOMPILER)
+#  if __cplusplus == CXX_STD_17 && defined(__cpp_aggregate_paren_init)
+#    define CXX_STD CXX_STD_20
+#  else
+#    define CXX_STD __cplusplus
+#  endif
+#elif defined(__INTEL_COMPILER) || defined(__PGI)
+#  if __cplusplus == CXX_STD_11 && defined(__cpp_namespace_attributes)
+#    define CXX_STD CXX_STD_17
+#  elif __cplusplus == CXX_STD_11 && defined(__cpp_aggregate_nsdmi)
+#    define CXX_STD CXX_STD_14
+#  else
+#    define CXX_STD __cplusplus
+#  endif
+#elif (defined(__IBMCPP__) || defined(__ibmxl__)) && defined(__linux__)
+#  if __cplusplus == CXX_STD_11 && defined(__cpp_aggregate_nsdmi)
+#    define CXX_STD CXX_STD_14
+#  else
+#    define CXX_STD __cplusplus
+#  endif
+#elif __cplusplus == 1 && defined(__GXX_EXPERIMENTAL_CXX0X__)
+#  define CXX_STD CXX_STD_11
+#else
+#  define CXX_STD __cplusplus
+#endif
+
+const char* info_language_standard_default = "INFO" ":" "standard_default["
+#if CXX_STD > CXX_STD_23
+  "26"
+#elif CXX_STD > CXX_STD_20
+  "23"
+#elif CXX_STD > CXX_STD_17
+  "20"
+#elif CXX_STD > CXX_STD_14
+  "17"
+#elif CXX_STD > CXX_STD_11
+  "14"
+#elif CXX_STD >= CXX_STD_11
+  "11"
+#else
+  "98"
+#endif
+"]";
+
+const char* info_language_extensions_default = "INFO" ":" "extensions_default["
+#if (defined(__clang__) || defined(__GNUC__) || defined(__xlC__) ||           \
+     defined(__TI_COMPILER_VERSION__)) &&                                     \
+  !defined(__STRICT_ANSI__)
+  "ON"
+#else
+  "OFF"
+#endif
+"]";
+
+/*--------------------------------------------------------------------------*/
+
+int main(int argc, char* argv[])
+{
+  int require = 0;
+  require += info_compiler[argc];
+  require += info_platform[argc];
+  require += info_arch[argc];
+#ifdef COMPILER_VERSION_MAJOR
+  require += info_version[argc];
+#endif
+#ifdef COMPILER_VERSION_INTERNAL
+  require += info_version_internal[argc];
+#endif
+#ifdef SIMULATE_ID
+  require += info_simulate[argc];
+#endif
+#ifdef SIMULATE_VERSION_MAJOR
+  require += info_simulate_version[argc];
+#endif
+#if defined(__CRAYXT_COMPUTE_LINUX_TARGET)
+  require += info_cray[argc];
+#endif
+  require += info_language_standard_default[argc];
+  require += info_language_extensions_default[argc];
+  (void)argv;
+  return require;
+}
diff --git a/build_test/CMakeFiles/3.31.6/CompilerIdCXX/a.out b/build_test/CMakeFiles/3.31.6/CompilerIdCXX/a.out
new file mode 100755
index 0000000000000000000000000000000000000000..c8ced32cf082708045baa23211fbf858c298928d
GIT binary patch
literal 16096
zcmeHOeQX>@6`woj!=X-macg3d(k!8=99nPAj^nz8kaO&_*T^4f;*@}ER%_qdcj7+G
z-X66pNQ2TsjBC`;3i?Npq6&ckRRRf$sMO%Js8y?i5($YQ0Wu#EK}uUAK4e1V<Gq>p
z*6ZaQ1oRIi_F3LH@Ap1t_RZ|x?C#9N$-eGrBqErq#0LdRiI_qXq&Ryw6@Vo~yVwlJ
zcZ*xa29VcDOz9JffmYF_=xSa~colH;YrsMUeyf6^21VRL<mk5+rLjRk%mtkX`mIL=
z$wB^Ws(?A`z4|nC2GZow<ByRabH5)pWwA-wFCJLU4a&=5;_Qc_JOy3ZLw6`5K2P;A
z=X_#L@V}k%8RT&a!#wDhCchx>B0uI>2h!2YZt6d&?=bnjuE{VW$nR3HV9xd32Y%GG
zWN~B0-F$@VTdN;plz--wUa>cu8EtFbn@u%kGx^d~(^Pv~Q(LQEEa)w=Vr-WN|2U?4
z295~`GmjXhQAAHFnd71E7Sf~r3)WM^-*Yd|tslBNKJntNUw+`kwO7yv+l@YGgM{&T
zh@gyRtP^ciK0X5_8r#4x+CRxjV2uO%)m6}S0;W~K%{B1+8u-nC@2U_-m?mU&%q+T=
z<C-}ulLusM$}-0@c`KWF$QG!^{I-dnzTQKfW{cjU@Au04T7}s=)NiJ2$DYU(UE3Mz
z@5~nR_K-E2wIS9-u8^nbrZTN)h#8E?Kh;wakg>fyUP{|Dn=tD*{t)}_nJ+<_qj1Ml
z#Md!jKiXD>FVXeQ_yPs2PAEO&EXM-4rYXCI0PYa31@O-i-Wb52AUqzxpC$a#K_Lmp
z4vqz;1s{%MjOmIG=dq2tMIVmimTAd{%lj=WLLO!y%s`ldFau!*!VH8N2s7|Mk%2$e
z-geD6b+y`<UH|jFLKu(EyV3Fm<J6C;Uy|)B?|%m1^6sy~v36%dpnZAwIgrL{cXkOW
zH^0$4bMa%w%x{cSzgs*!lx&`Fe$|*e@EQat*B8O`&*OUS&PQZCz|R9>%&mVO**!~c
zJyd-^mZ9oR<%QavC(-aF;$VM9+VB57vOUYj%%XAr&4b4Ir79!xvT<?Qy#)g7rU2FD
z1=TM0$M&8)&<|=+y7QQE>Od5W#>{26#+W^@0fZ}i%H{Hv6dYcbVIm{o>(!6`e|Qj-
zSU3iLGoQX{%#;>hNnXch8ngAU!IS!I@~ZKa5xG$NoTxoFA4y&Z{P{KTZ&t!pfVui-
zw?LYoTNm@9JW|OTqPvyw+2r*R=r(Ms>{G87v8f@283;2FW+2Q!n1L_@VFtnsgc%4k
z5N06E!2fdw@cY+|sCS@y@ZPaPZZea#oniPYIkMV%mEQcM?G!VG{BT@S^FCb_;$9&>
zBBaM;)^f)SPHwmlzpfH!Ib-QzD#Lfee9CfC@WF4~DrMc_=DSH_Pq}s;YbkoV!2#K-
z$d0P_H$wC9d(_Zd<?;i-Q^4`fg9{v9SBR0ta`|cC_$?MG^3V|xnTkbr)NHJN96pF4
zj%yAY!Tt_3=-Md1<lPR%R`_3hvs{+ImRR?eh7Z-=^kDT#ad7)R@7s4fenyo3Snnma
zLl6jKy72!4i2Dr$l3QY*jdpI{5IqYuBM?%UfiMGM2Eq)483;2FW+2Q!n1L_@VFupb
z4DfnIUZ2Qo0Oi9AR8_;((fY;BB>$AwIlhZzUI)2@WPXI%PBO2D#OEF)*8gR>TtNBT
zw3v|B2&VC&4G7mIB3&Z=JCrC+6TgXg1Mzy|%*aj5(>lbBq=-{R+>UlSaaimriR0Zy
zGTZ&VtlA6a5?Ur%EhdK#+$(zN36GcZ{1)ka{zfv#qwsGZ<MrYHWkg<=s%a_^uRG;+
zro66{*OB&gcHXNs9vdy?-I4|m`tXF`)K-#W%ZZj&J>I&9;2Sp#yJ4O9V>xJr{SpDq
zW7MG<8Q}WjO7_@qQL#l#(zqpap%H#IfbS!muLHL4g+fF$i1vg+uzg6l8ao0{_dKp8
z2!~I>Ki13F72~I&5D_;EzD^kbIut6k|D3dsiG-#sTNHx`mF+J89)XqIr{6<{K2|CI
zucSR(ErId!d+E2;TZhkKu1WiMde;%-F-S-q3qIZixaO0&cwFM!gh()=crV~FvCYdf
zYYzin7p)b1zhV4-vJb`?lkwSVg*$+6jcyY>u37Ui;!v~D6hfD&_=3c@iQxL{rwI?P
zr+xwO7>tudf+H*b0N`~n9uhR(<U1r#y-0ClWY7153lxXP8%O&E#o0smUHQ%kl(;_y
z&nsyE2E}g-#IK2Zr^=xvzXR}Hs}Lo00A3e`yKLZk=>dEz^p}=UcHDk(bj)#^^#ZKG
zw?;FjYfT6Mif(CqTptrFtMyGcXO7`|{UTVV3g$$%FluGZlv{9$rd65}_>M7ayLL*C
zSGK^N0vXeC9BbON^R6>3#vLnXo2gPRHw`X6$plMxm1$?c^>MrN`0-A9li8cn$0jF*
z`O&`SmP~%Uz;7-gPWO?H{-l{4=rUm+LDxqHI{JG%0ftwfX3`+7(RD<aJ$-|RI{M7P
z?(U<>A#<qXP+t-}g4-Mtyqn=)?O?D|mTL)lmJkI6wVeTk)q5MvRIy;D;q@r)d*~em
zt5ha$mWp;t$W!5Wt4hjR`H7M>VVnQ_-c&#y$%o(YLS>`HB2`SgG+?6zr9+1I0tR2v
z-eA|o>a8ALN^paR>?_q&eE%ziUYyRk)+lh-Q9RA1Odj@qObR_;aBY1eU(zR?!ldoE
z(>`dllz~k<nG``ChkBcEP)hT(RZI&#HJyhl6n7n^p%>Sy1QT?Qowd+G=s2W=KABYq
zeWCyb7ji0e9G75Oko~9IX&Q;?6!^2G{MC?D9$bdtRxUFJ&B5;1A^Spy-pIiauW)((
z+Yrvr;MU;1<qz(+<M|l}Mq59<7X+L`!R0S$t$k&r_U3skw?V=0AKYJt@74Xp_hZKJ
z_t@{x^8w}>8xjxte;Dw;!W@j-&+|^^TtCk{z55!)vw-8All^&K%KUM%!!}~>*q`T<
z8NhG~!~Q(aWqulTehTLQ6QIO7Cj0Zek~z=Ux&3U%`~>*poRwvsw=$1Y<-zuIo93W^
zIc0yIM>FSnG}j+I|1X0to)hc6-xd0O;pYc1kreE|uK?=z*T|1KiR8WVv&Hx`0slBD
zn6n)RV43;10{#h7F#lqp!`P4GeJ9}0^BU&-e8u*`^Z!2ibN+=!mc(Brkr}}(iXTD=
zo5=pJlL7O)JWEvw*8gLG{r*ej&-}@NKleYwKZ63SY4!F+@_d;0V+QS6X8v37t@Ziy
z{ClYhKp?hL(u&OZTcE(PM~@LJ^Iup$i!@LDhvOfK{kR{$1{j*KKR;K_??r1N67slm
zV1MRIpz`~B4sqqvzTzrN?8opj6cFS3dEVDf{y}>>9d;L003b%@9?t%EdWb5pzn}Bi
z@tdY8Am0b^I>u)eZV%u8HUY+M_xmUCV=B;nf#6)P(&C)6vi}+UVF9WMI0QuT55M$T
ASpWb4

literal 0
HcmV?d00001

diff --git a/build_test/CMakeFiles/CMakeConfigureLog.yaml b/build_test/CMakeFiles/CMakeConfigureLog.yaml
new file mode 100644
index 000000000..5bbed262c
--- /dev/null
+++ b/build_test/CMakeFiles/CMakeConfigureLog.yaml
@@ -0,0 +1,294 @@
+
+---
+events:
+  -
+    kind: "message-v1"
+    backtrace:
+      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineSystem.cmake:205 (message)"
+      - "CMakeLists.txt:5 (project)"
+    message: |
+      The system is: Linux - 6.11.0-1018-azure - x86_64
+  -
+    kind: "message-v1"
+    backtrace:
+      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCompilerId.cmake:17 (message)"
+      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCompilerId.cmake:64 (__determine_compiler_id_test)"
+      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCXXCompiler.cmake:126 (CMAKE_DETERMINE_COMPILER_ID)"
+      - "CMakeLists.txt:5 (project)"
+    message: |
+      Compiling the CXX compiler identification source file "CMakeCXXCompilerId.cpp" succeeded.
+      Compiler: /usr/bin/c++ 
+      Build flags: 
+      Id flags:  
+      
+      The output was:
+      0
+      
+      
+      Compilation of the CXX compiler identification source "CMakeCXXCompilerId.cpp" produced "a.out"
+      
+      The CXX compiler identification is GNU, found in:
+        /home/runner/work/mscclpp/mscclpp/build_test/CMakeFiles/3.31.6/CompilerIdCXX/a.out
+      
+  -
+    kind: "try_compile-v1"
+    backtrace:
+      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCompilerABI.cmake:74 (try_compile)"
+      - "/usr/local/share/cmake-3.31/Modules/CMakeTestCXXCompiler.cmake:26 (CMAKE_DETERMINE_COMPILER_ABI)"
+      - "CMakeLists.txt:5 (project)"
+    checks:
+      - "Detecting CXX compiler ABI info"
+    directories:
+      source: "/home/runner/work/mscclpp/mscclpp/build_test/CMakeFiles/CMakeScratch/TryCompile-8luNu3"
+      binary: "/home/runner/work/mscclpp/mscclpp/build_test/CMakeFiles/CMakeScratch/TryCompile-8luNu3"
+    cmakeVariables:
+      CMAKE_CXX_FLAGS: ""
+      CMAKE_CXX_FLAGS_DEBUG: "-g"
+      CMAKE_CXX_SCAN_FOR_MODULES: "OFF"
+      CMAKE_EXE_LINKER_FLAGS: ""
+    buildResult:
+      variable: "CMAKE_CXX_ABI_COMPILED"
+      cached: true
+      stdout: |
+        Change Dir: '/home/runner/work/mscclpp/mscclpp/build_test/CMakeFiles/CMakeScratch/TryCompile-8luNu3'
+        
+        Run Build Command(s): /usr/local/bin/cmake -E env VERBOSE=1 /usr/bin/gmake -f Makefile cmTC_ba2ae/fast
+        /usr/bin/gmake  -f CMakeFiles/cmTC_ba2ae.dir/build.make CMakeFiles/cmTC_ba2ae.dir/build
+        gmake[1]: Entering directory '/home/runner/work/mscclpp/mscclpp/build_test/CMakeFiles/CMakeScratch/TryCompile-8luNu3'
+        Building CXX object CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o
+        /usr/bin/c++   -v -o CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o -c /usr/local/share/cmake-3.31/Modules/CMakeCXXCompilerABI.cpp
+        Using built-in specs.
+        COLLECT_GCC=/usr/bin/c++
+        OFFLOAD_TARGET_NAMES=nvptx-none:amdgcn-amdhsa
+        OFFLOAD_TARGET_DEFAULT=1
+        Target: x86_64-linux-gnu
+        Configured with: ../src/configure -v --with-pkgversion='Ubuntu 13.3.0-6ubuntu2~24.04' --with-bugurl=file:///usr/share/doc/gcc-13/README.Bugs --enable-languages=c,ada,c++,go,d,fortran,objc,obj-c++,m2 --prefix=/usr --with-gcc-major-version-only --program-suffix=-13 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/libexec --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --enable-bootstrap --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-libstdcxx-backtrace --enable-gnu-unique-object --disable-vtable-verify --enable-plugin --enable-default-pie --with-system-zlib --enable-libphobos-checking=release --with-target-system-zlib=auto --enable-objc-gc=auto --enable-multiarch --disable-werror --enable-cet --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-nvptx/usr,amdgcn-amdhsa=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-gcn/usr --enable-offload-defaulted --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu --with-build-config=bootstrap-lto-lean --enable-link-serialization=2
+        Thread model: posix
+        Supported LTO compression algorithms: zlib zstd
+        gcc version 13.3.0 (Ubuntu 13.3.0-6ubuntu2~24.04) 
+        COLLECT_GCC_OPTIONS='-v' '-o' 'CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o' '-c' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'CMakeFiles/cmTC_ba2ae.dir/'
+         /usr/libexec/gcc/x86_64-linux-gnu/13/cc1plus -quiet -v -imultiarch x86_64-linux-gnu -D_GNU_SOURCE /usr/local/share/cmake-3.31/Modules/CMakeCXXCompilerABI.cpp -quiet -dumpdir CMakeFiles/cmTC_ba2ae.dir/ -dumpbase CMakeCXXCompilerABI.cpp.cpp -dumpbase-ext .cpp -mtune=generic -march=x86-64 -version -fasynchronous-unwind-tables -fstack-protector-strong -Wformat -Wformat-security -fstack-clash-protection -fcf-protection -o /tmp/cckrLaf7.s
+        GNU C++17 (Ubuntu 13.3.0-6ubuntu2~24.04) version 13.3.0 (x86_64-linux-gnu)
+        	compiled by GNU C version 13.3.0, GMP version 6.3.0, MPFR version 4.2.1, MPC version 1.3.1, isl version isl-0.26-GMP
+        
+        GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
+        ignoring duplicate directory "/usr/include/x86_64-linux-gnu/c++/13"
+        ignoring nonexistent directory "/usr/local/include/x86_64-linux-gnu"
+        ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/13/include-fixed/x86_64-linux-gnu"
+        ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/13/include-fixed"
+        ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/13/../../../../x86_64-linux-gnu/include"
+        #include "..." search starts here:
+        #include <...> search starts here:
+         /usr/include/c++/13
+         /usr/include/x86_64-linux-gnu/c++/13
+         /usr/include/c++/13/backward
+         /usr/lib/gcc/x86_64-linux-gnu/13/include
+         /usr/local/include
+         /usr/include/x86_64-linux-gnu
+         /usr/include
+        End of search list.
+        Compiler executable checksum: c81c05345ce537099dafd5580045814a
+        COLLECT_GCC_OPTIONS='-v' '-o' 'CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o' '-c' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'CMakeFiles/cmTC_ba2ae.dir/'
+         as -v --64 -o CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o /tmp/cckrLaf7.s
+        GNU assembler version 2.42 (x86_64-linux-gnu) using BFD version (GNU Binutils for Ubuntu) 2.42
+        COMPILER_PATH=/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/
+        LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../:/lib/:/usr/lib/
+        COLLECT_GCC_OPTIONS='-v' '-o' 'CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o' '-c' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.'
+        Linking CXX executable cmTC_ba2ae
+        /usr/local/bin/cmake -E cmake_link_script CMakeFiles/cmTC_ba2ae.dir/link.txt --verbose=1
+        Using built-in specs.
+        COLLECT_GCC=/usr/bin/c++
+        COLLECT_LTO_WRAPPER=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper
+        OFFLOAD_TARGET_NAMES=nvptx-none:amdgcn-amdhsa
+        OFFLOAD_TARGET_DEFAULT=1
+        Target: x86_64-linux-gnu
+        Configured with: ../src/configure -v --with-pkgversion='Ubuntu 13.3.0-6ubuntu2~24.04' --with-bugurl=file:///usr/share/doc/gcc-13/README.Bugs --enable-languages=c,ada,c++,go,d,fortran,objc,obj-c++,m2 --prefix=/usr --with-gcc-major-version-only --program-suffix=-13 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/libexec --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --enable-bootstrap --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-libstdcxx-backtrace --enable-gnu-unique-object --disable-vtable-verify --enable-plugin --enable-default-pie --with-system-zlib --enable-libphobos-checking=release --with-target-system-zlib=auto --enable-objc-gc=auto --enable-multiarch --disable-werror --enable-cet --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-nvptx/usr,amdgcn-amdhsa=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-gcn/usr --enable-offload-defaulted --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu --with-build-config=bootstrap-lto-lean --enable-link-serialization=2
+        Thread model: posix
+        Supported LTO compression algorithms: zlib zstd
+        gcc version 13.3.0 (Ubuntu 13.3.0-6ubuntu2~24.04) 
+        COMPILER_PATH=/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/
+        LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../:/lib/:/usr/lib/
+        COLLECT_GCC_OPTIONS='-v' '-o' 'cmTC_ba2ae' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'cmTC_ba2ae.'
+         /usr/libexec/gcc/x86_64-linux-gnu/13/collect2 -plugin /usr/libexec/gcc/x86_64-linux-gnu/13/liblto_plugin.so -plugin-opt=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper -plugin-opt=-fresolution=/tmp/cczMQRrO.res -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -dynamic-linker /lib64/ld-linux-x86-64.so.2 -pie -z now -z relro -o cmTC_ba2ae /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o -L/usr/lib/gcc/x86_64-linux-gnu/13 -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/13/../../.. -v CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o
+        collect2 version 13.3.0
+        /usr/bin/ld -plugin /usr/libexec/gcc/x86_64-linux-gnu/13/liblto_plugin.so -plugin-opt=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper -plugin-opt=-fresolution=/tmp/cczMQRrO.res -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -dynamic-linker /lib64/ld-linux-x86-64.so.2 -pie -z now -z relro -o cmTC_ba2ae /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o -L/usr/lib/gcc/x86_64-linux-gnu/13 -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/13/../../.. -v CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o
+        GNU ld (GNU Binutils for Ubuntu) 2.42
+        COLLECT_GCC_OPTIONS='-v' '-o' 'cmTC_ba2ae' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'cmTC_ba2ae.'
+        /usr/bin/c++  -v -Wl,-v CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o -o cmTC_ba2ae
+        gmake[1]: Leaving directory '/home/runner/work/mscclpp/mscclpp/build_test/CMakeFiles/CMakeScratch/TryCompile-8luNu3'
+        
+      exitCode: 0
+  -
+    kind: "message-v1"
+    backtrace:
+      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCompilerABI.cmake:182 (message)"
+      - "/usr/local/share/cmake-3.31/Modules/CMakeTestCXXCompiler.cmake:26 (CMAKE_DETERMINE_COMPILER_ABI)"
+      - "CMakeLists.txt:5 (project)"
+    message: |
+      Parsed CXX implicit include dir info: rv=done
+        found start of include info
+        found start of implicit include info
+          add: [/usr/include/c++/13]
+          add: [/usr/include/x86_64-linux-gnu/c++/13]
+          add: [/usr/include/c++/13/backward]
+          add: [/usr/lib/gcc/x86_64-linux-gnu/13/include]
+          add: [/usr/local/include]
+          add: [/usr/include/x86_64-linux-gnu]
+          add: [/usr/include]
+        end of search list found
+        collapse include dir [/usr/include/c++/13] ==> [/usr/include/c++/13]
+        collapse include dir [/usr/include/x86_64-linux-gnu/c++/13] ==> [/usr/include/x86_64-linux-gnu/c++/13]
+        collapse include dir [/usr/include/c++/13/backward] ==> [/usr/include/c++/13/backward]
+        collapse include dir [/usr/lib/gcc/x86_64-linux-gnu/13/include] ==> [/usr/lib/gcc/x86_64-linux-gnu/13/include]
+        collapse include dir [/usr/local/include] ==> [/usr/local/include]
+        collapse include dir [/usr/include/x86_64-linux-gnu] ==> [/usr/include/x86_64-linux-gnu]
+        collapse include dir [/usr/include] ==> [/usr/include]
+        implicit include dirs: [/usr/include/c++/13;/usr/include/x86_64-linux-gnu/c++/13;/usr/include/c++/13/backward;/usr/lib/gcc/x86_64-linux-gnu/13/include;/usr/local/include;/usr/include/x86_64-linux-gnu;/usr/include]
+      
+      
+  -
+    kind: "message-v1"
+    backtrace:
+      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCompilerABI.cmake:218 (message)"
+      - "/usr/local/share/cmake-3.31/Modules/CMakeTestCXXCompiler.cmake:26 (CMAKE_DETERMINE_COMPILER_ABI)"
+      - "CMakeLists.txt:5 (project)"
+    message: |
+      Parsed CXX implicit link information:
+        link line regex: [^( *|.*[/\\])(ld[0-9]*(\\.[a-z]+)?|CMAKE_LINK_STARTFILE-NOTFOUND|([^/\\]+-)?ld|collect2)[^/\\]*( |$)]
+        linker tool regex: [^[ 	]*(->|")?[ 	]*(([^"]*[/\\])?(ld[0-9]*(\\.[a-z]+)?))("|,| |$)]
+        ignore line: [Change Dir: '/home/runner/work/mscclpp/mscclpp/build_test/CMakeFiles/CMakeScratch/TryCompile-8luNu3']
+        ignore line: []
+        ignore line: [Run Build Command(s): /usr/local/bin/cmake -E env VERBOSE=1 /usr/bin/gmake -f Makefile cmTC_ba2ae/fast]
+        ignore line: [/usr/bin/gmake  -f CMakeFiles/cmTC_ba2ae.dir/build.make CMakeFiles/cmTC_ba2ae.dir/build]
+        ignore line: [gmake[1]: Entering directory '/home/runner/work/mscclpp/mscclpp/build_test/CMakeFiles/CMakeScratch/TryCompile-8luNu3']
+        ignore line: [Building CXX object CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o]
+        ignore line: [/usr/bin/c++   -v -o CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o -c /usr/local/share/cmake-3.31/Modules/CMakeCXXCompilerABI.cpp]
+        ignore line: [Using built-in specs.]
+        ignore line: [COLLECT_GCC=/usr/bin/c++]
+        ignore line: [OFFLOAD_TARGET_NAMES=nvptx-none:amdgcn-amdhsa]
+        ignore line: [OFFLOAD_TARGET_DEFAULT=1]
+        ignore line: [Target: x86_64-linux-gnu]
+        ignore line: [Configured with: ../src/configure -v --with-pkgversion='Ubuntu 13.3.0-6ubuntu2~24.04' --with-bugurl=file:///usr/share/doc/gcc-13/README.Bugs --enable-languages=c ada c++ go d fortran objc obj-c++ m2 --prefix=/usr --with-gcc-major-version-only --program-suffix=-13 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/libexec --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --enable-bootstrap --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-libstdcxx-backtrace --enable-gnu-unique-object --disable-vtable-verify --enable-plugin --enable-default-pie --with-system-zlib --enable-libphobos-checking=release --with-target-system-zlib=auto --enable-objc-gc=auto --enable-multiarch --disable-werror --enable-cet --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32 m64 mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-nvptx/usr amdgcn-amdhsa=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-gcn/usr --enable-offload-defaulted --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu --with-build-config=bootstrap-lto-lean --enable-link-serialization=2]
+        ignore line: [Thread model: posix]
+        ignore line: [Supported LTO compression algorithms: zlib zstd]
+        ignore line: [gcc version 13.3.0 (Ubuntu 13.3.0-6ubuntu2~24.04) ]
+        ignore line: [COLLECT_GCC_OPTIONS='-v' '-o' 'CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o' '-c' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'CMakeFiles/cmTC_ba2ae.dir/']
+        ignore line: [ /usr/libexec/gcc/x86_64-linux-gnu/13/cc1plus -quiet -v -imultiarch x86_64-linux-gnu -D_GNU_SOURCE /usr/local/share/cmake-3.31/Modules/CMakeCXXCompilerABI.cpp -quiet -dumpdir CMakeFiles/cmTC_ba2ae.dir/ -dumpbase CMakeCXXCompilerABI.cpp.cpp -dumpbase-ext .cpp -mtune=generic -march=x86-64 -version -fasynchronous-unwind-tables -fstack-protector-strong -Wformat -Wformat-security -fstack-clash-protection -fcf-protection -o /tmp/cckrLaf7.s]
+        ignore line: [GNU C++17 (Ubuntu 13.3.0-6ubuntu2~24.04) version 13.3.0 (x86_64-linux-gnu)]
+        ignore line: [	compiled by GNU C version 13.3.0  GMP version 6.3.0  MPFR version 4.2.1  MPC version 1.3.1  isl version isl-0.26-GMP]
+        ignore line: []
+        ignore line: [GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072]
+        ignore line: [ignoring duplicate directory "/usr/include/x86_64-linux-gnu/c++/13"]
+        ignore line: [ignoring nonexistent directory "/usr/local/include/x86_64-linux-gnu"]
+        ignore line: [ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/13/include-fixed/x86_64-linux-gnu"]
+        ignore line: [ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/13/include-fixed"]
+        ignore line: [ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/13/../../../../x86_64-linux-gnu/include"]
+        ignore line: [#include "..." search starts here:]
+        ignore line: [#include <...> search starts here:]
+        ignore line: [ /usr/include/c++/13]
+        ignore line: [ /usr/include/x86_64-linux-gnu/c++/13]
+        ignore line: [ /usr/include/c++/13/backward]
+        ignore line: [ /usr/lib/gcc/x86_64-linux-gnu/13/include]
+        ignore line: [ /usr/local/include]
+        ignore line: [ /usr/include/x86_64-linux-gnu]
+        ignore line: [ /usr/include]
+        ignore line: [End of search list.]
+        ignore line: [Compiler executable checksum: c81c05345ce537099dafd5580045814a]
+        ignore line: [COLLECT_GCC_OPTIONS='-v' '-o' 'CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o' '-c' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'CMakeFiles/cmTC_ba2ae.dir/']
+        ignore line: [ as -v --64 -o CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o /tmp/cckrLaf7.s]
+        ignore line: [GNU assembler version 2.42 (x86_64-linux-gnu) using BFD version (GNU Binutils for Ubuntu) 2.42]
+        ignore line: [COMPILER_PATH=/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/]
+        ignore line: [LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../:/lib/:/usr/lib/]
+        ignore line: [COLLECT_GCC_OPTIONS='-v' '-o' 'CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o' '-c' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.']
+        ignore line: [Linking CXX executable cmTC_ba2ae]
+        ignore line: [/usr/local/bin/cmake -E cmake_link_script CMakeFiles/cmTC_ba2ae.dir/link.txt --verbose=1]
+        ignore line: [Using built-in specs.]
+        ignore line: [COLLECT_GCC=/usr/bin/c++]
+        ignore line: [COLLECT_LTO_WRAPPER=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper]
+        ignore line: [OFFLOAD_TARGET_NAMES=nvptx-none:amdgcn-amdhsa]
+        ignore line: [OFFLOAD_TARGET_DEFAULT=1]
+        ignore line: [Target: x86_64-linux-gnu]
+        ignore line: [Configured with: ../src/configure -v --with-pkgversion='Ubuntu 13.3.0-6ubuntu2~24.04' --with-bugurl=file:///usr/share/doc/gcc-13/README.Bugs --enable-languages=c ada c++ go d fortran objc obj-c++ m2 --prefix=/usr --with-gcc-major-version-only --program-suffix=-13 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/libexec --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --enable-bootstrap --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-libstdcxx-backtrace --enable-gnu-unique-object --disable-vtable-verify --enable-plugin --enable-default-pie --with-system-zlib --enable-libphobos-checking=release --with-target-system-zlib=auto --enable-objc-gc=auto --enable-multiarch --disable-werror --enable-cet --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32 m64 mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-nvptx/usr amdgcn-amdhsa=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-gcn/usr --enable-offload-defaulted --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu --with-build-config=bootstrap-lto-lean --enable-link-serialization=2]
+        ignore line: [Thread model: posix]
+        ignore line: [Supported LTO compression algorithms: zlib zstd]
+        ignore line: [gcc version 13.3.0 (Ubuntu 13.3.0-6ubuntu2~24.04) ]
+        ignore line: [COMPILER_PATH=/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/]
+        ignore line: [LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../:/lib/:/usr/lib/]
+        ignore line: [COLLECT_GCC_OPTIONS='-v' '-o' 'cmTC_ba2ae' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'cmTC_ba2ae.']
+        link line: [ /usr/libexec/gcc/x86_64-linux-gnu/13/collect2 -plugin /usr/libexec/gcc/x86_64-linux-gnu/13/liblto_plugin.so -plugin-opt=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper -plugin-opt=-fresolution=/tmp/cczMQRrO.res -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -dynamic-linker /lib64/ld-linux-x86-64.so.2 -pie -z now -z relro -o cmTC_ba2ae /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o -L/usr/lib/gcc/x86_64-linux-gnu/13 -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/13/../../.. -v CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o]
+          arg [/usr/libexec/gcc/x86_64-linux-gnu/13/collect2] ==> ignore
+          arg [-plugin] ==> ignore
+          arg [/usr/libexec/gcc/x86_64-linux-gnu/13/liblto_plugin.so] ==> ignore
+          arg [-plugin-opt=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper] ==> ignore
+          arg [-plugin-opt=-fresolution=/tmp/cczMQRrO.res] ==> ignore
+          arg [-plugin-opt=-pass-through=-lgcc_s] ==> ignore
+          arg [-plugin-opt=-pass-through=-lgcc] ==> ignore
+          arg [-plugin-opt=-pass-through=-lc] ==> ignore
+          arg [-plugin-opt=-pass-through=-lgcc_s] ==> ignore
+          arg [-plugin-opt=-pass-through=-lgcc] ==> ignore
+          arg [--build-id] ==> ignore
+          arg [--eh-frame-hdr] ==> ignore
+          arg [-m] ==> ignore
+          arg [elf_x86_64] ==> ignore
+          arg [--hash-style=gnu] ==> ignore
+          arg [--as-needed] ==> ignore
+          arg [-dynamic-linker] ==> ignore
+          arg [/lib64/ld-linux-x86-64.so.2] ==> ignore
+          arg [-pie] ==> ignore
+          arg [-znow] ==> ignore
+          arg [-zrelro] ==> ignore
+          arg [-o] ==> ignore
+          arg [cmTC_ba2ae] ==> ignore
+          arg [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o] ==> obj [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o]
+          arg [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o] ==> obj [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o]
+          arg [/usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o] ==> obj [/usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o]
+          arg [-L/usr/lib/gcc/x86_64-linux-gnu/13] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/13]
+          arg [-L/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu]
+          arg [-L/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib]
+          arg [-L/lib/x86_64-linux-gnu] ==> dir [/lib/x86_64-linux-gnu]
+          arg [-L/lib/../lib] ==> dir [/lib/../lib]
+          arg [-L/usr/lib/x86_64-linux-gnu] ==> dir [/usr/lib/x86_64-linux-gnu]
+          arg [-L/usr/lib/../lib] ==> dir [/usr/lib/../lib]
+          arg [-L/usr/lib/gcc/x86_64-linux-gnu/13/../../..] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/13/../../..]
+          arg [-v] ==> ignore
+          arg [CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o] ==> ignore
+          arg [-lstdc++] ==> lib [stdc++]
+          arg [-lm] ==> lib [m]
+          arg [-lgcc_s] ==> lib [gcc_s]
+          arg [-lgcc] ==> lib [gcc]
+          arg [-lc] ==> lib [c]
+          arg [-lgcc_s] ==> lib [gcc_s]
+          arg [-lgcc] ==> lib [gcc]
+          arg [/usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o] ==> obj [/usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o]
+          arg [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o] ==> obj [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o]
+        ignore line: [collect2 version 13.3.0]
+        ignore line: [/usr/bin/ld -plugin /usr/libexec/gcc/x86_64-linux-gnu/13/liblto_plugin.so -plugin-opt=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper -plugin-opt=-fresolution=/tmp/cczMQRrO.res -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -dynamic-linker /lib64/ld-linux-x86-64.so.2 -pie -z now -z relro -o cmTC_ba2ae /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o -L/usr/lib/gcc/x86_64-linux-gnu/13 -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/13/../../.. -v CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o]
+        linker tool for 'CXX': /usr/bin/ld
+        collapse obj [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o] ==> [/usr/lib/x86_64-linux-gnu/Scrt1.o]
+        collapse obj [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o] ==> [/usr/lib/x86_64-linux-gnu/crti.o]
+        collapse obj [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o] ==> [/usr/lib/x86_64-linux-gnu/crtn.o]
+        collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/13] ==> [/usr/lib/gcc/x86_64-linux-gnu/13]
+        collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu] ==> [/usr/lib/x86_64-linux-gnu]
+        collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib] ==> [/usr/lib]
+        collapse library dir [/lib/x86_64-linux-gnu] ==> [/lib/x86_64-linux-gnu]
+        collapse library dir [/lib/../lib] ==> [/lib]
+        collapse library dir [/usr/lib/x86_64-linux-gnu] ==> [/usr/lib/x86_64-linux-gnu]
+        collapse library dir [/usr/lib/../lib] ==> [/usr/lib]
+        collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/13/../../..] ==> [/usr/lib]
+        implicit libs: [stdc++;m;gcc_s;gcc;c;gcc_s;gcc]
+        implicit objs: [/usr/lib/x86_64-linux-gnu/Scrt1.o;/usr/lib/x86_64-linux-gnu/crti.o;/usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o;/usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o;/usr/lib/x86_64-linux-gnu/crtn.o]
+        implicit dirs: [/usr/lib/gcc/x86_64-linux-gnu/13;/usr/lib/x86_64-linux-gnu;/usr/lib;/lib/x86_64-linux-gnu;/lib]
+        implicit fwks: []
+      
+      
+  -
+    kind: "message-v1"
+    backtrace:
+      - "/usr/local/share/cmake-3.31/Modules/Internal/CMakeDetermineLinkerId.cmake:40 (message)"
+      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCompilerABI.cmake:255 (cmake_determine_linker_id)"
+      - "/usr/local/share/cmake-3.31/Modules/CMakeTestCXXCompiler.cmake:26 (CMAKE_DETERMINE_COMPILER_ABI)"
+      - "CMakeLists.txt:5 (project)"
+    message: |
+      Running the CXX compiler's linker: "/usr/bin/ld" "-v"
+      GNU ld (GNU Binutils for Ubuntu) 2.42
+...
diff --git a/build_test/CMakeFiles/cmake.check_cache b/build_test/CMakeFiles/cmake.check_cache
new file mode 100644
index 000000000..3dccd7317
--- /dev/null
+++ b/build_test/CMakeFiles/cmake.check_cache
@@ -0,0 +1 @@
+# This file is generated by cmake for dependency checking of the CMakeCache.txt file
diff --git a/build_test/include/mscclpp/version.hpp b/build_test/include/mscclpp/version.hpp
new file mode 100644
index 000000000..4c1c9a14a
--- /dev/null
+++ b/build_test/include/mscclpp/version.hpp
@@ -0,0 +1,13 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef MSCCLPP_VERSION_HPP_
+#define MSCCLPP_VERSION_HPP_
+
+#define MSCCLPP_MAJOR 0
+#define MSCCLPP_MINOR 8
+#define MSCCLPP_PATCH 0
+#define MSCCLPP_VERSION (MSCCLPP_MAJOR * 10000 + MSCCLPP_MINOR * 100 + MSCCLPP_PATCH)
+#define MSCCLPP_GIT_COMMIT "1818709de0a5"
+
+#endif  // MSCCLPP_VERSION_HPP_

From 0eae34c53dcaf32b1fc204920e037ea69b46f91f Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 11 Feb 2026 01:13:29 +0000
Subject: [PATCH 010/132] Fix test framework for building with Docker

- Make MPI REQUIRED for test builds (clearer error messages)
- Add project include directories to test_framework library
- Fix core_tests.cc to use custom framework correctly
- Fix mp_unit_tests.hpp to use mscclpp::test namespace
- Add FAIL() macro with streaming support for test messages
- Building tests now works in Docker environment with GPU bypass

Tests can now be built using:
docker run --rm -v $(pwd):/workspace -w /workspace \
  ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 bash -c \
  "mkdir build && cd build && cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON \
   -DMSCCLPP_USE_CUDA=ON .. && make -j"

Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
---
 test/CMakeLists.txt            |  4 ++--
 test/framework.hpp             | 33 ++++++++++++++++++++++++++++-----
 test/mp_unit/mp_unit_tests.hpp |  4 ++--
 test/unit/core_tests.cc        |  6 ++----
 4 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 7c4e9684e..20c469d70 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-find_package(MPI)
+find_package(MPI REQUIRED)
 
 set(TEST_LIBS_COMMON mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} Threads::Threads)
 if(MSCCLPP_USE_IB)
@@ -40,7 +40,7 @@ include(CTest)
 
 # Build test framework library
 add_library(test_framework STATIC framework.cc)
-target_include_directories(test_framework PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(test_framework PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${TEST_INC_COMMON})
 target_link_libraries(test_framework PUBLIC MPI::MPI_CXX)
 
 # Unit tests
diff --git a/test/framework.hpp b/test/framework.hpp
index 4b953e379..cfd9ecf6f 100644
--- a/test/framework.hpp
+++ b/test/framework.hpp
@@ -366,11 +366,34 @@ void reportSuccess();
     }                                                                                                      \
   } while (0)
 
-#define FAIL()                                                                \
-  do {                                                                        \
-    ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Test failed"); \
-    throw std::runtime_error("Test failed");                                  \
-  } while (0)
+// Helper class for FAIL functionality with message streaming support
+class FailHelper {
+ public:
+  explicit FailHelper(const char* file, int line) : file_(file), line_(line) {}
+  template <typename T>
+  FailHelper& operator<<(const T& value) {
+    message_ << value;
+    return *this;
+  }
+  ~FailHelper() noexcept(false) {
+    std::string msg = message_.str();
+    if (!msg.empty()) {
+      ::mscclpp::test::utils::reportFailure(file_, line_, "Test failed: " + msg);
+    } else {
+      ::mscclpp::test::utils::reportFailure(file_, line_, "Test failed");
+    }
+    throw std::runtime_error("Test failed");
+  }
+
+ private:
+  const char* file_;
+  int line_;
+  std::ostringstream message_;
+};
+
+// Test fail macro - throws exception to fail test execution
+// Usage: FAIL() << "Optional fail message";
+#define FAIL() ::mscclpp::test::FailHelper(__FILE__, __LINE__)
 
 // Helper class for GTEST_SKIP functionality
 // This class uses RAII (Resource Acquisition Is Initialization) pattern:
diff --git a/test/mp_unit/mp_unit_tests.hpp b/test/mp_unit/mp_unit_tests.hpp
index 8b1fab279..bcf880ae2 100644
--- a/test/mp_unit/mp_unit_tests.hpp
+++ b/test/mp_unit/mp_unit_tests.hpp
@@ -15,7 +15,7 @@
 #include "ib.hpp"
 #include "utils_internal.hpp"
 
-class MultiProcessTestEnv : public ::testing::Environment {
+class MultiProcessTestEnv : public ::mscclpp::test::Environment {
  public:
   MultiProcessTestEnv(int argc, const char** argv);
 
@@ -36,7 +36,7 @@ mscclpp::Transport ibIdToTransport(int id);
 int rankToLocalRank(int rank);
 int rankToNode(int rank);
 
-class MultiProcessTest : public ::testing::Test {
+class MultiProcessTest : public ::mscclpp::test::TestCase {
  protected:
   void TearDown() override;
 };
diff --git a/test/unit/core_tests.cc b/test/unit/core_tests.cc
index a2c39c1b4..13437872d 100644
--- a/test/unit/core_tests.cc
+++ b/test/unit/core_tests.cc
@@ -1,13 +1,11 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include <gmock/gmock.h>
+#include "../framework.hpp"
 
 #include <mscclpp/core.hpp>
 
-#include "../framework.hpp"
-
-class LocalCommunicatorTest : public ::testing::Test {
+class LocalCommunicatorTest : public ::mscclpp::test::TestCase {
  protected:
   void SetUp() override {
     bootstrap = std::make_shared<mscclpp::TcpBootstrap>(0, 1);

From 48235835871849f20677a7cd5e6f74499206eb82 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 11 Feb 2026 01:17:08 +0000
Subject: [PATCH 011/132] Move FailHelper and SkipHelper into mscclpp::test
 namespace

- Move helper classes inside namespace for proper access
- Remove duplicate class definitions outside namespace
- Tests can now build in Docker with CUDA toolkit installed
- Remaining issues: ErrorCode and TransportFlags need operator<< for EXPECT_EQ

Successfully building with Docker:
docker run --rm -v $(pwd):/workspace -w /workspace \
  ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 bash -c \
  "mkdir build && cd build && cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON \
   -DMSCCLPP_USE_CUDA=ON .. && make -j4"

Note: Some unit tests (errors_tests.cc, core_tests.cc) need operator<<
defined for ErrorCode and TransportFlags to compile with custom framework.

Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
---
 test/framework.hpp | 110 ++++++++++++++++++++++-----------------------
 1 file changed, 55 insertions(+), 55 deletions(-)

diff --git a/test/framework.hpp b/test/framework.hpp
index cfd9ecf6f..f93471e94 100644
--- a/test/framework.hpp
+++ b/test/framework.hpp
@@ -146,6 +146,61 @@ void reportSuccess();
 
 }  // namespace utils
 
+// Helper class for FAIL functionality with message streaming support
+class FailHelper {
+ public:
+  explicit FailHelper(const char* file, int line) : file_(file), line_(line) {}
+  template <typename T>
+  FailHelper& operator<<(const T& value) {
+    message_ << value;
+    return *this;
+  }
+  ~FailHelper() noexcept(false) {
+    std::string msg = message_.str();
+    if (!msg.empty()) {
+      ::mscclpp::test::utils::reportFailure(file_, line_, "Test failed: " + msg);
+    } else {
+      ::mscclpp::test::utils::reportFailure(file_, line_, "Test failed");
+    }
+    throw std::runtime_error("Test failed");
+  }
+
+ private:
+  const char* file_;
+  int line_;
+  std::ostringstream message_;
+};
+
+// Helper class for GTEST_SKIP functionality
+// This class uses RAII (Resource Acquisition Is Initialization) pattern:
+// - The constructor records file and line information
+// - The stream operator (<<) allows appending a skip message
+// - The destructor throws an exception to skip the test
+// This enables usage like: GTEST_SKIP() << "Reason for skipping";
+class SkipHelper {
+ public:
+  explicit SkipHelper(const char* file, int line) : file_(file), line_(line) {}
+  template <typename T>
+  SkipHelper& operator<<(const T& value) {
+    message_ << value;
+    return *this;
+  }
+  ~SkipHelper() noexcept(false) {
+    std::string msg = message_.str();
+    if (!msg.empty()) {
+      ::mscclpp::test::utils::reportFailure(file_, line_, "Test skipped: " + msg);
+    } else {
+      ::mscclpp::test::utils::reportFailure(file_, line_, "Test skipped");
+    }
+    throw std::runtime_error("Test skipped");
+  }
+
+ private:
+  const char* file_;
+  int line_;
+  std::ostringstream message_;
+};
+
 }  // namespace test
 }  // namespace mscclpp
 
@@ -366,65 +421,10 @@ void reportSuccess();
     }                                                                                                      \
   } while (0)
 
-// Helper class for FAIL functionality with message streaming support
-class FailHelper {
- public:
-  explicit FailHelper(const char* file, int line) : file_(file), line_(line) {}
-  template <typename T>
-  FailHelper& operator<<(const T& value) {
-    message_ << value;
-    return *this;
-  }
-  ~FailHelper() noexcept(false) {
-    std::string msg = message_.str();
-    if (!msg.empty()) {
-      ::mscclpp::test::utils::reportFailure(file_, line_, "Test failed: " + msg);
-    } else {
-      ::mscclpp::test::utils::reportFailure(file_, line_, "Test failed");
-    }
-    throw std::runtime_error("Test failed");
-  }
-
- private:
-  const char* file_;
-  int line_;
-  std::ostringstream message_;
-};
-
 // Test fail macro - throws exception to fail test execution
 // Usage: FAIL() << "Optional fail message";
 #define FAIL() ::mscclpp::test::FailHelper(__FILE__, __LINE__)
 
-// Helper class for GTEST_SKIP functionality
-// This class uses RAII (Resource Acquisition Is Initialization) pattern:
-// - The constructor records file and line information
-// - The stream operator (<<) allows appending a skip message
-// - The destructor throws an exception to skip the test
-// This enables usage like: GTEST_SKIP() << "Reason for skipping";
-class SkipHelper {
- public:
-  explicit SkipHelper(const char* file, int line) : file_(file), line_(line) {}
-  template <typename T>
-  SkipHelper& operator<<(const T& value) {
-    message_ << value;
-    return *this;
-  }
-  ~SkipHelper() noexcept(false) {
-    std::string msg = message_.str();
-    if (!msg.empty()) {
-      ::mscclpp::test::utils::reportFailure(file_, line_, "Test skipped: " + msg);
-    } else {
-      ::mscclpp::test::utils::reportFailure(file_, line_, "Test skipped");
-    }
-    throw std::runtime_error("Test skipped");
-  }
-
- private:
-  const char* file_;
-  int line_;
-  std::ostringstream message_;
-};
-
 // Test skip macro - throws exception to skip test execution
 // Usage: GTEST_SKIP() << "Optional skip message";
 #define GTEST_SKIP() ::mscclpp::test::SkipHelper(__FILE__, __LINE__)

From 403b2fb3dbed7e7abddcb8ae57f42426ba9d646d Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 11 Feb 2026 01:55:40 +0000
Subject: [PATCH 012/132] Remove unnecessary CMake build artifacts from PR

- Remove build_test/ directory containing CMake cache and build files
- Update .gitignore to exclude build_*/ pattern to prevent future accidents

These CMake artifacts (CMakeCache.txt, CMakeFiles/, generated headers)
were accidentally committed and should never be in version control.

Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
---
 .gitignore                                    |   1 +
 build_test/CMakeCache.txt                     | 390 --------
 .../CMakeFiles/3.31.6/CMakeCXXCompiler.cmake  | 101 --
 .../3.31.6/CMakeDetermineCompilerABI_CXX.bin  | Bin 15992 -> 0 bytes
 .../CMakeFiles/3.31.6/CMakeSystem.cmake       |  15 -
 .../CompilerIdCXX/CMakeCXXCompilerId.cpp      | 919 ------------------
 .../CMakeFiles/3.31.6/CompilerIdCXX/a.out     | Bin 16096 -> 0 bytes
 build_test/CMakeFiles/CMakeConfigureLog.yaml  | 294 ------
 build_test/CMakeFiles/cmake.check_cache       |   1 -
 build_test/include/mscclpp/version.hpp        |  13 -
 10 files changed, 1 insertion(+), 1733 deletions(-)
 delete mode 100644 build_test/CMakeCache.txt
 delete mode 100644 build_test/CMakeFiles/3.31.6/CMakeCXXCompiler.cmake
 delete mode 100755 build_test/CMakeFiles/3.31.6/CMakeDetermineCompilerABI_CXX.bin
 delete mode 100644 build_test/CMakeFiles/3.31.6/CMakeSystem.cmake
 delete mode 100644 build_test/CMakeFiles/3.31.6/CompilerIdCXX/CMakeCXXCompilerId.cpp
 delete mode 100755 build_test/CMakeFiles/3.31.6/CompilerIdCXX/a.out
 delete mode 100644 build_test/CMakeFiles/CMakeConfigureLog.yaml
 delete mode 100644 build_test/CMakeFiles/cmake.check_cache
 delete mode 100644 build_test/include/mscclpp/version.hpp

diff --git a/.gitignore b/.gitignore
index ed3b94c41..cf946377d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 .vscode/
 build/
+build_*/
 __pycache__
 .*.swp
 *.so
diff --git a/build_test/CMakeCache.txt b/build_test/CMakeCache.txt
deleted file mode 100644
index cc9de9e11..000000000
--- a/build_test/CMakeCache.txt
+++ /dev/null
@@ -1,390 +0,0 @@
-# This is the CMakeCache file.
-# For build in directory: /home/runner/work/mscclpp/mscclpp/build_test
-# It was generated by CMake: /usr/local/bin/cmake
-# You can edit this file to change values found and used by cmake.
-# If you do not want to change any of the values, simply exit the editor.
-# If you do want to change a value, simply edit, save, and exit the editor.
-# The syntax for the file is as follows:
-# KEY:TYPE=VALUE
-# KEY is the name of a variable in the cache.
-# TYPE is a hint to GUIs for the type of VALUE, DO NOT EDIT TYPE!.
-# VALUE is the current value for the KEY.
-
-########################
-# EXTERNAL cache entries
-########################
-
-//Path to a program.
-CMAKE_ADDR2LINE:FILEPATH=/usr/bin/addr2line
-
-//Path to a program.
-CMAKE_AR:FILEPATH=/usr/bin/ar
-
-//Choose the type of build, options are: None Debug Release RelWithDebInfo
-// MinSizeRel ...
-CMAKE_BUILD_TYPE:STRING=
-
-//Enable/Disable color output during build.
-CMAKE_COLOR_MAKEFILE:BOOL=ON
-
-//CXX compiler
-CMAKE_CXX_COMPILER:FILEPATH=/usr/bin/c++
-
-//A wrapper around 'ar' adding the appropriate '--plugin' option
-// for the GCC compiler
-CMAKE_CXX_COMPILER_AR:FILEPATH=/usr/bin/gcc-ar-13
-
-//A wrapper around 'ranlib' adding the appropriate '--plugin' option
-// for the GCC compiler
-CMAKE_CXX_COMPILER_RANLIB:FILEPATH=/usr/bin/gcc-ranlib-13
-
-//Flags used by the CXX compiler during all build types.
-CMAKE_CXX_FLAGS:STRING=
-
-//Flags used by the CXX compiler during DEBUG builds.
-CMAKE_CXX_FLAGS_DEBUG:STRING=-g
-
-//Flags used by the CXX compiler during MINSIZEREL builds.
-CMAKE_CXX_FLAGS_MINSIZEREL:STRING=-Os -DNDEBUG
-
-//Flags used by the CXX compiler during RELEASE builds.
-CMAKE_CXX_FLAGS_RELEASE:STRING=-O3 -DNDEBUG
-
-//Flags used by the CXX compiler during RELWITHDEBINFO builds.
-CMAKE_CXX_FLAGS_RELWITHDEBINFO:STRING=-O2 -g -DNDEBUG
-
-//Path to a program.
-CMAKE_DLLTOOL:FILEPATH=CMAKE_DLLTOOL-NOTFOUND
-
-//Flags used by the linker during all build types.
-CMAKE_EXE_LINKER_FLAGS:STRING=
-
-//Flags used by the linker during DEBUG builds.
-CMAKE_EXE_LINKER_FLAGS_DEBUG:STRING=
-
-//Flags used by the linker during MINSIZEREL builds.
-CMAKE_EXE_LINKER_FLAGS_MINSIZEREL:STRING=
-
-//Flags used by the linker during RELEASE builds.
-CMAKE_EXE_LINKER_FLAGS_RELEASE:STRING=
-
-//Flags used by the linker during RELWITHDEBINFO builds.
-CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
-
-//Enable/Disable output of compile commands during generation.
-CMAKE_EXPORT_COMPILE_COMMANDS:BOOL=
-
-//Value Computed by CMake.
-CMAKE_FIND_PACKAGE_REDIRECTS_DIR:STATIC=/home/runner/work/mscclpp/mscclpp/build_test/CMakeFiles/pkgRedirects
-
-//Install path prefix, prepended onto install directories.
-CMAKE_INSTALL_PREFIX:PATH=/usr/local
-
-//Path to a program.
-CMAKE_LINKER:FILEPATH=/usr/bin/ld
-
-//Path to a program.
-CMAKE_MAKE_PROGRAM:FILEPATH=/usr/bin/gmake
-
-//Flags used by the linker during the creation of modules during
-// all build types.
-CMAKE_MODULE_LINKER_FLAGS:STRING=
-
-//Flags used by the linker during the creation of modules during
-// DEBUG builds.
-CMAKE_MODULE_LINKER_FLAGS_DEBUG:STRING=
-
-//Flags used by the linker during the creation of modules during
-// MINSIZEREL builds.
-CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL:STRING=
-
-//Flags used by the linker during the creation of modules during
-// RELEASE builds.
-CMAKE_MODULE_LINKER_FLAGS_RELEASE:STRING=
-
-//Flags used by the linker during the creation of modules during
-// RELWITHDEBINFO builds.
-CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
-
-//Path to a program.
-CMAKE_NM:FILEPATH=/usr/bin/nm
-
-//Path to a program.
-CMAKE_OBJCOPY:FILEPATH=/usr/bin/objcopy
-
-//Path to a program.
-CMAKE_OBJDUMP:FILEPATH=/usr/bin/objdump
-
-//Value Computed by CMake
-CMAKE_PROJECT_DESCRIPTION:STATIC=
-
-//Value Computed by CMake
-CMAKE_PROJECT_HOMEPAGE_URL:STATIC=
-
-//Value Computed by CMake
-CMAKE_PROJECT_NAME:STATIC=mscclpp
-
-//Path to a program.
-CMAKE_RANLIB:FILEPATH=/usr/bin/ranlib
-
-//Path to a program.
-CMAKE_READELF:FILEPATH=/usr/bin/readelf
-
-//Flags used by the linker during the creation of shared libraries
-// during all build types.
-CMAKE_SHARED_LINKER_FLAGS:STRING=
-
-//Flags used by the linker during the creation of shared libraries
-// during DEBUG builds.
-CMAKE_SHARED_LINKER_FLAGS_DEBUG:STRING=
-
-//Flags used by the linker during the creation of shared libraries
-// during MINSIZEREL builds.
-CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL:STRING=
-
-//Flags used by the linker during the creation of shared libraries
-// during RELEASE builds.
-CMAKE_SHARED_LINKER_FLAGS_RELEASE:STRING=
-
-//Flags used by the linker during the creation of shared libraries
-// during RELWITHDEBINFO builds.
-CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO:STRING=
-
-//If set, runtime paths are not added when installing shared libraries,
-// but are added when building.
-CMAKE_SKIP_INSTALL_RPATH:BOOL=NO
-
-//If set, runtime paths are not added when using shared libraries.
-CMAKE_SKIP_RPATH:BOOL=NO
-
-//Flags used by the linker during the creation of static libraries
-// during all build types.
-CMAKE_STATIC_LINKER_FLAGS:STRING=
-
-//Flags used by the linker during the creation of static libraries
-// during DEBUG builds.
-CMAKE_STATIC_LINKER_FLAGS_DEBUG:STRING=
-
-//Flags used by the linker during the creation of static libraries
-// during MINSIZEREL builds.
-CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL:STRING=
-
-//Flags used by the linker during the creation of static libraries
-// during RELEASE builds.
-CMAKE_STATIC_LINKER_FLAGS_RELEASE:STRING=
-
-//Flags used by the linker during the creation of static libraries
-// during RELWITHDEBINFO builds.
-CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO:STRING=
-
-//Path to a program.
-CMAKE_STRIP:FILEPATH=/usr/bin/strip
-
-//Path to a program.
-CMAKE_TAPI:FILEPATH=CMAKE_TAPI-NOTFOUND
-
-//If this value is on, makefiles will be generated without the
-// .SILENT directive, and all commands will be echoed to the console
-// during the make.  This is useful for debugging only. With Visual
-// Studio IDE projects all commands are done without /nologo.
-CMAKE_VERBOSE_MAKEFILE:BOOL=FALSE
-
-//Path to a program.
-CUDAToolkit_NVCC_EXECUTABLE:FILEPATH=CUDAToolkit_NVCC_EXECUTABLE-NOTFOUND
-
-//Path to a file.
-CUDAToolkit_SENTINEL_FILE:FILEPATH=CUDAToolkit_SENTINEL_FILE-NOTFOUND
-
-//Git command line client
-GIT_EXECUTABLE:FILEPATH=/usr/bin/git
-
-//Build collective algorithms
-MSCCLPP_BUILD_EXT_COLLECTIVES:BOOL=ON
-
-//Build NCCL interfaces
-MSCCLPP_BUILD_EXT_NCCL:BOOL=ON
-
-//Build Python bindings
-MSCCLPP_BUILD_PYTHON_BINDINGS:BOOL=ON
-
-//Build tests
-MSCCLPP_BUILD_TESTS:BOOL=ON
-
-//Bypass GPU check.
-MSCCLPP_BYPASS_GPU_CHECK:BOOL=ON
-
-//Enable code coverage
-MSCCLPP_ENABLE_COVERAGE:BOOL=OFF
-
-//Enable tracing
-MSCCLPP_ENABLE_TRACE:BOOL=OFF
-
-//Specify GPU architectures with delimiters (comma, space, or semicolon).
-MSCCLPP_GPU_ARCHS:STRING=
-
-//Set NPKIT flags
-MSCCLPP_NPKIT_FLAGS:BOOL=OFF
-
-//Use NVIDIA/CUDA.
-MSCCLPP_USE_CUDA:BOOL=ON
-
-//Use InfiniBand.
-MSCCLPP_USE_IB:BOOL=ON
-
-//Use AMD/ROCm.
-MSCCLPP_USE_ROCM:BOOL=OFF
-
-//Value Computed by CMake
-mscclpp_BINARY_DIR:STATIC=/home/runner/work/mscclpp/mscclpp/build_test
-
-//Value Computed by CMake
-mscclpp_IS_TOP_LEVEL:STATIC=ON
-
-//Value Computed by CMake
-mscclpp_SOURCE_DIR:STATIC=/home/runner/work/mscclpp/mscclpp
-
-
-########################
-# INTERNAL cache entries
-########################
-
-//ADVANCED property for variable: CMAKE_ADDR2LINE
-CMAKE_ADDR2LINE-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_AR
-CMAKE_AR-ADVANCED:INTERNAL=1
-//This is the directory where this CMakeCache.txt was created
-CMAKE_CACHEFILE_DIR:INTERNAL=/home/runner/work/mscclpp/mscclpp/build_test
-//Major version of cmake used to create the current loaded cache
-CMAKE_CACHE_MAJOR_VERSION:INTERNAL=3
-//Minor version of cmake used to create the current loaded cache
-CMAKE_CACHE_MINOR_VERSION:INTERNAL=31
-//Patch version of cmake used to create the current loaded cache
-CMAKE_CACHE_PATCH_VERSION:INTERNAL=6
-//ADVANCED property for variable: CMAKE_COLOR_MAKEFILE
-CMAKE_COLOR_MAKEFILE-ADVANCED:INTERNAL=1
-//Path to CMake executable.
-CMAKE_COMMAND:INTERNAL=/usr/local/bin/cmake
-//Path to cpack program executable.
-CMAKE_CPACK_COMMAND:INTERNAL=/usr/local/bin/cpack
-//Path to ctest program executable.
-CMAKE_CTEST_COMMAND:INTERNAL=/usr/local/bin/ctest
-//ADVANCED property for variable: CMAKE_CXX_COMPILER
-CMAKE_CXX_COMPILER-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_CXX_COMPILER_AR
-CMAKE_CXX_COMPILER_AR-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_CXX_COMPILER_RANLIB
-CMAKE_CXX_COMPILER_RANLIB-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_CXX_FLAGS
-CMAKE_CXX_FLAGS-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_CXX_FLAGS_DEBUG
-CMAKE_CXX_FLAGS_DEBUG-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_CXX_FLAGS_MINSIZEREL
-CMAKE_CXX_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_CXX_FLAGS_RELEASE
-CMAKE_CXX_FLAGS_RELEASE-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_CXX_FLAGS_RELWITHDEBINFO
-CMAKE_CXX_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_DLLTOOL
-CMAKE_DLLTOOL-ADVANCED:INTERNAL=1
-//Path to cache edit program executable.
-CMAKE_EDIT_COMMAND:INTERNAL=/usr/local/bin/ccmake
-//Executable file format
-CMAKE_EXECUTABLE_FORMAT:INTERNAL=ELF
-//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS
-CMAKE_EXE_LINKER_FLAGS-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_DEBUG
-CMAKE_EXE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_MINSIZEREL
-CMAKE_EXE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELEASE
-CMAKE_EXE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO
-CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_EXPORT_COMPILE_COMMANDS
-CMAKE_EXPORT_COMPILE_COMMANDS-ADVANCED:INTERNAL=1
-//Name of external makefile project generator.
-CMAKE_EXTRA_GENERATOR:INTERNAL=
-//Name of generator.
-CMAKE_GENERATOR:INTERNAL=Unix Makefiles
-//Generator instance identifier.
-CMAKE_GENERATOR_INSTANCE:INTERNAL=
-//Name of generator platform.
-CMAKE_GENERATOR_PLATFORM:INTERNAL=
-//Name of generator toolset.
-CMAKE_GENERATOR_TOOLSET:INTERNAL=
-//Source directory with the top level CMakeLists.txt file for this
-// project
-CMAKE_HOME_DIRECTORY:INTERNAL=/home/runner/work/mscclpp/mscclpp
-//Install .so files without execute permission.
-CMAKE_INSTALL_SO_NO_EXE:INTERNAL=1
-//ADVANCED property for variable: CMAKE_LINKER
-CMAKE_LINKER-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_MAKE_PROGRAM
-CMAKE_MAKE_PROGRAM-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS
-CMAKE_MODULE_LINKER_FLAGS-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_DEBUG
-CMAKE_MODULE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL
-CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELEASE
-CMAKE_MODULE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO
-CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_NM
-CMAKE_NM-ADVANCED:INTERNAL=1
-//number of local generators
-CMAKE_NUMBER_OF_MAKEFILES:INTERNAL=1
-//ADVANCED property for variable: CMAKE_OBJCOPY
-CMAKE_OBJCOPY-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_OBJDUMP
-CMAKE_OBJDUMP-ADVANCED:INTERNAL=1
-//Platform information initialized
-CMAKE_PLATFORM_INFO_INITIALIZED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_RANLIB
-CMAKE_RANLIB-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_READELF
-CMAKE_READELF-ADVANCED:INTERNAL=1
-//Path to CMake installation.
-CMAKE_ROOT:INTERNAL=/usr/local/share/cmake-3.31
-//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS
-CMAKE_SHARED_LINKER_FLAGS-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_DEBUG
-CMAKE_SHARED_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL
-CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELEASE
-CMAKE_SHARED_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO
-CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_SKIP_INSTALL_RPATH
-CMAKE_SKIP_INSTALL_RPATH-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_SKIP_RPATH
-CMAKE_SKIP_RPATH-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS
-CMAKE_STATIC_LINKER_FLAGS-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_DEBUG
-CMAKE_STATIC_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL
-CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELEASE
-CMAKE_STATIC_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO
-CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_STRIP
-CMAKE_STRIP-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_TAPI
-CMAKE_TAPI-ADVANCED:INTERNAL=1
-//uname command
-CMAKE_UNAME:INTERNAL=/usr/bin/uname
-//ADVANCED property for variable: CMAKE_VERBOSE_MAKEFILE
-CMAKE_VERBOSE_MAKEFILE-ADVANCED:INTERNAL=1
-//Details about finding Git
-FIND_PACKAGE_MESSAGE_DETAILS_Git:INTERNAL=[/usr/bin/git][v2.52.0()]
-//ADVANCED property for variable: GIT_EXECUTABLE
-GIT_EXECUTABLE-ADVANCED:INTERNAL=1
-//linker supports push/pop state
-_CMAKE_CXX_LINKER_PUSHPOP_STATE_SUPPORTED:INTERNAL=TRUE
-//linker supports push/pop state
-_CMAKE_LINKER_PUSHPOP_STATE_SUPPORTED:INTERNAL=TRUE
-
diff --git a/build_test/CMakeFiles/3.31.6/CMakeCXXCompiler.cmake b/build_test/CMakeFiles/3.31.6/CMakeCXXCompiler.cmake
deleted file mode 100644
index 14f6ae31d..000000000
--- a/build_test/CMakeFiles/3.31.6/CMakeCXXCompiler.cmake
+++ /dev/null
@@ -1,101 +0,0 @@
-set(CMAKE_CXX_COMPILER "/usr/bin/c++")
-set(CMAKE_CXX_COMPILER_ARG1 "")
-set(CMAKE_CXX_COMPILER_ID "GNU")
-set(CMAKE_CXX_COMPILER_VERSION "13.3.0")
-set(CMAKE_CXX_COMPILER_VERSION_INTERNAL "")
-set(CMAKE_CXX_COMPILER_WRAPPER "")
-set(CMAKE_CXX_STANDARD_COMPUTED_DEFAULT "17")
-set(CMAKE_CXX_EXTENSIONS_COMPUTED_DEFAULT "ON")
-set(CMAKE_CXX_STANDARD_LATEST "23")
-set(CMAKE_CXX_COMPILE_FEATURES "cxx_std_98;cxx_template_template_parameters;cxx_std_11;cxx_alias_templates;cxx_alignas;cxx_alignof;cxx_attributes;cxx_auto_type;cxx_constexpr;cxx_decltype;cxx_decltype_incomplete_return_types;cxx_default_function_template_args;cxx_defaulted_functions;cxx_defaulted_move_initializers;cxx_delegating_constructors;cxx_deleted_functions;cxx_enum_forward_declarations;cxx_explicit_conversions;cxx_extended_friend_declarations;cxx_extern_templates;cxx_final;cxx_func_identifier;cxx_generalized_initializers;cxx_inheriting_constructors;cxx_inline_namespaces;cxx_lambdas;cxx_local_type_template_args;cxx_long_long_type;cxx_noexcept;cxx_nonstatic_member_init;cxx_nullptr;cxx_override;cxx_range_for;cxx_raw_string_literals;cxx_reference_qualified_functions;cxx_right_angle_brackets;cxx_rvalue_references;cxx_sizeof_member;cxx_static_assert;cxx_strong_enums;cxx_thread_local;cxx_trailing_return_types;cxx_unicode_literals;cxx_uniform_initialization;cxx_unrestricted_unions;cxx_user_literals;cxx_variadic_macros;cxx_variadic_templates;cxx_std_14;cxx_aggregate_default_initializers;cxx_attribute_deprecated;cxx_binary_literals;cxx_contextual_conversions;cxx_decltype_auto;cxx_digit_separators;cxx_generic_lambdas;cxx_lambda_init_captures;cxx_relaxed_constexpr;cxx_return_type_deduction;cxx_variable_templates;cxx_std_17;cxx_std_20;cxx_std_23")
-set(CMAKE_CXX98_COMPILE_FEATURES "cxx_std_98;cxx_template_template_parameters")
-set(CMAKE_CXX11_COMPILE_FEATURES "cxx_std_11;cxx_alias_templates;cxx_alignas;cxx_alignof;cxx_attributes;cxx_auto_type;cxx_constexpr;cxx_decltype;cxx_decltype_incomplete_return_types;cxx_default_function_template_args;cxx_defaulted_functions;cxx_defaulted_move_initializers;cxx_delegating_constructors;cxx_deleted_functions;cxx_enum_forward_declarations;cxx_explicit_conversions;cxx_extended_friend_declarations;cxx_extern_templates;cxx_final;cxx_func_identifier;cxx_generalized_initializers;cxx_inheriting_constructors;cxx_inline_namespaces;cxx_lambdas;cxx_local_type_template_args;cxx_long_long_type;cxx_noexcept;cxx_nonstatic_member_init;cxx_nullptr;cxx_override;cxx_range_for;cxx_raw_string_literals;cxx_reference_qualified_functions;cxx_right_angle_brackets;cxx_rvalue_references;cxx_sizeof_member;cxx_static_assert;cxx_strong_enums;cxx_thread_local;cxx_trailing_return_types;cxx_unicode_literals;cxx_uniform_initialization;cxx_unrestricted_unions;cxx_user_literals;cxx_variadic_macros;cxx_variadic_templates")
-set(CMAKE_CXX14_COMPILE_FEATURES "cxx_std_14;cxx_aggregate_default_initializers;cxx_attribute_deprecated;cxx_binary_literals;cxx_contextual_conversions;cxx_decltype_auto;cxx_digit_separators;cxx_generic_lambdas;cxx_lambda_init_captures;cxx_relaxed_constexpr;cxx_return_type_deduction;cxx_variable_templates")
-set(CMAKE_CXX17_COMPILE_FEATURES "cxx_std_17")
-set(CMAKE_CXX20_COMPILE_FEATURES "cxx_std_20")
-set(CMAKE_CXX23_COMPILE_FEATURES "cxx_std_23")
-set(CMAKE_CXX26_COMPILE_FEATURES "")
-
-set(CMAKE_CXX_PLATFORM_ID "Linux")
-set(CMAKE_CXX_SIMULATE_ID "")
-set(CMAKE_CXX_COMPILER_FRONTEND_VARIANT "GNU")
-set(CMAKE_CXX_SIMULATE_VERSION "")
-
-
-
-
-set(CMAKE_AR "/usr/bin/ar")
-set(CMAKE_CXX_COMPILER_AR "/usr/bin/gcc-ar-13")
-set(CMAKE_RANLIB "/usr/bin/ranlib")
-set(CMAKE_CXX_COMPILER_RANLIB "/usr/bin/gcc-ranlib-13")
-set(CMAKE_LINKER "/usr/bin/ld")
-set(CMAKE_LINKER_LINK "")
-set(CMAKE_LINKER_LLD "")
-set(CMAKE_CXX_COMPILER_LINKER "/usr/bin/ld")
-set(CMAKE_CXX_COMPILER_LINKER_ID "GNU")
-set(CMAKE_CXX_COMPILER_LINKER_VERSION 2.42)
-set(CMAKE_CXX_COMPILER_LINKER_FRONTEND_VARIANT GNU)
-set(CMAKE_MT "")
-set(CMAKE_TAPI "CMAKE_TAPI-NOTFOUND")
-set(CMAKE_COMPILER_IS_GNUCXX 1)
-set(CMAKE_CXX_COMPILER_LOADED 1)
-set(CMAKE_CXX_COMPILER_WORKS TRUE)
-set(CMAKE_CXX_ABI_COMPILED TRUE)
-
-set(CMAKE_CXX_COMPILER_ENV_VAR "CXX")
-
-set(CMAKE_CXX_COMPILER_ID_RUN 1)
-set(CMAKE_CXX_SOURCE_FILE_EXTENSIONS C;M;c++;cc;cpp;cxx;m;mm;mpp;CPP;ixx;cppm;ccm;cxxm;c++m)
-set(CMAKE_CXX_IGNORE_EXTENSIONS inl;h;hpp;HPP;H;o;O;obj;OBJ;def;DEF;rc;RC)
-
-foreach (lang IN ITEMS C OBJC OBJCXX)
-  if (CMAKE_${lang}_COMPILER_ID_RUN)
-    foreach(extension IN LISTS CMAKE_${lang}_SOURCE_FILE_EXTENSIONS)
-      list(REMOVE_ITEM CMAKE_CXX_SOURCE_FILE_EXTENSIONS ${extension})
-    endforeach()
-  endif()
-endforeach()
-
-set(CMAKE_CXX_LINKER_PREFERENCE 30)
-set(CMAKE_CXX_LINKER_PREFERENCE_PROPAGATES 1)
-set(CMAKE_CXX_LINKER_DEPFILE_SUPPORTED )
-
-# Save compiler ABI information.
-set(CMAKE_CXX_SIZEOF_DATA_PTR "8")
-set(CMAKE_CXX_COMPILER_ABI "ELF")
-set(CMAKE_CXX_BYTE_ORDER "LITTLE_ENDIAN")
-set(CMAKE_CXX_LIBRARY_ARCHITECTURE "x86_64-linux-gnu")
-
-if(CMAKE_CXX_SIZEOF_DATA_PTR)
-  set(CMAKE_SIZEOF_VOID_P "${CMAKE_CXX_SIZEOF_DATA_PTR}")
-endif()
-
-if(CMAKE_CXX_COMPILER_ABI)
-  set(CMAKE_INTERNAL_PLATFORM_ABI "${CMAKE_CXX_COMPILER_ABI}")
-endif()
-
-if(CMAKE_CXX_LIBRARY_ARCHITECTURE)
-  set(CMAKE_LIBRARY_ARCHITECTURE "x86_64-linux-gnu")
-endif()
-
-set(CMAKE_CXX_CL_SHOWINCLUDES_PREFIX "")
-if(CMAKE_CXX_CL_SHOWINCLUDES_PREFIX)
-  set(CMAKE_CL_SHOWINCLUDES_PREFIX "${CMAKE_CXX_CL_SHOWINCLUDES_PREFIX}")
-endif()
-
-
-
-
-
-set(CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES "/usr/include/c++/13;/usr/include/x86_64-linux-gnu/c++/13;/usr/include/c++/13/backward;/usr/lib/gcc/x86_64-linux-gnu/13/include;/usr/local/include;/usr/include/x86_64-linux-gnu;/usr/include")
-set(CMAKE_CXX_IMPLICIT_LINK_LIBRARIES "stdc++;m;gcc_s;gcc;c;gcc_s;gcc")
-set(CMAKE_CXX_IMPLICIT_LINK_DIRECTORIES "/usr/lib/gcc/x86_64-linux-gnu/13;/usr/lib/x86_64-linux-gnu;/usr/lib;/lib/x86_64-linux-gnu;/lib")
-set(CMAKE_CXX_IMPLICIT_LINK_FRAMEWORK_DIRECTORIES "")
-set(CMAKE_CXX_COMPILER_CLANG_RESOURCE_DIR "")
-
-set(CMAKE_CXX_COMPILER_IMPORT_STD "")
-### Imported target for C++23 standard library
-set(CMAKE_CXX23_COMPILER_IMPORT_STD_NOT_FOUND_MESSAGE "Unsupported generator: Unix Makefiles")
-
-
-
diff --git a/build_test/CMakeFiles/3.31.6/CMakeDetermineCompilerABI_CXX.bin b/build_test/CMakeFiles/3.31.6/CMakeDetermineCompilerABI_CXX.bin
deleted file mode 100755
index e90f3f71d98d8b48fdca37fdc4f6d991fd1db519..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 15992
zcmeHOYit}>6~4Q9xipD4Y0{XaG)rkv(&C9<F-{<K9ebTw<k%r`pb)}j);qRGtar`s
zEVT=iKzP&&K>;D4KR{7b9#VzWB18~B+O2|G6~rSFg`oZkluAK_))g&sA!Ipc?)lc^
z(YodJ1Btn-o$sFSoOAD;bMNflnYs7l>A`_`ET)i_sdp%rQVGqZMA7qB$q=Mek6J^=
zH>g|GN|KlRoYto_kXENl@x|CA{4zrJYvD`-yhYPggHC86Bl|6t=2mD8P|10)pRW=b
zJn#{z00_QbUs7re;fVMFgMJ*FxmN8rw|6lnB`(_q;m0ETDMQ;+cjzQomHL2)C&z@p
zJrd6_wn;I-u-}CEg|T1!fLsTs!_RrSf2Y2K;&&$L7o)=X7ELQ4>U$UY`Ee2bYXQ3X
zkkq$SKO`jnKnbtfnRm0@T|4u+*1TJ&Ot((=bhmbQ8ReqU;aAP=O466d)c&C(ii)W+
zCt+0a6Iw=jtlJ=Zw*TRV!E;T|eD<lsJAcyk*c}_b`>XiRpJy9xH~X*+CoT^|gk{ci
zoou7y@d?Vw*e1N_{A|)EmN>BA`Ubi_;*t$`YYD!v1b-9pw>2n7Sr$cf)GB*+$+ISH
zw?NG3v~7*K1v~HF>nK)pe7n{D!OXrstHbCpcGdHpUCPRg9I$du$r*Rco>Lk*(3dY3
zoDn;lcc`rK$znlDx3p<PLylm~|LC5Ik<9JIc&Ti5Z{Vo&_+##SU-&YGIZnTLI^jCT
z^^;tu`FXj%!C#gFn^Ia29`dETG|zp=eS&m3zz6&NN`S{0W1qPI&*KMaKETUQB2*DZ
z5r`rXMIeem6oDuLQ3Rq0{2xc)&&{{~)jWB%$vm~<H#?OwKV9|WwO^Pgf7Eork4kOV
zIihRZ9;9RQ)|6uV+O|hY8f)I#uY9@vPnp?^A24TsXP*51+`*A_d$s*3^Yq>yQvtP&
zWiowf%xK>FDZf18A0Wm&z2b`uyXU=)RQ0<#PgUPgyWG6>1RGuuBzxDl-<4(9aowDq
zGarBcF7xsEWoGON^Wt@H0~N4M3TUcb*6o5nxA(+eR;$XLN6eFZ<D4~TpYv9mr}nNS
z;mVF$t#&0xhbLD2o$k70$H=!{Kl}gT9#V4V2>H!^?5a6ix%_1M8aMM)`l|U=^Yq52
z*HU=CzdX_WXf>9;ChP`2&1YD1etEq4d|30_Mw*R(43%{4*afcI@1uIJaMe+YA`nF&
zia->BC<0Lgq6kD0h$0Y0Ac{Z~fhYq1d<6LY*Q=$>(7^DXGQFQGj#;@WuXMDn=UC8w
zC^I~e-Q&$zPO0eRj+Qd}to=jjO#e`?^6h;8?2PAF#S*={J35#d85vAl>7o8i?+{t|
zdOPbLrF97G5ZkisZT#+y-({V7p;kLic$V;f!iNb>!UyJRwX=kr_?;@J*u95TY&sF!
zvU*k18G50{Jg*%%PCjpDgZ@?i8@byl+eP2)#QVhB#K78?cQ)U6Ptyr?*XG@Kbl&d2
zzGVOR(>DP-%5&l}J^H>#{70BbuT6X=-nV9DyhJrK5v3>sQ3Rq0L=lK05Je!0Koo%}
z0#O8_2>fqE0P7X8J`rmV{hJ<Y;%YQg)-SFR`9WFd_<E7C4swggxb@jAGS)-#{SqhW
zU%p-|viz_tV#M0S3BKW@q}Q}6bxHKE)3mx@@J7KF!Ht3dtc|S7`o~qGXp@T2j;ipq
z*wara?^cmv_qUpEFU85Hu8XV}lhX_C1-<V{x2FF2&B^(^A~M<~#sBvJ>%;%U60t6I
ze_!98<n|-kO2Mln+dGX;qph{O;)@;kb#xhRT|0z+^$K}hEmtqr!d4vb7->Ey0ZEDh
zuN!V;&;1csYt@vDM=@7P;m?NnPT?`WVV|K)Otq*)N;4SuyvjO8PYW<!wN|N*Qikir
z^#Y#9VNBhmF#f@Ri!zPc|Cn!|P|2jW#CUyL_>}M%cP|TnTzCQ1LJf|oggPMvtrGCl
zQgPen+pkv#-zbIwXw=S5-=10*8c%O0Ua58Ub^0h~*tfq~;W`8F5Z`Eh`6r1_!YF{>
z@%c?kr2-^nzfOEYZL0SdwBI0peY{!W_Xzw$VjnK&2Y&gmTEHiXUl-q`Fz%uGCG%9X
zN@_+fWA!ZY2^v2wDOhUc{UYmWoTOwN`p=q3bw%tk-r)6;*zb_vQ~wzfDPJL;+Y`25
z5wAA|MfkXt_}dmSTG&JU`Z)bchOP^Bc(mlT8%0_vPfyz{&mLDql)cK>m@%prR@GbH
zq&3Rx>dR!AD_Z0EV%E-EIj>kMTXtnyjTR@T@{Z@^jJC!WyrSQ=>{7|5hk^yKG^55!
z_M~IwDwC5l<Pwl9vh)_2_8qW4==9xvcOTW_=ABaSzKk(CHKnZg4Yqf?g|VU)coxZQ
zhh`U^Fj`r6oa)WFHtjGV{chhYpwGLWmv;gtJ-!7+g&H?-sP};Xbkd?t1pV(F>OGL@
zBbs(&SZPzVX8$2&?H?T8*E?tp4-6bmk60tU`{<!28HV;aq_CCYwYD!fIoq?9A37?9
z1-+MngvA>htX#QhP1uDTZ+gfKlU2?wSe3GqQ+!HfpDmZgS9V#@MhSl2%4ftoC>m~y
zSiBdb-fZ51;dc`4M=H-udUlr3D`}iS&MnY(j45Rlik@SP7b?b7sW|17yqN%%t+=$8
z#?1*u{o2Z7&^Mp3%M;4T%@n8#jb2G>KJ1jrZn3aPut-;O@-{mtgGZ1urt<n=j29{6
zIn#9HVMvxmKeC21Ap>tBNB)qszaD|w19>Xko^(g4IovS@1yva|^e1UVH@NElb&BUr
zbjjDBzK8e0Vcvw2**2KoL;}xk=yLbdQv1C`U7vqJ?xsx8KfLdYpOXg@eh0zv|7p-4
z|L4FY3<bmf?;-v#G&e%~F&_k?e#{3kA49P=Wq2+Kf6NzwXT*@($gzVz=6No0JOzP2
z=AS_RpAV*R{69oWp8LTc^F1Ku(P%&HfcKF<&m|#aJ_&4-%ERqPn@&@PV+w!FZ-G@Y
zME&9O{|f2(oS?7&U&#Lk=JisHUl;O>U!!l(KPi4d5$i6Hf#*X0ZK43e4h294J{0m#
zi2|4lbr}3m-XkG@%qM`j?}2@I{GJzo#9t-FQt<O40)&RB^t^DP|IUa3kl%p?Q@H-0
zl9Epm^;eVH8u%qG){p3a5Wl7j&mnPNg83}=Nrvqq1D_?|=72xu&-1NBQi7e97G&@*
zkb=h^>aWi`4ee3olcU7rpA-DhkKZJYP2i7tXmuxBE0yw(3kUcE=SdaxuRFA9AJl^q
z;0O6SWtc<#n71XwKWs0j19!EI2<F7R&cpxCI-@i24<h<LXqu7&zby^p>-c8+qCNQi
l<NGkQJ?MXhZ=fipLWQGVt>rm#WB={^$3kg!$RQ-Ee*hEc8gl>u

diff --git a/build_test/CMakeFiles/3.31.6/CMakeSystem.cmake b/build_test/CMakeFiles/3.31.6/CMakeSystem.cmake
deleted file mode 100644
index b2715a602..000000000
--- a/build_test/CMakeFiles/3.31.6/CMakeSystem.cmake
+++ /dev/null
@@ -1,15 +0,0 @@
-set(CMAKE_HOST_SYSTEM "Linux-6.11.0-1018-azure")
-set(CMAKE_HOST_SYSTEM_NAME "Linux")
-set(CMAKE_HOST_SYSTEM_VERSION "6.11.0-1018-azure")
-set(CMAKE_HOST_SYSTEM_PROCESSOR "x86_64")
-
-
-
-set(CMAKE_SYSTEM "Linux-6.11.0-1018-azure")
-set(CMAKE_SYSTEM_NAME "Linux")
-set(CMAKE_SYSTEM_VERSION "6.11.0-1018-azure")
-set(CMAKE_SYSTEM_PROCESSOR "x86_64")
-
-set(CMAKE_CROSSCOMPILING "FALSE")
-
-set(CMAKE_SYSTEM_LOADED 1)
diff --git a/build_test/CMakeFiles/3.31.6/CompilerIdCXX/CMakeCXXCompilerId.cpp b/build_test/CMakeFiles/3.31.6/CompilerIdCXX/CMakeCXXCompilerId.cpp
deleted file mode 100644
index 3b6e114ca..000000000
--- a/build_test/CMakeFiles/3.31.6/CompilerIdCXX/CMakeCXXCompilerId.cpp
+++ /dev/null
@@ -1,919 +0,0 @@
-/* This source file must have a .cpp extension so that all C++ compilers
-   recognize the extension without flags.  Borland does not know .cxx for
-   example.  */
-#ifndef __cplusplus
-# error "A C compiler has been selected for C++."
-#endif
-
-#if !defined(__has_include)
-/* If the compiler does not have __has_include, pretend the answer is
-   always no.  */
-#  define __has_include(x) 0
-#endif
-
-
-/* Version number components: V=Version, R=Revision, P=Patch
-   Version date components:   YYYY=Year, MM=Month,   DD=Day  */
-
-#if defined(__INTEL_COMPILER) || defined(__ICC)
-# define COMPILER_ID "Intel"
-# if defined(_MSC_VER)
-#  define SIMULATE_ID "MSVC"
-# endif
-# if defined(__GNUC__)
-#  define SIMULATE_ID "GNU"
-# endif
-  /* __INTEL_COMPILER = VRP prior to 2021, and then VVVV for 2021 and later,
-     except that a few beta releases use the old format with V=2021.  */
-# if __INTEL_COMPILER < 2021 || __INTEL_COMPILER == 202110 || __INTEL_COMPILER == 202111
-#  define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER/100)
-#  define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER/10 % 10)
-#  if defined(__INTEL_COMPILER_UPDATE)
-#   define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER_UPDATE)
-#  else
-#   define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER   % 10)
-#  endif
-# else
-#  define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER)
-#  define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER_UPDATE)
-   /* The third version component from --version is an update index,
-      but no macro is provided for it.  */
-#  define COMPILER_VERSION_PATCH DEC(0)
-# endif
-# if defined(__INTEL_COMPILER_BUILD_DATE)
-   /* __INTEL_COMPILER_BUILD_DATE = YYYYMMDD */
-#  define COMPILER_VERSION_TWEAK DEC(__INTEL_COMPILER_BUILD_DATE)
-# endif
-# if defined(_MSC_VER)
-   /* _MSC_VER = VVRR */
-#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
-#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
-# endif
-# if defined(__GNUC__)
-#  define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
-# elif defined(__GNUG__)
-#  define SIMULATE_VERSION_MAJOR DEC(__GNUG__)
-# endif
-# if defined(__GNUC_MINOR__)
-#  define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
-# endif
-# if defined(__GNUC_PATCHLEVEL__)
-#  define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
-# endif
-
-#elif (defined(__clang__) && defined(__INTEL_CLANG_COMPILER)) || defined(__INTEL_LLVM_COMPILER)
-# define COMPILER_ID "IntelLLVM"
-#if defined(_MSC_VER)
-# define SIMULATE_ID "MSVC"
-#endif
-#if defined(__GNUC__)
-# define SIMULATE_ID "GNU"
-#endif
-/* __INTEL_LLVM_COMPILER = VVVVRP prior to 2021.2.0, VVVVRRPP for 2021.2.0 and
- * later.  Look for 6 digit vs. 8 digit version number to decide encoding.
- * VVVV is no smaller than the current year when a version is released.
- */
-#if __INTEL_LLVM_COMPILER < 1000000L
-# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/100)
-# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER    % 10)
-#else
-# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/10000)
-# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/100 % 100)
-# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER     % 100)
-#endif
-#if defined(_MSC_VER)
-  /* _MSC_VER = VVRR */
-# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
-# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
-#endif
-#if defined(__GNUC__)
-# define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
-#elif defined(__GNUG__)
-# define SIMULATE_VERSION_MAJOR DEC(__GNUG__)
-#endif
-#if defined(__GNUC_MINOR__)
-# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
-#endif
-#if defined(__GNUC_PATCHLEVEL__)
-# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
-#endif
-
-#elif defined(__PATHCC__)
-# define COMPILER_ID "PathScale"
-# define COMPILER_VERSION_MAJOR DEC(__PATHCC__)
-# define COMPILER_VERSION_MINOR DEC(__PATHCC_MINOR__)
-# if defined(__PATHCC_PATCHLEVEL__)
-#  define COMPILER_VERSION_PATCH DEC(__PATHCC_PATCHLEVEL__)
-# endif
-
-#elif defined(__BORLANDC__) && defined(__CODEGEARC_VERSION__)
-# define COMPILER_ID "Embarcadero"
-# define COMPILER_VERSION_MAJOR HEX(__CODEGEARC_VERSION__>>24 & 0x00FF)
-# define COMPILER_VERSION_MINOR HEX(__CODEGEARC_VERSION__>>16 & 0x00FF)
-# define COMPILER_VERSION_PATCH DEC(__CODEGEARC_VERSION__     & 0xFFFF)
-
-#elif defined(__BORLANDC__)
-# define COMPILER_ID "Borland"
-  /* __BORLANDC__ = 0xVRR */
-# define COMPILER_VERSION_MAJOR HEX(__BORLANDC__>>8)
-# define COMPILER_VERSION_MINOR HEX(__BORLANDC__ & 0xFF)
-
-#elif defined(__WATCOMC__) && __WATCOMC__ < 1200
-# define COMPILER_ID "Watcom"
-   /* __WATCOMC__ = VVRR */
-# define COMPILER_VERSION_MAJOR DEC(__WATCOMC__ / 100)
-# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
-# if (__WATCOMC__ % 10) > 0
-#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
-# endif
-
-#elif defined(__WATCOMC__)
-# define COMPILER_ID "OpenWatcom"
-   /* __WATCOMC__ = VVRP + 1100 */
-# define COMPILER_VERSION_MAJOR DEC((__WATCOMC__ - 1100) / 100)
-# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
-# if (__WATCOMC__ % 10) > 0
-#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
-# endif
-
-#elif defined(__SUNPRO_CC)
-# define COMPILER_ID "SunPro"
-# if __SUNPRO_CC >= 0x5100
-   /* __SUNPRO_CC = 0xVRRP */
-#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>12)
-#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xFF)
-#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC    & 0xF)
-# else
-   /* __SUNPRO_CC = 0xVRP */
-#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>8)
-#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xF)
-#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC    & 0xF)
-# endif
-
-#elif defined(__HP_aCC)
-# define COMPILER_ID "HP"
-  /* __HP_aCC = VVRRPP */
-# define COMPILER_VERSION_MAJOR DEC(__HP_aCC/10000)
-# define COMPILER_VERSION_MINOR DEC(__HP_aCC/100 % 100)
-# define COMPILER_VERSION_PATCH DEC(__HP_aCC     % 100)
-
-#elif defined(__DECCXX)
-# define COMPILER_ID "Compaq"
-  /* __DECCXX_VER = VVRRTPPPP */
-# define COMPILER_VERSION_MAJOR DEC(__DECCXX_VER/10000000)
-# define COMPILER_VERSION_MINOR DEC(__DECCXX_VER/100000  % 100)
-# define COMPILER_VERSION_PATCH DEC(__DECCXX_VER         % 10000)
-
-#elif defined(__IBMCPP__) && defined(__COMPILER_VER__)
-# define COMPILER_ID "zOS"
-  /* __IBMCPP__ = VRP */
-# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
-# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
-
-#elif defined(__open_xl__) && defined(__clang__)
-# define COMPILER_ID "IBMClang"
-# define COMPILER_VERSION_MAJOR DEC(__open_xl_version__)
-# define COMPILER_VERSION_MINOR DEC(__open_xl_release__)
-# define COMPILER_VERSION_PATCH DEC(__open_xl_modification__)
-# define COMPILER_VERSION_TWEAK DEC(__open_xl_ptf_fix_level__)
-
-
-#elif defined(__ibmxl__) && defined(__clang__)
-# define COMPILER_ID "XLClang"
-# define COMPILER_VERSION_MAJOR DEC(__ibmxl_version__)
-# define COMPILER_VERSION_MINOR DEC(__ibmxl_release__)
-# define COMPILER_VERSION_PATCH DEC(__ibmxl_modification__)
-# define COMPILER_VERSION_TWEAK DEC(__ibmxl_ptf_fix_level__)
-
-
-#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ >= 800
-# define COMPILER_ID "XL"
-  /* __IBMCPP__ = VRP */
-# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
-# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
-
-#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ < 800
-# define COMPILER_ID "VisualAge"
-  /* __IBMCPP__ = VRP */
-# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
-# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
-
-#elif defined(__NVCOMPILER)
-# define COMPILER_ID "NVHPC"
-# define COMPILER_VERSION_MAJOR DEC(__NVCOMPILER_MAJOR__)
-# define COMPILER_VERSION_MINOR DEC(__NVCOMPILER_MINOR__)
-# if defined(__NVCOMPILER_PATCHLEVEL__)
-#  define COMPILER_VERSION_PATCH DEC(__NVCOMPILER_PATCHLEVEL__)
-# endif
-
-#elif defined(__PGI)
-# define COMPILER_ID "PGI"
-# define COMPILER_VERSION_MAJOR DEC(__PGIC__)
-# define COMPILER_VERSION_MINOR DEC(__PGIC_MINOR__)
-# if defined(__PGIC_PATCHLEVEL__)
-#  define COMPILER_VERSION_PATCH DEC(__PGIC_PATCHLEVEL__)
-# endif
-
-#elif defined(__clang__) && defined(__cray__)
-# define COMPILER_ID "CrayClang"
-# define COMPILER_VERSION_MAJOR DEC(__cray_major__)
-# define COMPILER_VERSION_MINOR DEC(__cray_minor__)
-# define COMPILER_VERSION_PATCH DEC(__cray_patchlevel__)
-# define COMPILER_VERSION_INTERNAL_STR __clang_version__
-
-
-#elif defined(_CRAYC)
-# define COMPILER_ID "Cray"
-# define COMPILER_VERSION_MAJOR DEC(_RELEASE_MAJOR)
-# define COMPILER_VERSION_MINOR DEC(_RELEASE_MINOR)
-
-#elif defined(__TI_COMPILER_VERSION__)
-# define COMPILER_ID "TI"
-  /* __TI_COMPILER_VERSION__ = VVVRRRPPP */
-# define COMPILER_VERSION_MAJOR DEC(__TI_COMPILER_VERSION__/1000000)
-# define COMPILER_VERSION_MINOR DEC(__TI_COMPILER_VERSION__/1000   % 1000)
-# define COMPILER_VERSION_PATCH DEC(__TI_COMPILER_VERSION__        % 1000)
-
-#elif defined(__CLANG_FUJITSU)
-# define COMPILER_ID "FujitsuClang"
-# define COMPILER_VERSION_MAJOR DEC(__FCC_major__)
-# define COMPILER_VERSION_MINOR DEC(__FCC_minor__)
-# define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__)
-# define COMPILER_VERSION_INTERNAL_STR __clang_version__
-
-
-#elif defined(__FUJITSU)
-# define COMPILER_ID "Fujitsu"
-# if defined(__FCC_version__)
-#   define COMPILER_VERSION __FCC_version__
-# elif defined(__FCC_major__)
-#   define COMPILER_VERSION_MAJOR DEC(__FCC_major__)
-#   define COMPILER_VERSION_MINOR DEC(__FCC_minor__)
-#   define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__)
-# endif
-# if defined(__fcc_version)
-#   define COMPILER_VERSION_INTERNAL DEC(__fcc_version)
-# elif defined(__FCC_VERSION)
-#   define COMPILER_VERSION_INTERNAL DEC(__FCC_VERSION)
-# endif
-
-
-#elif defined(__ghs__)
-# define COMPILER_ID "GHS"
-/* __GHS_VERSION_NUMBER = VVVVRP */
-# ifdef __GHS_VERSION_NUMBER
-# define COMPILER_VERSION_MAJOR DEC(__GHS_VERSION_NUMBER / 100)
-# define COMPILER_VERSION_MINOR DEC(__GHS_VERSION_NUMBER / 10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__GHS_VERSION_NUMBER      % 10)
-# endif
-
-#elif defined(__TASKING__)
-# define COMPILER_ID "Tasking"
-  # define COMPILER_VERSION_MAJOR DEC(__VERSION__/1000)
-  # define COMPILER_VERSION_MINOR DEC(__VERSION__ % 100)
-# define COMPILER_VERSION_INTERNAL DEC(__VERSION__)
-
-#elif defined(__ORANGEC__)
-# define COMPILER_ID "OrangeC"
-# define COMPILER_VERSION_MAJOR DEC(__ORANGEC_MAJOR__)
-# define COMPILER_VERSION_MINOR DEC(__ORANGEC_MINOR__)
-# define COMPILER_VERSION_PATCH DEC(__ORANGEC_PATCHLEVEL__)
-
-#elif defined(__SCO_VERSION__)
-# define COMPILER_ID "SCO"
-
-#elif defined(__ARMCC_VERSION) && !defined(__clang__)
-# define COMPILER_ID "ARMCC"
-#if __ARMCC_VERSION >= 1000000
-  /* __ARMCC_VERSION = VRRPPPP */
-  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/1000000)
-  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 100)
-  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION     % 10000)
-#else
-  /* __ARMCC_VERSION = VRPPPP */
-  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/100000)
-  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 10)
-  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION    % 10000)
-#endif
-
-
-#elif defined(__clang__) && defined(__apple_build_version__)
-# define COMPILER_ID "AppleClang"
-# if defined(_MSC_VER)
-#  define SIMULATE_ID "MSVC"
-# endif
-# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
-# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
-# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
-# if defined(_MSC_VER)
-   /* _MSC_VER = VVRR */
-#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
-#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
-# endif
-# define COMPILER_VERSION_TWEAK DEC(__apple_build_version__)
-
-#elif defined(__clang__) && defined(__ARMCOMPILER_VERSION)
-# define COMPILER_ID "ARMClang"
-  # define COMPILER_VERSION_MAJOR DEC(__ARMCOMPILER_VERSION/1000000)
-  # define COMPILER_VERSION_MINOR DEC(__ARMCOMPILER_VERSION/10000 % 100)
-  # define COMPILER_VERSION_PATCH DEC(__ARMCOMPILER_VERSION/100   % 100)
-# define COMPILER_VERSION_INTERNAL DEC(__ARMCOMPILER_VERSION)
-
-#elif defined(__clang__) && defined(__ti__)
-# define COMPILER_ID "TIClang"
-  # define COMPILER_VERSION_MAJOR DEC(__ti_major__)
-  # define COMPILER_VERSION_MINOR DEC(__ti_minor__)
-  # define COMPILER_VERSION_PATCH DEC(__ti_patchlevel__)
-# define COMPILER_VERSION_INTERNAL DEC(__ti_version__)
-
-#elif defined(__clang__)
-# define COMPILER_ID "Clang"
-# if defined(_MSC_VER)
-#  define SIMULATE_ID "MSVC"
-# endif
-# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
-# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
-# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
-# if defined(_MSC_VER)
-   /* _MSC_VER = VVRR */
-#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
-#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
-# endif
-
-#elif defined(__LCC__) && (defined(__GNUC__) || defined(__GNUG__) || defined(__MCST__))
-# define COMPILER_ID "LCC"
-# define COMPILER_VERSION_MAJOR DEC(__LCC__ / 100)
-# define COMPILER_VERSION_MINOR DEC(__LCC__ % 100)
-# if defined(__LCC_MINOR__)
-#  define COMPILER_VERSION_PATCH DEC(__LCC_MINOR__)
-# endif
-# if defined(__GNUC__) && defined(__GNUC_MINOR__)
-#  define SIMULATE_ID "GNU"
-#  define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
-#  define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
-#  if defined(__GNUC_PATCHLEVEL__)
-#   define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
-#  endif
-# endif
-
-#elif defined(__GNUC__) || defined(__GNUG__)
-# define COMPILER_ID "GNU"
-# if defined(__GNUC__)
-#  define COMPILER_VERSION_MAJOR DEC(__GNUC__)
-# else
-#  define COMPILER_VERSION_MAJOR DEC(__GNUG__)
-# endif
-# if defined(__GNUC_MINOR__)
-#  define COMPILER_VERSION_MINOR DEC(__GNUC_MINOR__)
-# endif
-# if defined(__GNUC_PATCHLEVEL__)
-#  define COMPILER_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
-# endif
-
-#elif defined(_MSC_VER)
-# define COMPILER_ID "MSVC"
-  /* _MSC_VER = VVRR */
-# define COMPILER_VERSION_MAJOR DEC(_MSC_VER / 100)
-# define COMPILER_VERSION_MINOR DEC(_MSC_VER % 100)
-# if defined(_MSC_FULL_VER)
-#  if _MSC_VER >= 1400
-    /* _MSC_FULL_VER = VVRRPPPPP */
-#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 100000)
-#  else
-    /* _MSC_FULL_VER = VVRRPPPP */
-#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 10000)
-#  endif
-# endif
-# if defined(_MSC_BUILD)
-#  define COMPILER_VERSION_TWEAK DEC(_MSC_BUILD)
-# endif
-
-#elif defined(_ADI_COMPILER)
-# define COMPILER_ID "ADSP"
-#if defined(__VERSIONNUM__)
-  /* __VERSIONNUM__ = 0xVVRRPPTT */
-#  define COMPILER_VERSION_MAJOR DEC(__VERSIONNUM__ >> 24 & 0xFF)
-#  define COMPILER_VERSION_MINOR DEC(__VERSIONNUM__ >> 16 & 0xFF)
-#  define COMPILER_VERSION_PATCH DEC(__VERSIONNUM__ >> 8 & 0xFF)
-#  define COMPILER_VERSION_TWEAK DEC(__VERSIONNUM__ & 0xFF)
-#endif
-
-#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
-# define COMPILER_ID "IAR"
-# if defined(__VER__) && defined(__ICCARM__)
-#  define COMPILER_VERSION_MAJOR DEC((__VER__) / 1000000)
-#  define COMPILER_VERSION_MINOR DEC(((__VER__) / 1000) % 1000)
-#  define COMPILER_VERSION_PATCH DEC((__VER__) % 1000)
-#  define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
-# elif defined(__VER__) && (defined(__ICCAVR__) || defined(__ICCRX__) || defined(__ICCRH850__) || defined(__ICCRL78__) || defined(__ICC430__) || defined(__ICCRISCV__) || defined(__ICCV850__) || defined(__ICC8051__) || defined(__ICCSTM8__))
-#  define COMPILER_VERSION_MAJOR DEC((__VER__) / 100)
-#  define COMPILER_VERSION_MINOR DEC((__VER__) - (((__VER__) / 100)*100))
-#  define COMPILER_VERSION_PATCH DEC(__SUBVERSION__)
-#  define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
-# endif
-
-
-/* These compilers are either not known or too old to define an
-  identification macro.  Try to identify the platform and guess that
-  it is the native compiler.  */
-#elif defined(__hpux) || defined(__hpua)
-# define COMPILER_ID "HP"
-
-#else /* unknown compiler */
-# define COMPILER_ID ""
-#endif
-
-/* Construct the string literal in pieces to prevent the source from
-   getting matched.  Store it in a pointer rather than an array
-   because some compilers will just produce instructions to fill the
-   array rather than assigning a pointer to a static array.  */
-char const* info_compiler = "INFO" ":" "compiler[" COMPILER_ID "]";
-#ifdef SIMULATE_ID
-char const* info_simulate = "INFO" ":" "simulate[" SIMULATE_ID "]";
-#endif
-
-#ifdef __QNXNTO__
-char const* qnxnto = "INFO" ":" "qnxnto[]";
-#endif
-
-#if defined(__CRAYXT_COMPUTE_LINUX_TARGET)
-char const *info_cray = "INFO" ":" "compiler_wrapper[CrayPrgEnv]";
-#endif
-
-#define STRINGIFY_HELPER(X) #X
-#define STRINGIFY(X) STRINGIFY_HELPER(X)
-
-/* Identify known platforms by name.  */
-#if defined(__linux) || defined(__linux__) || defined(linux)
-# define PLATFORM_ID "Linux"
-
-#elif defined(__MSYS__)
-# define PLATFORM_ID "MSYS"
-
-#elif defined(__CYGWIN__)
-# define PLATFORM_ID "Cygwin"
-
-#elif defined(__MINGW32__)
-# define PLATFORM_ID "MinGW"
-
-#elif defined(__APPLE__)
-# define PLATFORM_ID "Darwin"
-
-#elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
-# define PLATFORM_ID "Windows"
-
-#elif defined(__FreeBSD__) || defined(__FreeBSD)
-# define PLATFORM_ID "FreeBSD"
-
-#elif defined(__NetBSD__) || defined(__NetBSD)
-# define PLATFORM_ID "NetBSD"
-
-#elif defined(__OpenBSD__) || defined(__OPENBSD)
-# define PLATFORM_ID "OpenBSD"
-
-#elif defined(__sun) || defined(sun)
-# define PLATFORM_ID "SunOS"
-
-#elif defined(_AIX) || defined(__AIX) || defined(__AIX__) || defined(__aix) || defined(__aix__)
-# define PLATFORM_ID "AIX"
-
-#elif defined(__hpux) || defined(__hpux__)
-# define PLATFORM_ID "HP-UX"
-
-#elif defined(__HAIKU__)
-# define PLATFORM_ID "Haiku"
-
-#elif defined(__BeOS) || defined(__BEOS__) || defined(_BEOS)
-# define PLATFORM_ID "BeOS"
-
-#elif defined(__QNX__) || defined(__QNXNTO__)
-# define PLATFORM_ID "QNX"
-
-#elif defined(__tru64) || defined(_tru64) || defined(__TRU64__)
-# define PLATFORM_ID "Tru64"
-
-#elif defined(__riscos) || defined(__riscos__)
-# define PLATFORM_ID "RISCos"
-
-#elif defined(__sinix) || defined(__sinix__) || defined(__SINIX__)
-# define PLATFORM_ID "SINIX"
-
-#elif defined(__UNIX_SV__)
-# define PLATFORM_ID "UNIX_SV"
-
-#elif defined(__bsdos__)
-# define PLATFORM_ID "BSDOS"
-
-#elif defined(_MPRAS) || defined(MPRAS)
-# define PLATFORM_ID "MP-RAS"
-
-#elif defined(__osf) || defined(__osf__)
-# define PLATFORM_ID "OSF1"
-
-#elif defined(_SCO_SV) || defined(SCO_SV) || defined(sco_sv)
-# define PLATFORM_ID "SCO_SV"
-
-#elif defined(__ultrix) || defined(__ultrix__) || defined(_ULTRIX)
-# define PLATFORM_ID "ULTRIX"
-
-#elif defined(__XENIX__) || defined(_XENIX) || defined(XENIX)
-# define PLATFORM_ID "Xenix"
-
-#elif defined(__WATCOMC__)
-# if defined(__LINUX__)
-#  define PLATFORM_ID "Linux"
-
-# elif defined(__DOS__)
-#  define PLATFORM_ID "DOS"
-
-# elif defined(__OS2__)
-#  define PLATFORM_ID "OS2"
-
-# elif defined(__WINDOWS__)
-#  define PLATFORM_ID "Windows3x"
-
-# elif defined(__VXWORKS__)
-#  define PLATFORM_ID "VxWorks"
-
-# else /* unknown platform */
-#  define PLATFORM_ID
-# endif
-
-#elif defined(__INTEGRITY)
-# if defined(INT_178B)
-#  define PLATFORM_ID "Integrity178"
-
-# else /* regular Integrity */
-#  define PLATFORM_ID "Integrity"
-# endif
-
-# elif defined(_ADI_COMPILER)
-#  define PLATFORM_ID "ADSP"
-
-#else /* unknown platform */
-# define PLATFORM_ID
-
-#endif
-
-/* For windows compilers MSVC and Intel we can determine
-   the architecture of the compiler being used.  This is because
-   the compilers do not have flags that can change the architecture,
-   but rather depend on which compiler is being used
-*/
-#if defined(_WIN32) && defined(_MSC_VER)
-# if defined(_M_IA64)
-#  define ARCHITECTURE_ID "IA64"
-
-# elif defined(_M_ARM64EC)
-#  define ARCHITECTURE_ID "ARM64EC"
-
-# elif defined(_M_X64) || defined(_M_AMD64)
-#  define ARCHITECTURE_ID "x64"
-
-# elif defined(_M_IX86)
-#  define ARCHITECTURE_ID "X86"
-
-# elif defined(_M_ARM64)
-#  define ARCHITECTURE_ID "ARM64"
-
-# elif defined(_M_ARM)
-#  if _M_ARM == 4
-#   define ARCHITECTURE_ID "ARMV4I"
-#  elif _M_ARM == 5
-#   define ARCHITECTURE_ID "ARMV5I"
-#  else
-#   define ARCHITECTURE_ID "ARMV" STRINGIFY(_M_ARM)
-#  endif
-
-# elif defined(_M_MIPS)
-#  define ARCHITECTURE_ID "MIPS"
-
-# elif defined(_M_SH)
-#  define ARCHITECTURE_ID "SHx"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-#elif defined(__WATCOMC__)
-# if defined(_M_I86)
-#  define ARCHITECTURE_ID "I86"
-
-# elif defined(_M_IX86)
-#  define ARCHITECTURE_ID "X86"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
-# if defined(__ICCARM__)
-#  define ARCHITECTURE_ID "ARM"
-
-# elif defined(__ICCRX__)
-#  define ARCHITECTURE_ID "RX"
-
-# elif defined(__ICCRH850__)
-#  define ARCHITECTURE_ID "RH850"
-
-# elif defined(__ICCRL78__)
-#  define ARCHITECTURE_ID "RL78"
-
-# elif defined(__ICCRISCV__)
-#  define ARCHITECTURE_ID "RISCV"
-
-# elif defined(__ICCAVR__)
-#  define ARCHITECTURE_ID "AVR"
-
-# elif defined(__ICC430__)
-#  define ARCHITECTURE_ID "MSP430"
-
-# elif defined(__ICCV850__)
-#  define ARCHITECTURE_ID "V850"
-
-# elif defined(__ICC8051__)
-#  define ARCHITECTURE_ID "8051"
-
-# elif defined(__ICCSTM8__)
-#  define ARCHITECTURE_ID "STM8"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-#elif defined(__ghs__)
-# if defined(__PPC64__)
-#  define ARCHITECTURE_ID "PPC64"
-
-# elif defined(__ppc__)
-#  define ARCHITECTURE_ID "PPC"
-
-# elif defined(__ARM__)
-#  define ARCHITECTURE_ID "ARM"
-
-# elif defined(__x86_64__)
-#  define ARCHITECTURE_ID "x64"
-
-# elif defined(__i386__)
-#  define ARCHITECTURE_ID "X86"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-#elif defined(__clang__) && defined(__ti__)
-# if defined(__ARM_ARCH)
-#  define ARCHITECTURE_ID "ARM"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-#elif defined(__TI_COMPILER_VERSION__)
-# if defined(__TI_ARM__)
-#  define ARCHITECTURE_ID "ARM"
-
-# elif defined(__MSP430__)
-#  define ARCHITECTURE_ID "MSP430"
-
-# elif defined(__TMS320C28XX__)
-#  define ARCHITECTURE_ID "TMS320C28x"
-
-# elif defined(__TMS320C6X__) || defined(_TMS320C6X)
-#  define ARCHITECTURE_ID "TMS320C6x"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-# elif defined(__ADSPSHARC__)
-#  define ARCHITECTURE_ID "SHARC"
-
-# elif defined(__ADSPBLACKFIN__)
-#  define ARCHITECTURE_ID "Blackfin"
-
-#elif defined(__TASKING__)
-
-# if defined(__CTC__) || defined(__CPTC__)
-#  define ARCHITECTURE_ID "TriCore"
-
-# elif defined(__CMCS__)
-#  define ARCHITECTURE_ID "MCS"
-
-# elif defined(__CARM__)
-#  define ARCHITECTURE_ID "ARM"
-
-# elif defined(__CARC__)
-#  define ARCHITECTURE_ID "ARC"
-
-# elif defined(__C51__)
-#  define ARCHITECTURE_ID "8051"
-
-# elif defined(__CPCP__)
-#  define ARCHITECTURE_ID "PCP"
-
-# else
-#  define ARCHITECTURE_ID ""
-# endif
-
-#else
-#  define ARCHITECTURE_ID
-#endif
-
-/* Convert integer to decimal digit literals.  */
-#define DEC(n)                   \
-  ('0' + (((n) / 10000000)%10)), \
-  ('0' + (((n) / 1000000)%10)),  \
-  ('0' + (((n) / 100000)%10)),   \
-  ('0' + (((n) / 10000)%10)),    \
-  ('0' + (((n) / 1000)%10)),     \
-  ('0' + (((n) / 100)%10)),      \
-  ('0' + (((n) / 10)%10)),       \
-  ('0' +  ((n) % 10))
-
-/* Convert integer to hex digit literals.  */
-#define HEX(n)             \
-  ('0' + ((n)>>28 & 0xF)), \
-  ('0' + ((n)>>24 & 0xF)), \
-  ('0' + ((n)>>20 & 0xF)), \
-  ('0' + ((n)>>16 & 0xF)), \
-  ('0' + ((n)>>12 & 0xF)), \
-  ('0' + ((n)>>8  & 0xF)), \
-  ('0' + ((n)>>4  & 0xF)), \
-  ('0' + ((n)     & 0xF))
-
-/* Construct a string literal encoding the version number. */
-#ifdef COMPILER_VERSION
-char const* info_version = "INFO" ":" "compiler_version[" COMPILER_VERSION "]";
-
-/* Construct a string literal encoding the version number components. */
-#elif defined(COMPILER_VERSION_MAJOR)
-char const info_version[] = {
-  'I', 'N', 'F', 'O', ':',
-  'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','[',
-  COMPILER_VERSION_MAJOR,
-# ifdef COMPILER_VERSION_MINOR
-  '.', COMPILER_VERSION_MINOR,
-#  ifdef COMPILER_VERSION_PATCH
-   '.', COMPILER_VERSION_PATCH,
-#   ifdef COMPILER_VERSION_TWEAK
-    '.', COMPILER_VERSION_TWEAK,
-#   endif
-#  endif
-# endif
-  ']','\0'};
-#endif
-
-/* Construct a string literal encoding the internal version number. */
-#ifdef COMPILER_VERSION_INTERNAL
-char const info_version_internal[] = {
-  'I', 'N', 'F', 'O', ':',
-  'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','_',
-  'i','n','t','e','r','n','a','l','[',
-  COMPILER_VERSION_INTERNAL,']','\0'};
-#elif defined(COMPILER_VERSION_INTERNAL_STR)
-char const* info_version_internal = "INFO" ":" "compiler_version_internal[" COMPILER_VERSION_INTERNAL_STR "]";
-#endif
-
-/* Construct a string literal encoding the version number components. */
-#ifdef SIMULATE_VERSION_MAJOR
-char const info_simulate_version[] = {
-  'I', 'N', 'F', 'O', ':',
-  's','i','m','u','l','a','t','e','_','v','e','r','s','i','o','n','[',
-  SIMULATE_VERSION_MAJOR,
-# ifdef SIMULATE_VERSION_MINOR
-  '.', SIMULATE_VERSION_MINOR,
-#  ifdef SIMULATE_VERSION_PATCH
-   '.', SIMULATE_VERSION_PATCH,
-#   ifdef SIMULATE_VERSION_TWEAK
-    '.', SIMULATE_VERSION_TWEAK,
-#   endif
-#  endif
-# endif
-  ']','\0'};
-#endif
-
-/* Construct the string literal in pieces to prevent the source from
-   getting matched.  Store it in a pointer rather than an array
-   because some compilers will just produce instructions to fill the
-   array rather than assigning a pointer to a static array.  */
-char const* info_platform = "INFO" ":" "platform[" PLATFORM_ID "]";
-char const* info_arch = "INFO" ":" "arch[" ARCHITECTURE_ID "]";
-
-
-
-#define CXX_STD_98 199711L
-#define CXX_STD_11 201103L
-#define CXX_STD_14 201402L
-#define CXX_STD_17 201703L
-#define CXX_STD_20 202002L
-#define CXX_STD_23 202302L
-
-#if defined(__INTEL_COMPILER) && defined(_MSVC_LANG)
-#  if _MSVC_LANG > CXX_STD_17
-#    define CXX_STD _MSVC_LANG
-#  elif _MSVC_LANG == CXX_STD_17 && defined(__cpp_aggregate_paren_init)
-#    define CXX_STD CXX_STD_20
-#  elif _MSVC_LANG > CXX_STD_14 && __cplusplus > CXX_STD_17
-#    define CXX_STD CXX_STD_20
-#  elif _MSVC_LANG > CXX_STD_14
-#    define CXX_STD CXX_STD_17
-#  elif defined(__INTEL_CXX11_MODE__) && defined(__cpp_aggregate_nsdmi)
-#    define CXX_STD CXX_STD_14
-#  elif defined(__INTEL_CXX11_MODE__)
-#    define CXX_STD CXX_STD_11
-#  else
-#    define CXX_STD CXX_STD_98
-#  endif
-#elif defined(_MSC_VER) && defined(_MSVC_LANG)
-#  if _MSVC_LANG > __cplusplus
-#    define CXX_STD _MSVC_LANG
-#  else
-#    define CXX_STD __cplusplus
-#  endif
-#elif defined(__NVCOMPILER)
-#  if __cplusplus == CXX_STD_17 && defined(__cpp_aggregate_paren_init)
-#    define CXX_STD CXX_STD_20
-#  else
-#    define CXX_STD __cplusplus
-#  endif
-#elif defined(__INTEL_COMPILER) || defined(__PGI)
-#  if __cplusplus == CXX_STD_11 && defined(__cpp_namespace_attributes)
-#    define CXX_STD CXX_STD_17
-#  elif __cplusplus == CXX_STD_11 && defined(__cpp_aggregate_nsdmi)
-#    define CXX_STD CXX_STD_14
-#  else
-#    define CXX_STD __cplusplus
-#  endif
-#elif (defined(__IBMCPP__) || defined(__ibmxl__)) && defined(__linux__)
-#  if __cplusplus == CXX_STD_11 && defined(__cpp_aggregate_nsdmi)
-#    define CXX_STD CXX_STD_14
-#  else
-#    define CXX_STD __cplusplus
-#  endif
-#elif __cplusplus == 1 && defined(__GXX_EXPERIMENTAL_CXX0X__)
-#  define CXX_STD CXX_STD_11
-#else
-#  define CXX_STD __cplusplus
-#endif
-
-const char* info_language_standard_default = "INFO" ":" "standard_default["
-#if CXX_STD > CXX_STD_23
-  "26"
-#elif CXX_STD > CXX_STD_20
-  "23"
-#elif CXX_STD > CXX_STD_17
-  "20"
-#elif CXX_STD > CXX_STD_14
-  "17"
-#elif CXX_STD > CXX_STD_11
-  "14"
-#elif CXX_STD >= CXX_STD_11
-  "11"
-#else
-  "98"
-#endif
-"]";
-
-const char* info_language_extensions_default = "INFO" ":" "extensions_default["
-#if (defined(__clang__) || defined(__GNUC__) || defined(__xlC__) ||           \
-     defined(__TI_COMPILER_VERSION__)) &&                                     \
-  !defined(__STRICT_ANSI__)
-  "ON"
-#else
-  "OFF"
-#endif
-"]";
-
-/*--------------------------------------------------------------------------*/
-
-int main(int argc, char* argv[])
-{
-  int require = 0;
-  require += info_compiler[argc];
-  require += info_platform[argc];
-  require += info_arch[argc];
-#ifdef COMPILER_VERSION_MAJOR
-  require += info_version[argc];
-#endif
-#ifdef COMPILER_VERSION_INTERNAL
-  require += info_version_internal[argc];
-#endif
-#ifdef SIMULATE_ID
-  require += info_simulate[argc];
-#endif
-#ifdef SIMULATE_VERSION_MAJOR
-  require += info_simulate_version[argc];
-#endif
-#if defined(__CRAYXT_COMPUTE_LINUX_TARGET)
-  require += info_cray[argc];
-#endif
-  require += info_language_standard_default[argc];
-  require += info_language_extensions_default[argc];
-  (void)argv;
-  return require;
-}
diff --git a/build_test/CMakeFiles/3.31.6/CompilerIdCXX/a.out b/build_test/CMakeFiles/3.31.6/CompilerIdCXX/a.out
deleted file mode 100755
index c8ced32cf082708045baa23211fbf858c298928d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 16096
zcmeHOeQX>@6`woj!=X-macg3d(k!8=99nPAj^nz8kaO&_*T^4f;*@}ER%_qdcj7+G
z-X66pNQ2TsjBC`;3i?Npq6&ckRRRf$sMO%Js8y?i5($YQ0Wu#EK}uUAK4e1V<Gq>p
z*6ZaQ1oRIi_F3LH@Ap1t_RZ|x?C#9N$-eGrBqErq#0LdRiI_qXq&Ryw6@Vo~yVwlJ
zcZ*xa29VcDOz9JffmYF_=xSa~colH;YrsMUeyf6^21VRL<mk5+rLjRk%mtkX`mIL=
z$wB^Ws(?A`z4|nC2GZow<ByRabH5)pWwA-wFCJLU4a&=5;_Qc_JOy3ZLw6`5K2P;A
z=X_#L@V}k%8RT&a!#wDhCchx>B0uI>2h!2YZt6d&?=bnjuE{VW$nR3HV9xd32Y%GG
zWN~B0-F$@VTdN;plz--wUa>cu8EtFbn@u%kGx^d~(^Pv~Q(LQEEa)w=Vr-WN|2U?4
z295~`GmjXhQAAHFnd71E7Sf~r3)WM^-*Yd|tslBNKJntNUw+`kwO7yv+l@YGgM{&T
zh@gyRtP^ciK0X5_8r#4x+CRxjV2uO%)m6}S0;W~K%{B1+8u-nC@2U_-m?mU&%q+T=
z<C-}ulLusM$}-0@c`KWF$QG!^{I-dnzTQKfW{cjU@Au04T7}s=)NiJ2$DYU(UE3Mz
z@5~nR_K-E2wIS9-u8^nbrZTN)h#8E?Kh;wakg>fyUP{|Dn=tD*{t)}_nJ+<_qj1Ml
z#Md!jKiXD>FVXeQ_yPs2PAEO&EXM-4rYXCI0PYa31@O-i-Wb52AUqzxpC$a#K_Lmp
z4vqz;1s{%MjOmIG=dq2tMIVmimTAd{%lj=WLLO!y%s`ldFau!*!VH8N2s7|Mk%2$e
z-geD6b+y`<UH|jFLKu(EyV3Fm<J6C;Uy|)B?|%m1^6sy~v36%dpnZAwIgrL{cXkOW
zH^0$4bMa%w%x{cSzgs*!lx&`Fe$|*e@EQat*B8O`&*OUS&PQZCz|R9>%&mVO**!~c
zJyd-^mZ9oR<%QavC(-aF;$VM9+VB57vOUYj%%XAr&4b4Ir79!xvT<?Qy#)g7rU2FD
z1=TM0$M&8)&<|=+y7QQE>Od5W#>{26#+W^@0fZ}i%H{Hv6dYcbVIm{o>(!6`e|Qj-
zSU3iLGoQX{%#;>hNnXch8ngAU!IS!I@~ZKa5xG$NoTxoFA4y&Z{P{KTZ&t!pfVui-
zw?LYoTNm@9JW|OTqPvyw+2r*R=r(Ms>{G87v8f@283;2FW+2Q!n1L_@VFtnsgc%4k
z5N06E!2fdw@cY+|sCS@y@ZPaPZZea#oniPYIkMV%mEQcM?G!VG{BT@S^FCb_;$9&>
zBBaM;)^f)SPHwmlzpfH!Ib-QzD#Lfee9CfC@WF4~DrMc_=DSH_Pq}s;YbkoV!2#K-
z$d0P_H$wC9d(_Zd<?;i-Q^4`fg9{v9SBR0ta`|cC_$?MG^3V|xnTkbr)NHJN96pF4
zj%yAY!Tt_3=-Md1<lPR%R`_3hvs{+ImRR?eh7Z-=^kDT#ad7)R@7s4fenyo3Snnma
zLl6jKy72!4i2Dr$l3QY*jdpI{5IqYuBM?%UfiMGM2Eq)483;2FW+2Q!n1L_@VFupb
z4DfnIUZ2Qo0Oi9AR8_;((fY;BB>$AwIlhZzUI)2@WPXI%PBO2D#OEF)*8gR>TtNBT
zw3v|B2&VC&4G7mIB3&Z=JCrC+6TgXg1Mzy|%*aj5(>lbBq=-{R+>UlSaaimriR0Zy
zGTZ&VtlA6a5?Ur%EhdK#+$(zN36GcZ{1)ka{zfv#qwsGZ<MrYHWkg<=s%a_^uRG;+
zro66{*OB&gcHXNs9vdy?-I4|m`tXF`)K-#W%ZZj&J>I&9;2Sp#yJ4O9V>xJr{SpDq
zW7MG<8Q}WjO7_@qQL#l#(zqpap%H#IfbS!muLHL4g+fF$i1vg+uzg6l8ao0{_dKp8
z2!~I>Ki13F72~I&5D_;EzD^kbIut6k|D3dsiG-#sTNHx`mF+J89)XqIr{6<{K2|CI
zucSR(ErId!d+E2;TZhkKu1WiMde;%-F-S-q3qIZixaO0&cwFM!gh()=crV~FvCYdf
zYYzin7p)b1zhV4-vJb`?lkwSVg*$+6jcyY>u37Ui;!v~D6hfD&_=3c@iQxL{rwI?P
zr+xwO7>tudf+H*b0N`~n9uhR(<U1r#y-0ClWY7153lxXP8%O&E#o0smUHQ%kl(;_y
z&nsyE2E}g-#IK2Zr^=xvzXR}Hs}Lo00A3e`yKLZk=>dEz^p}=UcHDk(bj)#^^#ZKG
zw?;FjYfT6Mif(CqTptrFtMyGcXO7`|{UTVV3g$$%FluGZlv{9$rd65}_>M7ayLL*C
zSGK^N0vXeC9BbON^R6>3#vLnXo2gPRHw`X6$plMxm1$?c^>MrN`0-A9li8cn$0jF*
z`O&`SmP~%Uz;7-gPWO?H{-l{4=rUm+LDxqHI{JG%0ftwfX3`+7(RD<aJ$-|RI{M7P
z?(U<>A#<qXP+t-}g4-Mtyqn=)?O?D|mTL)lmJkI6wVeTk)q5MvRIy;D;q@r)d*~em
zt5ha$mWp;t$W!5Wt4hjR`H7M>VVnQ_-c&#y$%o(YLS>`HB2`SgG+?6zr9+1I0tR2v
z-eA|o>a8ALN^paR>?_q&eE%ziUYyRk)+lh-Q9RA1Odj@qObR_;aBY1eU(zR?!ldoE
z(>`dllz~k<nG``ChkBcEP)hT(RZI&#HJyhl6n7n^p%>Sy1QT?Qowd+G=s2W=KABYq
zeWCyb7ji0e9G75Oko~9IX&Q;?6!^2G{MC?D9$bdtRxUFJ&B5;1A^Spy-pIiauW)((
z+Yrvr;MU;1<qz(+<M|l}Mq59<7X+L`!R0S$t$k&r_U3skw?V=0AKYJt@74Xp_hZKJ
z_t@{x^8w}>8xjxte;Dw;!W@j-&+|^^TtCk{z55!)vw-8All^&K%KUM%!!}~>*q`T<
z8NhG~!~Q(aWqulTehTLQ6QIO7Cj0Zek~z=Ux&3U%`~>*poRwvsw=$1Y<-zuIo93W^
zIc0yIM>FSnG}j+I|1X0to)hc6-xd0O;pYc1kreE|uK?=z*T|1KiR8WVv&Hx`0slBD
zn6n)RV43;10{#h7F#lqp!`P4GeJ9}0^BU&-e8u*`^Z!2ibN+=!mc(Brkr}}(iXTD=
zo5=pJlL7O)JWEvw*8gLG{r*ej&-}@NKleYwKZ63SY4!F+@_d;0V+QS6X8v37t@Ziy
z{ClYhKp?hL(u&OZTcE(PM~@LJ^Iup$i!@LDhvOfK{kR{$1{j*KKR;K_??r1N67slm
zV1MRIpz`~B4sqqvzTzrN?8opj6cFS3dEVDf{y}>>9d;L003b%@9?t%EdWb5pzn}Bi
z@tdY8Am0b^I>u)eZV%u8HUY+M_xmUCV=B;nf#6)P(&C)6vi}+UVF9WMI0QuT55M$T
ASpWb4

diff --git a/build_test/CMakeFiles/CMakeConfigureLog.yaml b/build_test/CMakeFiles/CMakeConfigureLog.yaml
deleted file mode 100644
index 5bbed262c..000000000
--- a/build_test/CMakeFiles/CMakeConfigureLog.yaml
+++ /dev/null
@@ -1,294 +0,0 @@
-
----
-events:
-  -
-    kind: "message-v1"
-    backtrace:
-      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineSystem.cmake:205 (message)"
-      - "CMakeLists.txt:5 (project)"
-    message: |
-      The system is: Linux - 6.11.0-1018-azure - x86_64
-  -
-    kind: "message-v1"
-    backtrace:
-      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCompilerId.cmake:17 (message)"
-      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCompilerId.cmake:64 (__determine_compiler_id_test)"
-      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCXXCompiler.cmake:126 (CMAKE_DETERMINE_COMPILER_ID)"
-      - "CMakeLists.txt:5 (project)"
-    message: |
-      Compiling the CXX compiler identification source file "CMakeCXXCompilerId.cpp" succeeded.
-      Compiler: /usr/bin/c++ 
-      Build flags: 
-      Id flags:  
-      
-      The output was:
-      0
-      
-      
-      Compilation of the CXX compiler identification source "CMakeCXXCompilerId.cpp" produced "a.out"
-      
-      The CXX compiler identification is GNU, found in:
-        /home/runner/work/mscclpp/mscclpp/build_test/CMakeFiles/3.31.6/CompilerIdCXX/a.out
-      
-  -
-    kind: "try_compile-v1"
-    backtrace:
-      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCompilerABI.cmake:74 (try_compile)"
-      - "/usr/local/share/cmake-3.31/Modules/CMakeTestCXXCompiler.cmake:26 (CMAKE_DETERMINE_COMPILER_ABI)"
-      - "CMakeLists.txt:5 (project)"
-    checks:
-      - "Detecting CXX compiler ABI info"
-    directories:
-      source: "/home/runner/work/mscclpp/mscclpp/build_test/CMakeFiles/CMakeScratch/TryCompile-8luNu3"
-      binary: "/home/runner/work/mscclpp/mscclpp/build_test/CMakeFiles/CMakeScratch/TryCompile-8luNu3"
-    cmakeVariables:
-      CMAKE_CXX_FLAGS: ""
-      CMAKE_CXX_FLAGS_DEBUG: "-g"
-      CMAKE_CXX_SCAN_FOR_MODULES: "OFF"
-      CMAKE_EXE_LINKER_FLAGS: ""
-    buildResult:
-      variable: "CMAKE_CXX_ABI_COMPILED"
-      cached: true
-      stdout: |
-        Change Dir: '/home/runner/work/mscclpp/mscclpp/build_test/CMakeFiles/CMakeScratch/TryCompile-8luNu3'
-        
-        Run Build Command(s): /usr/local/bin/cmake -E env VERBOSE=1 /usr/bin/gmake -f Makefile cmTC_ba2ae/fast
-        /usr/bin/gmake  -f CMakeFiles/cmTC_ba2ae.dir/build.make CMakeFiles/cmTC_ba2ae.dir/build
-        gmake[1]: Entering directory '/home/runner/work/mscclpp/mscclpp/build_test/CMakeFiles/CMakeScratch/TryCompile-8luNu3'
-        Building CXX object CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o
-        /usr/bin/c++   -v -o CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o -c /usr/local/share/cmake-3.31/Modules/CMakeCXXCompilerABI.cpp
-        Using built-in specs.
-        COLLECT_GCC=/usr/bin/c++
-        OFFLOAD_TARGET_NAMES=nvptx-none:amdgcn-amdhsa
-        OFFLOAD_TARGET_DEFAULT=1
-        Target: x86_64-linux-gnu
-        Configured with: ../src/configure -v --with-pkgversion='Ubuntu 13.3.0-6ubuntu2~24.04' --with-bugurl=file:///usr/share/doc/gcc-13/README.Bugs --enable-languages=c,ada,c++,go,d,fortran,objc,obj-c++,m2 --prefix=/usr --with-gcc-major-version-only --program-suffix=-13 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/libexec --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --enable-bootstrap --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-libstdcxx-backtrace --enable-gnu-unique-object --disable-vtable-verify --enable-plugin --enable-default-pie --with-system-zlib --enable-libphobos-checking=release --with-target-system-zlib=auto --enable-objc-gc=auto --enable-multiarch --disable-werror --enable-cet --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-nvptx/usr,amdgcn-amdhsa=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-gcn/usr --enable-offload-defaulted --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu --with-build-config=bootstrap-lto-lean --enable-link-serialization=2
-        Thread model: posix
-        Supported LTO compression algorithms: zlib zstd
-        gcc version 13.3.0 (Ubuntu 13.3.0-6ubuntu2~24.04) 
-        COLLECT_GCC_OPTIONS='-v' '-o' 'CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o' '-c' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'CMakeFiles/cmTC_ba2ae.dir/'
-         /usr/libexec/gcc/x86_64-linux-gnu/13/cc1plus -quiet -v -imultiarch x86_64-linux-gnu -D_GNU_SOURCE /usr/local/share/cmake-3.31/Modules/CMakeCXXCompilerABI.cpp -quiet -dumpdir CMakeFiles/cmTC_ba2ae.dir/ -dumpbase CMakeCXXCompilerABI.cpp.cpp -dumpbase-ext .cpp -mtune=generic -march=x86-64 -version -fasynchronous-unwind-tables -fstack-protector-strong -Wformat -Wformat-security -fstack-clash-protection -fcf-protection -o /tmp/cckrLaf7.s
-        GNU C++17 (Ubuntu 13.3.0-6ubuntu2~24.04) version 13.3.0 (x86_64-linux-gnu)
-        	compiled by GNU C version 13.3.0, GMP version 6.3.0, MPFR version 4.2.1, MPC version 1.3.1, isl version isl-0.26-GMP
-        
-        GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
-        ignoring duplicate directory "/usr/include/x86_64-linux-gnu/c++/13"
-        ignoring nonexistent directory "/usr/local/include/x86_64-linux-gnu"
-        ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/13/include-fixed/x86_64-linux-gnu"
-        ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/13/include-fixed"
-        ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/13/../../../../x86_64-linux-gnu/include"
-        #include "..." search starts here:
-        #include <...> search starts here:
-         /usr/include/c++/13
-         /usr/include/x86_64-linux-gnu/c++/13
-         /usr/include/c++/13/backward
-         /usr/lib/gcc/x86_64-linux-gnu/13/include
-         /usr/local/include
-         /usr/include/x86_64-linux-gnu
-         /usr/include
-        End of search list.
-        Compiler executable checksum: c81c05345ce537099dafd5580045814a
-        COLLECT_GCC_OPTIONS='-v' '-o' 'CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o' '-c' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'CMakeFiles/cmTC_ba2ae.dir/'
-         as -v --64 -o CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o /tmp/cckrLaf7.s
-        GNU assembler version 2.42 (x86_64-linux-gnu) using BFD version (GNU Binutils for Ubuntu) 2.42
-        COMPILER_PATH=/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/
-        LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../:/lib/:/usr/lib/
-        COLLECT_GCC_OPTIONS='-v' '-o' 'CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o' '-c' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.'
-        Linking CXX executable cmTC_ba2ae
-        /usr/local/bin/cmake -E cmake_link_script CMakeFiles/cmTC_ba2ae.dir/link.txt --verbose=1
-        Using built-in specs.
-        COLLECT_GCC=/usr/bin/c++
-        COLLECT_LTO_WRAPPER=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper
-        OFFLOAD_TARGET_NAMES=nvptx-none:amdgcn-amdhsa
-        OFFLOAD_TARGET_DEFAULT=1
-        Target: x86_64-linux-gnu
-        Configured with: ../src/configure -v --with-pkgversion='Ubuntu 13.3.0-6ubuntu2~24.04' --with-bugurl=file:///usr/share/doc/gcc-13/README.Bugs --enable-languages=c,ada,c++,go,d,fortran,objc,obj-c++,m2 --prefix=/usr --with-gcc-major-version-only --program-suffix=-13 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/libexec --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --enable-bootstrap --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-libstdcxx-backtrace --enable-gnu-unique-object --disable-vtable-verify --enable-plugin --enable-default-pie --with-system-zlib --enable-libphobos-checking=release --with-target-system-zlib=auto --enable-objc-gc=auto --enable-multiarch --disable-werror --enable-cet --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-nvptx/usr,amdgcn-amdhsa=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-gcn/usr --enable-offload-defaulted --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu --with-build-config=bootstrap-lto-lean --enable-link-serialization=2
-        Thread model: posix
-        Supported LTO compression algorithms: zlib zstd
-        gcc version 13.3.0 (Ubuntu 13.3.0-6ubuntu2~24.04) 
-        COMPILER_PATH=/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/
-        LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../:/lib/:/usr/lib/
-        COLLECT_GCC_OPTIONS='-v' '-o' 'cmTC_ba2ae' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'cmTC_ba2ae.'
-         /usr/libexec/gcc/x86_64-linux-gnu/13/collect2 -plugin /usr/libexec/gcc/x86_64-linux-gnu/13/liblto_plugin.so -plugin-opt=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper -plugin-opt=-fresolution=/tmp/cczMQRrO.res -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -dynamic-linker /lib64/ld-linux-x86-64.so.2 -pie -z now -z relro -o cmTC_ba2ae /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o -L/usr/lib/gcc/x86_64-linux-gnu/13 -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/13/../../.. -v CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o
-        collect2 version 13.3.0
-        /usr/bin/ld -plugin /usr/libexec/gcc/x86_64-linux-gnu/13/liblto_plugin.so -plugin-opt=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper -plugin-opt=-fresolution=/tmp/cczMQRrO.res -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -dynamic-linker /lib64/ld-linux-x86-64.so.2 -pie -z now -z relro -o cmTC_ba2ae /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o -L/usr/lib/gcc/x86_64-linux-gnu/13 -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/13/../../.. -v CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o
-        GNU ld (GNU Binutils for Ubuntu) 2.42
-        COLLECT_GCC_OPTIONS='-v' '-o' 'cmTC_ba2ae' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'cmTC_ba2ae.'
-        /usr/bin/c++  -v -Wl,-v CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o -o cmTC_ba2ae
-        gmake[1]: Leaving directory '/home/runner/work/mscclpp/mscclpp/build_test/CMakeFiles/CMakeScratch/TryCompile-8luNu3'
-        
-      exitCode: 0
-  -
-    kind: "message-v1"
-    backtrace:
-      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCompilerABI.cmake:182 (message)"
-      - "/usr/local/share/cmake-3.31/Modules/CMakeTestCXXCompiler.cmake:26 (CMAKE_DETERMINE_COMPILER_ABI)"
-      - "CMakeLists.txt:5 (project)"
-    message: |
-      Parsed CXX implicit include dir info: rv=done
-        found start of include info
-        found start of implicit include info
-          add: [/usr/include/c++/13]
-          add: [/usr/include/x86_64-linux-gnu/c++/13]
-          add: [/usr/include/c++/13/backward]
-          add: [/usr/lib/gcc/x86_64-linux-gnu/13/include]
-          add: [/usr/local/include]
-          add: [/usr/include/x86_64-linux-gnu]
-          add: [/usr/include]
-        end of search list found
-        collapse include dir [/usr/include/c++/13] ==> [/usr/include/c++/13]
-        collapse include dir [/usr/include/x86_64-linux-gnu/c++/13] ==> [/usr/include/x86_64-linux-gnu/c++/13]
-        collapse include dir [/usr/include/c++/13/backward] ==> [/usr/include/c++/13/backward]
-        collapse include dir [/usr/lib/gcc/x86_64-linux-gnu/13/include] ==> [/usr/lib/gcc/x86_64-linux-gnu/13/include]
-        collapse include dir [/usr/local/include] ==> [/usr/local/include]
-        collapse include dir [/usr/include/x86_64-linux-gnu] ==> [/usr/include/x86_64-linux-gnu]
-        collapse include dir [/usr/include] ==> [/usr/include]
-        implicit include dirs: [/usr/include/c++/13;/usr/include/x86_64-linux-gnu/c++/13;/usr/include/c++/13/backward;/usr/lib/gcc/x86_64-linux-gnu/13/include;/usr/local/include;/usr/include/x86_64-linux-gnu;/usr/include]
-      
-      
-  -
-    kind: "message-v1"
-    backtrace:
-      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCompilerABI.cmake:218 (message)"
-      - "/usr/local/share/cmake-3.31/Modules/CMakeTestCXXCompiler.cmake:26 (CMAKE_DETERMINE_COMPILER_ABI)"
-      - "CMakeLists.txt:5 (project)"
-    message: |
-      Parsed CXX implicit link information:
-        link line regex: [^( *|.*[/\\])(ld[0-9]*(\\.[a-z]+)?|CMAKE_LINK_STARTFILE-NOTFOUND|([^/\\]+-)?ld|collect2)[^/\\]*( |$)]
-        linker tool regex: [^[ 	]*(->|")?[ 	]*(([^"]*[/\\])?(ld[0-9]*(\\.[a-z]+)?))("|,| |$)]
-        ignore line: [Change Dir: '/home/runner/work/mscclpp/mscclpp/build_test/CMakeFiles/CMakeScratch/TryCompile-8luNu3']
-        ignore line: []
-        ignore line: [Run Build Command(s): /usr/local/bin/cmake -E env VERBOSE=1 /usr/bin/gmake -f Makefile cmTC_ba2ae/fast]
-        ignore line: [/usr/bin/gmake  -f CMakeFiles/cmTC_ba2ae.dir/build.make CMakeFiles/cmTC_ba2ae.dir/build]
-        ignore line: [gmake[1]: Entering directory '/home/runner/work/mscclpp/mscclpp/build_test/CMakeFiles/CMakeScratch/TryCompile-8luNu3']
-        ignore line: [Building CXX object CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o]
-        ignore line: [/usr/bin/c++   -v -o CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o -c /usr/local/share/cmake-3.31/Modules/CMakeCXXCompilerABI.cpp]
-        ignore line: [Using built-in specs.]
-        ignore line: [COLLECT_GCC=/usr/bin/c++]
-        ignore line: [OFFLOAD_TARGET_NAMES=nvptx-none:amdgcn-amdhsa]
-        ignore line: [OFFLOAD_TARGET_DEFAULT=1]
-        ignore line: [Target: x86_64-linux-gnu]
-        ignore line: [Configured with: ../src/configure -v --with-pkgversion='Ubuntu 13.3.0-6ubuntu2~24.04' --with-bugurl=file:///usr/share/doc/gcc-13/README.Bugs --enable-languages=c ada c++ go d fortran objc obj-c++ m2 --prefix=/usr --with-gcc-major-version-only --program-suffix=-13 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/libexec --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --enable-bootstrap --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-libstdcxx-backtrace --enable-gnu-unique-object --disable-vtable-verify --enable-plugin --enable-default-pie --with-system-zlib --enable-libphobos-checking=release --with-target-system-zlib=auto --enable-objc-gc=auto --enable-multiarch --disable-werror --enable-cet --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32 m64 mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-nvptx/usr amdgcn-amdhsa=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-gcn/usr --enable-offload-defaulted --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu --with-build-config=bootstrap-lto-lean --enable-link-serialization=2]
-        ignore line: [Thread model: posix]
-        ignore line: [Supported LTO compression algorithms: zlib zstd]
-        ignore line: [gcc version 13.3.0 (Ubuntu 13.3.0-6ubuntu2~24.04) ]
-        ignore line: [COLLECT_GCC_OPTIONS='-v' '-o' 'CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o' '-c' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'CMakeFiles/cmTC_ba2ae.dir/']
-        ignore line: [ /usr/libexec/gcc/x86_64-linux-gnu/13/cc1plus -quiet -v -imultiarch x86_64-linux-gnu -D_GNU_SOURCE /usr/local/share/cmake-3.31/Modules/CMakeCXXCompilerABI.cpp -quiet -dumpdir CMakeFiles/cmTC_ba2ae.dir/ -dumpbase CMakeCXXCompilerABI.cpp.cpp -dumpbase-ext .cpp -mtune=generic -march=x86-64 -version -fasynchronous-unwind-tables -fstack-protector-strong -Wformat -Wformat-security -fstack-clash-protection -fcf-protection -o /tmp/cckrLaf7.s]
-        ignore line: [GNU C++17 (Ubuntu 13.3.0-6ubuntu2~24.04) version 13.3.0 (x86_64-linux-gnu)]
-        ignore line: [	compiled by GNU C version 13.3.0  GMP version 6.3.0  MPFR version 4.2.1  MPC version 1.3.1  isl version isl-0.26-GMP]
-        ignore line: []
-        ignore line: [GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072]
-        ignore line: [ignoring duplicate directory "/usr/include/x86_64-linux-gnu/c++/13"]
-        ignore line: [ignoring nonexistent directory "/usr/local/include/x86_64-linux-gnu"]
-        ignore line: [ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/13/include-fixed/x86_64-linux-gnu"]
-        ignore line: [ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/13/include-fixed"]
-        ignore line: [ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/13/../../../../x86_64-linux-gnu/include"]
-        ignore line: [#include "..." search starts here:]
-        ignore line: [#include <...> search starts here:]
-        ignore line: [ /usr/include/c++/13]
-        ignore line: [ /usr/include/x86_64-linux-gnu/c++/13]
-        ignore line: [ /usr/include/c++/13/backward]
-        ignore line: [ /usr/lib/gcc/x86_64-linux-gnu/13/include]
-        ignore line: [ /usr/local/include]
-        ignore line: [ /usr/include/x86_64-linux-gnu]
-        ignore line: [ /usr/include]
-        ignore line: [End of search list.]
-        ignore line: [Compiler executable checksum: c81c05345ce537099dafd5580045814a]
-        ignore line: [COLLECT_GCC_OPTIONS='-v' '-o' 'CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o' '-c' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'CMakeFiles/cmTC_ba2ae.dir/']
-        ignore line: [ as -v --64 -o CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o /tmp/cckrLaf7.s]
-        ignore line: [GNU assembler version 2.42 (x86_64-linux-gnu) using BFD version (GNU Binutils for Ubuntu) 2.42]
-        ignore line: [COMPILER_PATH=/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/]
-        ignore line: [LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../:/lib/:/usr/lib/]
-        ignore line: [COLLECT_GCC_OPTIONS='-v' '-o' 'CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o' '-c' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.']
-        ignore line: [Linking CXX executable cmTC_ba2ae]
-        ignore line: [/usr/local/bin/cmake -E cmake_link_script CMakeFiles/cmTC_ba2ae.dir/link.txt --verbose=1]
-        ignore line: [Using built-in specs.]
-        ignore line: [COLLECT_GCC=/usr/bin/c++]
-        ignore line: [COLLECT_LTO_WRAPPER=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper]
-        ignore line: [OFFLOAD_TARGET_NAMES=nvptx-none:amdgcn-amdhsa]
-        ignore line: [OFFLOAD_TARGET_DEFAULT=1]
-        ignore line: [Target: x86_64-linux-gnu]
-        ignore line: [Configured with: ../src/configure -v --with-pkgversion='Ubuntu 13.3.0-6ubuntu2~24.04' --with-bugurl=file:///usr/share/doc/gcc-13/README.Bugs --enable-languages=c ada c++ go d fortran objc obj-c++ m2 --prefix=/usr --with-gcc-major-version-only --program-suffix=-13 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/libexec --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --enable-bootstrap --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-libstdcxx-backtrace --enable-gnu-unique-object --disable-vtable-verify --enable-plugin --enable-default-pie --with-system-zlib --enable-libphobos-checking=release --with-target-system-zlib=auto --enable-objc-gc=auto --enable-multiarch --disable-werror --enable-cet --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32 m64 mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-nvptx/usr amdgcn-amdhsa=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-gcn/usr --enable-offload-defaulted --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu --with-build-config=bootstrap-lto-lean --enable-link-serialization=2]
-        ignore line: [Thread model: posix]
-        ignore line: [Supported LTO compression algorithms: zlib zstd]
-        ignore line: [gcc version 13.3.0 (Ubuntu 13.3.0-6ubuntu2~24.04) ]
-        ignore line: [COMPILER_PATH=/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/]
-        ignore line: [LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../:/lib/:/usr/lib/]
-        ignore line: [COLLECT_GCC_OPTIONS='-v' '-o' 'cmTC_ba2ae' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'cmTC_ba2ae.']
-        link line: [ /usr/libexec/gcc/x86_64-linux-gnu/13/collect2 -plugin /usr/libexec/gcc/x86_64-linux-gnu/13/liblto_plugin.so -plugin-opt=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper -plugin-opt=-fresolution=/tmp/cczMQRrO.res -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -dynamic-linker /lib64/ld-linux-x86-64.so.2 -pie -z now -z relro -o cmTC_ba2ae /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o -L/usr/lib/gcc/x86_64-linux-gnu/13 -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/13/../../.. -v CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o]
-          arg [/usr/libexec/gcc/x86_64-linux-gnu/13/collect2] ==> ignore
-          arg [-plugin] ==> ignore
-          arg [/usr/libexec/gcc/x86_64-linux-gnu/13/liblto_plugin.so] ==> ignore
-          arg [-plugin-opt=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper] ==> ignore
-          arg [-plugin-opt=-fresolution=/tmp/cczMQRrO.res] ==> ignore
-          arg [-plugin-opt=-pass-through=-lgcc_s] ==> ignore
-          arg [-plugin-opt=-pass-through=-lgcc] ==> ignore
-          arg [-plugin-opt=-pass-through=-lc] ==> ignore
-          arg [-plugin-opt=-pass-through=-lgcc_s] ==> ignore
-          arg [-plugin-opt=-pass-through=-lgcc] ==> ignore
-          arg [--build-id] ==> ignore
-          arg [--eh-frame-hdr] ==> ignore
-          arg [-m] ==> ignore
-          arg [elf_x86_64] ==> ignore
-          arg [--hash-style=gnu] ==> ignore
-          arg [--as-needed] ==> ignore
-          arg [-dynamic-linker] ==> ignore
-          arg [/lib64/ld-linux-x86-64.so.2] ==> ignore
-          arg [-pie] ==> ignore
-          arg [-znow] ==> ignore
-          arg [-zrelro] ==> ignore
-          arg [-o] ==> ignore
-          arg [cmTC_ba2ae] ==> ignore
-          arg [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o] ==> obj [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o]
-          arg [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o] ==> obj [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o]
-          arg [/usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o] ==> obj [/usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o]
-          arg [-L/usr/lib/gcc/x86_64-linux-gnu/13] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/13]
-          arg [-L/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu]
-          arg [-L/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib]
-          arg [-L/lib/x86_64-linux-gnu] ==> dir [/lib/x86_64-linux-gnu]
-          arg [-L/lib/../lib] ==> dir [/lib/../lib]
-          arg [-L/usr/lib/x86_64-linux-gnu] ==> dir [/usr/lib/x86_64-linux-gnu]
-          arg [-L/usr/lib/../lib] ==> dir [/usr/lib/../lib]
-          arg [-L/usr/lib/gcc/x86_64-linux-gnu/13/../../..] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/13/../../..]
-          arg [-v] ==> ignore
-          arg [CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o] ==> ignore
-          arg [-lstdc++] ==> lib [stdc++]
-          arg [-lm] ==> lib [m]
-          arg [-lgcc_s] ==> lib [gcc_s]
-          arg [-lgcc] ==> lib [gcc]
-          arg [-lc] ==> lib [c]
-          arg [-lgcc_s] ==> lib [gcc_s]
-          arg [-lgcc] ==> lib [gcc]
-          arg [/usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o] ==> obj [/usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o]
-          arg [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o] ==> obj [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o]
-        ignore line: [collect2 version 13.3.0]
-        ignore line: [/usr/bin/ld -plugin /usr/libexec/gcc/x86_64-linux-gnu/13/liblto_plugin.so -plugin-opt=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper -plugin-opt=-fresolution=/tmp/cczMQRrO.res -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -dynamic-linker /lib64/ld-linux-x86-64.so.2 -pie -z now -z relro -o cmTC_ba2ae /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o -L/usr/lib/gcc/x86_64-linux-gnu/13 -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/13/../../.. -v CMakeFiles/cmTC_ba2ae.dir/CMakeCXXCompilerABI.cpp.o -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o]
-        linker tool for 'CXX': /usr/bin/ld
-        collapse obj [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o] ==> [/usr/lib/x86_64-linux-gnu/Scrt1.o]
-        collapse obj [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o] ==> [/usr/lib/x86_64-linux-gnu/crti.o]
-        collapse obj [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o] ==> [/usr/lib/x86_64-linux-gnu/crtn.o]
-        collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/13] ==> [/usr/lib/gcc/x86_64-linux-gnu/13]
-        collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu] ==> [/usr/lib/x86_64-linux-gnu]
-        collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib] ==> [/usr/lib]
-        collapse library dir [/lib/x86_64-linux-gnu] ==> [/lib/x86_64-linux-gnu]
-        collapse library dir [/lib/../lib] ==> [/lib]
-        collapse library dir [/usr/lib/x86_64-linux-gnu] ==> [/usr/lib/x86_64-linux-gnu]
-        collapse library dir [/usr/lib/../lib] ==> [/usr/lib]
-        collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/13/../../..] ==> [/usr/lib]
-        implicit libs: [stdc++;m;gcc_s;gcc;c;gcc_s;gcc]
-        implicit objs: [/usr/lib/x86_64-linux-gnu/Scrt1.o;/usr/lib/x86_64-linux-gnu/crti.o;/usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o;/usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o;/usr/lib/x86_64-linux-gnu/crtn.o]
-        implicit dirs: [/usr/lib/gcc/x86_64-linux-gnu/13;/usr/lib/x86_64-linux-gnu;/usr/lib;/lib/x86_64-linux-gnu;/lib]
-        implicit fwks: []
-      
-      
-  -
-    kind: "message-v1"
-    backtrace:
-      - "/usr/local/share/cmake-3.31/Modules/Internal/CMakeDetermineLinkerId.cmake:40 (message)"
-      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCompilerABI.cmake:255 (cmake_determine_linker_id)"
-      - "/usr/local/share/cmake-3.31/Modules/CMakeTestCXXCompiler.cmake:26 (CMAKE_DETERMINE_COMPILER_ABI)"
-      - "CMakeLists.txt:5 (project)"
-    message: |
-      Running the CXX compiler's linker: "/usr/bin/ld" "-v"
-      GNU ld (GNU Binutils for Ubuntu) 2.42
-...
diff --git a/build_test/CMakeFiles/cmake.check_cache b/build_test/CMakeFiles/cmake.check_cache
deleted file mode 100644
index 3dccd7317..000000000
--- a/build_test/CMakeFiles/cmake.check_cache
+++ /dev/null
@@ -1 +0,0 @@
-# This file is generated by cmake for dependency checking of the CMakeCache.txt file
diff --git a/build_test/include/mscclpp/version.hpp b/build_test/include/mscclpp/version.hpp
deleted file mode 100644
index 4c1c9a14a..000000000
--- a/build_test/include/mscclpp/version.hpp
+++ /dev/null
@@ -1,13 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#ifndef MSCCLPP_VERSION_HPP_
-#define MSCCLPP_VERSION_HPP_
-
-#define MSCCLPP_MAJOR 0
-#define MSCCLPP_MINOR 8
-#define MSCCLPP_PATCH 0
-#define MSCCLPP_VERSION (MSCCLPP_MAJOR * 10000 + MSCCLPP_MINOR * 100 + MSCCLPP_PATCH)
-#define MSCCLPP_GIT_COMMIT "1818709de0a5"
-
-#endif  // MSCCLPP_VERSION_HPP_

From 305d15717edcbfc1061eed5295b5a03fcc549912 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 11 Feb 2026 02:21:10 +0000
Subject: [PATCH 013/132] Remove PerfTestResult and reuse TestResult directly

- Add nlohmann::ordered_json metrics field to TestResult struct
- Add nlohmann/json.hpp include to test/framework.hpp
- Link test_framework with nlohmann_json::nlohmann_json
- Replace PerfTestResult with TestResult in test/perf/framework.cc
- Move perf utility functions to utils namespace for consistency
- Remove duplicate PerfTestResult struct definition

This consolidates the two similar structs into one, reducing code
duplication while maintaining all necessary fields for both unit
tests (passed/failure_message) and performance tests (metrics).

Verified build succeeds with Docker:
docker run --rm -v $(pwd):/workspace -w /workspace \
  ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 bash -c \
  "cd /workspace/build && make -j4 fifo_test"

Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
---
 test/CMakeLists.txt     |  2 +-
 test/framework.hpp      |  2 ++
 test/perf/fifo_test.cu  | 18 +++++++++---------
 test/perf/framework.cc  | 24 ++++++++----------------
 test/perf/framework.hpp |  2 ++
 test/unit/core_tests.cc |  4 ++--
 6 files changed, 24 insertions(+), 28 deletions(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 20c469d70..6b6941487 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -41,7 +41,7 @@ include(CTest)
 # Build test framework library
 add_library(test_framework STATIC framework.cc)
 target_include_directories(test_framework PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${TEST_INC_COMMON})
-target_link_libraries(test_framework PUBLIC MPI::MPI_CXX)
+target_link_libraries(test_framework PUBLIC MPI::MPI_CXX nlohmann_json::nlohmann_json)
 
 # Unit tests
 add_executable(unit_tests)
diff --git a/test/framework.hpp b/test/framework.hpp
index f93471e94..34ef40841 100644
--- a/test/framework.hpp
+++ b/test/framework.hpp
@@ -12,6 +12,7 @@
 #include <iostream>
 #include <map>
 #include <mscclpp/gpu.hpp>
+#include <nlohmann/json.hpp>
 #include <sstream>
 #include <stdexcept>
 #include <string>
@@ -26,6 +27,7 @@ struct TestResult {
   std::string test_name;
   std::string test_category;
   std::map<std::string, std::string> test_params;
+  nlohmann::ordered_json metrics;
   int num_processes;
   int process_rank;
   std::string timestamp;
diff --git a/test/perf/fifo_test.cu b/test/perf/fifo_test.cu
index bb77a1067..3e6980eb9 100644
--- a/test/perf/fifo_test.cu
+++ b/test/perf/fifo_test.cu
@@ -48,7 +48,7 @@ __global__ void kernelFifoPushSync(size_t numTriggers) {
 }
 
 static void setupCuda(int& cudaDevice, int& numaNode) {
-  utils::CUDA_CHECK(cudaGetDevice(&cudaDevice));
+  CUDA_CHECK(cudaGetDevice(&cudaDevice));
   numaNode = mscclpp::getDeviceNumaNode(cudaDevice);
   mscclpp::numaBind(numaNode);
 }
@@ -88,26 +88,26 @@ std::tuple<double, double, int, int> runSingleKernelVariant(void (*kernel)(size_
 
   // Warmup
   kernel<<<numParallel, 1, 0, stream>>>(warmupTriggers);
-  utils::CUDA_CHECK(cudaGetLastError());
+  CUDA_CHECK(cudaGetLastError());
 
   // Process warmup triggers (note: total triggers = warmupTriggers * numParallel)
   if (!consumeTriggers(hostFifo, warmupTriggers, numParallel)) {
     return {0.0, 0.0, 0, 0};  // Return error values
   }
-  utils::CUDA_CHECK(cudaStreamSynchronize(stream));
+  CUDA_CHECK(cudaStreamSynchronize(stream));
 
   // Benchmark
   utils::Timer timer;
   timer.start();
 
   kernel<<<numParallel, 1, 0, stream>>>(numTriggers);
-  utils::CUDA_CHECK(cudaGetLastError());
+  CUDA_CHECK(cudaGetLastError());
 
   // Process all triggers
   if (!consumeTriggers(hostFifo, numTriggers, numParallel)) {
     return {0.0, 0.0, 0, 0};
   }
-  utils::CUDA_CHECK(cudaStreamSynchronize(stream));
+  CUDA_CHECK(cudaStreamSynchronize(stream));
 
   timer.stop();
 
@@ -115,7 +115,7 @@ std::tuple<double, double, int, int> runSingleKernelVariant(void (*kernel)(size_
   double throughput = totalTriggers / timer.elapsedSeconds();
   double duration_us = timer.elapsedMicroseconds();
 
-  utils::CUDA_CHECK(cudaDeviceSynchronize());
+  CUDA_CHECK(cudaDeviceSynchronize());
 
   return {throughput, duration_us, totalTriggers, warmupTriggers * numParallel};
 }
@@ -165,10 +165,10 @@ void runFifoTest(const FifoTestConfig& config, [[maybe_unused]] int rank, [[mayb
   auto hostFifo = std::make_unique<mscclpp::Fifo>(config.fifoSize);
 
   mscclpp::FifoDeviceHandle hostHandle = hostFifo->deviceHandle();
-  utils::CUDA_CHECK(cudaMemcpyToSymbol(gFifoDeviceHandle, &hostHandle, sizeof(mscclpp::FifoDeviceHandle)));
+  CUDA_CHECK(cudaMemcpyToSymbol(gFifoDeviceHandle, &hostHandle, sizeof(mscclpp::FifoDeviceHandle)));
 
   cudaStream_t stream;
-  utils::CUDA_CHECK(cudaStreamCreate(&stream));
+  CUDA_CHECK(cudaStreamCreate(&stream));
 
   // Create test name with parallelism range
   std::string testName = "FifoTest_Size" + std::to_string(config.fifoSize) + "_Parallel";
@@ -218,7 +218,7 @@ void runFifoTest(const FifoTestConfig& config, [[maybe_unused]] int rank, [[mayb
 
   utils::recordResult(testName, "fifo", combinedMetrics, testParams);
 
-  utils::CUDA_CHECK(cudaStreamDestroy(stream));
+  CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
 void runAllFifoTests([[maybe_unused]] int rank, [[maybe_unused]] int worldSize, [[maybe_unused]] int localRank) {
diff --git a/test/perf/framework.cc b/test/perf/framework.cc
index 45a0c28c8..be1d812e3 100644
--- a/test/perf/framework.cc
+++ b/test/perf/framework.cc
@@ -11,19 +11,8 @@
 namespace mscclpp {
 namespace test {
 
-// Performance test result structure
-struct PerfTestResult {
-  std::string test_name;
-  std::string test_category;
-  std::map<std::string, std::string> test_params;
-  nlohmann::ordered_json metrics;
-  int num_processes;
-  int process_rank;
-  std::string timestamp;
-};
-
 // Global state for performance test results
-static std::vector<PerfTestResult> g_perf_results;
+static std::vector<TestResult> g_perf_results;
 
 namespace {
 std::string getCurrentTimestamp() {
@@ -35,15 +24,17 @@ std::string getCurrentTimestamp() {
 }
 }  // namespace
 
+namespace utils {
+
 void recordResult(const std::string& test_name, const std::string& test_category, const nlohmann::ordered_json& metrics,
                   const std::map<std::string, std::string>& test_params) {
-  PerfTestResult result;
+  TestResult result;
   result.test_name = test_name;
   result.test_category = test_category;
   result.test_params = test_params;
   result.metrics = metrics;
-  result.num_processes = utils::getMPISize();
-  result.process_rank = utils::getMPIRank();
+  result.num_processes = getMPISize();
+  result.process_rank = getMPIRank();
   result.timestamp = getCurrentTimestamp();
 
   g_perf_results.push_back(result);
@@ -70,7 +61,7 @@ void writeResultsToFile(const std::string& filename) {
 }
 
 void printResults(bool verbose) {
-  if (!utils::isMainRank()) return;
+  if (!isMainRank()) return;
 
   std::cout << "\n=== Test Results ===" << std::endl;
 
@@ -92,5 +83,6 @@ void printResults(bool verbose) {
   std::cout << std::endl;
 }
 
+}  // namespace utils
 }  // namespace test
 }  // namespace mscclpp
diff --git a/test/perf/framework.hpp b/test/perf/framework.hpp
index 094d5cb13..7f7401877 100644
--- a/test/perf/framework.hpp
+++ b/test/perf/framework.hpp
@@ -13,6 +13,7 @@
 
 namespace mscclpp {
 namespace test {
+namespace utils {
 
 // Additional performance test utilities not in the base framework
 
@@ -24,6 +25,7 @@ void recordResult(const std::string& test_name, const std::string& test_category
 void writeResultsToFile(const std::string& filename);
 void printResults(bool verbose = false);
 
+}  // namespace utils
 }  // namespace test
 }  // namespace mscclpp
 
diff --git a/test/unit/core_tests.cc b/test/unit/core_tests.cc
index 13437872d..f026c05e6 100644
--- a/test/unit/core_tests.cc
+++ b/test/unit/core_tests.cc
@@ -1,10 +1,10 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "../framework.hpp"
-
 #include <mscclpp/core.hpp>
 
+#include "../framework.hpp"
+
 class LocalCommunicatorTest : public ::mscclpp::test::TestCase {
  protected:
   void SetUp() override {

From b1f458eca35c5963e737c562d4d297debeb5bd97 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 11 Feb 2026 02:25:28 +0000
Subject: [PATCH 014/132] Convert test framework identifiers from snake_case to
 camelCase

Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
---
 build2/CMakeCache.txt                         | 393 ++++++++
 .../CMakeFiles/3.31.6/CMakeCXXCompiler.cmake  | 101 ++
 .../3.31.6/CMakeDetermineCompilerABI_CXX.bin  | Bin 0 -> 15992 bytes
 build2/CMakeFiles/3.31.6/CMakeSystem.cmake    |  15 +
 .../CompilerIdCXX/CMakeCXXCompilerId.cpp      | 919 ++++++++++++++++++
 build2/CMakeFiles/3.31.6/CompilerIdCXX/a.out  | Bin 0 -> 16096 bytes
 build2/CMakeFiles/CMakeConfigureLog.yaml      | 294 ++++++
 build2/CMakeFiles/cmake.check_cache           |   1 +
 build2/include/mscclpp/version.hpp            |  13 +
 test/framework.cc                             | 104 +-
 test/framework.hpp                            |  38 +-
 test/perf/framework.cc                        |  38 +-
 test/perf/framework.hpp                       |   4 +-
 13 files changed, 1828 insertions(+), 92 deletions(-)
 create mode 100644 build2/CMakeCache.txt
 create mode 100644 build2/CMakeFiles/3.31.6/CMakeCXXCompiler.cmake
 create mode 100755 build2/CMakeFiles/3.31.6/CMakeDetermineCompilerABI_CXX.bin
 create mode 100644 build2/CMakeFiles/3.31.6/CMakeSystem.cmake
 create mode 100644 build2/CMakeFiles/3.31.6/CompilerIdCXX/CMakeCXXCompilerId.cpp
 create mode 100755 build2/CMakeFiles/3.31.6/CompilerIdCXX/a.out
 create mode 100644 build2/CMakeFiles/CMakeConfigureLog.yaml
 create mode 100644 build2/CMakeFiles/cmake.check_cache
 create mode 100644 build2/include/mscclpp/version.hpp

diff --git a/build2/CMakeCache.txt b/build2/CMakeCache.txt
new file mode 100644
index 000000000..c404aca8d
--- /dev/null
+++ b/build2/CMakeCache.txt
@@ -0,0 +1,393 @@
+# This is the CMakeCache file.
+# For build in directory: /home/runner/work/mscclpp/mscclpp/build2
+# It was generated by CMake: /usr/local/bin/cmake
+# You can edit this file to change values found and used by cmake.
+# If you do not want to change any of the values, simply exit the editor.
+# If you do want to change a value, simply edit, save, and exit the editor.
+# The syntax for the file is as follows:
+# KEY:TYPE=VALUE
+# KEY is the name of a variable in the cache.
+# TYPE is a hint to GUIs for the type of VALUE, DO NOT EDIT TYPE!.
+# VALUE is the current value for the KEY.
+
+########################
+# EXTERNAL cache entries
+########################
+
+//Path to a program.
+CMAKE_ADDR2LINE:FILEPATH=/usr/bin/addr2line
+
+//Path to a program.
+CMAKE_AR:FILEPATH=/usr/bin/ar
+
+//Choose the type of build, options are: None Debug Release RelWithDebInfo
+// MinSizeRel ...
+CMAKE_BUILD_TYPE:STRING=Release
+
+//Enable/Disable color output during build.
+CMAKE_COLOR_MAKEFILE:BOOL=ON
+
+//CXX compiler
+CMAKE_CXX_COMPILER:FILEPATH=/usr/bin/c++
+
+//A wrapper around 'ar' adding the appropriate '--plugin' option
+// for the GCC compiler
+CMAKE_CXX_COMPILER_AR:FILEPATH=/usr/bin/gcc-ar-13
+
+//A wrapper around 'ranlib' adding the appropriate '--plugin' option
+// for the GCC compiler
+CMAKE_CXX_COMPILER_RANLIB:FILEPATH=/usr/bin/gcc-ranlib-13
+
+//Flags used by the CXX compiler during all build types.
+CMAKE_CXX_FLAGS:STRING=
+
+//Flags used by the CXX compiler during DEBUG builds.
+CMAKE_CXX_FLAGS_DEBUG:STRING=-g
+
+//Flags used by the CXX compiler during MINSIZEREL builds.
+CMAKE_CXX_FLAGS_MINSIZEREL:STRING=-Os -DNDEBUG
+
+//Flags used by the CXX compiler during RELEASE builds.
+CMAKE_CXX_FLAGS_RELEASE:STRING=-O3 -DNDEBUG
+
+//Flags used by the CXX compiler during RELWITHDEBINFO builds.
+CMAKE_CXX_FLAGS_RELWITHDEBINFO:STRING=-O2 -g -DNDEBUG
+
+//Path to a program.
+CMAKE_DLLTOOL:FILEPATH=CMAKE_DLLTOOL-NOTFOUND
+
+//Flags used by the linker during all build types.
+CMAKE_EXE_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during DEBUG builds.
+CMAKE_EXE_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during MINSIZEREL builds.
+CMAKE_EXE_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during RELEASE builds.
+CMAKE_EXE_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during RELWITHDEBINFO builds.
+CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//Enable/Disable output of compile commands during generation.
+CMAKE_EXPORT_COMPILE_COMMANDS:BOOL=
+
+//Value Computed by CMake.
+CMAKE_FIND_PACKAGE_REDIRECTS_DIR:STATIC=/home/runner/work/mscclpp/mscclpp/build2/CMakeFiles/pkgRedirects
+
+//Install path prefix, prepended onto install directories.
+CMAKE_INSTALL_PREFIX:PATH=/usr/local
+
+//Path to a program.
+CMAKE_LINKER:FILEPATH=/usr/bin/ld
+
+//Path to a program.
+CMAKE_MAKE_PROGRAM:FILEPATH=/usr/bin/gmake
+
+//Flags used by the linker during the creation of modules during
+// all build types.
+CMAKE_MODULE_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during the creation of modules during
+// DEBUG builds.
+CMAKE_MODULE_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during the creation of modules during
+// MINSIZEREL builds.
+CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during the creation of modules during
+// RELEASE builds.
+CMAKE_MODULE_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during the creation of modules during
+// RELWITHDEBINFO builds.
+CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//Path to a program.
+CMAKE_NM:FILEPATH=/usr/bin/nm
+
+//Path to a program.
+CMAKE_OBJCOPY:FILEPATH=/usr/bin/objcopy
+
+//Path to a program.
+CMAKE_OBJDUMP:FILEPATH=/usr/bin/objdump
+
+//Value Computed by CMake
+CMAKE_PROJECT_DESCRIPTION:STATIC=
+
+//Value Computed by CMake
+CMAKE_PROJECT_HOMEPAGE_URL:STATIC=
+
+//Value Computed by CMake
+CMAKE_PROJECT_NAME:STATIC=mscclpp
+
+//Path to a program.
+CMAKE_RANLIB:FILEPATH=/usr/bin/ranlib
+
+//Path to a program.
+CMAKE_READELF:FILEPATH=/usr/bin/readelf
+
+//Flags used by the linker during the creation of shared libraries
+// during all build types.
+CMAKE_SHARED_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during the creation of shared libraries
+// during DEBUG builds.
+CMAKE_SHARED_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during the creation of shared libraries
+// during MINSIZEREL builds.
+CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during the creation of shared libraries
+// during RELEASE builds.
+CMAKE_SHARED_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during the creation of shared libraries
+// during RELWITHDEBINFO builds.
+CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//If set, runtime paths are not added when installing shared libraries,
+// but are added when building.
+CMAKE_SKIP_INSTALL_RPATH:BOOL=NO
+
+//If set, runtime paths are not added when using shared libraries.
+CMAKE_SKIP_RPATH:BOOL=NO
+
+//Flags used by the linker during the creation of static libraries
+// during all build types.
+CMAKE_STATIC_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during the creation of static libraries
+// during DEBUG builds.
+CMAKE_STATIC_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during the creation of static libraries
+// during MINSIZEREL builds.
+CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during the creation of static libraries
+// during RELEASE builds.
+CMAKE_STATIC_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during the creation of static libraries
+// during RELWITHDEBINFO builds.
+CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//Path to a program.
+CMAKE_STRIP:FILEPATH=/usr/bin/strip
+
+//Path to a program.
+CMAKE_TAPI:FILEPATH=CMAKE_TAPI-NOTFOUND
+
+//If this value is on, makefiles will be generated without the
+// .SILENT directive, and all commands will be echoed to the console
+// during the make.  This is useful for debugging only. With Visual
+// Studio IDE projects all commands are done without /nologo.
+CMAKE_VERBOSE_MAKEFILE:BOOL=FALSE
+
+//Path to a program.
+CUDAToolkit_NVCC_EXECUTABLE:FILEPATH=CUDAToolkit_NVCC_EXECUTABLE-NOTFOUND
+
+//Path to a file.
+CUDAToolkit_SENTINEL_FILE:FILEPATH=CUDAToolkit_SENTINEL_FILE-NOTFOUND
+
+//Git command line client
+GIT_EXECUTABLE:FILEPATH=/usr/bin/git
+
+//Build collective algorithms
+MSCCLPP_BUILD_EXT_COLLECTIVES:BOOL=ON
+
+//Build NCCL interfaces
+MSCCLPP_BUILD_EXT_NCCL:BOOL=ON
+
+//Build Python bindings
+MSCCLPP_BUILD_PYTHON_BINDINGS:BOOL=ON
+
+//Build tests
+MSCCLPP_BUILD_TESTS:BOOL=ON
+
+//Bypass GPU check.
+MSCCLPP_BYPASS_GPU_CHECK:BOOL=OFF
+
+//Enable code coverage
+MSCCLPP_ENABLE_COVERAGE:BOOL=OFF
+
+//Enable tracing
+MSCCLPP_ENABLE_TRACE:BOOL=OFF
+
+//Specify GPU architectures with delimiters (comma, space, or semicolon).
+MSCCLPP_GPU_ARCHS:STRING=
+
+//Set NPKIT flags
+MSCCLPP_NPKIT_FLAGS:BOOL=OFF
+
+//Use NVIDIA/CUDA.
+MSCCLPP_USE_CUDA:BOOL=OFF
+
+//Use InfiniBand.
+MSCCLPP_USE_IB:BOOL=ON
+
+//Use AMD/ROCm.
+MSCCLPP_USE_ROCM:BOOL=OFF
+
+//The directory containing a CMake configuration file for hip.
+hip_DIR:PATH=hip_DIR-NOTFOUND
+
+//Value Computed by CMake
+mscclpp_BINARY_DIR:STATIC=/home/runner/work/mscclpp/mscclpp/build2
+
+//Value Computed by CMake
+mscclpp_IS_TOP_LEVEL:STATIC=ON
+
+//Value Computed by CMake
+mscclpp_SOURCE_DIR:STATIC=/home/runner/work/mscclpp/mscclpp
+
+
+########################
+# INTERNAL cache entries
+########################
+
+//ADVANCED property for variable: CMAKE_ADDR2LINE
+CMAKE_ADDR2LINE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_AR
+CMAKE_AR-ADVANCED:INTERNAL=1
+//This is the directory where this CMakeCache.txt was created
+CMAKE_CACHEFILE_DIR:INTERNAL=/home/runner/work/mscclpp/mscclpp/build2
+//Major version of cmake used to create the current loaded cache
+CMAKE_CACHE_MAJOR_VERSION:INTERNAL=3
+//Minor version of cmake used to create the current loaded cache
+CMAKE_CACHE_MINOR_VERSION:INTERNAL=31
+//Patch version of cmake used to create the current loaded cache
+CMAKE_CACHE_PATCH_VERSION:INTERNAL=6
+//ADVANCED property for variable: CMAKE_COLOR_MAKEFILE
+CMAKE_COLOR_MAKEFILE-ADVANCED:INTERNAL=1
+//Path to CMake executable.
+CMAKE_COMMAND:INTERNAL=/usr/local/bin/cmake
+//Path to cpack program executable.
+CMAKE_CPACK_COMMAND:INTERNAL=/usr/local/bin/cpack
+//Path to ctest program executable.
+CMAKE_CTEST_COMMAND:INTERNAL=/usr/local/bin/ctest
+//ADVANCED property for variable: CMAKE_CXX_COMPILER
+CMAKE_CXX_COMPILER-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_COMPILER_AR
+CMAKE_CXX_COMPILER_AR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_COMPILER_RANLIB
+CMAKE_CXX_COMPILER_RANLIB-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS
+CMAKE_CXX_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS_DEBUG
+CMAKE_CXX_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS_MINSIZEREL
+CMAKE_CXX_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS_RELEASE
+CMAKE_CXX_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS_RELWITHDEBINFO
+CMAKE_CXX_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_DLLTOOL
+CMAKE_DLLTOOL-ADVANCED:INTERNAL=1
+//Path to cache edit program executable.
+CMAKE_EDIT_COMMAND:INTERNAL=/usr/local/bin/ccmake
+//Executable file format
+CMAKE_EXECUTABLE_FORMAT:INTERNAL=ELF
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS
+CMAKE_EXE_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_DEBUG
+CMAKE_EXE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_MINSIZEREL
+CMAKE_EXE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELEASE
+CMAKE_EXE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXPORT_COMPILE_COMMANDS
+CMAKE_EXPORT_COMPILE_COMMANDS-ADVANCED:INTERNAL=1
+//Name of external makefile project generator.
+CMAKE_EXTRA_GENERATOR:INTERNAL=
+//Name of generator.
+CMAKE_GENERATOR:INTERNAL=Unix Makefiles
+//Generator instance identifier.
+CMAKE_GENERATOR_INSTANCE:INTERNAL=
+//Name of generator platform.
+CMAKE_GENERATOR_PLATFORM:INTERNAL=
+//Name of generator toolset.
+CMAKE_GENERATOR_TOOLSET:INTERNAL=
+//Source directory with the top level CMakeLists.txt file for this
+// project
+CMAKE_HOME_DIRECTORY:INTERNAL=/home/runner/work/mscclpp/mscclpp
+//Install .so files without execute permission.
+CMAKE_INSTALL_SO_NO_EXE:INTERNAL=1
+//ADVANCED property for variable: CMAKE_LINKER
+CMAKE_LINKER-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MAKE_PROGRAM
+CMAKE_MAKE_PROGRAM-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS
+CMAKE_MODULE_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_DEBUG
+CMAKE_MODULE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL
+CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELEASE
+CMAKE_MODULE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_NM
+CMAKE_NM-ADVANCED:INTERNAL=1
+//number of local generators
+CMAKE_NUMBER_OF_MAKEFILES:INTERNAL=1
+//ADVANCED property for variable: CMAKE_OBJCOPY
+CMAKE_OBJCOPY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_OBJDUMP
+CMAKE_OBJDUMP-ADVANCED:INTERNAL=1
+//Platform information initialized
+CMAKE_PLATFORM_INFO_INITIALIZED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_RANLIB
+CMAKE_RANLIB-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_READELF
+CMAKE_READELF-ADVANCED:INTERNAL=1
+//Path to CMake installation.
+CMAKE_ROOT:INTERNAL=/usr/local/share/cmake-3.31
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS
+CMAKE_SHARED_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_DEBUG
+CMAKE_SHARED_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL
+CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELEASE
+CMAKE_SHARED_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SKIP_INSTALL_RPATH
+CMAKE_SKIP_INSTALL_RPATH-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SKIP_RPATH
+CMAKE_SKIP_RPATH-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS
+CMAKE_STATIC_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_DEBUG
+CMAKE_STATIC_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL
+CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELEASE
+CMAKE_STATIC_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STRIP
+CMAKE_STRIP-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_TAPI
+CMAKE_TAPI-ADVANCED:INTERNAL=1
+//uname command
+CMAKE_UNAME:INTERNAL=/usr/bin/uname
+//ADVANCED property for variable: CMAKE_VERBOSE_MAKEFILE
+CMAKE_VERBOSE_MAKEFILE-ADVANCED:INTERNAL=1
+//Details about finding Git
+FIND_PACKAGE_MESSAGE_DETAILS_Git:INTERNAL=[/usr/bin/git][v2.52.0()]
+//ADVANCED property for variable: GIT_EXECUTABLE
+GIT_EXECUTABLE-ADVANCED:INTERNAL=1
+//linker supports push/pop state
+_CMAKE_CXX_LINKER_PUSHPOP_STATE_SUPPORTED:INTERNAL=TRUE
+//linker supports push/pop state
+_CMAKE_LINKER_PUSHPOP_STATE_SUPPORTED:INTERNAL=TRUE
+
diff --git a/build2/CMakeFiles/3.31.6/CMakeCXXCompiler.cmake b/build2/CMakeFiles/3.31.6/CMakeCXXCompiler.cmake
new file mode 100644
index 000000000..14f6ae31d
--- /dev/null
+++ b/build2/CMakeFiles/3.31.6/CMakeCXXCompiler.cmake
@@ -0,0 +1,101 @@
+set(CMAKE_CXX_COMPILER "/usr/bin/c++")
+set(CMAKE_CXX_COMPILER_ARG1 "")
+set(CMAKE_CXX_COMPILER_ID "GNU")
+set(CMAKE_CXX_COMPILER_VERSION "13.3.0")
+set(CMAKE_CXX_COMPILER_VERSION_INTERNAL "")
+set(CMAKE_CXX_COMPILER_WRAPPER "")
+set(CMAKE_CXX_STANDARD_COMPUTED_DEFAULT "17")
+set(CMAKE_CXX_EXTENSIONS_COMPUTED_DEFAULT "ON")
+set(CMAKE_CXX_STANDARD_LATEST "23")
+set(CMAKE_CXX_COMPILE_FEATURES "cxx_std_98;cxx_template_template_parameters;cxx_std_11;cxx_alias_templates;cxx_alignas;cxx_alignof;cxx_attributes;cxx_auto_type;cxx_constexpr;cxx_decltype;cxx_decltype_incomplete_return_types;cxx_default_function_template_args;cxx_defaulted_functions;cxx_defaulted_move_initializers;cxx_delegating_constructors;cxx_deleted_functions;cxx_enum_forward_declarations;cxx_explicit_conversions;cxx_extended_friend_declarations;cxx_extern_templates;cxx_final;cxx_func_identifier;cxx_generalized_initializers;cxx_inheriting_constructors;cxx_inline_namespaces;cxx_lambdas;cxx_local_type_template_args;cxx_long_long_type;cxx_noexcept;cxx_nonstatic_member_init;cxx_nullptr;cxx_override;cxx_range_for;cxx_raw_string_literals;cxx_reference_qualified_functions;cxx_right_angle_brackets;cxx_rvalue_references;cxx_sizeof_member;cxx_static_assert;cxx_strong_enums;cxx_thread_local;cxx_trailing_return_types;cxx_unicode_literals;cxx_uniform_initialization;cxx_unrestricted_unions;cxx_user_literals;cxx_variadic_macros;cxx_variadic_templates;cxx_std_14;cxx_aggregate_default_initializers;cxx_attribute_deprecated;cxx_binary_literals;cxx_contextual_conversions;cxx_decltype_auto;cxx_digit_separators;cxx_generic_lambdas;cxx_lambda_init_captures;cxx_relaxed_constexpr;cxx_return_type_deduction;cxx_variable_templates;cxx_std_17;cxx_std_20;cxx_std_23")
+set(CMAKE_CXX98_COMPILE_FEATURES "cxx_std_98;cxx_template_template_parameters")
+set(CMAKE_CXX11_COMPILE_FEATURES "cxx_std_11;cxx_alias_templates;cxx_alignas;cxx_alignof;cxx_attributes;cxx_auto_type;cxx_constexpr;cxx_decltype;cxx_decltype_incomplete_return_types;cxx_default_function_template_args;cxx_defaulted_functions;cxx_defaulted_move_initializers;cxx_delegating_constructors;cxx_deleted_functions;cxx_enum_forward_declarations;cxx_explicit_conversions;cxx_extended_friend_declarations;cxx_extern_templates;cxx_final;cxx_func_identifier;cxx_generalized_initializers;cxx_inheriting_constructors;cxx_inline_namespaces;cxx_lambdas;cxx_local_type_template_args;cxx_long_long_type;cxx_noexcept;cxx_nonstatic_member_init;cxx_nullptr;cxx_override;cxx_range_for;cxx_raw_string_literals;cxx_reference_qualified_functions;cxx_right_angle_brackets;cxx_rvalue_references;cxx_sizeof_member;cxx_static_assert;cxx_strong_enums;cxx_thread_local;cxx_trailing_return_types;cxx_unicode_literals;cxx_uniform_initialization;cxx_unrestricted_unions;cxx_user_literals;cxx_variadic_macros;cxx_variadic_templates")
+set(CMAKE_CXX14_COMPILE_FEATURES "cxx_std_14;cxx_aggregate_default_initializers;cxx_attribute_deprecated;cxx_binary_literals;cxx_contextual_conversions;cxx_decltype_auto;cxx_digit_separators;cxx_generic_lambdas;cxx_lambda_init_captures;cxx_relaxed_constexpr;cxx_return_type_deduction;cxx_variable_templates")
+set(CMAKE_CXX17_COMPILE_FEATURES "cxx_std_17")
+set(CMAKE_CXX20_COMPILE_FEATURES "cxx_std_20")
+set(CMAKE_CXX23_COMPILE_FEATURES "cxx_std_23")
+set(CMAKE_CXX26_COMPILE_FEATURES "")
+
+set(CMAKE_CXX_PLATFORM_ID "Linux")
+set(CMAKE_CXX_SIMULATE_ID "")
+set(CMAKE_CXX_COMPILER_FRONTEND_VARIANT "GNU")
+set(CMAKE_CXX_SIMULATE_VERSION "")
+
+
+
+
+set(CMAKE_AR "/usr/bin/ar")
+set(CMAKE_CXX_COMPILER_AR "/usr/bin/gcc-ar-13")
+set(CMAKE_RANLIB "/usr/bin/ranlib")
+set(CMAKE_CXX_COMPILER_RANLIB "/usr/bin/gcc-ranlib-13")
+set(CMAKE_LINKER "/usr/bin/ld")
+set(CMAKE_LINKER_LINK "")
+set(CMAKE_LINKER_LLD "")
+set(CMAKE_CXX_COMPILER_LINKER "/usr/bin/ld")
+set(CMAKE_CXX_COMPILER_LINKER_ID "GNU")
+set(CMAKE_CXX_COMPILER_LINKER_VERSION 2.42)
+set(CMAKE_CXX_COMPILER_LINKER_FRONTEND_VARIANT GNU)
+set(CMAKE_MT "")
+set(CMAKE_TAPI "CMAKE_TAPI-NOTFOUND")
+set(CMAKE_COMPILER_IS_GNUCXX 1)
+set(CMAKE_CXX_COMPILER_LOADED 1)
+set(CMAKE_CXX_COMPILER_WORKS TRUE)
+set(CMAKE_CXX_ABI_COMPILED TRUE)
+
+set(CMAKE_CXX_COMPILER_ENV_VAR "CXX")
+
+set(CMAKE_CXX_COMPILER_ID_RUN 1)
+set(CMAKE_CXX_SOURCE_FILE_EXTENSIONS C;M;c++;cc;cpp;cxx;m;mm;mpp;CPP;ixx;cppm;ccm;cxxm;c++m)
+set(CMAKE_CXX_IGNORE_EXTENSIONS inl;h;hpp;HPP;H;o;O;obj;OBJ;def;DEF;rc;RC)
+
+foreach (lang IN ITEMS C OBJC OBJCXX)
+  if (CMAKE_${lang}_COMPILER_ID_RUN)
+    foreach(extension IN LISTS CMAKE_${lang}_SOURCE_FILE_EXTENSIONS)
+      list(REMOVE_ITEM CMAKE_CXX_SOURCE_FILE_EXTENSIONS ${extension})
+    endforeach()
+  endif()
+endforeach()
+
+set(CMAKE_CXX_LINKER_PREFERENCE 30)
+set(CMAKE_CXX_LINKER_PREFERENCE_PROPAGATES 1)
+set(CMAKE_CXX_LINKER_DEPFILE_SUPPORTED )
+
+# Save compiler ABI information.
+set(CMAKE_CXX_SIZEOF_DATA_PTR "8")
+set(CMAKE_CXX_COMPILER_ABI "ELF")
+set(CMAKE_CXX_BYTE_ORDER "LITTLE_ENDIAN")
+set(CMAKE_CXX_LIBRARY_ARCHITECTURE "x86_64-linux-gnu")
+
+if(CMAKE_CXX_SIZEOF_DATA_PTR)
+  set(CMAKE_SIZEOF_VOID_P "${CMAKE_CXX_SIZEOF_DATA_PTR}")
+endif()
+
+if(CMAKE_CXX_COMPILER_ABI)
+  set(CMAKE_INTERNAL_PLATFORM_ABI "${CMAKE_CXX_COMPILER_ABI}")
+endif()
+
+if(CMAKE_CXX_LIBRARY_ARCHITECTURE)
+  set(CMAKE_LIBRARY_ARCHITECTURE "x86_64-linux-gnu")
+endif()
+
+set(CMAKE_CXX_CL_SHOWINCLUDES_PREFIX "")
+if(CMAKE_CXX_CL_SHOWINCLUDES_PREFIX)
+  set(CMAKE_CL_SHOWINCLUDES_PREFIX "${CMAKE_CXX_CL_SHOWINCLUDES_PREFIX}")
+endif()
+
+
+
+
+
+set(CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES "/usr/include/c++/13;/usr/include/x86_64-linux-gnu/c++/13;/usr/include/c++/13/backward;/usr/lib/gcc/x86_64-linux-gnu/13/include;/usr/local/include;/usr/include/x86_64-linux-gnu;/usr/include")
+set(CMAKE_CXX_IMPLICIT_LINK_LIBRARIES "stdc++;m;gcc_s;gcc;c;gcc_s;gcc")
+set(CMAKE_CXX_IMPLICIT_LINK_DIRECTORIES "/usr/lib/gcc/x86_64-linux-gnu/13;/usr/lib/x86_64-linux-gnu;/usr/lib;/lib/x86_64-linux-gnu;/lib")
+set(CMAKE_CXX_IMPLICIT_LINK_FRAMEWORK_DIRECTORIES "")
+set(CMAKE_CXX_COMPILER_CLANG_RESOURCE_DIR "")
+
+set(CMAKE_CXX_COMPILER_IMPORT_STD "")
+### Imported target for C++23 standard library
+set(CMAKE_CXX23_COMPILER_IMPORT_STD_NOT_FOUND_MESSAGE "Unsupported generator: Unix Makefiles")
+
+
+
diff --git a/build2/CMakeFiles/3.31.6/CMakeDetermineCompilerABI_CXX.bin b/build2/CMakeFiles/3.31.6/CMakeDetermineCompilerABI_CXX.bin
new file mode 100755
index 0000000000000000000000000000000000000000..e90f3f71d98d8b48fdca37fdc4f6d991fd1db519
GIT binary patch
literal 15992
zcmeHOYit}>6~4Q9xipD4Y0{XaG)rkv(&C9<F-{<K9ebTw<k%r`pb)}j);qRGtar`s
zEVT=iKzP&&K>;D4KR{7b9#VzWB18~B+O2|G6~rSFg`oZkluAK_))g&sA!Ipc?)lc^
z(YodJ1Btn-o$sFSoOAD;bMNflnYs7l>A`_`ET)i_sdp%rQVGqZMA7qB$q=Mek6J^=
zH>g|GN|KlRoYto_kXENl@x|CA{4zrJYvD`-yhYPggHC86Bl|6t=2mD8P|10)pRW=b
zJn#{z00_QbUs7re;fVMFgMJ*FxmN8rw|6lnB`(_q;m0ETDMQ;+cjzQomHL2)C&z@p
zJrd6_wn;I-u-}CEg|T1!fLsTs!_RrSf2Y2K;&&$L7o)=X7ELQ4>U$UY`Ee2bYXQ3X
zkkq$SKO`jnKnbtfnRm0@T|4u+*1TJ&Ot((=bhmbQ8ReqU;aAP=O466d)c&C(ii)W+
zCt+0a6Iw=jtlJ=Zw*TRV!E;T|eD<lsJAcyk*c}_b`>XiRpJy9xH~X*+CoT^|gk{ci
zoou7y@d?Vw*e1N_{A|)EmN>BA`Ubi_;*t$`YYD!v1b-9pw>2n7Sr$cf)GB*+$+ISH
zw?NG3v~7*K1v~HF>nK)pe7n{D!OXrstHbCpcGdHpUCPRg9I$du$r*Rco>Lk*(3dY3
zoDn;lcc`rK$znlDx3p<PLylm~|LC5Ik<9JIc&Ti5Z{Vo&_+##SU-&YGIZnTLI^jCT
z^^;tu`FXj%!C#gFn^Ia29`dETG|zp=eS&m3zz6&NN`S{0W1qPI&*KMaKETUQB2*DZ
z5r`rXMIeem6oDuLQ3Rq0{2xc)&&{{~)jWB%$vm~<H#?OwKV9|WwO^Pgf7Eork4kOV
zIihRZ9;9RQ)|6uV+O|hY8f)I#uY9@vPnp?^A24TsXP*51+`*A_d$s*3^Yq>yQvtP&
zWiowf%xK>FDZf18A0Wm&z2b`uyXU=)RQ0<#PgUPgyWG6>1RGuuBzxDl-<4(9aowDq
zGarBcF7xsEWoGON^Wt@H0~N4M3TUcb*6o5nxA(+eR;$XLN6eFZ<D4~TpYv9mr}nNS
z;mVF$t#&0xhbLD2o$k70$H=!{Kl}gT9#V4V2>H!^?5a6ix%_1M8aMM)`l|U=^Yq52
z*HU=CzdX_WXf>9;ChP`2&1YD1etEq4d|30_Mw*R(43%{4*afcI@1uIJaMe+YA`nF&
zia->BC<0Lgq6kD0h$0Y0Ac{Z~fhYq1d<6LY*Q=$>(7^DXGQFQGj#;@WuXMDn=UC8w
zC^I~e-Q&$zPO0eRj+Qd}to=jjO#e`?^6h;8?2PAF#S*={J35#d85vAl>7o8i?+{t|
zdOPbLrF97G5ZkisZT#+y-({V7p;kLic$V;f!iNb>!UyJRwX=kr_?;@J*u95TY&sF!
zvU*k18G50{Jg*%%PCjpDgZ@?i8@byl+eP2)#QVhB#K78?cQ)U6Ptyr?*XG@Kbl&d2
zzGVOR(>DP-%5&l}J^H>#{70BbuT6X=-nV9DyhJrK5v3>sQ3Rq0L=lK05Je!0Koo%}
z0#O8_2>fqE0P7X8J`rmV{hJ<Y;%YQg)-SFR`9WFd_<E7C4swggxb@jAGS)-#{SqhW
zU%p-|viz_tV#M0S3BKW@q}Q}6bxHKE)3mx@@J7KF!Ht3dtc|S7`o~qGXp@T2j;ipq
z*wara?^cmv_qUpEFU85Hu8XV}lhX_C1-<V{x2FF2&B^(^A~M<~#sBvJ>%;%U60t6I
ze_!98<n|-kO2Mln+dGX;qph{O;)@;kb#xhRT|0z+^$K}hEmtqr!d4vb7->Ey0ZEDh
zuN!V;&;1csYt@vDM=@7P;m?NnPT?`WVV|K)Otq*)N;4SuyvjO8PYW<!wN|N*Qikir
z^#Y#9VNBhmF#f@Ri!zPc|Cn!|P|2jW#CUyL_>}M%cP|TnTzCQ1LJf|oggPMvtrGCl
zQgPen+pkv#-zbIwXw=S5-=10*8c%O0Ua58Ub^0h~*tfq~;W`8F5Z`Eh`6r1_!YF{>
z@%c?kr2-^nzfOEYZL0SdwBI0peY{!W_Xzw$VjnK&2Y&gmTEHiXUl-q`Fz%uGCG%9X
zN@_+fWA!ZY2^v2wDOhUc{UYmWoTOwN`p=q3bw%tk-r)6;*zb_vQ~wzfDPJL;+Y`25
z5wAA|MfkXt_}dmSTG&JU`Z)bchOP^Bc(mlT8%0_vPfyz{&mLDql)cK>m@%prR@GbH
zq&3Rx>dR!AD_Z0EV%E-EIj>kMTXtnyjTR@T@{Z@^jJC!WyrSQ=>{7|5hk^yKG^55!
z_M~IwDwC5l<Pwl9vh)_2_8qW4==9xvcOTW_=ABaSzKk(CHKnZg4Yqf?g|VU)coxZQ
zhh`U^Fj`r6oa)WFHtjGV{chhYpwGLWmv;gtJ-!7+g&H?-sP};Xbkd?t1pV(F>OGL@
zBbs(&SZPzVX8$2&?H?T8*E?tp4-6bmk60tU`{<!28HV;aq_CCYwYD!fIoq?9A37?9
z1-+MngvA>htX#QhP1uDTZ+gfKlU2?wSe3GqQ+!HfpDmZgS9V#@MhSl2%4ftoC>m~y
zSiBdb-fZ51;dc`4M=H-udUlr3D`}iS&MnY(j45Rlik@SP7b?b7sW|17yqN%%t+=$8
z#?1*u{o2Z7&^Mp3%M;4T%@n8#jb2G>KJ1jrZn3aPut-;O@-{mtgGZ1urt<n=j29{6
zIn#9HVMvxmKeC21Ap>tBNB)qszaD|w19>Xko^(g4IovS@1yva|^e1UVH@NElb&BUr
zbjjDBzK8e0Vcvw2**2KoL;}xk=yLbdQv1C`U7vqJ?xsx8KfLdYpOXg@eh0zv|7p-4
z|L4FY3<bmf?;-v#G&e%~F&_k?e#{3kA49P=Wq2+Kf6NzwXT*@($gzVz=6No0JOzP2
z=AS_RpAV*R{69oWp8LTc^F1Ku(P%&HfcKF<&m|#aJ_&4-%ERqPn@&@PV+w!FZ-G@Y
zME&9O{|f2(oS?7&U&#Lk=JisHUl;O>U!!l(KPi4d5$i6Hf#*X0ZK43e4h294J{0m#
zi2|4lbr}3m-XkG@%qM`j?}2@I{GJzo#9t-FQt<O40)&RB^t^DP|IUa3kl%p?Q@H-0
zl9Epm^;eVH8u%qG){p3a5Wl7j&mnPNg83}=Nrvqq1D_?|=72xu&-1NBQi7e97G&@*
zkb=h^>aWi`4ee3olcU7rpA-DhkKZJYP2i7tXmuxBE0yw(3kUcE=SdaxuRFA9AJl^q
z;0O6SWtc<#n71XwKWs0j19!EI2<F7R&cpxCI-@i24<h<LXqu7&zby^p>-c8+qCNQi
l<NGkQJ?MXhZ=fipLWQGVt>rm#WB={^$3kg!$RQ-Ee*hEc8gl>u

literal 0
HcmV?d00001

diff --git a/build2/CMakeFiles/3.31.6/CMakeSystem.cmake b/build2/CMakeFiles/3.31.6/CMakeSystem.cmake
new file mode 100644
index 000000000..b2715a602
--- /dev/null
+++ b/build2/CMakeFiles/3.31.6/CMakeSystem.cmake
@@ -0,0 +1,15 @@
+set(CMAKE_HOST_SYSTEM "Linux-6.11.0-1018-azure")
+set(CMAKE_HOST_SYSTEM_NAME "Linux")
+set(CMAKE_HOST_SYSTEM_VERSION "6.11.0-1018-azure")
+set(CMAKE_HOST_SYSTEM_PROCESSOR "x86_64")
+
+
+
+set(CMAKE_SYSTEM "Linux-6.11.0-1018-azure")
+set(CMAKE_SYSTEM_NAME "Linux")
+set(CMAKE_SYSTEM_VERSION "6.11.0-1018-azure")
+set(CMAKE_SYSTEM_PROCESSOR "x86_64")
+
+set(CMAKE_CROSSCOMPILING "FALSE")
+
+set(CMAKE_SYSTEM_LOADED 1)
diff --git a/build2/CMakeFiles/3.31.6/CompilerIdCXX/CMakeCXXCompilerId.cpp b/build2/CMakeFiles/3.31.6/CompilerIdCXX/CMakeCXXCompilerId.cpp
new file mode 100644
index 000000000..3b6e114ca
--- /dev/null
+++ b/build2/CMakeFiles/3.31.6/CompilerIdCXX/CMakeCXXCompilerId.cpp
@@ -0,0 +1,919 @@
+/* This source file must have a .cpp extension so that all C++ compilers
+   recognize the extension without flags.  Borland does not know .cxx for
+   example.  */
+#ifndef __cplusplus
+# error "A C compiler has been selected for C++."
+#endif
+
+#if !defined(__has_include)
+/* If the compiler does not have __has_include, pretend the answer is
+   always no.  */
+#  define __has_include(x) 0
+#endif
+
+
+/* Version number components: V=Version, R=Revision, P=Patch
+   Version date components:   YYYY=Year, MM=Month,   DD=Day  */
+
+#if defined(__INTEL_COMPILER) || defined(__ICC)
+# define COMPILER_ID "Intel"
+# if defined(_MSC_VER)
+#  define SIMULATE_ID "MSVC"
+# endif
+# if defined(__GNUC__)
+#  define SIMULATE_ID "GNU"
+# endif
+  /* __INTEL_COMPILER = VRP prior to 2021, and then VVVV for 2021 and later,
+     except that a few beta releases use the old format with V=2021.  */
+# if __INTEL_COMPILER < 2021 || __INTEL_COMPILER == 202110 || __INTEL_COMPILER == 202111
+#  define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER/100)
+#  define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER/10 % 10)
+#  if defined(__INTEL_COMPILER_UPDATE)
+#   define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER_UPDATE)
+#  else
+#   define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER   % 10)
+#  endif
+# else
+#  define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER)
+#  define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER_UPDATE)
+   /* The third version component from --version is an update index,
+      but no macro is provided for it.  */
+#  define COMPILER_VERSION_PATCH DEC(0)
+# endif
+# if defined(__INTEL_COMPILER_BUILD_DATE)
+   /* __INTEL_COMPILER_BUILD_DATE = YYYYMMDD */
+#  define COMPILER_VERSION_TWEAK DEC(__INTEL_COMPILER_BUILD_DATE)
+# endif
+# if defined(_MSC_VER)
+   /* _MSC_VER = VVRR */
+#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# endif
+# if defined(__GNUC__)
+#  define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
+# elif defined(__GNUG__)
+#  define SIMULATE_VERSION_MAJOR DEC(__GNUG__)
+# endif
+# if defined(__GNUC_MINOR__)
+#  define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
+# endif
+# if defined(__GNUC_PATCHLEVEL__)
+#  define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
+# endif
+
+#elif (defined(__clang__) && defined(__INTEL_CLANG_COMPILER)) || defined(__INTEL_LLVM_COMPILER)
+# define COMPILER_ID "IntelLLVM"
+#if defined(_MSC_VER)
+# define SIMULATE_ID "MSVC"
+#endif
+#if defined(__GNUC__)
+# define SIMULATE_ID "GNU"
+#endif
+/* __INTEL_LLVM_COMPILER = VVVVRP prior to 2021.2.0, VVVVRRPP for 2021.2.0 and
+ * later.  Look for 6 digit vs. 8 digit version number to decide encoding.
+ * VVVV is no smaller than the current year when a version is released.
+ */
+#if __INTEL_LLVM_COMPILER < 1000000L
+# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/100)
+# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER    % 10)
+#else
+# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/10000)
+# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/100 % 100)
+# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER     % 100)
+#endif
+#if defined(_MSC_VER)
+  /* _MSC_VER = VVRR */
+# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+#endif
+#if defined(__GNUC__)
+# define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
+#elif defined(__GNUG__)
+# define SIMULATE_VERSION_MAJOR DEC(__GNUG__)
+#endif
+#if defined(__GNUC_MINOR__)
+# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
+#endif
+#if defined(__GNUC_PATCHLEVEL__)
+# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
+#endif
+
+#elif defined(__PATHCC__)
+# define COMPILER_ID "PathScale"
+# define COMPILER_VERSION_MAJOR DEC(__PATHCC__)
+# define COMPILER_VERSION_MINOR DEC(__PATHCC_MINOR__)
+# if defined(__PATHCC_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__PATHCC_PATCHLEVEL__)
+# endif
+
+#elif defined(__BORLANDC__) && defined(__CODEGEARC_VERSION__)
+# define COMPILER_ID "Embarcadero"
+# define COMPILER_VERSION_MAJOR HEX(__CODEGEARC_VERSION__>>24 & 0x00FF)
+# define COMPILER_VERSION_MINOR HEX(__CODEGEARC_VERSION__>>16 & 0x00FF)
+# define COMPILER_VERSION_PATCH DEC(__CODEGEARC_VERSION__     & 0xFFFF)
+
+#elif defined(__BORLANDC__)
+# define COMPILER_ID "Borland"
+  /* __BORLANDC__ = 0xVRR */
+# define COMPILER_VERSION_MAJOR HEX(__BORLANDC__>>8)
+# define COMPILER_VERSION_MINOR HEX(__BORLANDC__ & 0xFF)
+
+#elif defined(__WATCOMC__) && __WATCOMC__ < 1200
+# define COMPILER_ID "Watcom"
+   /* __WATCOMC__ = VVRR */
+# define COMPILER_VERSION_MAJOR DEC(__WATCOMC__ / 100)
+# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
+# if (__WATCOMC__ % 10) > 0
+#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
+# endif
+
+#elif defined(__WATCOMC__)
+# define COMPILER_ID "OpenWatcom"
+   /* __WATCOMC__ = VVRP + 1100 */
+# define COMPILER_VERSION_MAJOR DEC((__WATCOMC__ - 1100) / 100)
+# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
+# if (__WATCOMC__ % 10) > 0
+#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
+# endif
+
+#elif defined(__SUNPRO_CC)
+# define COMPILER_ID "SunPro"
+# if __SUNPRO_CC >= 0x5100
+   /* __SUNPRO_CC = 0xVRRP */
+#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>12)
+#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xFF)
+#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC    & 0xF)
+# else
+   /* __SUNPRO_CC = 0xVRP */
+#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>8)
+#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xF)
+#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC    & 0xF)
+# endif
+
+#elif defined(__HP_aCC)
+# define COMPILER_ID "HP"
+  /* __HP_aCC = VVRRPP */
+# define COMPILER_VERSION_MAJOR DEC(__HP_aCC/10000)
+# define COMPILER_VERSION_MINOR DEC(__HP_aCC/100 % 100)
+# define COMPILER_VERSION_PATCH DEC(__HP_aCC     % 100)
+
+#elif defined(__DECCXX)
+# define COMPILER_ID "Compaq"
+  /* __DECCXX_VER = VVRRTPPPP */
+# define COMPILER_VERSION_MAJOR DEC(__DECCXX_VER/10000000)
+# define COMPILER_VERSION_MINOR DEC(__DECCXX_VER/100000  % 100)
+# define COMPILER_VERSION_PATCH DEC(__DECCXX_VER         % 10000)
+
+#elif defined(__IBMCPP__) && defined(__COMPILER_VER__)
+# define COMPILER_ID "zOS"
+  /* __IBMCPP__ = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
+# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
+
+#elif defined(__open_xl__) && defined(__clang__)
+# define COMPILER_ID "IBMClang"
+# define COMPILER_VERSION_MAJOR DEC(__open_xl_version__)
+# define COMPILER_VERSION_MINOR DEC(__open_xl_release__)
+# define COMPILER_VERSION_PATCH DEC(__open_xl_modification__)
+# define COMPILER_VERSION_TWEAK DEC(__open_xl_ptf_fix_level__)
+
+
+#elif defined(__ibmxl__) && defined(__clang__)
+# define COMPILER_ID "XLClang"
+# define COMPILER_VERSION_MAJOR DEC(__ibmxl_version__)
+# define COMPILER_VERSION_MINOR DEC(__ibmxl_release__)
+# define COMPILER_VERSION_PATCH DEC(__ibmxl_modification__)
+# define COMPILER_VERSION_TWEAK DEC(__ibmxl_ptf_fix_level__)
+
+
+#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ >= 800
+# define COMPILER_ID "XL"
+  /* __IBMCPP__ = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
+# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
+
+#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ < 800
+# define COMPILER_ID "VisualAge"
+  /* __IBMCPP__ = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
+# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
+
+#elif defined(__NVCOMPILER)
+# define COMPILER_ID "NVHPC"
+# define COMPILER_VERSION_MAJOR DEC(__NVCOMPILER_MAJOR__)
+# define COMPILER_VERSION_MINOR DEC(__NVCOMPILER_MINOR__)
+# if defined(__NVCOMPILER_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__NVCOMPILER_PATCHLEVEL__)
+# endif
+
+#elif defined(__PGI)
+# define COMPILER_ID "PGI"
+# define COMPILER_VERSION_MAJOR DEC(__PGIC__)
+# define COMPILER_VERSION_MINOR DEC(__PGIC_MINOR__)
+# if defined(__PGIC_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__PGIC_PATCHLEVEL__)
+# endif
+
+#elif defined(__clang__) && defined(__cray__)
+# define COMPILER_ID "CrayClang"
+# define COMPILER_VERSION_MAJOR DEC(__cray_major__)
+# define COMPILER_VERSION_MINOR DEC(__cray_minor__)
+# define COMPILER_VERSION_PATCH DEC(__cray_patchlevel__)
+# define COMPILER_VERSION_INTERNAL_STR __clang_version__
+
+
+#elif defined(_CRAYC)
+# define COMPILER_ID "Cray"
+# define COMPILER_VERSION_MAJOR DEC(_RELEASE_MAJOR)
+# define COMPILER_VERSION_MINOR DEC(_RELEASE_MINOR)
+
+#elif defined(__TI_COMPILER_VERSION__)
+# define COMPILER_ID "TI"
+  /* __TI_COMPILER_VERSION__ = VVVRRRPPP */
+# define COMPILER_VERSION_MAJOR DEC(__TI_COMPILER_VERSION__/1000000)
+# define COMPILER_VERSION_MINOR DEC(__TI_COMPILER_VERSION__/1000   % 1000)
+# define COMPILER_VERSION_PATCH DEC(__TI_COMPILER_VERSION__        % 1000)
+
+#elif defined(__CLANG_FUJITSU)
+# define COMPILER_ID "FujitsuClang"
+# define COMPILER_VERSION_MAJOR DEC(__FCC_major__)
+# define COMPILER_VERSION_MINOR DEC(__FCC_minor__)
+# define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__)
+# define COMPILER_VERSION_INTERNAL_STR __clang_version__
+
+
+#elif defined(__FUJITSU)
+# define COMPILER_ID "Fujitsu"
+# if defined(__FCC_version__)
+#   define COMPILER_VERSION __FCC_version__
+# elif defined(__FCC_major__)
+#   define COMPILER_VERSION_MAJOR DEC(__FCC_major__)
+#   define COMPILER_VERSION_MINOR DEC(__FCC_minor__)
+#   define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__)
+# endif
+# if defined(__fcc_version)
+#   define COMPILER_VERSION_INTERNAL DEC(__fcc_version)
+# elif defined(__FCC_VERSION)
+#   define COMPILER_VERSION_INTERNAL DEC(__FCC_VERSION)
+# endif
+
+
+#elif defined(__ghs__)
+# define COMPILER_ID "GHS"
+/* __GHS_VERSION_NUMBER = VVVVRP */
+# ifdef __GHS_VERSION_NUMBER
+# define COMPILER_VERSION_MAJOR DEC(__GHS_VERSION_NUMBER / 100)
+# define COMPILER_VERSION_MINOR DEC(__GHS_VERSION_NUMBER / 10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__GHS_VERSION_NUMBER      % 10)
+# endif
+
+#elif defined(__TASKING__)
+# define COMPILER_ID "Tasking"
+  # define COMPILER_VERSION_MAJOR DEC(__VERSION__/1000)
+  # define COMPILER_VERSION_MINOR DEC(__VERSION__ % 100)
+# define COMPILER_VERSION_INTERNAL DEC(__VERSION__)
+
+#elif defined(__ORANGEC__)
+# define COMPILER_ID "OrangeC"
+# define COMPILER_VERSION_MAJOR DEC(__ORANGEC_MAJOR__)
+# define COMPILER_VERSION_MINOR DEC(__ORANGEC_MINOR__)
+# define COMPILER_VERSION_PATCH DEC(__ORANGEC_PATCHLEVEL__)
+
+#elif defined(__SCO_VERSION__)
+# define COMPILER_ID "SCO"
+
+#elif defined(__ARMCC_VERSION) && !defined(__clang__)
+# define COMPILER_ID "ARMCC"
+#if __ARMCC_VERSION >= 1000000
+  /* __ARMCC_VERSION = VRRPPPP */
+  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/1000000)
+  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 100)
+  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION     % 10000)
+#else
+  /* __ARMCC_VERSION = VRPPPP */
+  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/100000)
+  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 10)
+  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION    % 10000)
+#endif
+
+
+#elif defined(__clang__) && defined(__apple_build_version__)
+# define COMPILER_ID "AppleClang"
+# if defined(_MSC_VER)
+#  define SIMULATE_ID "MSVC"
+# endif
+# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
+# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
+# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
+# if defined(_MSC_VER)
+   /* _MSC_VER = VVRR */
+#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# endif
+# define COMPILER_VERSION_TWEAK DEC(__apple_build_version__)
+
+#elif defined(__clang__) && defined(__ARMCOMPILER_VERSION)
+# define COMPILER_ID "ARMClang"
+  # define COMPILER_VERSION_MAJOR DEC(__ARMCOMPILER_VERSION/1000000)
+  # define COMPILER_VERSION_MINOR DEC(__ARMCOMPILER_VERSION/10000 % 100)
+  # define COMPILER_VERSION_PATCH DEC(__ARMCOMPILER_VERSION/100   % 100)
+# define COMPILER_VERSION_INTERNAL DEC(__ARMCOMPILER_VERSION)
+
+#elif defined(__clang__) && defined(__ti__)
+# define COMPILER_ID "TIClang"
+  # define COMPILER_VERSION_MAJOR DEC(__ti_major__)
+  # define COMPILER_VERSION_MINOR DEC(__ti_minor__)
+  # define COMPILER_VERSION_PATCH DEC(__ti_patchlevel__)
+# define COMPILER_VERSION_INTERNAL DEC(__ti_version__)
+
+#elif defined(__clang__)
+# define COMPILER_ID "Clang"
+# if defined(_MSC_VER)
+#  define SIMULATE_ID "MSVC"
+# endif
+# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
+# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
+# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
+# if defined(_MSC_VER)
+   /* _MSC_VER = VVRR */
+#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# endif
+
+#elif defined(__LCC__) && (defined(__GNUC__) || defined(__GNUG__) || defined(__MCST__))
+# define COMPILER_ID "LCC"
+# define COMPILER_VERSION_MAJOR DEC(__LCC__ / 100)
+# define COMPILER_VERSION_MINOR DEC(__LCC__ % 100)
+# if defined(__LCC_MINOR__)
+#  define COMPILER_VERSION_PATCH DEC(__LCC_MINOR__)
+# endif
+# if defined(__GNUC__) && defined(__GNUC_MINOR__)
+#  define SIMULATE_ID "GNU"
+#  define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
+#  define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
+#  if defined(__GNUC_PATCHLEVEL__)
+#   define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
+#  endif
+# endif
+
+#elif defined(__GNUC__) || defined(__GNUG__)
+# define COMPILER_ID "GNU"
+# if defined(__GNUC__)
+#  define COMPILER_VERSION_MAJOR DEC(__GNUC__)
+# else
+#  define COMPILER_VERSION_MAJOR DEC(__GNUG__)
+# endif
+# if defined(__GNUC_MINOR__)
+#  define COMPILER_VERSION_MINOR DEC(__GNUC_MINOR__)
+# endif
+# if defined(__GNUC_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
+# endif
+
+#elif defined(_MSC_VER)
+# define COMPILER_ID "MSVC"
+  /* _MSC_VER = VVRR */
+# define COMPILER_VERSION_MAJOR DEC(_MSC_VER / 100)
+# define COMPILER_VERSION_MINOR DEC(_MSC_VER % 100)
+# if defined(_MSC_FULL_VER)
+#  if _MSC_VER >= 1400
+    /* _MSC_FULL_VER = VVRRPPPPP */
+#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 100000)
+#  else
+    /* _MSC_FULL_VER = VVRRPPPP */
+#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 10000)
+#  endif
+# endif
+# if defined(_MSC_BUILD)
+#  define COMPILER_VERSION_TWEAK DEC(_MSC_BUILD)
+# endif
+
+#elif defined(_ADI_COMPILER)
+# define COMPILER_ID "ADSP"
+#if defined(__VERSIONNUM__)
+  /* __VERSIONNUM__ = 0xVVRRPPTT */
+#  define COMPILER_VERSION_MAJOR DEC(__VERSIONNUM__ >> 24 & 0xFF)
+#  define COMPILER_VERSION_MINOR DEC(__VERSIONNUM__ >> 16 & 0xFF)
+#  define COMPILER_VERSION_PATCH DEC(__VERSIONNUM__ >> 8 & 0xFF)
+#  define COMPILER_VERSION_TWEAK DEC(__VERSIONNUM__ & 0xFF)
+#endif
+
+#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
+# define COMPILER_ID "IAR"
+# if defined(__VER__) && defined(__ICCARM__)
+#  define COMPILER_VERSION_MAJOR DEC((__VER__) / 1000000)
+#  define COMPILER_VERSION_MINOR DEC(((__VER__) / 1000) % 1000)
+#  define COMPILER_VERSION_PATCH DEC((__VER__) % 1000)
+#  define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
+# elif defined(__VER__) && (defined(__ICCAVR__) || defined(__ICCRX__) || defined(__ICCRH850__) || defined(__ICCRL78__) || defined(__ICC430__) || defined(__ICCRISCV__) || defined(__ICCV850__) || defined(__ICC8051__) || defined(__ICCSTM8__))
+#  define COMPILER_VERSION_MAJOR DEC((__VER__) / 100)
+#  define COMPILER_VERSION_MINOR DEC((__VER__) - (((__VER__) / 100)*100))
+#  define COMPILER_VERSION_PATCH DEC(__SUBVERSION__)
+#  define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
+# endif
+
+
+/* These compilers are either not known or too old to define an
+  identification macro.  Try to identify the platform and guess that
+  it is the native compiler.  */
+#elif defined(__hpux) || defined(__hpua)
+# define COMPILER_ID "HP"
+
+#else /* unknown compiler */
+# define COMPILER_ID ""
+#endif
+
+/* Construct the string literal in pieces to prevent the source from
+   getting matched.  Store it in a pointer rather than an array
+   because some compilers will just produce instructions to fill the
+   array rather than assigning a pointer to a static array.  */
+char const* info_compiler = "INFO" ":" "compiler[" COMPILER_ID "]";
+#ifdef SIMULATE_ID
+char const* info_simulate = "INFO" ":" "simulate[" SIMULATE_ID "]";
+#endif
+
+#ifdef __QNXNTO__
+char const* qnxnto = "INFO" ":" "qnxnto[]";
+#endif
+
+#if defined(__CRAYXT_COMPUTE_LINUX_TARGET)
+char const *info_cray = "INFO" ":" "compiler_wrapper[CrayPrgEnv]";
+#endif
+
+#define STRINGIFY_HELPER(X) #X
+#define STRINGIFY(X) STRINGIFY_HELPER(X)
+
+/* Identify known platforms by name.  */
+#if defined(__linux) || defined(__linux__) || defined(linux)
+# define PLATFORM_ID "Linux"
+
+#elif defined(__MSYS__)
+# define PLATFORM_ID "MSYS"
+
+#elif defined(__CYGWIN__)
+# define PLATFORM_ID "Cygwin"
+
+#elif defined(__MINGW32__)
+# define PLATFORM_ID "MinGW"
+
+#elif defined(__APPLE__)
+# define PLATFORM_ID "Darwin"
+
+#elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
+# define PLATFORM_ID "Windows"
+
+#elif defined(__FreeBSD__) || defined(__FreeBSD)
+# define PLATFORM_ID "FreeBSD"
+
+#elif defined(__NetBSD__) || defined(__NetBSD)
+# define PLATFORM_ID "NetBSD"
+
+#elif defined(__OpenBSD__) || defined(__OPENBSD)
+# define PLATFORM_ID "OpenBSD"
+
+#elif defined(__sun) || defined(sun)
+# define PLATFORM_ID "SunOS"
+
+#elif defined(_AIX) || defined(__AIX) || defined(__AIX__) || defined(__aix) || defined(__aix__)
+# define PLATFORM_ID "AIX"
+
+#elif defined(__hpux) || defined(__hpux__)
+# define PLATFORM_ID "HP-UX"
+
+#elif defined(__HAIKU__)
+# define PLATFORM_ID "Haiku"
+
+#elif defined(__BeOS) || defined(__BEOS__) || defined(_BEOS)
+# define PLATFORM_ID "BeOS"
+
+#elif defined(__QNX__) || defined(__QNXNTO__)
+# define PLATFORM_ID "QNX"
+
+#elif defined(__tru64) || defined(_tru64) || defined(__TRU64__)
+# define PLATFORM_ID "Tru64"
+
+#elif defined(__riscos) || defined(__riscos__)
+# define PLATFORM_ID "RISCos"
+
+#elif defined(__sinix) || defined(__sinix__) || defined(__SINIX__)
+# define PLATFORM_ID "SINIX"
+
+#elif defined(__UNIX_SV__)
+# define PLATFORM_ID "UNIX_SV"
+
+#elif defined(__bsdos__)
+# define PLATFORM_ID "BSDOS"
+
+#elif defined(_MPRAS) || defined(MPRAS)
+# define PLATFORM_ID "MP-RAS"
+
+#elif defined(__osf) || defined(__osf__)
+# define PLATFORM_ID "OSF1"
+
+#elif defined(_SCO_SV) || defined(SCO_SV) || defined(sco_sv)
+# define PLATFORM_ID "SCO_SV"
+
+#elif defined(__ultrix) || defined(__ultrix__) || defined(_ULTRIX)
+# define PLATFORM_ID "ULTRIX"
+
+#elif defined(__XENIX__) || defined(_XENIX) || defined(XENIX)
+# define PLATFORM_ID "Xenix"
+
+#elif defined(__WATCOMC__)
+# if defined(__LINUX__)
+#  define PLATFORM_ID "Linux"
+
+# elif defined(__DOS__)
+#  define PLATFORM_ID "DOS"
+
+# elif defined(__OS2__)
+#  define PLATFORM_ID "OS2"
+
+# elif defined(__WINDOWS__)
+#  define PLATFORM_ID "Windows3x"
+
+# elif defined(__VXWORKS__)
+#  define PLATFORM_ID "VxWorks"
+
+# else /* unknown platform */
+#  define PLATFORM_ID
+# endif
+
+#elif defined(__INTEGRITY)
+# if defined(INT_178B)
+#  define PLATFORM_ID "Integrity178"
+
+# else /* regular Integrity */
+#  define PLATFORM_ID "Integrity"
+# endif
+
+# elif defined(_ADI_COMPILER)
+#  define PLATFORM_ID "ADSP"
+
+#else /* unknown platform */
+# define PLATFORM_ID
+
+#endif
+
+/* For windows compilers MSVC and Intel we can determine
+   the architecture of the compiler being used.  This is because
+   the compilers do not have flags that can change the architecture,
+   but rather depend on which compiler is being used
+*/
+#if defined(_WIN32) && defined(_MSC_VER)
+# if defined(_M_IA64)
+#  define ARCHITECTURE_ID "IA64"
+
+# elif defined(_M_ARM64EC)
+#  define ARCHITECTURE_ID "ARM64EC"
+
+# elif defined(_M_X64) || defined(_M_AMD64)
+#  define ARCHITECTURE_ID "x64"
+
+# elif defined(_M_IX86)
+#  define ARCHITECTURE_ID "X86"
+
+# elif defined(_M_ARM64)
+#  define ARCHITECTURE_ID "ARM64"
+
+# elif defined(_M_ARM)
+#  if _M_ARM == 4
+#   define ARCHITECTURE_ID "ARMV4I"
+#  elif _M_ARM == 5
+#   define ARCHITECTURE_ID "ARMV5I"
+#  else
+#   define ARCHITECTURE_ID "ARMV" STRINGIFY(_M_ARM)
+#  endif
+
+# elif defined(_M_MIPS)
+#  define ARCHITECTURE_ID "MIPS"
+
+# elif defined(_M_SH)
+#  define ARCHITECTURE_ID "SHx"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__WATCOMC__)
+# if defined(_M_I86)
+#  define ARCHITECTURE_ID "I86"
+
+# elif defined(_M_IX86)
+#  define ARCHITECTURE_ID "X86"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
+# if defined(__ICCARM__)
+#  define ARCHITECTURE_ID "ARM"
+
+# elif defined(__ICCRX__)
+#  define ARCHITECTURE_ID "RX"
+
+# elif defined(__ICCRH850__)
+#  define ARCHITECTURE_ID "RH850"
+
+# elif defined(__ICCRL78__)
+#  define ARCHITECTURE_ID "RL78"
+
+# elif defined(__ICCRISCV__)
+#  define ARCHITECTURE_ID "RISCV"
+
+# elif defined(__ICCAVR__)
+#  define ARCHITECTURE_ID "AVR"
+
+# elif defined(__ICC430__)
+#  define ARCHITECTURE_ID "MSP430"
+
+# elif defined(__ICCV850__)
+#  define ARCHITECTURE_ID "V850"
+
+# elif defined(__ICC8051__)
+#  define ARCHITECTURE_ID "8051"
+
+# elif defined(__ICCSTM8__)
+#  define ARCHITECTURE_ID "STM8"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__ghs__)
+# if defined(__PPC64__)
+#  define ARCHITECTURE_ID "PPC64"
+
+# elif defined(__ppc__)
+#  define ARCHITECTURE_ID "PPC"
+
+# elif defined(__ARM__)
+#  define ARCHITECTURE_ID "ARM"
+
+# elif defined(__x86_64__)
+#  define ARCHITECTURE_ID "x64"
+
+# elif defined(__i386__)
+#  define ARCHITECTURE_ID "X86"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__clang__) && defined(__ti__)
+# if defined(__ARM_ARCH)
+#  define ARCHITECTURE_ID "ARM"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__TI_COMPILER_VERSION__)
+# if defined(__TI_ARM__)
+#  define ARCHITECTURE_ID "ARM"
+
+# elif defined(__MSP430__)
+#  define ARCHITECTURE_ID "MSP430"
+
+# elif defined(__TMS320C28XX__)
+#  define ARCHITECTURE_ID "TMS320C28x"
+
+# elif defined(__TMS320C6X__) || defined(_TMS320C6X)
+#  define ARCHITECTURE_ID "TMS320C6x"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+# elif defined(__ADSPSHARC__)
+#  define ARCHITECTURE_ID "SHARC"
+
+# elif defined(__ADSPBLACKFIN__)
+#  define ARCHITECTURE_ID "Blackfin"
+
+#elif defined(__TASKING__)
+
+# if defined(__CTC__) || defined(__CPTC__)
+#  define ARCHITECTURE_ID "TriCore"
+
+# elif defined(__CMCS__)
+#  define ARCHITECTURE_ID "MCS"
+
+# elif defined(__CARM__)
+#  define ARCHITECTURE_ID "ARM"
+
+# elif defined(__CARC__)
+#  define ARCHITECTURE_ID "ARC"
+
+# elif defined(__C51__)
+#  define ARCHITECTURE_ID "8051"
+
+# elif defined(__CPCP__)
+#  define ARCHITECTURE_ID "PCP"
+
+# else
+#  define ARCHITECTURE_ID ""
+# endif
+
+#else
+#  define ARCHITECTURE_ID
+#endif
+
+/* Convert integer to decimal digit literals.  */
+#define DEC(n)                   \
+  ('0' + (((n) / 10000000)%10)), \
+  ('0' + (((n) / 1000000)%10)),  \
+  ('0' + (((n) / 100000)%10)),   \
+  ('0' + (((n) / 10000)%10)),    \
+  ('0' + (((n) / 1000)%10)),     \
+  ('0' + (((n) / 100)%10)),      \
+  ('0' + (((n) / 10)%10)),       \
+  ('0' +  ((n) % 10))
+
+/* Convert integer to hex digit literals.  */
+#define HEX(n)             \
+  ('0' + ((n)>>28 & 0xF)), \
+  ('0' + ((n)>>24 & 0xF)), \
+  ('0' + ((n)>>20 & 0xF)), \
+  ('0' + ((n)>>16 & 0xF)), \
+  ('0' + ((n)>>12 & 0xF)), \
+  ('0' + ((n)>>8  & 0xF)), \
+  ('0' + ((n)>>4  & 0xF)), \
+  ('0' + ((n)     & 0xF))
+
+/* Construct a string literal encoding the version number. */
+#ifdef COMPILER_VERSION
+char const* info_version = "INFO" ":" "compiler_version[" COMPILER_VERSION "]";
+
+/* Construct a string literal encoding the version number components. */
+#elif defined(COMPILER_VERSION_MAJOR)
+char const info_version[] = {
+  'I', 'N', 'F', 'O', ':',
+  'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','[',
+  COMPILER_VERSION_MAJOR,
+# ifdef COMPILER_VERSION_MINOR
+  '.', COMPILER_VERSION_MINOR,
+#  ifdef COMPILER_VERSION_PATCH
+   '.', COMPILER_VERSION_PATCH,
+#   ifdef COMPILER_VERSION_TWEAK
+    '.', COMPILER_VERSION_TWEAK,
+#   endif
+#  endif
+# endif
+  ']','\0'};
+#endif
+
+/* Construct a string literal encoding the internal version number. */
+#ifdef COMPILER_VERSION_INTERNAL
+char const info_version_internal[] = {
+  'I', 'N', 'F', 'O', ':',
+  'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','_',
+  'i','n','t','e','r','n','a','l','[',
+  COMPILER_VERSION_INTERNAL,']','\0'};
+#elif defined(COMPILER_VERSION_INTERNAL_STR)
+char const* info_version_internal = "INFO" ":" "compiler_version_internal[" COMPILER_VERSION_INTERNAL_STR "]";
+#endif
+
+/* Construct a string literal encoding the version number components. */
+#ifdef SIMULATE_VERSION_MAJOR
+char const info_simulate_version[] = {
+  'I', 'N', 'F', 'O', ':',
+  's','i','m','u','l','a','t','e','_','v','e','r','s','i','o','n','[',
+  SIMULATE_VERSION_MAJOR,
+# ifdef SIMULATE_VERSION_MINOR
+  '.', SIMULATE_VERSION_MINOR,
+#  ifdef SIMULATE_VERSION_PATCH
+   '.', SIMULATE_VERSION_PATCH,
+#   ifdef SIMULATE_VERSION_TWEAK
+    '.', SIMULATE_VERSION_TWEAK,
+#   endif
+#  endif
+# endif
+  ']','\0'};
+#endif
+
+/* Construct the string literal in pieces to prevent the source from
+   getting matched.  Store it in a pointer rather than an array
+   because some compilers will just produce instructions to fill the
+   array rather than assigning a pointer to a static array.  */
+char const* info_platform = "INFO" ":" "platform[" PLATFORM_ID "]";
+char const* info_arch = "INFO" ":" "arch[" ARCHITECTURE_ID "]";
+
+
+
+#define CXX_STD_98 199711L
+#define CXX_STD_11 201103L
+#define CXX_STD_14 201402L
+#define CXX_STD_17 201703L
+#define CXX_STD_20 202002L
+#define CXX_STD_23 202302L
+
+#if defined(__INTEL_COMPILER) && defined(_MSVC_LANG)
+#  if _MSVC_LANG > CXX_STD_17
+#    define CXX_STD _MSVC_LANG
+#  elif _MSVC_LANG == CXX_STD_17 && defined(__cpp_aggregate_paren_init)
+#    define CXX_STD CXX_STD_20
+#  elif _MSVC_LANG > CXX_STD_14 && __cplusplus > CXX_STD_17
+#    define CXX_STD CXX_STD_20
+#  elif _MSVC_LANG > CXX_STD_14
+#    define CXX_STD CXX_STD_17
+#  elif defined(__INTEL_CXX11_MODE__) && defined(__cpp_aggregate_nsdmi)
+#    define CXX_STD CXX_STD_14
+#  elif defined(__INTEL_CXX11_MODE__)
+#    define CXX_STD CXX_STD_11
+#  else
+#    define CXX_STD CXX_STD_98
+#  endif
+#elif defined(_MSC_VER) && defined(_MSVC_LANG)
+#  if _MSVC_LANG > __cplusplus
+#    define CXX_STD _MSVC_LANG
+#  else
+#    define CXX_STD __cplusplus
+#  endif
+#elif defined(__NVCOMPILER)
+#  if __cplusplus == CXX_STD_17 && defined(__cpp_aggregate_paren_init)
+#    define CXX_STD CXX_STD_20
+#  else
+#    define CXX_STD __cplusplus
+#  endif
+#elif defined(__INTEL_COMPILER) || defined(__PGI)
+#  if __cplusplus == CXX_STD_11 && defined(__cpp_namespace_attributes)
+#    define CXX_STD CXX_STD_17
+#  elif __cplusplus == CXX_STD_11 && defined(__cpp_aggregate_nsdmi)
+#    define CXX_STD CXX_STD_14
+#  else
+#    define CXX_STD __cplusplus
+#  endif
+#elif (defined(__IBMCPP__) || defined(__ibmxl__)) && defined(__linux__)
+#  if __cplusplus == CXX_STD_11 && defined(__cpp_aggregate_nsdmi)
+#    define CXX_STD CXX_STD_14
+#  else
+#    define CXX_STD __cplusplus
+#  endif
+#elif __cplusplus == 1 && defined(__GXX_EXPERIMENTAL_CXX0X__)
+#  define CXX_STD CXX_STD_11
+#else
+#  define CXX_STD __cplusplus
+#endif
+
+const char* info_language_standard_default = "INFO" ":" "standard_default["
+#if CXX_STD > CXX_STD_23
+  "26"
+#elif CXX_STD > CXX_STD_20
+  "23"
+#elif CXX_STD > CXX_STD_17
+  "20"
+#elif CXX_STD > CXX_STD_14
+  "17"
+#elif CXX_STD > CXX_STD_11
+  "14"
+#elif CXX_STD >= CXX_STD_11
+  "11"
+#else
+  "98"
+#endif
+"]";
+
+const char* info_language_extensions_default = "INFO" ":" "extensions_default["
+#if (defined(__clang__) || defined(__GNUC__) || defined(__xlC__) ||           \
+     defined(__TI_COMPILER_VERSION__)) &&                                     \
+  !defined(__STRICT_ANSI__)
+  "ON"
+#else
+  "OFF"
+#endif
+"]";
+
+/*--------------------------------------------------------------------------*/
+
+int main(int argc, char* argv[])
+{
+  int require = 0;
+  require += info_compiler[argc];
+  require += info_platform[argc];
+  require += info_arch[argc];
+#ifdef COMPILER_VERSION_MAJOR
+  require += info_version[argc];
+#endif
+#ifdef COMPILER_VERSION_INTERNAL
+  require += info_version_internal[argc];
+#endif
+#ifdef SIMULATE_ID
+  require += info_simulate[argc];
+#endif
+#ifdef SIMULATE_VERSION_MAJOR
+  require += info_simulate_version[argc];
+#endif
+#if defined(__CRAYXT_COMPUTE_LINUX_TARGET)
+  require += info_cray[argc];
+#endif
+  require += info_language_standard_default[argc];
+  require += info_language_extensions_default[argc];
+  (void)argv;
+  return require;
+}
diff --git a/build2/CMakeFiles/3.31.6/CompilerIdCXX/a.out b/build2/CMakeFiles/3.31.6/CompilerIdCXX/a.out
new file mode 100755
index 0000000000000000000000000000000000000000..c8ced32cf082708045baa23211fbf858c298928d
GIT binary patch
literal 16096
zcmeHOeQX>@6`woj!=X-macg3d(k!8=99nPAj^nz8kaO&_*T^4f;*@}ER%_qdcj7+G
z-X66pNQ2TsjBC`;3i?Npq6&ckRRRf$sMO%Js8y?i5($YQ0Wu#EK}uUAK4e1V<Gq>p
z*6ZaQ1oRIi_F3LH@Ap1t_RZ|x?C#9N$-eGrBqErq#0LdRiI_qXq&Ryw6@Vo~yVwlJ
zcZ*xa29VcDOz9JffmYF_=xSa~colH;YrsMUeyf6^21VRL<mk5+rLjRk%mtkX`mIL=
z$wB^Ws(?A`z4|nC2GZow<ByRabH5)pWwA-wFCJLU4a&=5;_Qc_JOy3ZLw6`5K2P;A
z=X_#L@V}k%8RT&a!#wDhCchx>B0uI>2h!2YZt6d&?=bnjuE{VW$nR3HV9xd32Y%GG
zWN~B0-F$@VTdN;plz--wUa>cu8EtFbn@u%kGx^d~(^Pv~Q(LQEEa)w=Vr-WN|2U?4
z295~`GmjXhQAAHFnd71E7Sf~r3)WM^-*Yd|tslBNKJntNUw+`kwO7yv+l@YGgM{&T
zh@gyRtP^ciK0X5_8r#4x+CRxjV2uO%)m6}S0;W~K%{B1+8u-nC@2U_-m?mU&%q+T=
z<C-}ulLusM$}-0@c`KWF$QG!^{I-dnzTQKfW{cjU@Au04T7}s=)NiJ2$DYU(UE3Mz
z@5~nR_K-E2wIS9-u8^nbrZTN)h#8E?Kh;wakg>fyUP{|Dn=tD*{t)}_nJ+<_qj1Ml
z#Md!jKiXD>FVXeQ_yPs2PAEO&EXM-4rYXCI0PYa31@O-i-Wb52AUqzxpC$a#K_Lmp
z4vqz;1s{%MjOmIG=dq2tMIVmimTAd{%lj=WLLO!y%s`ldFau!*!VH8N2s7|Mk%2$e
z-geD6b+y`<UH|jFLKu(EyV3Fm<J6C;Uy|)B?|%m1^6sy~v36%dpnZAwIgrL{cXkOW
zH^0$4bMa%w%x{cSzgs*!lx&`Fe$|*e@EQat*B8O`&*OUS&PQZCz|R9>%&mVO**!~c
zJyd-^mZ9oR<%QavC(-aF;$VM9+VB57vOUYj%%XAr&4b4Ir79!xvT<?Qy#)g7rU2FD
z1=TM0$M&8)&<|=+y7QQE>Od5W#>{26#+W^@0fZ}i%H{Hv6dYcbVIm{o>(!6`e|Qj-
zSU3iLGoQX{%#;>hNnXch8ngAU!IS!I@~ZKa5xG$NoTxoFA4y&Z{P{KTZ&t!pfVui-
zw?LYoTNm@9JW|OTqPvyw+2r*R=r(Ms>{G87v8f@283;2FW+2Q!n1L_@VFtnsgc%4k
z5N06E!2fdw@cY+|sCS@y@ZPaPZZea#oniPYIkMV%mEQcM?G!VG{BT@S^FCb_;$9&>
zBBaM;)^f)SPHwmlzpfH!Ib-QzD#Lfee9CfC@WF4~DrMc_=DSH_Pq}s;YbkoV!2#K-
z$d0P_H$wC9d(_Zd<?;i-Q^4`fg9{v9SBR0ta`|cC_$?MG^3V|xnTkbr)NHJN96pF4
zj%yAY!Tt_3=-Md1<lPR%R`_3hvs{+ImRR?eh7Z-=^kDT#ad7)R@7s4fenyo3Snnma
zLl6jKy72!4i2Dr$l3QY*jdpI{5IqYuBM?%UfiMGM2Eq)483;2FW+2Q!n1L_@VFupb
z4DfnIUZ2Qo0Oi9AR8_;((fY;BB>$AwIlhZzUI)2@WPXI%PBO2D#OEF)*8gR>TtNBT
zw3v|B2&VC&4G7mIB3&Z=JCrC+6TgXg1Mzy|%*aj5(>lbBq=-{R+>UlSaaimriR0Zy
zGTZ&VtlA6a5?Ur%EhdK#+$(zN36GcZ{1)ka{zfv#qwsGZ<MrYHWkg<=s%a_^uRG;+
zro66{*OB&gcHXNs9vdy?-I4|m`tXF`)K-#W%ZZj&J>I&9;2Sp#yJ4O9V>xJr{SpDq
zW7MG<8Q}WjO7_@qQL#l#(zqpap%H#IfbS!muLHL4g+fF$i1vg+uzg6l8ao0{_dKp8
z2!~I>Ki13F72~I&5D_;EzD^kbIut6k|D3dsiG-#sTNHx`mF+J89)XqIr{6<{K2|CI
zucSR(ErId!d+E2;TZhkKu1WiMde;%-F-S-q3qIZixaO0&cwFM!gh()=crV~FvCYdf
zYYzin7p)b1zhV4-vJb`?lkwSVg*$+6jcyY>u37Ui;!v~D6hfD&_=3c@iQxL{rwI?P
zr+xwO7>tudf+H*b0N`~n9uhR(<U1r#y-0ClWY7153lxXP8%O&E#o0smUHQ%kl(;_y
z&nsyE2E}g-#IK2Zr^=xvzXR}Hs}Lo00A3e`yKLZk=>dEz^p}=UcHDk(bj)#^^#ZKG
zw?;FjYfT6Mif(CqTptrFtMyGcXO7`|{UTVV3g$$%FluGZlv{9$rd65}_>M7ayLL*C
zSGK^N0vXeC9BbON^R6>3#vLnXo2gPRHw`X6$plMxm1$?c^>MrN`0-A9li8cn$0jF*
z`O&`SmP~%Uz;7-gPWO?H{-l{4=rUm+LDxqHI{JG%0ftwfX3`+7(RD<aJ$-|RI{M7P
z?(U<>A#<qXP+t-}g4-Mtyqn=)?O?D|mTL)lmJkI6wVeTk)q5MvRIy;D;q@r)d*~em
zt5ha$mWp;t$W!5Wt4hjR`H7M>VVnQ_-c&#y$%o(YLS>`HB2`SgG+?6zr9+1I0tR2v
z-eA|o>a8ALN^paR>?_q&eE%ziUYyRk)+lh-Q9RA1Odj@qObR_;aBY1eU(zR?!ldoE
z(>`dllz~k<nG``ChkBcEP)hT(RZI&#HJyhl6n7n^p%>Sy1QT?Qowd+G=s2W=KABYq
zeWCyb7ji0e9G75Oko~9IX&Q;?6!^2G{MC?D9$bdtRxUFJ&B5;1A^Spy-pIiauW)((
z+Yrvr;MU;1<qz(+<M|l}Mq59<7X+L`!R0S$t$k&r_U3skw?V=0AKYJt@74Xp_hZKJ
z_t@{x^8w}>8xjxte;Dw;!W@j-&+|^^TtCk{z55!)vw-8All^&K%KUM%!!}~>*q`T<
z8NhG~!~Q(aWqulTehTLQ6QIO7Cj0Zek~z=Ux&3U%`~>*poRwvsw=$1Y<-zuIo93W^
zIc0yIM>FSnG}j+I|1X0to)hc6-xd0O;pYc1kreE|uK?=z*T|1KiR8WVv&Hx`0slBD
zn6n)RV43;10{#h7F#lqp!`P4GeJ9}0^BU&-e8u*`^Z!2ibN+=!mc(Brkr}}(iXTD=
zo5=pJlL7O)JWEvw*8gLG{r*ej&-}@NKleYwKZ63SY4!F+@_d;0V+QS6X8v37t@Ziy
z{ClYhKp?hL(u&OZTcE(PM~@LJ^Iup$i!@LDhvOfK{kR{$1{j*KKR;K_??r1N67slm
zV1MRIpz`~B4sqqvzTzrN?8opj6cFS3dEVDf{y}>>9d;L003b%@9?t%EdWb5pzn}Bi
z@tdY8Am0b^I>u)eZV%u8HUY+M_xmUCV=B;nf#6)P(&C)6vi}+UVF9WMI0QuT55M$T
ASpWb4

literal 0
HcmV?d00001

diff --git a/build2/CMakeFiles/CMakeConfigureLog.yaml b/build2/CMakeFiles/CMakeConfigureLog.yaml
new file mode 100644
index 000000000..0c5487522
--- /dev/null
+++ b/build2/CMakeFiles/CMakeConfigureLog.yaml
@@ -0,0 +1,294 @@
+
+---
+events:
+  -
+    kind: "message-v1"
+    backtrace:
+      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineSystem.cmake:205 (message)"
+      - "CMakeLists.txt:5 (project)"
+    message: |
+      The system is: Linux - 6.11.0-1018-azure - x86_64
+  -
+    kind: "message-v1"
+    backtrace:
+      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCompilerId.cmake:17 (message)"
+      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCompilerId.cmake:64 (__determine_compiler_id_test)"
+      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCXXCompiler.cmake:126 (CMAKE_DETERMINE_COMPILER_ID)"
+      - "CMakeLists.txt:5 (project)"
+    message: |
+      Compiling the CXX compiler identification source file "CMakeCXXCompilerId.cpp" succeeded.
+      Compiler: /usr/bin/c++ 
+      Build flags: 
+      Id flags:  
+      
+      The output was:
+      0
+      
+      
+      Compilation of the CXX compiler identification source "CMakeCXXCompilerId.cpp" produced "a.out"
+      
+      The CXX compiler identification is GNU, found in:
+        /home/runner/work/mscclpp/mscclpp/build2/CMakeFiles/3.31.6/CompilerIdCXX/a.out
+      
+  -
+    kind: "try_compile-v1"
+    backtrace:
+      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCompilerABI.cmake:74 (try_compile)"
+      - "/usr/local/share/cmake-3.31/Modules/CMakeTestCXXCompiler.cmake:26 (CMAKE_DETERMINE_COMPILER_ABI)"
+      - "CMakeLists.txt:5 (project)"
+    checks:
+      - "Detecting CXX compiler ABI info"
+    directories:
+      source: "/home/runner/work/mscclpp/mscclpp/build2/CMakeFiles/CMakeScratch/TryCompile-NUgSX3"
+      binary: "/home/runner/work/mscclpp/mscclpp/build2/CMakeFiles/CMakeScratch/TryCompile-NUgSX3"
+    cmakeVariables:
+      CMAKE_CXX_FLAGS: ""
+      CMAKE_CXX_FLAGS_DEBUG: "-g"
+      CMAKE_CXX_SCAN_FOR_MODULES: "OFF"
+      CMAKE_EXE_LINKER_FLAGS: ""
+    buildResult:
+      variable: "CMAKE_CXX_ABI_COMPILED"
+      cached: true
+      stdout: |
+        Change Dir: '/home/runner/work/mscclpp/mscclpp/build2/CMakeFiles/CMakeScratch/TryCompile-NUgSX3'
+        
+        Run Build Command(s): /usr/local/bin/cmake -E env VERBOSE=1 /usr/bin/gmake -f Makefile cmTC_68918/fast
+        /usr/bin/gmake  -f CMakeFiles/cmTC_68918.dir/build.make CMakeFiles/cmTC_68918.dir/build
+        gmake[1]: Entering directory '/home/runner/work/mscclpp/mscclpp/build2/CMakeFiles/CMakeScratch/TryCompile-NUgSX3'
+        Building CXX object CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o
+        /usr/bin/c++   -v -o CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o -c /usr/local/share/cmake-3.31/Modules/CMakeCXXCompilerABI.cpp
+        Using built-in specs.
+        COLLECT_GCC=/usr/bin/c++
+        OFFLOAD_TARGET_NAMES=nvptx-none:amdgcn-amdhsa
+        OFFLOAD_TARGET_DEFAULT=1
+        Target: x86_64-linux-gnu
+        Configured with: ../src/configure -v --with-pkgversion='Ubuntu 13.3.0-6ubuntu2~24.04' --with-bugurl=file:///usr/share/doc/gcc-13/README.Bugs --enable-languages=c,ada,c++,go,d,fortran,objc,obj-c++,m2 --prefix=/usr --with-gcc-major-version-only --program-suffix=-13 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/libexec --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --enable-bootstrap --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-libstdcxx-backtrace --enable-gnu-unique-object --disable-vtable-verify --enable-plugin --enable-default-pie --with-system-zlib --enable-libphobos-checking=release --with-target-system-zlib=auto --enable-objc-gc=auto --enable-multiarch --disable-werror --enable-cet --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-nvptx/usr,amdgcn-amdhsa=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-gcn/usr --enable-offload-defaulted --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu --with-build-config=bootstrap-lto-lean --enable-link-serialization=2
+        Thread model: posix
+        Supported LTO compression algorithms: zlib zstd
+        gcc version 13.3.0 (Ubuntu 13.3.0-6ubuntu2~24.04) 
+        COLLECT_GCC_OPTIONS='-v' '-o' 'CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o' '-c' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'CMakeFiles/cmTC_68918.dir/'
+         /usr/libexec/gcc/x86_64-linux-gnu/13/cc1plus -quiet -v -imultiarch x86_64-linux-gnu -D_GNU_SOURCE /usr/local/share/cmake-3.31/Modules/CMakeCXXCompilerABI.cpp -quiet -dumpdir CMakeFiles/cmTC_68918.dir/ -dumpbase CMakeCXXCompilerABI.cpp.cpp -dumpbase-ext .cpp -mtune=generic -march=x86-64 -version -fasynchronous-unwind-tables -fstack-protector-strong -Wformat -Wformat-security -fstack-clash-protection -fcf-protection -o /tmp/ccqGcDxl.s
+        GNU C++17 (Ubuntu 13.3.0-6ubuntu2~24.04) version 13.3.0 (x86_64-linux-gnu)
+        	compiled by GNU C version 13.3.0, GMP version 6.3.0, MPFR version 4.2.1, MPC version 1.3.1, isl version isl-0.26-GMP
+        
+        GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
+        ignoring duplicate directory "/usr/include/x86_64-linux-gnu/c++/13"
+        ignoring nonexistent directory "/usr/local/include/x86_64-linux-gnu"
+        ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/13/include-fixed/x86_64-linux-gnu"
+        ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/13/include-fixed"
+        ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/13/../../../../x86_64-linux-gnu/include"
+        #include "..." search starts here:
+        #include <...> search starts here:
+         /usr/include/c++/13
+         /usr/include/x86_64-linux-gnu/c++/13
+         /usr/include/c++/13/backward
+         /usr/lib/gcc/x86_64-linux-gnu/13/include
+         /usr/local/include
+         /usr/include/x86_64-linux-gnu
+         /usr/include
+        End of search list.
+        Compiler executable checksum: c81c05345ce537099dafd5580045814a
+        COLLECT_GCC_OPTIONS='-v' '-o' 'CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o' '-c' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'CMakeFiles/cmTC_68918.dir/'
+         as -v --64 -o CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o /tmp/ccqGcDxl.s
+        GNU assembler version 2.42 (x86_64-linux-gnu) using BFD version (GNU Binutils for Ubuntu) 2.42
+        COMPILER_PATH=/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/
+        LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../:/lib/:/usr/lib/
+        COLLECT_GCC_OPTIONS='-v' '-o' 'CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o' '-c' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.'
+        Linking CXX executable cmTC_68918
+        /usr/local/bin/cmake -E cmake_link_script CMakeFiles/cmTC_68918.dir/link.txt --verbose=1
+        Using built-in specs.
+        COLLECT_GCC=/usr/bin/c++
+        COLLECT_LTO_WRAPPER=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper
+        OFFLOAD_TARGET_NAMES=nvptx-none:amdgcn-amdhsa
+        OFFLOAD_TARGET_DEFAULT=1
+        Target: x86_64-linux-gnu
+        Configured with: ../src/configure -v --with-pkgversion='Ubuntu 13.3.0-6ubuntu2~24.04' --with-bugurl=file:///usr/share/doc/gcc-13/README.Bugs --enable-languages=c,ada,c++,go,d,fortran,objc,obj-c++,m2 --prefix=/usr --with-gcc-major-version-only --program-suffix=-13 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/libexec --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --enable-bootstrap --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-libstdcxx-backtrace --enable-gnu-unique-object --disable-vtable-verify --enable-plugin --enable-default-pie --with-system-zlib --enable-libphobos-checking=release --with-target-system-zlib=auto --enable-objc-gc=auto --enable-multiarch --disable-werror --enable-cet --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-nvptx/usr,amdgcn-amdhsa=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-gcn/usr --enable-offload-defaulted --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu --with-build-config=bootstrap-lto-lean --enable-link-serialization=2
+        Thread model: posix
+        Supported LTO compression algorithms: zlib zstd
+        gcc version 13.3.0 (Ubuntu 13.3.0-6ubuntu2~24.04) 
+        COMPILER_PATH=/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/
+        LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../:/lib/:/usr/lib/
+        COLLECT_GCC_OPTIONS='-v' '-o' 'cmTC_68918' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'cmTC_68918.'
+         /usr/libexec/gcc/x86_64-linux-gnu/13/collect2 -plugin /usr/libexec/gcc/x86_64-linux-gnu/13/liblto_plugin.so -plugin-opt=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper -plugin-opt=-fresolution=/tmp/ccE7OB0z.res -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -dynamic-linker /lib64/ld-linux-x86-64.so.2 -pie -z now -z relro -o cmTC_68918 /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o -L/usr/lib/gcc/x86_64-linux-gnu/13 -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/13/../../.. -v CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o
+        collect2 version 13.3.0
+        /usr/bin/ld -plugin /usr/libexec/gcc/x86_64-linux-gnu/13/liblto_plugin.so -plugin-opt=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper -plugin-opt=-fresolution=/tmp/ccE7OB0z.res -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -dynamic-linker /lib64/ld-linux-x86-64.so.2 -pie -z now -z relro -o cmTC_68918 /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o -L/usr/lib/gcc/x86_64-linux-gnu/13 -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/13/../../.. -v CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o
+        GNU ld (GNU Binutils for Ubuntu) 2.42
+        COLLECT_GCC_OPTIONS='-v' '-o' 'cmTC_68918' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'cmTC_68918.'
+        /usr/bin/c++  -v -Wl,-v CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o -o cmTC_68918
+        gmake[1]: Leaving directory '/home/runner/work/mscclpp/mscclpp/build2/CMakeFiles/CMakeScratch/TryCompile-NUgSX3'
+        
+      exitCode: 0
+  -
+    kind: "message-v1"
+    backtrace:
+      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCompilerABI.cmake:182 (message)"
+      - "/usr/local/share/cmake-3.31/Modules/CMakeTestCXXCompiler.cmake:26 (CMAKE_DETERMINE_COMPILER_ABI)"
+      - "CMakeLists.txt:5 (project)"
+    message: |
+      Parsed CXX implicit include dir info: rv=done
+        found start of include info
+        found start of implicit include info
+          add: [/usr/include/c++/13]
+          add: [/usr/include/x86_64-linux-gnu/c++/13]
+          add: [/usr/include/c++/13/backward]
+          add: [/usr/lib/gcc/x86_64-linux-gnu/13/include]
+          add: [/usr/local/include]
+          add: [/usr/include/x86_64-linux-gnu]
+          add: [/usr/include]
+        end of search list found
+        collapse include dir [/usr/include/c++/13] ==> [/usr/include/c++/13]
+        collapse include dir [/usr/include/x86_64-linux-gnu/c++/13] ==> [/usr/include/x86_64-linux-gnu/c++/13]
+        collapse include dir [/usr/include/c++/13/backward] ==> [/usr/include/c++/13/backward]
+        collapse include dir [/usr/lib/gcc/x86_64-linux-gnu/13/include] ==> [/usr/lib/gcc/x86_64-linux-gnu/13/include]
+        collapse include dir [/usr/local/include] ==> [/usr/local/include]
+        collapse include dir [/usr/include/x86_64-linux-gnu] ==> [/usr/include/x86_64-linux-gnu]
+        collapse include dir [/usr/include] ==> [/usr/include]
+        implicit include dirs: [/usr/include/c++/13;/usr/include/x86_64-linux-gnu/c++/13;/usr/include/c++/13/backward;/usr/lib/gcc/x86_64-linux-gnu/13/include;/usr/local/include;/usr/include/x86_64-linux-gnu;/usr/include]
+      
+      
+  -
+    kind: "message-v1"
+    backtrace:
+      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCompilerABI.cmake:218 (message)"
+      - "/usr/local/share/cmake-3.31/Modules/CMakeTestCXXCompiler.cmake:26 (CMAKE_DETERMINE_COMPILER_ABI)"
+      - "CMakeLists.txt:5 (project)"
+    message: |
+      Parsed CXX implicit link information:
+        link line regex: [^( *|.*[/\\])(ld[0-9]*(\\.[a-z]+)?|CMAKE_LINK_STARTFILE-NOTFOUND|([^/\\]+-)?ld|collect2)[^/\\]*( |$)]
+        linker tool regex: [^[ 	]*(->|")?[ 	]*(([^"]*[/\\])?(ld[0-9]*(\\.[a-z]+)?))("|,| |$)]
+        ignore line: [Change Dir: '/home/runner/work/mscclpp/mscclpp/build2/CMakeFiles/CMakeScratch/TryCompile-NUgSX3']
+        ignore line: []
+        ignore line: [Run Build Command(s): /usr/local/bin/cmake -E env VERBOSE=1 /usr/bin/gmake -f Makefile cmTC_68918/fast]
+        ignore line: [/usr/bin/gmake  -f CMakeFiles/cmTC_68918.dir/build.make CMakeFiles/cmTC_68918.dir/build]
+        ignore line: [gmake[1]: Entering directory '/home/runner/work/mscclpp/mscclpp/build2/CMakeFiles/CMakeScratch/TryCompile-NUgSX3']
+        ignore line: [Building CXX object CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o]
+        ignore line: [/usr/bin/c++   -v -o CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o -c /usr/local/share/cmake-3.31/Modules/CMakeCXXCompilerABI.cpp]
+        ignore line: [Using built-in specs.]
+        ignore line: [COLLECT_GCC=/usr/bin/c++]
+        ignore line: [OFFLOAD_TARGET_NAMES=nvptx-none:amdgcn-amdhsa]
+        ignore line: [OFFLOAD_TARGET_DEFAULT=1]
+        ignore line: [Target: x86_64-linux-gnu]
+        ignore line: [Configured with: ../src/configure -v --with-pkgversion='Ubuntu 13.3.0-6ubuntu2~24.04' --with-bugurl=file:///usr/share/doc/gcc-13/README.Bugs --enable-languages=c ada c++ go d fortran objc obj-c++ m2 --prefix=/usr --with-gcc-major-version-only --program-suffix=-13 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/libexec --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --enable-bootstrap --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-libstdcxx-backtrace --enable-gnu-unique-object --disable-vtable-verify --enable-plugin --enable-default-pie --with-system-zlib --enable-libphobos-checking=release --with-target-system-zlib=auto --enable-objc-gc=auto --enable-multiarch --disable-werror --enable-cet --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32 m64 mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-nvptx/usr amdgcn-amdhsa=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-gcn/usr --enable-offload-defaulted --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu --with-build-config=bootstrap-lto-lean --enable-link-serialization=2]
+        ignore line: [Thread model: posix]
+        ignore line: [Supported LTO compression algorithms: zlib zstd]
+        ignore line: [gcc version 13.3.0 (Ubuntu 13.3.0-6ubuntu2~24.04) ]
+        ignore line: [COLLECT_GCC_OPTIONS='-v' '-o' 'CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o' '-c' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'CMakeFiles/cmTC_68918.dir/']
+        ignore line: [ /usr/libexec/gcc/x86_64-linux-gnu/13/cc1plus -quiet -v -imultiarch x86_64-linux-gnu -D_GNU_SOURCE /usr/local/share/cmake-3.31/Modules/CMakeCXXCompilerABI.cpp -quiet -dumpdir CMakeFiles/cmTC_68918.dir/ -dumpbase CMakeCXXCompilerABI.cpp.cpp -dumpbase-ext .cpp -mtune=generic -march=x86-64 -version -fasynchronous-unwind-tables -fstack-protector-strong -Wformat -Wformat-security -fstack-clash-protection -fcf-protection -o /tmp/ccqGcDxl.s]
+        ignore line: [GNU C++17 (Ubuntu 13.3.0-6ubuntu2~24.04) version 13.3.0 (x86_64-linux-gnu)]
+        ignore line: [	compiled by GNU C version 13.3.0  GMP version 6.3.0  MPFR version 4.2.1  MPC version 1.3.1  isl version isl-0.26-GMP]
+        ignore line: []
+        ignore line: [GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072]
+        ignore line: [ignoring duplicate directory "/usr/include/x86_64-linux-gnu/c++/13"]
+        ignore line: [ignoring nonexistent directory "/usr/local/include/x86_64-linux-gnu"]
+        ignore line: [ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/13/include-fixed/x86_64-linux-gnu"]
+        ignore line: [ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/13/include-fixed"]
+        ignore line: [ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/13/../../../../x86_64-linux-gnu/include"]
+        ignore line: [#include "..." search starts here:]
+        ignore line: [#include <...> search starts here:]
+        ignore line: [ /usr/include/c++/13]
+        ignore line: [ /usr/include/x86_64-linux-gnu/c++/13]
+        ignore line: [ /usr/include/c++/13/backward]
+        ignore line: [ /usr/lib/gcc/x86_64-linux-gnu/13/include]
+        ignore line: [ /usr/local/include]
+        ignore line: [ /usr/include/x86_64-linux-gnu]
+        ignore line: [ /usr/include]
+        ignore line: [End of search list.]
+        ignore line: [Compiler executable checksum: c81c05345ce537099dafd5580045814a]
+        ignore line: [COLLECT_GCC_OPTIONS='-v' '-o' 'CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o' '-c' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'CMakeFiles/cmTC_68918.dir/']
+        ignore line: [ as -v --64 -o CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o /tmp/ccqGcDxl.s]
+        ignore line: [GNU assembler version 2.42 (x86_64-linux-gnu) using BFD version (GNU Binutils for Ubuntu) 2.42]
+        ignore line: [COMPILER_PATH=/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/]
+        ignore line: [LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../:/lib/:/usr/lib/]
+        ignore line: [COLLECT_GCC_OPTIONS='-v' '-o' 'CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o' '-c' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.']
+        ignore line: [Linking CXX executable cmTC_68918]
+        ignore line: [/usr/local/bin/cmake -E cmake_link_script CMakeFiles/cmTC_68918.dir/link.txt --verbose=1]
+        ignore line: [Using built-in specs.]
+        ignore line: [COLLECT_GCC=/usr/bin/c++]
+        ignore line: [COLLECT_LTO_WRAPPER=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper]
+        ignore line: [OFFLOAD_TARGET_NAMES=nvptx-none:amdgcn-amdhsa]
+        ignore line: [OFFLOAD_TARGET_DEFAULT=1]
+        ignore line: [Target: x86_64-linux-gnu]
+        ignore line: [Configured with: ../src/configure -v --with-pkgversion='Ubuntu 13.3.0-6ubuntu2~24.04' --with-bugurl=file:///usr/share/doc/gcc-13/README.Bugs --enable-languages=c ada c++ go d fortran objc obj-c++ m2 --prefix=/usr --with-gcc-major-version-only --program-suffix=-13 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/libexec --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --enable-bootstrap --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-libstdcxx-backtrace --enable-gnu-unique-object --disable-vtable-verify --enable-plugin --enable-default-pie --with-system-zlib --enable-libphobos-checking=release --with-target-system-zlib=auto --enable-objc-gc=auto --enable-multiarch --disable-werror --enable-cet --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32 m64 mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-nvptx/usr amdgcn-amdhsa=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-gcn/usr --enable-offload-defaulted --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu --with-build-config=bootstrap-lto-lean --enable-link-serialization=2]
+        ignore line: [Thread model: posix]
+        ignore line: [Supported LTO compression algorithms: zlib zstd]
+        ignore line: [gcc version 13.3.0 (Ubuntu 13.3.0-6ubuntu2~24.04) ]
+        ignore line: [COMPILER_PATH=/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/]
+        ignore line: [LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../:/lib/:/usr/lib/]
+        ignore line: [COLLECT_GCC_OPTIONS='-v' '-o' 'cmTC_68918' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'cmTC_68918.']
+        link line: [ /usr/libexec/gcc/x86_64-linux-gnu/13/collect2 -plugin /usr/libexec/gcc/x86_64-linux-gnu/13/liblto_plugin.so -plugin-opt=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper -plugin-opt=-fresolution=/tmp/ccE7OB0z.res -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -dynamic-linker /lib64/ld-linux-x86-64.so.2 -pie -z now -z relro -o cmTC_68918 /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o -L/usr/lib/gcc/x86_64-linux-gnu/13 -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/13/../../.. -v CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o]
+          arg [/usr/libexec/gcc/x86_64-linux-gnu/13/collect2] ==> ignore
+          arg [-plugin] ==> ignore
+          arg [/usr/libexec/gcc/x86_64-linux-gnu/13/liblto_plugin.so] ==> ignore
+          arg [-plugin-opt=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper] ==> ignore
+          arg [-plugin-opt=-fresolution=/tmp/ccE7OB0z.res] ==> ignore
+          arg [-plugin-opt=-pass-through=-lgcc_s] ==> ignore
+          arg [-plugin-opt=-pass-through=-lgcc] ==> ignore
+          arg [-plugin-opt=-pass-through=-lc] ==> ignore
+          arg [-plugin-opt=-pass-through=-lgcc_s] ==> ignore
+          arg [-plugin-opt=-pass-through=-lgcc] ==> ignore
+          arg [--build-id] ==> ignore
+          arg [--eh-frame-hdr] ==> ignore
+          arg [-m] ==> ignore
+          arg [elf_x86_64] ==> ignore
+          arg [--hash-style=gnu] ==> ignore
+          arg [--as-needed] ==> ignore
+          arg [-dynamic-linker] ==> ignore
+          arg [/lib64/ld-linux-x86-64.so.2] ==> ignore
+          arg [-pie] ==> ignore
+          arg [-znow] ==> ignore
+          arg [-zrelro] ==> ignore
+          arg [-o] ==> ignore
+          arg [cmTC_68918] ==> ignore
+          arg [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o] ==> obj [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o]
+          arg [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o] ==> obj [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o]
+          arg [/usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o] ==> obj [/usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o]
+          arg [-L/usr/lib/gcc/x86_64-linux-gnu/13] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/13]
+          arg [-L/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu]
+          arg [-L/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib]
+          arg [-L/lib/x86_64-linux-gnu] ==> dir [/lib/x86_64-linux-gnu]
+          arg [-L/lib/../lib] ==> dir [/lib/../lib]
+          arg [-L/usr/lib/x86_64-linux-gnu] ==> dir [/usr/lib/x86_64-linux-gnu]
+          arg [-L/usr/lib/../lib] ==> dir [/usr/lib/../lib]
+          arg [-L/usr/lib/gcc/x86_64-linux-gnu/13/../../..] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/13/../../..]
+          arg [-v] ==> ignore
+          arg [CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o] ==> ignore
+          arg [-lstdc++] ==> lib [stdc++]
+          arg [-lm] ==> lib [m]
+          arg [-lgcc_s] ==> lib [gcc_s]
+          arg [-lgcc] ==> lib [gcc]
+          arg [-lc] ==> lib [c]
+          arg [-lgcc_s] ==> lib [gcc_s]
+          arg [-lgcc] ==> lib [gcc]
+          arg [/usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o] ==> obj [/usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o]
+          arg [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o] ==> obj [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o]
+        ignore line: [collect2 version 13.3.0]
+        ignore line: [/usr/bin/ld -plugin /usr/libexec/gcc/x86_64-linux-gnu/13/liblto_plugin.so -plugin-opt=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper -plugin-opt=-fresolution=/tmp/ccE7OB0z.res -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -dynamic-linker /lib64/ld-linux-x86-64.so.2 -pie -z now -z relro -o cmTC_68918 /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o -L/usr/lib/gcc/x86_64-linux-gnu/13 -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/13/../../.. -v CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o]
+        linker tool for 'CXX': /usr/bin/ld
+        collapse obj [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o] ==> [/usr/lib/x86_64-linux-gnu/Scrt1.o]
+        collapse obj [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o] ==> [/usr/lib/x86_64-linux-gnu/crti.o]
+        collapse obj [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o] ==> [/usr/lib/x86_64-linux-gnu/crtn.o]
+        collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/13] ==> [/usr/lib/gcc/x86_64-linux-gnu/13]
+        collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu] ==> [/usr/lib/x86_64-linux-gnu]
+        collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib] ==> [/usr/lib]
+        collapse library dir [/lib/x86_64-linux-gnu] ==> [/lib/x86_64-linux-gnu]
+        collapse library dir [/lib/../lib] ==> [/lib]
+        collapse library dir [/usr/lib/x86_64-linux-gnu] ==> [/usr/lib/x86_64-linux-gnu]
+        collapse library dir [/usr/lib/../lib] ==> [/usr/lib]
+        collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/13/../../..] ==> [/usr/lib]
+        implicit libs: [stdc++;m;gcc_s;gcc;c;gcc_s;gcc]
+        implicit objs: [/usr/lib/x86_64-linux-gnu/Scrt1.o;/usr/lib/x86_64-linux-gnu/crti.o;/usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o;/usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o;/usr/lib/x86_64-linux-gnu/crtn.o]
+        implicit dirs: [/usr/lib/gcc/x86_64-linux-gnu/13;/usr/lib/x86_64-linux-gnu;/usr/lib;/lib/x86_64-linux-gnu;/lib]
+        implicit fwks: []
+      
+      
+  -
+    kind: "message-v1"
+    backtrace:
+      - "/usr/local/share/cmake-3.31/Modules/Internal/CMakeDetermineLinkerId.cmake:40 (message)"
+      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCompilerABI.cmake:255 (cmake_determine_linker_id)"
+      - "/usr/local/share/cmake-3.31/Modules/CMakeTestCXXCompiler.cmake:26 (CMAKE_DETERMINE_COMPILER_ABI)"
+      - "CMakeLists.txt:5 (project)"
+    message: |
+      Running the CXX compiler's linker: "/usr/bin/ld" "-v"
+      GNU ld (GNU Binutils for Ubuntu) 2.42
+...
diff --git a/build2/CMakeFiles/cmake.check_cache b/build2/CMakeFiles/cmake.check_cache
new file mode 100644
index 000000000..3dccd7317
--- /dev/null
+++ b/build2/CMakeFiles/cmake.check_cache
@@ -0,0 +1 @@
+# This file is generated by cmake for dependency checking of the CMakeCache.txt file
diff --git a/build2/include/mscclpp/version.hpp b/build2/include/mscclpp/version.hpp
new file mode 100644
index 000000000..0ec54ad62
--- /dev/null
+++ b/build2/include/mscclpp/version.hpp
@@ -0,0 +1,13 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef MSCCLPP_VERSION_HPP_
+#define MSCCLPP_VERSION_HPP_
+
+#define MSCCLPP_MAJOR 0
+#define MSCCLPP_MINOR 8
+#define MSCCLPP_PATCH 0
+#define MSCCLPP_VERSION (MSCCLPP_MAJOR * 10000 + MSCCLPP_MINOR * 100 + MSCCLPP_PATCH)
+#define MSCCLPP_GIT_COMMIT "305d15717edc"
+
+#endif  // MSCCLPP_VERSION_HPP_
diff --git a/test/framework.cc b/test/framework.cc
index f072a075b..aff10d293 100644
--- a/test/framework.cc
+++ b/test/framework.cc
@@ -12,77 +12,77 @@ namespace mscclpp {
 namespace test {
 
 // Global state
-static int g_mpi_rank = 0;
-static int g_mpi_size = 1;
-static bool g_mpi_initialized = false;
-static bool g_current_test_passed = true;
-static std::string g_current_test_failure_message;
+static int gMpiRank = 0;
+static int gMpiSize = 1;
+static bool gMpiInitialized = false;
+static bool gCurrentTestPassed = true;
+static std::string gCurrentTestFailureMessage;
 
 namespace utils {
 
 // Internal MPI helper functions (not exposed in header)
 void initializeMPI(int argc, char* argv[]) {
-  if (g_mpi_initialized) return;
+  if (gMpiInitialized) return;
 
   MPI_Init(&argc, &argv);
-  MPI_Comm_rank(MPI_COMM_WORLD, &g_mpi_rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &g_mpi_size);
-  g_mpi_initialized = true;
+  MPI_Comm_rank(MPI_COMM_WORLD, &gMpiRank);
+  MPI_Comm_size(MPI_COMM_WORLD, &gMpiSize);
+  gMpiInitialized = true;
 }
 
 static void finalizeMPI() {
-  if (!g_mpi_initialized) return;
+  if (!gMpiInitialized) return;
 
   MPI_Finalize();
-  g_mpi_initialized = false;
+  gMpiInitialized = false;
 }
 
-static bool isMainProcess() { return g_mpi_rank == 0; }
+static bool isMainProcess() { return gMpiRank == 0; }
 
 // Public utility functions for test output
-bool isMainRank() { return g_mpi_rank == 0; }
+bool isMainRank() { return gMpiRank == 0; }
 
-int getMPIRank() { return g_mpi_rank; }
+int getMPIRank() { return gMpiRank; }
 
-int getMPISize() { return g_mpi_size; }
+int getMPISize() { return gMpiSize; }
 
 void cleanupMPI() { finalizeMPI(); }
 
 void reportFailure(const char* file, int line, const std::string& message) {
-  g_current_test_passed = false;
+  gCurrentTestPassed = false;
   std::ostringstream oss;
   oss << file << ":" << line << ": " << message;
-  if (!g_current_test_failure_message.empty()) {
-    g_current_test_failure_message += "\n";
+  if (!gCurrentTestFailureMessage.empty()) {
+    gCurrentTestFailureMessage += "\n";
   }
-  g_current_test_failure_message += oss.str();
+  gCurrentTestFailureMessage += oss.str();
   std::cerr << oss.str() << std::endl;
 }
 
 void reportSuccess() {
-  g_current_test_passed = true;
-  g_current_test_failure_message.clear();
+  gCurrentTestPassed = true;
+  gCurrentTestFailureMessage.clear();
 }
 
 // Timer implementation
-Timer::Timer() : is_running_(false) {}
+Timer::Timer() : isRunning_(false) {}
 
 void Timer::start() {
-  start_time_ = std::chrono::high_resolution_clock::now();
-  is_running_ = true;
+  startTime_ = std::chrono::high_resolution_clock::now();
+  isRunning_ = true;
 }
 
 void Timer::stop() {
-  end_time_ = std::chrono::high_resolution_clock::now();
-  is_running_ = false;
+  endTime_ = std::chrono::high_resolution_clock::now();
+  isRunning_ = false;
 }
 
 double Timer::elapsedMicroseconds() const {
-  if (is_running_) {
+  if (isRunning_) {
     auto now = std::chrono::high_resolution_clock::now();
-    return std::chrono::duration_cast<std::chrono::microseconds>(now - start_time_).count();
+    return std::chrono::duration_cast<std::chrono::microseconds>(now - startTime_).count();
   }
-  return std::chrono::duration_cast<std::chrono::microseconds>(end_time_ - start_time_).count();
+  return std::chrono::duration_cast<std::chrono::microseconds>(endTime_ - startTime_).count();
 }
 
 double Timer::elapsedMilliseconds() const { return elapsedMicroseconds() / 1000.0; }
@@ -145,7 +145,7 @@ int runMultipleTests(
     // finalizeMPI();
 
   } catch (const std::exception& e) {
-    if (g_mpi_rank == 0) {
+    if (gMpiRank == 0) {
       std::cerr << "Error: " << e.what() << std::endl;
     }
     finalizeMPI();
@@ -171,8 +171,8 @@ TestRegistry& TestRegistry::instance() {
 
 void TestRegistry::registerTest(const std::string& test_suite, const std::string& test_name, TestFactory factory) {
   TestInfoInternal info;
-  info.suite_name = test_suite;
-  info.test_name = test_name;
+  info.suiteName = test_suite;
+  info.testName = test_name;
   info.factory = factory;
   tests_.push_back(info);
 }
@@ -186,7 +186,7 @@ void TestRegistry::initGoogleTest(int* argc, char** argv) {
 
 int TestRegistry::runAllTests(int argc, char* argv[]) {
   // Initialize MPI if not already initialized
-  if (!g_mpi_initialized) {
+  if (!gMpiInitialized) {
     utils::initializeMPI(argc, argv);
   }
 
@@ -207,7 +207,7 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
     try {
       env->SetUp();
     } catch (const std::exception& e) {
-      if (g_mpi_rank == 0) {
+      if (gMpiRank == 0) {
         std::cerr << "Failed to set up test environment: " << e.what() << std::endl;
       }
       return 1;
@@ -221,7 +221,7 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
   // Count tests to run
   int total_to_run = 0;
   for (const auto& test_info : tests_) {
-    std::string full_name = test_info.suite_name + "." + test_info.test_name;
+    std::string full_name = test_info.suiteName + "." + test_info.testName;
     if (!filter.empty() && full_name.find(filter) == std::string::npos) {
       skipped++;
       continue;
@@ -229,7 +229,7 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
     total_to_run++;
   }
 
-  if (g_mpi_rank == 0) {
+  if (gMpiRank == 0) {
     std::cout << "[==========] Running " << total_to_run << " tests";
     if (skipped > 0) {
       std::cout << " (" << skipped << " skipped by filter)";
@@ -238,22 +238,22 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
   }
 
   for (const auto& test_info : tests_) {
-    std::string full_name = test_info.suite_name + "." + test_info.test_name;
+    std::string full_name = test_info.suiteName + "." + test_info.testName;
 
     // Apply filter
     if (!filter.empty() && full_name.find(filter) == std::string::npos) {
       continue;
     }
 
-    g_current_test_passed = true;
-    g_current_test_failure_message.clear();
+    gCurrentTestPassed = true;
+    gCurrentTestFailureMessage.clear();
 
-    if (g_mpi_rank == 0) {
+    if (gMpiRank == 0) {
       std::cout << "[ RUN      ] " << full_name << std::endl;
     }
 
     // Set current test info for UnitTest::GetInstance()->current_test_info()
-    TestInfo current_info(test_info.suite_name, test_info.test_name);
+    TestInfo current_info(test_info.suiteName, test_info.testName);
     UnitTest::GetInstance()->set_current_test_info(&current_info);
 
     TestCase* test_case = nullptr;
@@ -263,14 +263,14 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
       test_case->TestBody();
       test_case->TearDown();
     } catch (const std::exception& e) {
-      g_current_test_passed = false;
-      if (g_current_test_failure_message.empty()) {
-        g_current_test_failure_message = e.what();
+      gCurrentTestPassed = false;
+      if (gCurrentTestFailureMessage.empty()) {
+        gCurrentTestFailureMessage = e.what();
       }
     } catch (...) {
-      g_current_test_passed = false;
-      if (g_current_test_failure_message.empty()) {
-        g_current_test_failure_message = "Unknown exception";
+      gCurrentTestPassed = false;
+      if (gCurrentTestFailureMessage.empty()) {
+        gCurrentTestFailureMessage = "Unknown exception";
       }
     }
 
@@ -280,15 +280,15 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
     UnitTest::GetInstance()->set_current_test_info(nullptr);
 
     // Synchronize test status across all MPI processes
-    int local_passed = g_current_test_passed ? 1 : 0;
+    int local_passed = gCurrentTestPassed ? 1 : 0;
     int global_passed = 1;
-    if (g_mpi_initialized) {
+    if (gMpiInitialized) {
       MPI_Allreduce(&local_passed, &global_passed, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
     } else {
       global_passed = local_passed;
     }
 
-    if (g_mpi_rank == 0) {
+    if (gMpiRank == 0) {
       if (global_passed) {
         std::cout << "[       OK ] " << full_name << std::endl;
         passed++;
@@ -299,7 +299,7 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
     }
   }
 
-  if (g_mpi_rank == 0) {
+  if (gMpiRank == 0) {
     std::cout << "[==========] " << total_to_run << " tests ran.\n";
     if (passed > 0) {
       std::cout << "[  PASSED  ] " << passed << " tests.\n";
@@ -314,7 +314,7 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
     try {
       (*it)->TearDown();
     } catch (const std::exception& e) {
-      if (g_mpi_rank == 0) {
+      if (gMpiRank == 0) {
         std::cerr << "Failed to tear down test environment: " << e.what() << std::endl;
       }
     }
diff --git a/test/framework.hpp b/test/framework.hpp
index 34ef40841..6322a350d 100644
--- a/test/framework.hpp
+++ b/test/framework.hpp
@@ -24,15 +24,15 @@ namespace test {
 
 // Test result structure
 struct TestResult {
-  std::string test_name;
-  std::string test_category;
-  std::map<std::string, std::string> test_params;
+  std::string testName;
+  std::string testCategory;
+  std::map<std::string, std::string> testParams;
   nlohmann::ordered_json metrics;
-  int num_processes;
-  int process_rank;
+  int numProcesses;
+  int processRank;
   std::string timestamp;
   bool passed;
-  std::string failure_message;
+  std::string failureMessage;
 };
 
 // Forward declarations
@@ -61,14 +61,14 @@ class Environment {
 // Test info class (for getting current test information)
 class TestInfo {
  public:
-  TestInfo(const std::string& suite, const std::string& name) : test_suite_name_(suite), test_name_(name) {}
+  TestInfo(const std::string& suite, const std::string& name) : testSuiteName_(suite), testName_(name) {}
 
-  const char* test_suite_name() const { return test_suite_name_.c_str(); }
-  const char* name() const { return test_name_.c_str(); }
+  const char* test_suite_name() const { return testSuiteName_.c_str(); }
+  const char* name() const { return testName_.c_str(); }
 
  private:
-  std::string test_suite_name_;
-  std::string test_name_;
+  std::string testSuiteName_;
+  std::string testName_;
 };
 
 // UnitTest singleton (for getting test information)
@@ -76,12 +76,12 @@ class UnitTest {
  public:
   static UnitTest* GetInstance();
 
-  const TestInfo* current_test_info() const { return current_test_info_; }
-  void set_current_test_info(const TestInfo* info) { current_test_info_ = info; }
+  const TestInfo* current_test_info() const { return currentTestInfo_; }
+  void set_current_test_info(const TestInfo* info) { currentTestInfo_ = info; }
 
  private:
   UnitTest() = default;
-  const TestInfo* current_test_info_ = nullptr;
+  const TestInfo* currentTestInfo_ = nullptr;
 };
 
 // Test registry and runner
@@ -99,8 +99,8 @@ class TestRegistry {
  private:
   TestRegistry() = default;
   struct TestInfoInternal {
-    std::string suite_name;
-    std::string test_name;
+    std::string suiteName;
+    std::string testName;
     TestFactory factory;
   };
   std::vector<TestInfoInternal> tests_;
@@ -133,9 +133,9 @@ class Timer {
   double elapsedSeconds() const;
 
  private:
-  std::chrono::high_resolution_clock::time_point start_time_;
-  std::chrono::high_resolution_clock::time_point end_time_;
-  bool is_running_;
+  std::chrono::high_resolution_clock::time_point startTime_;
+  std::chrono::high_resolution_clock::time_point endTime_;
+  bool isRunning_;
 };
 
 // CUDA utilities
diff --git a/test/perf/framework.cc b/test/perf/framework.cc
index be1d812e3..680444604 100644
--- a/test/perf/framework.cc
+++ b/test/perf/framework.cc
@@ -12,7 +12,7 @@ namespace mscclpp {
 namespace test {
 
 // Global state for performance test results
-static std::vector<TestResult> g_perf_results;
+static std::vector<TestResult> gPerfResults;
 
 namespace {
 std::string getCurrentTimestamp() {
@@ -26,18 +26,18 @@ std::string getCurrentTimestamp() {
 
 namespace utils {
 
-void recordResult(const std::string& test_name, const std::string& test_category, const nlohmann::ordered_json& metrics,
-                  const std::map<std::string, std::string>& test_params) {
+void recordResult(const std::string& testName, const std::string& testCategory, const nlohmann::ordered_json& metrics,
+                  const std::map<std::string, std::string>& testParams) {
   TestResult result;
-  result.test_name = test_name;
-  result.test_category = test_category;
-  result.test_params = test_params;
+  result.testName = testName;
+  result.testCategory = testCategory;
+  result.testParams = testParams;
   result.metrics = metrics;
-  result.num_processes = getMPISize();
-  result.process_rank = getMPIRank();
+  result.numProcesses = getMPISize();
+  result.processRank = getMPIRank();
   result.timestamp = getCurrentTimestamp();
 
-  g_perf_results.push_back(result);
+  gPerfResults.push_back(result);
 }
 
 void writeResultsToFile(const std::string& filename) {
@@ -46,14 +46,14 @@ void writeResultsToFile(const std::string& filename) {
     throw std::runtime_error("Cannot open output file: " + filename);
   }
 
-  for (const auto& result : g_perf_results) {
+  for (const auto& result : gPerfResults) {
     nlohmann::ordered_json j;
-    j["test_name"] = result.test_name;
-    j["test_category"] = result.test_category;
-    j["test_config"] = result.test_params;
+    j["test_name"] = result.testName;
+    j["test_category"] = result.testCategory;
+    j["test_config"] = result.testParams;
     j["metrics"] = result.metrics;
-    j["num_processes"] = result.num_processes;
-    j["process_rank"] = result.process_rank;
+    j["num_processes"] = result.numProcesses;
+    j["process_rank"] = result.processRank;
     j["timestamp"] = result.timestamp;
 
     file << j.dump() << std::endl;
@@ -65,12 +65,12 @@ void printResults(bool verbose) {
 
   std::cout << "\n=== Test Results ===" << std::endl;
 
-  for (const auto& result : g_perf_results) {
-    std::cout << "\nTest: " << result.test_name << " (" << result.test_category << ")" << std::endl;
+  for (const auto& result : gPerfResults) {
+    std::cout << "\nTest: " << result.testName << " (" << result.testCategory << ")" << std::endl;
 
-    if (verbose && !result.test_params.empty()) {
+    if (verbose && !result.testParams.empty()) {
       std::cout << "  Parameters:" << std::endl;
-      for (const auto& param : result.test_params) {
+      for (const auto& param : result.testParams) {
         std::cout << "    " << param.first << ": " << param.second << std::endl;
       }
     }
diff --git a/test/perf/framework.hpp b/test/perf/framework.hpp
index 7f7401877..ae1122b3d 100644
--- a/test/perf/framework.hpp
+++ b/test/perf/framework.hpp
@@ -18,8 +18,8 @@ namespace utils {
 // Additional performance test utilities not in the base framework
 
 // Result recording for performance tests
-void recordResult(const std::string& test_name, const std::string& test_category, const nlohmann::ordered_json& metrics,
-                  const std::map<std::string, std::string>& test_params = {});
+void recordResult(const std::string& testName, const std::string& testCategory, const nlohmann::ordered_json& metrics,
+                  const std::map<std::string, std::string>& testParams = {});
 
 // Output utilities for performance tests
 void writeResultsToFile(const std::string& filename);

From 6da12fade1a9a091fc57082a7decec05a7f34b87 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 11 Feb 2026 02:27:17 +0000
Subject: [PATCH 015/132] Comprehensive plan for refactoring

Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
---
 CMakeLists.txt | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9bfef1ef7..b12ea8e43 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -115,17 +115,12 @@ if(MSCCLPP_ENABLE_COVERAGE)
         
         # Find lcov
         find_program(LCOV_PATH lcov)
-        find_program(GENHTML_PATH genhtml)
         
         if(NOT LCOV_PATH)
             message(WARNING "lcov not found. Install lcov to generate coverage reports.")
         endif()
         
-        if(NOT GENHTML_PATH)
-            message(WARNING "genhtml not found. Install lcov to generate HTML coverage reports.")
-        endif()
-        
-        if(LCOV_PATH AND GENHTML_PATH)
+        if(LCOV_PATH)
             # Add coverage target
             add_custom_target(coverage
                 COMMAND ${CMAKE_COMMAND} -E echo "Removing old coverage data..."
@@ -140,10 +135,7 @@ if(MSCCLPP_ENABLE_COVERAGE)
                 COMMAND ${CMAKE_COMMAND} -E echo "Filtering coverage data..."
                 COMMAND ${LCOV_PATH} --remove coverage.info '/usr/*' '*/test/*' '*/build/*' --output-file coverage.info
                 
-                COMMAND ${CMAKE_COMMAND} -E echo "Generating HTML report..."
-                COMMAND ${GENHTML_PATH} coverage.info --output-directory coverage_html
-                
-                COMMAND ${CMAKE_COMMAND} -E echo "Coverage report generated in coverage_html/index.html"
+                COMMAND ${CMAKE_COMMAND} -E echo "Coverage report generated in coverage.info"
                 
                 WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
                 COMMENT "Generating code coverage report"
@@ -151,7 +143,6 @@ if(MSCCLPP_ENABLE_COVERAGE)
             
             # Add coverage clean target
             add_custom_target(coverage-clean
-                COMMAND ${CMAKE_COMMAND} -E remove_directory coverage_html
                 COMMAND ${CMAKE_COMMAND} -E remove coverage.info
                 COMMAND ${LCOV_PATH} --directory . --zerocounters
                 WORKING_DIRECTORY ${CMAKE_BINARY_DIR}

From 7e4365f014bc3adedca9ace16ef7bf4747e68d42 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 11 Feb 2026 02:29:50 +0000
Subject: [PATCH 016/132] Add performance test filtering and remove HTML
 coverage

- Add isPerfTest field to TestInfoInternal struct
- Add --exclude-perf-tests command line argument
- Add PERF_TEST and PERF_TEST_F macros for marking performance tests
- Update runAllTests to filter performance tests when requested
- Remove genhtml dependency and HTML report generation
- Keep only coverage.info file generation with lcov

Performance tests can now be excluded with:
  ./build/bin/unit_tests --exclude-perf-tests
  ./build/bin/mp_unit_tests --exclude-perf-tests

Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
---
 test/framework.cc  | 24 +++++++++++++++++++++---
 test/framework.hpp | 40 +++++++++++++++++++++++++++++++++++++---
 2 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/test/framework.cc b/test/framework.cc
index aff10d293..cbfc2ffc6 100644
--- a/test/framework.cc
+++ b/test/framework.cc
@@ -169,11 +169,13 @@ TestRegistry& TestRegistry::instance() {
   return registry;
 }
 
-void TestRegistry::registerTest(const std::string& test_suite, const std::string& test_name, TestFactory factory) {
+void TestRegistry::registerTest(const std::string& test_suite, const std::string& test_name, TestFactory factory,
+                                bool isPerfTest) {
   TestInfoInternal info;
   info.suiteName = test_suite;
   info.testName = test_name;
   info.factory = factory;
+  info.isPerfTest = isPerfTest;
   tests_.push_back(info);
 }
 
@@ -190,8 +192,10 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
     utils::initializeMPI(argc, argv);
   }
 
-  // Parse command line arguments for test filter
+  // Parse command line arguments
   std::string filter = "";
+  bool excludePerfTests = false;
+  
   for (int i = 1; i < argc; ++i) {
     std::string arg = argv[i];
     if (arg.find("--gtest_filter=") == 0) {
@@ -199,6 +203,8 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
     } else if (arg == "--gtest_filter" && i + 1 < argc) {
       filter = argv[i + 1];
       ++i;
+    } else if (arg == "--exclude-perf-tests") {
+      excludePerfTests = true;
     }
   }
 
@@ -222,6 +228,13 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
   int total_to_run = 0;
   for (const auto& test_info : tests_) {
     std::string full_name = test_info.suiteName + "." + test_info.testName;
+    
+    // Skip performance tests if requested
+    if (excludePerfTests && test_info.isPerfTest) {
+      skipped++;
+      continue;
+    }
+    
     if (!filter.empty() && full_name.find(filter) == std::string::npos) {
       skipped++;
       continue;
@@ -232,7 +245,7 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
   if (gMpiRank == 0) {
     std::cout << "[==========] Running " << total_to_run << " tests";
     if (skipped > 0) {
-      std::cout << " (" << skipped << " skipped by filter)";
+      std::cout << " (" << skipped << " skipped)";
     }
     std::cout << ".\n";
   }
@@ -240,6 +253,11 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
   for (const auto& test_info : tests_) {
     std::string full_name = test_info.suiteName + "." + test_info.testName;
 
+    // Skip performance tests if requested
+    if (excludePerfTests && test_info.isPerfTest) {
+      continue;
+    }
+    
     // Apply filter
     if (!filter.empty() && full_name.find(filter) == std::string::npos) {
       continue;
diff --git a/test/framework.hpp b/test/framework.hpp
index 6322a350d..c5e0dc8ba 100644
--- a/test/framework.hpp
+++ b/test/framework.hpp
@@ -91,7 +91,7 @@ class TestRegistry {
 
   static TestRegistry& instance();
 
-  void registerTest(const std::string& test_suite, const std::string& test_name, TestFactory factory);
+  void registerTest(const std::string& test_suite, const std::string& test_name, TestFactory factory, bool isPerfTest = false);
   void addGlobalTestEnvironment(Environment* env);
   int runAllTests(int argc, char* argv[]);
   void initGoogleTest(int* argc, char** argv);
@@ -102,6 +102,7 @@ class TestRegistry {
     std::string suiteName;
     std::string testName;
     TestFactory factory;
+    bool isPerfTest;
   };
   std::vector<TestInfoInternal> tests_;
   std::vector<Environment*> environments_;
@@ -216,7 +217,8 @@ class SkipHelper {
   static bool test_suite##_##test_name##_registered = []() {                                   \
     ::mscclpp::test::TestRegistry::instance().registerTest(                                    \
         #test_suite, #test_name,                                                               \
-        []() -> ::mscclpp::test::TestCase* { return new test_suite##_##test_name##_Test(); }); \
+        []() -> ::mscclpp::test::TestCase* { return new test_suite##_##test_name##_Test(); }, \
+        false);                                                                                \
     return true;                                                                               \
   }();                                                                                         \
   void test_suite##_##test_name##_Test::TestBody()
@@ -230,11 +232,43 @@ class SkipHelper {
   static bool test_fixture##_##test_name##_registered = []() {                                   \
     ::mscclpp::test::TestRegistry::instance().registerTest(                                      \
         #test_fixture, #test_name,                                                               \
-        []() -> ::mscclpp::test::TestCase* { return new test_fixture##_##test_name##_Test(); }); \
+        []() -> ::mscclpp::test::TestCase* { return new test_fixture##_##test_name##_Test(); }, \
+        false);                                                                                  \
     return true;                                                                                 \
   }();                                                                                           \
   void test_fixture##_##test_name##_Test::TestBody()
 
+// Performance test registration macros
+#define PERF_TEST(test_suite, test_name)                                                       \
+  class test_suite##_##test_name##_Test : public ::mscclpp::test::TestCase {                   \
+   public:                                                                                     \
+    test_suite##_##test_name##_Test() {}                                                       \
+    void TestBody() override;                                                                  \
+  };                                                                                           \
+  static bool test_suite##_##test_name##_registered = []() {                                   \
+    ::mscclpp::test::TestRegistry::instance().registerTest(                                    \
+        #test_suite, #test_name,                                                               \
+        []() -> ::mscclpp::test::TestCase* { return new test_suite##_##test_name##_Test(); }, \
+        true);                                                                                 \
+    return true;                                                                               \
+  }();                                                                                         \
+  void test_suite##_##test_name##_Test::TestBody()
+
+#define PERF_TEST_F(test_fixture, test_name)                                           \
+  class test_fixture##_##test_name##_Test : public test_fixture {                     \
+   public:                                                                             \
+    test_fixture##_##test_name##_Test() {}                                            \
+    void TestBody() override;                                                          \
+  };                                                                                   \
+  static bool test_fixture##_##test_name##_registered = []() {                        \
+    ::mscclpp::test::TestRegistry::instance().registerTest(                           \
+        #test_fixture, #test_name,                                                     \
+        []() -> ::mscclpp::test::TestCase* { return new test_fixture##_##test_name##_Test(); }, \
+        true);                                                                         \
+    return true;                                                                       \
+  }();                                                                                 \
+  void test_fixture##_##test_name##_Test::TestBody()
+
 // Test runner macro
 #define RUN_ALL_TESTS() ::mscclpp::test::TestRegistry::instance().runAllTests(argc, argv)
 

From b59196b8a5bcf306855dd82732a37687ba07d1fc Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 11 Feb 2026 02:35:46 +0000
Subject: [PATCH 017/132] Integrate perf tests into unit_tests and add CI
 coverage step

- Add unit_tests_main.cc with main() function for unit_tests executable
- Create fifo_perf_tests.cu as PERF_TEST for unit_tests
- Add fifo_perf_tests.cu to unit_tests sources
- Fix errors_tests.cc to use ASSERT_TRUE for ErrorCode comparisons
- Fix core_tests.cc to use ASSERT_TRUE for TransportFlags comparisons
- Add Azure pipeline step for Debug build with coverage
- Add step to run mp_unit_tests --exclude-perf-tests with coverage

The perf tests are now part of unit_tests and can be filtered out
for coverage reporting. CI now includes Debug build with coverage
collection for non-performance tests.

Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
---
 .azure-pipelines/templates/ut.yaml | 42 ++++++++++++++++
 test/unit/CMakeLists.txt           |  2 +
 test/unit/core_tests.cc            |  7 ++-
 test/unit/errors_tests.cc          | 11 ++--
 test/unit/fifo_perf_tests.cu       | 81 ++++++++++++++++++++++++++++++
 test/unit/unit_tests_main.cc       |  6 +++
 6 files changed, 143 insertions(+), 6 deletions(-)
 create mode 100644 test/unit/fifo_perf_tests.cu
 create mode 100644 test/unit/unit_tests_main.cc

diff --git a/.azure-pipelines/templates/ut.yaml b/.azure-pipelines/templates/ut.yaml
index 82ff4aac5..ae5bedbd7 100644
--- a/.azure-pipelines/templates/ut.yaml
+++ b/.azure-pipelines/templates/ut.yaml
@@ -108,6 +108,48 @@ steps:
       kill $CHILD_PID
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
+- task: Bash@3
+  name: DebugBuildWithCoverage
+  displayName: Build Debug with Coverage
+  inputs:
+    targetType: 'inline'
+    script: |
+      mkdir build_coverage && cd build_coverage
+      if [ "${{ parameters.platform }}" == "rocm" ]; then
+        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_ENABLE_COVERAGE=ON ..
+      else
+        cmake -DCMAKE_BUILD_TYPE=Debug -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_ENABLE_COVERAGE=ON ..
+      fi
+      make -j
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: Bash@3
+  name: MpUnitTestsCoverageNonPerf
+  displayName: Run mp_unit_tests (non-perf) with coverage
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
+      SSH_OPTION="StrictHostKeyChecking=no"
+      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+      : > azureuser@10.0.0.4
+      tail -f azureuser@10.0.0.4 &
+      CHILD_PID=$!
+      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
+        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
+        export PATH=/usr/local/mpi/bin:\$PATH;                        \
+        cd /root/mscclpp;                                             \
+        export LD_LIBRARY_PATH=/root/mscclpp/build_coverage/lib:\$LD_LIBRARY_PATH; \
+        mpirun --allow-run-as-root -tag-output -np 2 ./build_coverage/bin/mp_unit_tests --exclude-perf-tests;  \
+        mpirun --allow-run-as-root -tag-output -np 4 ./build_coverage/bin/mp_unit_tests --exclude-perf-tests;  \
+        cd build_coverage;                                            \
+        lcov --directory . --capture --output-file coverage.info;     \
+        lcov --remove coverage.info '/usr/*' '*/test/*' '*/build/*' --output-file coverage.info;  \
+        lcov --list coverage.info"'
+      kill $CHILD_PID
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
 - task: Bash@3
   name: PyTests
   displayName: Run pytests
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 312d31ef5..655f77788 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -2,10 +2,12 @@
 # Licensed under the MIT license.
 
 target_sources(unit_tests PRIVATE
+    unit_tests_main.cc
     core_tests.cc
     gpu_utils_tests.cc
     errors_tests.cc
     fifo_tests.cu
+    fifo_perf_tests.cu
     numa_tests.cc
     socket_tests.cc
     utils_tests.cc
diff --git a/test/unit/core_tests.cc b/test/unit/core_tests.cc
index f026c05e6..45fce6e2b 100644
--- a/test/unit/core_tests.cc
+++ b/test/unit/core_tests.cc
@@ -5,6 +5,9 @@
 
 #include "../framework.hpp"
 
+// TODO: TransportFlags needs operator<< for EXPECT_EQ to work
+// Using ASSERT_TRUE with manual comparisons as workaround
+
 class LocalCommunicatorTest : public ::mscclpp::test::TestCase {
  protected:
   void SetUp() override {
@@ -22,7 +25,7 @@ TEST_F(LocalCommunicatorTest, RegisterMemory) {
   auto memory = comm->registerMemory(&dummy, sizeof(dummy), mscclpp::NoTransports);
   EXPECT_EQ(memory.data(), &dummy);
   EXPECT_EQ(memory.size(), sizeof(dummy));
-  EXPECT_EQ(memory.transports(), mscclpp::NoTransports);
+  ASSERT_TRUE(memory.transports() == mscclpp::NoTransports);
 }
 
 TEST_F(LocalCommunicatorTest, SendMemoryToSelf) {
@@ -33,5 +36,5 @@ TEST_F(LocalCommunicatorTest, SendMemoryToSelf) {
   auto sameMemory = memoryFuture.get();
   EXPECT_EQ(sameMemory.data(), memory.data());
   EXPECT_EQ(sameMemory.size(), memory.size());
-  EXPECT_EQ(sameMemory.transports(), memory.transports());
+  ASSERT_TRUE(sameMemory.transports() == memory.transports());
 }
diff --git a/test/unit/errors_tests.cc b/test/unit/errors_tests.cc
index 4cd68ee63..13c8d542a 100644
--- a/test/unit/errors_tests.cc
+++ b/test/unit/errors_tests.cc
@@ -5,26 +5,29 @@
 
 #include "../framework.hpp"
 
+// TODO: ErrorCode needs operator<< for EXPECT_EQ to work
+// Using ASSERT_TRUE with manual comparisons as workaround
+
 TEST(ErrorsTest, SystemError) {
   mscclpp::Error error("test", mscclpp::ErrorCode::SystemError);
-  EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::SystemError);
+  ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::SystemError);
   EXPECT_EQ(error.what(), std::string("test (mscclpp failure: SystemError)"));
 }
 
 TEST(ErrorsTest, InternalError) {
   mscclpp::Error error("test", mscclpp::ErrorCode::InternalError);
-  EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::InternalError);
+  ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::InternalError);
   EXPECT_EQ(error.what(), std::string("test (mscclpp failure: InternalError)"));
 }
 
 TEST(ErrorsTest, InvalidUsage) {
   mscclpp::Error error("test", mscclpp::ErrorCode::InvalidUsage);
-  EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::InvalidUsage);
+  ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::InvalidUsage);
   EXPECT_EQ(error.what(), std::string("test (mscclpp failure: InvalidUsage)"));
 }
 
 TEST(ErrorsTest, Timeout) {
   mscclpp::Error error("test", mscclpp::ErrorCode::Timeout);
-  EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::Timeout);
+  ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::Timeout);
   EXPECT_EQ(error.what(), std::string("test (mscclpp failure: Timeout)"));
 }
diff --git a/test/unit/fifo_perf_tests.cu b/test/unit/fifo_perf_tests.cu
new file mode 100644
index 000000000..76aed8355
--- /dev/null
+++ b/test/unit/fifo_perf_tests.cu
@@ -0,0 +1,81 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "../framework.hpp"
+
+#include <memory>
+#include <mscclpp/fifo.hpp>
+#include <mscclpp/gpu_utils.hpp>
+#include <mscclpp/numa.hpp>
+
+// Simple FIFO performance test to be run as part of unit_tests
+// This is a simplified version of test/perf/fifo_test.cu that can be
+// integrated into the unit test suite and marked as a performance test.
+
+constexpr uint64_t TIMEOUT_SPINS = 1000000;
+constexpr int MIN_TRIGGERS = 100;  // Reduced for faster unit test execution
+
+__constant__ mscclpp::FifoDeviceHandle gFifoPerfDeviceHandle;
+
+__global__ void kernelFifoPerfPush(size_t numTriggers) {
+  mscclpp::FifoDeviceHandle& fifo = gFifoPerfDeviceHandle;
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  mscclpp::ProxyTrigger trigger;
+  for (size_t i = 1; i <= numTriggers; ++i) {
+    trigger.fst = i;
+    trigger.snd = tid ^ i;
+    fifo.push(trigger);
+  }
+}
+
+static bool consumePerfTriggers(std::unique_ptr<mscclpp::Fifo>& hostFifo, int numTriggers, int parallel) {
+  int totalTriggers = numTriggers * parallel;
+  std::unordered_map<int, int> triggerCounts;
+  for (int i = 0; i < totalTriggers; ++i) {
+    mscclpp::ProxyTrigger trigger;
+    uint64_t spin = 0;
+    do {
+      trigger = hostFifo->poll();
+      if (spin++ > TIMEOUT_SPINS) {
+        return false;
+      }
+    } while (trigger.fst == 0 || trigger.snd == 0);
+
+    trigger.snd ^= ((uint64_t)1 << (uint64_t)63);
+    trigger.snd = trigger.snd ^ trigger.fst;
+    assert(triggerCounts[trigger.snd] + 1 == trigger.fst);
+    triggerCounts[trigger.snd]++;
+    hostFifo->pop();
+  }
+  return true;
+}
+
+PERF_TEST(FifoPerfTest, BasicPerformance) {
+  int cudaDevice, numaNode;
+  CUDA_CHECK(cudaGetDevice(&cudaDevice));
+  numaNode = mscclpp::getDeviceNumaNode(cudaDevice);
+  mscclpp::numaBind(numaNode);
+
+  const int fifoSize = 128;
+  const int numTriggers = MIN_TRIGGERS;
+  const int numParallel = 1;
+
+  auto hostFifo = std::make_unique<mscclpp::Fifo>(fifoSize);
+  mscclpp::FifoDeviceHandle hostHandle = hostFifo->deviceHandle();
+  CUDA_CHECK(cudaMemcpyToSymbol(gFifoPerfDeviceHandle, &hostHandle, sizeof(mscclpp::FifoDeviceHandle)));
+
+  cudaStream_t stream;
+  CUDA_CHECK(cudaStreamCreate(&stream));
+
+  // Run kernel
+  kernelFifoPerfPush<<<numParallel, 1, 0, stream>>>(numTriggers);
+  CUDA_CHECK(cudaGetLastError());
+
+  // Process triggers
+  bool success = consumePerfTriggers(hostFifo, numTriggers, numParallel);
+  ASSERT_TRUE(success);
+  
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  CUDA_CHECK(cudaStreamDestroy(stream));
+  CUDA_CHECK(cudaDeviceSynchronize());
+}
diff --git a/test/unit/unit_tests_main.cc b/test/unit/unit_tests_main.cc
new file mode 100644
index 000000000..743243d18
--- /dev/null
+++ b/test/unit/unit_tests_main.cc
@@ -0,0 +1,6 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "../framework.hpp"
+
+int main(int argc, char** argv) { return RUN_ALL_TESTS(); }

From ba0451ae68e1afc2076d708788dd226e39d600fb Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 11 Feb 2026 02:37:12 +0000
Subject: [PATCH 018/132] Remove build2 CMake artifacts from repository

Another set of accidentally committed build artifacts in build2/ directory.
The .gitignore pattern build_*/ should prevent these in the future.

Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
---
 build2/CMakeCache.txt                         | 393 --------
 .../CMakeFiles/3.31.6/CMakeCXXCompiler.cmake  | 101 --
 .../3.31.6/CMakeDetermineCompilerABI_CXX.bin  | Bin 15992 -> 0 bytes
 build2/CMakeFiles/3.31.6/CMakeSystem.cmake    |  15 -
 .../CompilerIdCXX/CMakeCXXCompilerId.cpp      | 919 ------------------
 build2/CMakeFiles/3.31.6/CompilerIdCXX/a.out  | Bin 16096 -> 0 bytes
 build2/CMakeFiles/CMakeConfigureLog.yaml      | 294 ------
 build2/CMakeFiles/cmake.check_cache           |   1 -
 build2/include/mscclpp/version.hpp            |  13 -
 9 files changed, 1736 deletions(-)
 delete mode 100644 build2/CMakeCache.txt
 delete mode 100644 build2/CMakeFiles/3.31.6/CMakeCXXCompiler.cmake
 delete mode 100755 build2/CMakeFiles/3.31.6/CMakeDetermineCompilerABI_CXX.bin
 delete mode 100644 build2/CMakeFiles/3.31.6/CMakeSystem.cmake
 delete mode 100644 build2/CMakeFiles/3.31.6/CompilerIdCXX/CMakeCXXCompilerId.cpp
 delete mode 100755 build2/CMakeFiles/3.31.6/CompilerIdCXX/a.out
 delete mode 100644 build2/CMakeFiles/CMakeConfigureLog.yaml
 delete mode 100644 build2/CMakeFiles/cmake.check_cache
 delete mode 100644 build2/include/mscclpp/version.hpp

diff --git a/build2/CMakeCache.txt b/build2/CMakeCache.txt
deleted file mode 100644
index c404aca8d..000000000
--- a/build2/CMakeCache.txt
+++ /dev/null
@@ -1,393 +0,0 @@
-# This is the CMakeCache file.
-# For build in directory: /home/runner/work/mscclpp/mscclpp/build2
-# It was generated by CMake: /usr/local/bin/cmake
-# You can edit this file to change values found and used by cmake.
-# If you do not want to change any of the values, simply exit the editor.
-# If you do want to change a value, simply edit, save, and exit the editor.
-# The syntax for the file is as follows:
-# KEY:TYPE=VALUE
-# KEY is the name of a variable in the cache.
-# TYPE is a hint to GUIs for the type of VALUE, DO NOT EDIT TYPE!.
-# VALUE is the current value for the KEY.
-
-########################
-# EXTERNAL cache entries
-########################
-
-//Path to a program.
-CMAKE_ADDR2LINE:FILEPATH=/usr/bin/addr2line
-
-//Path to a program.
-CMAKE_AR:FILEPATH=/usr/bin/ar
-
-//Choose the type of build, options are: None Debug Release RelWithDebInfo
-// MinSizeRel ...
-CMAKE_BUILD_TYPE:STRING=Release
-
-//Enable/Disable color output during build.
-CMAKE_COLOR_MAKEFILE:BOOL=ON
-
-//CXX compiler
-CMAKE_CXX_COMPILER:FILEPATH=/usr/bin/c++
-
-//A wrapper around 'ar' adding the appropriate '--plugin' option
-// for the GCC compiler
-CMAKE_CXX_COMPILER_AR:FILEPATH=/usr/bin/gcc-ar-13
-
-//A wrapper around 'ranlib' adding the appropriate '--plugin' option
-// for the GCC compiler
-CMAKE_CXX_COMPILER_RANLIB:FILEPATH=/usr/bin/gcc-ranlib-13
-
-//Flags used by the CXX compiler during all build types.
-CMAKE_CXX_FLAGS:STRING=
-
-//Flags used by the CXX compiler during DEBUG builds.
-CMAKE_CXX_FLAGS_DEBUG:STRING=-g
-
-//Flags used by the CXX compiler during MINSIZEREL builds.
-CMAKE_CXX_FLAGS_MINSIZEREL:STRING=-Os -DNDEBUG
-
-//Flags used by the CXX compiler during RELEASE builds.
-CMAKE_CXX_FLAGS_RELEASE:STRING=-O3 -DNDEBUG
-
-//Flags used by the CXX compiler during RELWITHDEBINFO builds.
-CMAKE_CXX_FLAGS_RELWITHDEBINFO:STRING=-O2 -g -DNDEBUG
-
-//Path to a program.
-CMAKE_DLLTOOL:FILEPATH=CMAKE_DLLTOOL-NOTFOUND
-
-//Flags used by the linker during all build types.
-CMAKE_EXE_LINKER_FLAGS:STRING=
-
-//Flags used by the linker during DEBUG builds.
-CMAKE_EXE_LINKER_FLAGS_DEBUG:STRING=
-
-//Flags used by the linker during MINSIZEREL builds.
-CMAKE_EXE_LINKER_FLAGS_MINSIZEREL:STRING=
-
-//Flags used by the linker during RELEASE builds.
-CMAKE_EXE_LINKER_FLAGS_RELEASE:STRING=
-
-//Flags used by the linker during RELWITHDEBINFO builds.
-CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
-
-//Enable/Disable output of compile commands during generation.
-CMAKE_EXPORT_COMPILE_COMMANDS:BOOL=
-
-//Value Computed by CMake.
-CMAKE_FIND_PACKAGE_REDIRECTS_DIR:STATIC=/home/runner/work/mscclpp/mscclpp/build2/CMakeFiles/pkgRedirects
-
-//Install path prefix, prepended onto install directories.
-CMAKE_INSTALL_PREFIX:PATH=/usr/local
-
-//Path to a program.
-CMAKE_LINKER:FILEPATH=/usr/bin/ld
-
-//Path to a program.
-CMAKE_MAKE_PROGRAM:FILEPATH=/usr/bin/gmake
-
-//Flags used by the linker during the creation of modules during
-// all build types.
-CMAKE_MODULE_LINKER_FLAGS:STRING=
-
-//Flags used by the linker during the creation of modules during
-// DEBUG builds.
-CMAKE_MODULE_LINKER_FLAGS_DEBUG:STRING=
-
-//Flags used by the linker during the creation of modules during
-// MINSIZEREL builds.
-CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL:STRING=
-
-//Flags used by the linker during the creation of modules during
-// RELEASE builds.
-CMAKE_MODULE_LINKER_FLAGS_RELEASE:STRING=
-
-//Flags used by the linker during the creation of modules during
-// RELWITHDEBINFO builds.
-CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
-
-//Path to a program.
-CMAKE_NM:FILEPATH=/usr/bin/nm
-
-//Path to a program.
-CMAKE_OBJCOPY:FILEPATH=/usr/bin/objcopy
-
-//Path to a program.
-CMAKE_OBJDUMP:FILEPATH=/usr/bin/objdump
-
-//Value Computed by CMake
-CMAKE_PROJECT_DESCRIPTION:STATIC=
-
-//Value Computed by CMake
-CMAKE_PROJECT_HOMEPAGE_URL:STATIC=
-
-//Value Computed by CMake
-CMAKE_PROJECT_NAME:STATIC=mscclpp
-
-//Path to a program.
-CMAKE_RANLIB:FILEPATH=/usr/bin/ranlib
-
-//Path to a program.
-CMAKE_READELF:FILEPATH=/usr/bin/readelf
-
-//Flags used by the linker during the creation of shared libraries
-// during all build types.
-CMAKE_SHARED_LINKER_FLAGS:STRING=
-
-//Flags used by the linker during the creation of shared libraries
-// during DEBUG builds.
-CMAKE_SHARED_LINKER_FLAGS_DEBUG:STRING=
-
-//Flags used by the linker during the creation of shared libraries
-// during MINSIZEREL builds.
-CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL:STRING=
-
-//Flags used by the linker during the creation of shared libraries
-// during RELEASE builds.
-CMAKE_SHARED_LINKER_FLAGS_RELEASE:STRING=
-
-//Flags used by the linker during the creation of shared libraries
-// during RELWITHDEBINFO builds.
-CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO:STRING=
-
-//If set, runtime paths are not added when installing shared libraries,
-// but are added when building.
-CMAKE_SKIP_INSTALL_RPATH:BOOL=NO
-
-//If set, runtime paths are not added when using shared libraries.
-CMAKE_SKIP_RPATH:BOOL=NO
-
-//Flags used by the linker during the creation of static libraries
-// during all build types.
-CMAKE_STATIC_LINKER_FLAGS:STRING=
-
-//Flags used by the linker during the creation of static libraries
-// during DEBUG builds.
-CMAKE_STATIC_LINKER_FLAGS_DEBUG:STRING=
-
-//Flags used by the linker during the creation of static libraries
-// during MINSIZEREL builds.
-CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL:STRING=
-
-//Flags used by the linker during the creation of static libraries
-// during RELEASE builds.
-CMAKE_STATIC_LINKER_FLAGS_RELEASE:STRING=
-
-//Flags used by the linker during the creation of static libraries
-// during RELWITHDEBINFO builds.
-CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO:STRING=
-
-//Path to a program.
-CMAKE_STRIP:FILEPATH=/usr/bin/strip
-
-//Path to a program.
-CMAKE_TAPI:FILEPATH=CMAKE_TAPI-NOTFOUND
-
-//If this value is on, makefiles will be generated without the
-// .SILENT directive, and all commands will be echoed to the console
-// during the make.  This is useful for debugging only. With Visual
-// Studio IDE projects all commands are done without /nologo.
-CMAKE_VERBOSE_MAKEFILE:BOOL=FALSE
-
-//Path to a program.
-CUDAToolkit_NVCC_EXECUTABLE:FILEPATH=CUDAToolkit_NVCC_EXECUTABLE-NOTFOUND
-
-//Path to a file.
-CUDAToolkit_SENTINEL_FILE:FILEPATH=CUDAToolkit_SENTINEL_FILE-NOTFOUND
-
-//Git command line client
-GIT_EXECUTABLE:FILEPATH=/usr/bin/git
-
-//Build collective algorithms
-MSCCLPP_BUILD_EXT_COLLECTIVES:BOOL=ON
-
-//Build NCCL interfaces
-MSCCLPP_BUILD_EXT_NCCL:BOOL=ON
-
-//Build Python bindings
-MSCCLPP_BUILD_PYTHON_BINDINGS:BOOL=ON
-
-//Build tests
-MSCCLPP_BUILD_TESTS:BOOL=ON
-
-//Bypass GPU check.
-MSCCLPP_BYPASS_GPU_CHECK:BOOL=OFF
-
-//Enable code coverage
-MSCCLPP_ENABLE_COVERAGE:BOOL=OFF
-
-//Enable tracing
-MSCCLPP_ENABLE_TRACE:BOOL=OFF
-
-//Specify GPU architectures with delimiters (comma, space, or semicolon).
-MSCCLPP_GPU_ARCHS:STRING=
-
-//Set NPKIT flags
-MSCCLPP_NPKIT_FLAGS:BOOL=OFF
-
-//Use NVIDIA/CUDA.
-MSCCLPP_USE_CUDA:BOOL=OFF
-
-//Use InfiniBand.
-MSCCLPP_USE_IB:BOOL=ON
-
-//Use AMD/ROCm.
-MSCCLPP_USE_ROCM:BOOL=OFF
-
-//The directory containing a CMake configuration file for hip.
-hip_DIR:PATH=hip_DIR-NOTFOUND
-
-//Value Computed by CMake
-mscclpp_BINARY_DIR:STATIC=/home/runner/work/mscclpp/mscclpp/build2
-
-//Value Computed by CMake
-mscclpp_IS_TOP_LEVEL:STATIC=ON
-
-//Value Computed by CMake
-mscclpp_SOURCE_DIR:STATIC=/home/runner/work/mscclpp/mscclpp
-
-
-########################
-# INTERNAL cache entries
-########################
-
-//ADVANCED property for variable: CMAKE_ADDR2LINE
-CMAKE_ADDR2LINE-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_AR
-CMAKE_AR-ADVANCED:INTERNAL=1
-//This is the directory where this CMakeCache.txt was created
-CMAKE_CACHEFILE_DIR:INTERNAL=/home/runner/work/mscclpp/mscclpp/build2
-//Major version of cmake used to create the current loaded cache
-CMAKE_CACHE_MAJOR_VERSION:INTERNAL=3
-//Minor version of cmake used to create the current loaded cache
-CMAKE_CACHE_MINOR_VERSION:INTERNAL=31
-//Patch version of cmake used to create the current loaded cache
-CMAKE_CACHE_PATCH_VERSION:INTERNAL=6
-//ADVANCED property for variable: CMAKE_COLOR_MAKEFILE
-CMAKE_COLOR_MAKEFILE-ADVANCED:INTERNAL=1
-//Path to CMake executable.
-CMAKE_COMMAND:INTERNAL=/usr/local/bin/cmake
-//Path to cpack program executable.
-CMAKE_CPACK_COMMAND:INTERNAL=/usr/local/bin/cpack
-//Path to ctest program executable.
-CMAKE_CTEST_COMMAND:INTERNAL=/usr/local/bin/ctest
-//ADVANCED property for variable: CMAKE_CXX_COMPILER
-CMAKE_CXX_COMPILER-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_CXX_COMPILER_AR
-CMAKE_CXX_COMPILER_AR-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_CXX_COMPILER_RANLIB
-CMAKE_CXX_COMPILER_RANLIB-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_CXX_FLAGS
-CMAKE_CXX_FLAGS-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_CXX_FLAGS_DEBUG
-CMAKE_CXX_FLAGS_DEBUG-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_CXX_FLAGS_MINSIZEREL
-CMAKE_CXX_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_CXX_FLAGS_RELEASE
-CMAKE_CXX_FLAGS_RELEASE-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_CXX_FLAGS_RELWITHDEBINFO
-CMAKE_CXX_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_DLLTOOL
-CMAKE_DLLTOOL-ADVANCED:INTERNAL=1
-//Path to cache edit program executable.
-CMAKE_EDIT_COMMAND:INTERNAL=/usr/local/bin/ccmake
-//Executable file format
-CMAKE_EXECUTABLE_FORMAT:INTERNAL=ELF
-//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS
-CMAKE_EXE_LINKER_FLAGS-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_DEBUG
-CMAKE_EXE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_MINSIZEREL
-CMAKE_EXE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELEASE
-CMAKE_EXE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO
-CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_EXPORT_COMPILE_COMMANDS
-CMAKE_EXPORT_COMPILE_COMMANDS-ADVANCED:INTERNAL=1
-//Name of external makefile project generator.
-CMAKE_EXTRA_GENERATOR:INTERNAL=
-//Name of generator.
-CMAKE_GENERATOR:INTERNAL=Unix Makefiles
-//Generator instance identifier.
-CMAKE_GENERATOR_INSTANCE:INTERNAL=
-//Name of generator platform.
-CMAKE_GENERATOR_PLATFORM:INTERNAL=
-//Name of generator toolset.
-CMAKE_GENERATOR_TOOLSET:INTERNAL=
-//Source directory with the top level CMakeLists.txt file for this
-// project
-CMAKE_HOME_DIRECTORY:INTERNAL=/home/runner/work/mscclpp/mscclpp
-//Install .so files without execute permission.
-CMAKE_INSTALL_SO_NO_EXE:INTERNAL=1
-//ADVANCED property for variable: CMAKE_LINKER
-CMAKE_LINKER-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_MAKE_PROGRAM
-CMAKE_MAKE_PROGRAM-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS
-CMAKE_MODULE_LINKER_FLAGS-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_DEBUG
-CMAKE_MODULE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL
-CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELEASE
-CMAKE_MODULE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO
-CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_NM
-CMAKE_NM-ADVANCED:INTERNAL=1
-//number of local generators
-CMAKE_NUMBER_OF_MAKEFILES:INTERNAL=1
-//ADVANCED property for variable: CMAKE_OBJCOPY
-CMAKE_OBJCOPY-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_OBJDUMP
-CMAKE_OBJDUMP-ADVANCED:INTERNAL=1
-//Platform information initialized
-CMAKE_PLATFORM_INFO_INITIALIZED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_RANLIB
-CMAKE_RANLIB-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_READELF
-CMAKE_READELF-ADVANCED:INTERNAL=1
-//Path to CMake installation.
-CMAKE_ROOT:INTERNAL=/usr/local/share/cmake-3.31
-//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS
-CMAKE_SHARED_LINKER_FLAGS-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_DEBUG
-CMAKE_SHARED_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL
-CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELEASE
-CMAKE_SHARED_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO
-CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_SKIP_INSTALL_RPATH
-CMAKE_SKIP_INSTALL_RPATH-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_SKIP_RPATH
-CMAKE_SKIP_RPATH-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS
-CMAKE_STATIC_LINKER_FLAGS-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_DEBUG
-CMAKE_STATIC_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL
-CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELEASE
-CMAKE_STATIC_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO
-CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_STRIP
-CMAKE_STRIP-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_TAPI
-CMAKE_TAPI-ADVANCED:INTERNAL=1
-//uname command
-CMAKE_UNAME:INTERNAL=/usr/bin/uname
-//ADVANCED property for variable: CMAKE_VERBOSE_MAKEFILE
-CMAKE_VERBOSE_MAKEFILE-ADVANCED:INTERNAL=1
-//Details about finding Git
-FIND_PACKAGE_MESSAGE_DETAILS_Git:INTERNAL=[/usr/bin/git][v2.52.0()]
-//ADVANCED property for variable: GIT_EXECUTABLE
-GIT_EXECUTABLE-ADVANCED:INTERNAL=1
-//linker supports push/pop state
-_CMAKE_CXX_LINKER_PUSHPOP_STATE_SUPPORTED:INTERNAL=TRUE
-//linker supports push/pop state
-_CMAKE_LINKER_PUSHPOP_STATE_SUPPORTED:INTERNAL=TRUE
-
diff --git a/build2/CMakeFiles/3.31.6/CMakeCXXCompiler.cmake b/build2/CMakeFiles/3.31.6/CMakeCXXCompiler.cmake
deleted file mode 100644
index 14f6ae31d..000000000
--- a/build2/CMakeFiles/3.31.6/CMakeCXXCompiler.cmake
+++ /dev/null
@@ -1,101 +0,0 @@
-set(CMAKE_CXX_COMPILER "/usr/bin/c++")
-set(CMAKE_CXX_COMPILER_ARG1 "")
-set(CMAKE_CXX_COMPILER_ID "GNU")
-set(CMAKE_CXX_COMPILER_VERSION "13.3.0")
-set(CMAKE_CXX_COMPILER_VERSION_INTERNAL "")
-set(CMAKE_CXX_COMPILER_WRAPPER "")
-set(CMAKE_CXX_STANDARD_COMPUTED_DEFAULT "17")
-set(CMAKE_CXX_EXTENSIONS_COMPUTED_DEFAULT "ON")
-set(CMAKE_CXX_STANDARD_LATEST "23")
-set(CMAKE_CXX_COMPILE_FEATURES "cxx_std_98;cxx_template_template_parameters;cxx_std_11;cxx_alias_templates;cxx_alignas;cxx_alignof;cxx_attributes;cxx_auto_type;cxx_constexpr;cxx_decltype;cxx_decltype_incomplete_return_types;cxx_default_function_template_args;cxx_defaulted_functions;cxx_defaulted_move_initializers;cxx_delegating_constructors;cxx_deleted_functions;cxx_enum_forward_declarations;cxx_explicit_conversions;cxx_extended_friend_declarations;cxx_extern_templates;cxx_final;cxx_func_identifier;cxx_generalized_initializers;cxx_inheriting_constructors;cxx_inline_namespaces;cxx_lambdas;cxx_local_type_template_args;cxx_long_long_type;cxx_noexcept;cxx_nonstatic_member_init;cxx_nullptr;cxx_override;cxx_range_for;cxx_raw_string_literals;cxx_reference_qualified_functions;cxx_right_angle_brackets;cxx_rvalue_references;cxx_sizeof_member;cxx_static_assert;cxx_strong_enums;cxx_thread_local;cxx_trailing_return_types;cxx_unicode_literals;cxx_uniform_initialization;cxx_unrestricted_unions;cxx_user_literals;cxx_variadic_macros;cxx_variadic_templates;cxx_std_14;cxx_aggregate_default_initializers;cxx_attribute_deprecated;cxx_binary_literals;cxx_contextual_conversions;cxx_decltype_auto;cxx_digit_separators;cxx_generic_lambdas;cxx_lambda_init_captures;cxx_relaxed_constexpr;cxx_return_type_deduction;cxx_variable_templates;cxx_std_17;cxx_std_20;cxx_std_23")
-set(CMAKE_CXX98_COMPILE_FEATURES "cxx_std_98;cxx_template_template_parameters")
-set(CMAKE_CXX11_COMPILE_FEATURES "cxx_std_11;cxx_alias_templates;cxx_alignas;cxx_alignof;cxx_attributes;cxx_auto_type;cxx_constexpr;cxx_decltype;cxx_decltype_incomplete_return_types;cxx_default_function_template_args;cxx_defaulted_functions;cxx_defaulted_move_initializers;cxx_delegating_constructors;cxx_deleted_functions;cxx_enum_forward_declarations;cxx_explicit_conversions;cxx_extended_friend_declarations;cxx_extern_templates;cxx_final;cxx_func_identifier;cxx_generalized_initializers;cxx_inheriting_constructors;cxx_inline_namespaces;cxx_lambdas;cxx_local_type_template_args;cxx_long_long_type;cxx_noexcept;cxx_nonstatic_member_init;cxx_nullptr;cxx_override;cxx_range_for;cxx_raw_string_literals;cxx_reference_qualified_functions;cxx_right_angle_brackets;cxx_rvalue_references;cxx_sizeof_member;cxx_static_assert;cxx_strong_enums;cxx_thread_local;cxx_trailing_return_types;cxx_unicode_literals;cxx_uniform_initialization;cxx_unrestricted_unions;cxx_user_literals;cxx_variadic_macros;cxx_variadic_templates")
-set(CMAKE_CXX14_COMPILE_FEATURES "cxx_std_14;cxx_aggregate_default_initializers;cxx_attribute_deprecated;cxx_binary_literals;cxx_contextual_conversions;cxx_decltype_auto;cxx_digit_separators;cxx_generic_lambdas;cxx_lambda_init_captures;cxx_relaxed_constexpr;cxx_return_type_deduction;cxx_variable_templates")
-set(CMAKE_CXX17_COMPILE_FEATURES "cxx_std_17")
-set(CMAKE_CXX20_COMPILE_FEATURES "cxx_std_20")
-set(CMAKE_CXX23_COMPILE_FEATURES "cxx_std_23")
-set(CMAKE_CXX26_COMPILE_FEATURES "")
-
-set(CMAKE_CXX_PLATFORM_ID "Linux")
-set(CMAKE_CXX_SIMULATE_ID "")
-set(CMAKE_CXX_COMPILER_FRONTEND_VARIANT "GNU")
-set(CMAKE_CXX_SIMULATE_VERSION "")
-
-
-
-
-set(CMAKE_AR "/usr/bin/ar")
-set(CMAKE_CXX_COMPILER_AR "/usr/bin/gcc-ar-13")
-set(CMAKE_RANLIB "/usr/bin/ranlib")
-set(CMAKE_CXX_COMPILER_RANLIB "/usr/bin/gcc-ranlib-13")
-set(CMAKE_LINKER "/usr/bin/ld")
-set(CMAKE_LINKER_LINK "")
-set(CMAKE_LINKER_LLD "")
-set(CMAKE_CXX_COMPILER_LINKER "/usr/bin/ld")
-set(CMAKE_CXX_COMPILER_LINKER_ID "GNU")
-set(CMAKE_CXX_COMPILER_LINKER_VERSION 2.42)
-set(CMAKE_CXX_COMPILER_LINKER_FRONTEND_VARIANT GNU)
-set(CMAKE_MT "")
-set(CMAKE_TAPI "CMAKE_TAPI-NOTFOUND")
-set(CMAKE_COMPILER_IS_GNUCXX 1)
-set(CMAKE_CXX_COMPILER_LOADED 1)
-set(CMAKE_CXX_COMPILER_WORKS TRUE)
-set(CMAKE_CXX_ABI_COMPILED TRUE)
-
-set(CMAKE_CXX_COMPILER_ENV_VAR "CXX")
-
-set(CMAKE_CXX_COMPILER_ID_RUN 1)
-set(CMAKE_CXX_SOURCE_FILE_EXTENSIONS C;M;c++;cc;cpp;cxx;m;mm;mpp;CPP;ixx;cppm;ccm;cxxm;c++m)
-set(CMAKE_CXX_IGNORE_EXTENSIONS inl;h;hpp;HPP;H;o;O;obj;OBJ;def;DEF;rc;RC)
-
-foreach (lang IN ITEMS C OBJC OBJCXX)
-  if (CMAKE_${lang}_COMPILER_ID_RUN)
-    foreach(extension IN LISTS CMAKE_${lang}_SOURCE_FILE_EXTENSIONS)
-      list(REMOVE_ITEM CMAKE_CXX_SOURCE_FILE_EXTENSIONS ${extension})
-    endforeach()
-  endif()
-endforeach()
-
-set(CMAKE_CXX_LINKER_PREFERENCE 30)
-set(CMAKE_CXX_LINKER_PREFERENCE_PROPAGATES 1)
-set(CMAKE_CXX_LINKER_DEPFILE_SUPPORTED )
-
-# Save compiler ABI information.
-set(CMAKE_CXX_SIZEOF_DATA_PTR "8")
-set(CMAKE_CXX_COMPILER_ABI "ELF")
-set(CMAKE_CXX_BYTE_ORDER "LITTLE_ENDIAN")
-set(CMAKE_CXX_LIBRARY_ARCHITECTURE "x86_64-linux-gnu")
-
-if(CMAKE_CXX_SIZEOF_DATA_PTR)
-  set(CMAKE_SIZEOF_VOID_P "${CMAKE_CXX_SIZEOF_DATA_PTR}")
-endif()
-
-if(CMAKE_CXX_COMPILER_ABI)
-  set(CMAKE_INTERNAL_PLATFORM_ABI "${CMAKE_CXX_COMPILER_ABI}")
-endif()
-
-if(CMAKE_CXX_LIBRARY_ARCHITECTURE)
-  set(CMAKE_LIBRARY_ARCHITECTURE "x86_64-linux-gnu")
-endif()
-
-set(CMAKE_CXX_CL_SHOWINCLUDES_PREFIX "")
-if(CMAKE_CXX_CL_SHOWINCLUDES_PREFIX)
-  set(CMAKE_CL_SHOWINCLUDES_PREFIX "${CMAKE_CXX_CL_SHOWINCLUDES_PREFIX}")
-endif()
-
-
-
-
-
-set(CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES "/usr/include/c++/13;/usr/include/x86_64-linux-gnu/c++/13;/usr/include/c++/13/backward;/usr/lib/gcc/x86_64-linux-gnu/13/include;/usr/local/include;/usr/include/x86_64-linux-gnu;/usr/include")
-set(CMAKE_CXX_IMPLICIT_LINK_LIBRARIES "stdc++;m;gcc_s;gcc;c;gcc_s;gcc")
-set(CMAKE_CXX_IMPLICIT_LINK_DIRECTORIES "/usr/lib/gcc/x86_64-linux-gnu/13;/usr/lib/x86_64-linux-gnu;/usr/lib;/lib/x86_64-linux-gnu;/lib")
-set(CMAKE_CXX_IMPLICIT_LINK_FRAMEWORK_DIRECTORIES "")
-set(CMAKE_CXX_COMPILER_CLANG_RESOURCE_DIR "")
-
-set(CMAKE_CXX_COMPILER_IMPORT_STD "")
-### Imported target for C++23 standard library
-set(CMAKE_CXX23_COMPILER_IMPORT_STD_NOT_FOUND_MESSAGE "Unsupported generator: Unix Makefiles")
-
-
-
diff --git a/build2/CMakeFiles/3.31.6/CMakeDetermineCompilerABI_CXX.bin b/build2/CMakeFiles/3.31.6/CMakeDetermineCompilerABI_CXX.bin
deleted file mode 100755
index e90f3f71d98d8b48fdca37fdc4f6d991fd1db519..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 15992
zcmeHOYit}>6~4Q9xipD4Y0{XaG)rkv(&C9<F-{<K9ebTw<k%r`pb)}j);qRGtar`s
zEVT=iKzP&&K>;D4KR{7b9#VzWB18~B+O2|G6~rSFg`oZkluAK_))g&sA!Ipc?)lc^
z(YodJ1Btn-o$sFSoOAD;bMNflnYs7l>A`_`ET)i_sdp%rQVGqZMA7qB$q=Mek6J^=
zH>g|GN|KlRoYto_kXENl@x|CA{4zrJYvD`-yhYPggHC86Bl|6t=2mD8P|10)pRW=b
zJn#{z00_QbUs7re;fVMFgMJ*FxmN8rw|6lnB`(_q;m0ETDMQ;+cjzQomHL2)C&z@p
zJrd6_wn;I-u-}CEg|T1!fLsTs!_RrSf2Y2K;&&$L7o)=X7ELQ4>U$UY`Ee2bYXQ3X
zkkq$SKO`jnKnbtfnRm0@T|4u+*1TJ&Ot((=bhmbQ8ReqU;aAP=O466d)c&C(ii)W+
zCt+0a6Iw=jtlJ=Zw*TRV!E;T|eD<lsJAcyk*c}_b`>XiRpJy9xH~X*+CoT^|gk{ci
zoou7y@d?Vw*e1N_{A|)EmN>BA`Ubi_;*t$`YYD!v1b-9pw>2n7Sr$cf)GB*+$+ISH
zw?NG3v~7*K1v~HF>nK)pe7n{D!OXrstHbCpcGdHpUCPRg9I$du$r*Rco>Lk*(3dY3
zoDn;lcc`rK$znlDx3p<PLylm~|LC5Ik<9JIc&Ti5Z{Vo&_+##SU-&YGIZnTLI^jCT
z^^;tu`FXj%!C#gFn^Ia29`dETG|zp=eS&m3zz6&NN`S{0W1qPI&*KMaKETUQB2*DZ
z5r`rXMIeem6oDuLQ3Rq0{2xc)&&{{~)jWB%$vm~<H#?OwKV9|WwO^Pgf7Eork4kOV
zIihRZ9;9RQ)|6uV+O|hY8f)I#uY9@vPnp?^A24TsXP*51+`*A_d$s*3^Yq>yQvtP&
zWiowf%xK>FDZf18A0Wm&z2b`uyXU=)RQ0<#PgUPgyWG6>1RGuuBzxDl-<4(9aowDq
zGarBcF7xsEWoGON^Wt@H0~N4M3TUcb*6o5nxA(+eR;$XLN6eFZ<D4~TpYv9mr}nNS
z;mVF$t#&0xhbLD2o$k70$H=!{Kl}gT9#V4V2>H!^?5a6ix%_1M8aMM)`l|U=^Yq52
z*HU=CzdX_WXf>9;ChP`2&1YD1etEq4d|30_Mw*R(43%{4*afcI@1uIJaMe+YA`nF&
zia->BC<0Lgq6kD0h$0Y0Ac{Z~fhYq1d<6LY*Q=$>(7^DXGQFQGj#;@WuXMDn=UC8w
zC^I~e-Q&$zPO0eRj+Qd}to=jjO#e`?^6h;8?2PAF#S*={J35#d85vAl>7o8i?+{t|
zdOPbLrF97G5ZkisZT#+y-({V7p;kLic$V;f!iNb>!UyJRwX=kr_?;@J*u95TY&sF!
zvU*k18G50{Jg*%%PCjpDgZ@?i8@byl+eP2)#QVhB#K78?cQ)U6Ptyr?*XG@Kbl&d2
zzGVOR(>DP-%5&l}J^H>#{70BbuT6X=-nV9DyhJrK5v3>sQ3Rq0L=lK05Je!0Koo%}
z0#O8_2>fqE0P7X8J`rmV{hJ<Y;%YQg)-SFR`9WFd_<E7C4swggxb@jAGS)-#{SqhW
zU%p-|viz_tV#M0S3BKW@q}Q}6bxHKE)3mx@@J7KF!Ht3dtc|S7`o~qGXp@T2j;ipq
z*wara?^cmv_qUpEFU85Hu8XV}lhX_C1-<V{x2FF2&B^(^A~M<~#sBvJ>%;%U60t6I
ze_!98<n|-kO2Mln+dGX;qph{O;)@;kb#xhRT|0z+^$K}hEmtqr!d4vb7->Ey0ZEDh
zuN!V;&;1csYt@vDM=@7P;m?NnPT?`WVV|K)Otq*)N;4SuyvjO8PYW<!wN|N*Qikir
z^#Y#9VNBhmF#f@Ri!zPc|Cn!|P|2jW#CUyL_>}M%cP|TnTzCQ1LJf|oggPMvtrGCl
zQgPen+pkv#-zbIwXw=S5-=10*8c%O0Ua58Ub^0h~*tfq~;W`8F5Z`Eh`6r1_!YF{>
z@%c?kr2-^nzfOEYZL0SdwBI0peY{!W_Xzw$VjnK&2Y&gmTEHiXUl-q`Fz%uGCG%9X
zN@_+fWA!ZY2^v2wDOhUc{UYmWoTOwN`p=q3bw%tk-r)6;*zb_vQ~wzfDPJL;+Y`25
z5wAA|MfkXt_}dmSTG&JU`Z)bchOP^Bc(mlT8%0_vPfyz{&mLDql)cK>m@%prR@GbH
zq&3Rx>dR!AD_Z0EV%E-EIj>kMTXtnyjTR@T@{Z@^jJC!WyrSQ=>{7|5hk^yKG^55!
z_M~IwDwC5l<Pwl9vh)_2_8qW4==9xvcOTW_=ABaSzKk(CHKnZg4Yqf?g|VU)coxZQ
zhh`U^Fj`r6oa)WFHtjGV{chhYpwGLWmv;gtJ-!7+g&H?-sP};Xbkd?t1pV(F>OGL@
zBbs(&SZPzVX8$2&?H?T8*E?tp4-6bmk60tU`{<!28HV;aq_CCYwYD!fIoq?9A37?9
z1-+MngvA>htX#QhP1uDTZ+gfKlU2?wSe3GqQ+!HfpDmZgS9V#@MhSl2%4ftoC>m~y
zSiBdb-fZ51;dc`4M=H-udUlr3D`}iS&MnY(j45Rlik@SP7b?b7sW|17yqN%%t+=$8
z#?1*u{o2Z7&^Mp3%M;4T%@n8#jb2G>KJ1jrZn3aPut-;O@-{mtgGZ1urt<n=j29{6
zIn#9HVMvxmKeC21Ap>tBNB)qszaD|w19>Xko^(g4IovS@1yva|^e1UVH@NElb&BUr
zbjjDBzK8e0Vcvw2**2KoL;}xk=yLbdQv1C`U7vqJ?xsx8KfLdYpOXg@eh0zv|7p-4
z|L4FY3<bmf?;-v#G&e%~F&_k?e#{3kA49P=Wq2+Kf6NzwXT*@($gzVz=6No0JOzP2
z=AS_RpAV*R{69oWp8LTc^F1Ku(P%&HfcKF<&m|#aJ_&4-%ERqPn@&@PV+w!FZ-G@Y
zME&9O{|f2(oS?7&U&#Lk=JisHUl;O>U!!l(KPi4d5$i6Hf#*X0ZK43e4h294J{0m#
zi2|4lbr}3m-XkG@%qM`j?}2@I{GJzo#9t-FQt<O40)&RB^t^DP|IUa3kl%p?Q@H-0
zl9Epm^;eVH8u%qG){p3a5Wl7j&mnPNg83}=Nrvqq1D_?|=72xu&-1NBQi7e97G&@*
zkb=h^>aWi`4ee3olcU7rpA-DhkKZJYP2i7tXmuxBE0yw(3kUcE=SdaxuRFA9AJl^q
z;0O6SWtc<#n71XwKWs0j19!EI2<F7R&cpxCI-@i24<h<LXqu7&zby^p>-c8+qCNQi
l<NGkQJ?MXhZ=fipLWQGVt>rm#WB={^$3kg!$RQ-Ee*hEc8gl>u

diff --git a/build2/CMakeFiles/3.31.6/CMakeSystem.cmake b/build2/CMakeFiles/3.31.6/CMakeSystem.cmake
deleted file mode 100644
index b2715a602..000000000
--- a/build2/CMakeFiles/3.31.6/CMakeSystem.cmake
+++ /dev/null
@@ -1,15 +0,0 @@
-set(CMAKE_HOST_SYSTEM "Linux-6.11.0-1018-azure")
-set(CMAKE_HOST_SYSTEM_NAME "Linux")
-set(CMAKE_HOST_SYSTEM_VERSION "6.11.0-1018-azure")
-set(CMAKE_HOST_SYSTEM_PROCESSOR "x86_64")
-
-
-
-set(CMAKE_SYSTEM "Linux-6.11.0-1018-azure")
-set(CMAKE_SYSTEM_NAME "Linux")
-set(CMAKE_SYSTEM_VERSION "6.11.0-1018-azure")
-set(CMAKE_SYSTEM_PROCESSOR "x86_64")
-
-set(CMAKE_CROSSCOMPILING "FALSE")
-
-set(CMAKE_SYSTEM_LOADED 1)
diff --git a/build2/CMakeFiles/3.31.6/CompilerIdCXX/CMakeCXXCompilerId.cpp b/build2/CMakeFiles/3.31.6/CompilerIdCXX/CMakeCXXCompilerId.cpp
deleted file mode 100644
index 3b6e114ca..000000000
--- a/build2/CMakeFiles/3.31.6/CompilerIdCXX/CMakeCXXCompilerId.cpp
+++ /dev/null
@@ -1,919 +0,0 @@
-/* This source file must have a .cpp extension so that all C++ compilers
-   recognize the extension without flags.  Borland does not know .cxx for
-   example.  */
-#ifndef __cplusplus
-# error "A C compiler has been selected for C++."
-#endif
-
-#if !defined(__has_include)
-/* If the compiler does not have __has_include, pretend the answer is
-   always no.  */
-#  define __has_include(x) 0
-#endif
-
-
-/* Version number components: V=Version, R=Revision, P=Patch
-   Version date components:   YYYY=Year, MM=Month,   DD=Day  */
-
-#if defined(__INTEL_COMPILER) || defined(__ICC)
-# define COMPILER_ID "Intel"
-# if defined(_MSC_VER)
-#  define SIMULATE_ID "MSVC"
-# endif
-# if defined(__GNUC__)
-#  define SIMULATE_ID "GNU"
-# endif
-  /* __INTEL_COMPILER = VRP prior to 2021, and then VVVV for 2021 and later,
-     except that a few beta releases use the old format with V=2021.  */
-# if __INTEL_COMPILER < 2021 || __INTEL_COMPILER == 202110 || __INTEL_COMPILER == 202111
-#  define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER/100)
-#  define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER/10 % 10)
-#  if defined(__INTEL_COMPILER_UPDATE)
-#   define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER_UPDATE)
-#  else
-#   define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER   % 10)
-#  endif
-# else
-#  define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER)
-#  define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER_UPDATE)
-   /* The third version component from --version is an update index,
-      but no macro is provided for it.  */
-#  define COMPILER_VERSION_PATCH DEC(0)
-# endif
-# if defined(__INTEL_COMPILER_BUILD_DATE)
-   /* __INTEL_COMPILER_BUILD_DATE = YYYYMMDD */
-#  define COMPILER_VERSION_TWEAK DEC(__INTEL_COMPILER_BUILD_DATE)
-# endif
-# if defined(_MSC_VER)
-   /* _MSC_VER = VVRR */
-#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
-#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
-# endif
-# if defined(__GNUC__)
-#  define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
-# elif defined(__GNUG__)
-#  define SIMULATE_VERSION_MAJOR DEC(__GNUG__)
-# endif
-# if defined(__GNUC_MINOR__)
-#  define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
-# endif
-# if defined(__GNUC_PATCHLEVEL__)
-#  define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
-# endif
-
-#elif (defined(__clang__) && defined(__INTEL_CLANG_COMPILER)) || defined(__INTEL_LLVM_COMPILER)
-# define COMPILER_ID "IntelLLVM"
-#if defined(_MSC_VER)
-# define SIMULATE_ID "MSVC"
-#endif
-#if defined(__GNUC__)
-# define SIMULATE_ID "GNU"
-#endif
-/* __INTEL_LLVM_COMPILER = VVVVRP prior to 2021.2.0, VVVVRRPP for 2021.2.0 and
- * later.  Look for 6 digit vs. 8 digit version number to decide encoding.
- * VVVV is no smaller than the current year when a version is released.
- */
-#if __INTEL_LLVM_COMPILER < 1000000L
-# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/100)
-# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER    % 10)
-#else
-# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/10000)
-# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/100 % 100)
-# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER     % 100)
-#endif
-#if defined(_MSC_VER)
-  /* _MSC_VER = VVRR */
-# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
-# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
-#endif
-#if defined(__GNUC__)
-# define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
-#elif defined(__GNUG__)
-# define SIMULATE_VERSION_MAJOR DEC(__GNUG__)
-#endif
-#if defined(__GNUC_MINOR__)
-# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
-#endif
-#if defined(__GNUC_PATCHLEVEL__)
-# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
-#endif
-
-#elif defined(__PATHCC__)
-# define COMPILER_ID "PathScale"
-# define COMPILER_VERSION_MAJOR DEC(__PATHCC__)
-# define COMPILER_VERSION_MINOR DEC(__PATHCC_MINOR__)
-# if defined(__PATHCC_PATCHLEVEL__)
-#  define COMPILER_VERSION_PATCH DEC(__PATHCC_PATCHLEVEL__)
-# endif
-
-#elif defined(__BORLANDC__) && defined(__CODEGEARC_VERSION__)
-# define COMPILER_ID "Embarcadero"
-# define COMPILER_VERSION_MAJOR HEX(__CODEGEARC_VERSION__>>24 & 0x00FF)
-# define COMPILER_VERSION_MINOR HEX(__CODEGEARC_VERSION__>>16 & 0x00FF)
-# define COMPILER_VERSION_PATCH DEC(__CODEGEARC_VERSION__     & 0xFFFF)
-
-#elif defined(__BORLANDC__)
-# define COMPILER_ID "Borland"
-  /* __BORLANDC__ = 0xVRR */
-# define COMPILER_VERSION_MAJOR HEX(__BORLANDC__>>8)
-# define COMPILER_VERSION_MINOR HEX(__BORLANDC__ & 0xFF)
-
-#elif defined(__WATCOMC__) && __WATCOMC__ < 1200
-# define COMPILER_ID "Watcom"
-   /* __WATCOMC__ = VVRR */
-# define COMPILER_VERSION_MAJOR DEC(__WATCOMC__ / 100)
-# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
-# if (__WATCOMC__ % 10) > 0
-#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
-# endif
-
-#elif defined(__WATCOMC__)
-# define COMPILER_ID "OpenWatcom"
-   /* __WATCOMC__ = VVRP + 1100 */
-# define COMPILER_VERSION_MAJOR DEC((__WATCOMC__ - 1100) / 100)
-# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
-# if (__WATCOMC__ % 10) > 0
-#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
-# endif
-
-#elif defined(__SUNPRO_CC)
-# define COMPILER_ID "SunPro"
-# if __SUNPRO_CC >= 0x5100
-   /* __SUNPRO_CC = 0xVRRP */
-#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>12)
-#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xFF)
-#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC    & 0xF)
-# else
-   /* __SUNPRO_CC = 0xVRP */
-#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>8)
-#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xF)
-#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC    & 0xF)
-# endif
-
-#elif defined(__HP_aCC)
-# define COMPILER_ID "HP"
-  /* __HP_aCC = VVRRPP */
-# define COMPILER_VERSION_MAJOR DEC(__HP_aCC/10000)
-# define COMPILER_VERSION_MINOR DEC(__HP_aCC/100 % 100)
-# define COMPILER_VERSION_PATCH DEC(__HP_aCC     % 100)
-
-#elif defined(__DECCXX)
-# define COMPILER_ID "Compaq"
-  /* __DECCXX_VER = VVRRTPPPP */
-# define COMPILER_VERSION_MAJOR DEC(__DECCXX_VER/10000000)
-# define COMPILER_VERSION_MINOR DEC(__DECCXX_VER/100000  % 100)
-# define COMPILER_VERSION_PATCH DEC(__DECCXX_VER         % 10000)
-
-#elif defined(__IBMCPP__) && defined(__COMPILER_VER__)
-# define COMPILER_ID "zOS"
-  /* __IBMCPP__ = VRP */
-# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
-# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
-
-#elif defined(__open_xl__) && defined(__clang__)
-# define COMPILER_ID "IBMClang"
-# define COMPILER_VERSION_MAJOR DEC(__open_xl_version__)
-# define COMPILER_VERSION_MINOR DEC(__open_xl_release__)
-# define COMPILER_VERSION_PATCH DEC(__open_xl_modification__)
-# define COMPILER_VERSION_TWEAK DEC(__open_xl_ptf_fix_level__)
-
-
-#elif defined(__ibmxl__) && defined(__clang__)
-# define COMPILER_ID "XLClang"
-# define COMPILER_VERSION_MAJOR DEC(__ibmxl_version__)
-# define COMPILER_VERSION_MINOR DEC(__ibmxl_release__)
-# define COMPILER_VERSION_PATCH DEC(__ibmxl_modification__)
-# define COMPILER_VERSION_TWEAK DEC(__ibmxl_ptf_fix_level__)
-
-
-#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ >= 800
-# define COMPILER_ID "XL"
-  /* __IBMCPP__ = VRP */
-# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
-# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
-
-#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ < 800
-# define COMPILER_ID "VisualAge"
-  /* __IBMCPP__ = VRP */
-# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
-# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
-
-#elif defined(__NVCOMPILER)
-# define COMPILER_ID "NVHPC"
-# define COMPILER_VERSION_MAJOR DEC(__NVCOMPILER_MAJOR__)
-# define COMPILER_VERSION_MINOR DEC(__NVCOMPILER_MINOR__)
-# if defined(__NVCOMPILER_PATCHLEVEL__)
-#  define COMPILER_VERSION_PATCH DEC(__NVCOMPILER_PATCHLEVEL__)
-# endif
-
-#elif defined(__PGI)
-# define COMPILER_ID "PGI"
-# define COMPILER_VERSION_MAJOR DEC(__PGIC__)
-# define COMPILER_VERSION_MINOR DEC(__PGIC_MINOR__)
-# if defined(__PGIC_PATCHLEVEL__)
-#  define COMPILER_VERSION_PATCH DEC(__PGIC_PATCHLEVEL__)
-# endif
-
-#elif defined(__clang__) && defined(__cray__)
-# define COMPILER_ID "CrayClang"
-# define COMPILER_VERSION_MAJOR DEC(__cray_major__)
-# define COMPILER_VERSION_MINOR DEC(__cray_minor__)
-# define COMPILER_VERSION_PATCH DEC(__cray_patchlevel__)
-# define COMPILER_VERSION_INTERNAL_STR __clang_version__
-
-
-#elif defined(_CRAYC)
-# define COMPILER_ID "Cray"
-# define COMPILER_VERSION_MAJOR DEC(_RELEASE_MAJOR)
-# define COMPILER_VERSION_MINOR DEC(_RELEASE_MINOR)
-
-#elif defined(__TI_COMPILER_VERSION__)
-# define COMPILER_ID "TI"
-  /* __TI_COMPILER_VERSION__ = VVVRRRPPP */
-# define COMPILER_VERSION_MAJOR DEC(__TI_COMPILER_VERSION__/1000000)
-# define COMPILER_VERSION_MINOR DEC(__TI_COMPILER_VERSION__/1000   % 1000)
-# define COMPILER_VERSION_PATCH DEC(__TI_COMPILER_VERSION__        % 1000)
-
-#elif defined(__CLANG_FUJITSU)
-# define COMPILER_ID "FujitsuClang"
-# define COMPILER_VERSION_MAJOR DEC(__FCC_major__)
-# define COMPILER_VERSION_MINOR DEC(__FCC_minor__)
-# define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__)
-# define COMPILER_VERSION_INTERNAL_STR __clang_version__
-
-
-#elif defined(__FUJITSU)
-# define COMPILER_ID "Fujitsu"
-# if defined(__FCC_version__)
-#   define COMPILER_VERSION __FCC_version__
-# elif defined(__FCC_major__)
-#   define COMPILER_VERSION_MAJOR DEC(__FCC_major__)
-#   define COMPILER_VERSION_MINOR DEC(__FCC_minor__)
-#   define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__)
-# endif
-# if defined(__fcc_version)
-#   define COMPILER_VERSION_INTERNAL DEC(__fcc_version)
-# elif defined(__FCC_VERSION)
-#   define COMPILER_VERSION_INTERNAL DEC(__FCC_VERSION)
-# endif
-
-
-#elif defined(__ghs__)
-# define COMPILER_ID "GHS"
-/* __GHS_VERSION_NUMBER = VVVVRP */
-# ifdef __GHS_VERSION_NUMBER
-# define COMPILER_VERSION_MAJOR DEC(__GHS_VERSION_NUMBER / 100)
-# define COMPILER_VERSION_MINOR DEC(__GHS_VERSION_NUMBER / 10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__GHS_VERSION_NUMBER      % 10)
-# endif
-
-#elif defined(__TASKING__)
-# define COMPILER_ID "Tasking"
-  # define COMPILER_VERSION_MAJOR DEC(__VERSION__/1000)
-  # define COMPILER_VERSION_MINOR DEC(__VERSION__ % 100)
-# define COMPILER_VERSION_INTERNAL DEC(__VERSION__)
-
-#elif defined(__ORANGEC__)
-# define COMPILER_ID "OrangeC"
-# define COMPILER_VERSION_MAJOR DEC(__ORANGEC_MAJOR__)
-# define COMPILER_VERSION_MINOR DEC(__ORANGEC_MINOR__)
-# define COMPILER_VERSION_PATCH DEC(__ORANGEC_PATCHLEVEL__)
-
-#elif defined(__SCO_VERSION__)
-# define COMPILER_ID "SCO"
-
-#elif defined(__ARMCC_VERSION) && !defined(__clang__)
-# define COMPILER_ID "ARMCC"
-#if __ARMCC_VERSION >= 1000000
-  /* __ARMCC_VERSION = VRRPPPP */
-  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/1000000)
-  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 100)
-  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION     % 10000)
-#else
-  /* __ARMCC_VERSION = VRPPPP */
-  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/100000)
-  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 10)
-  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION    % 10000)
-#endif
-
-
-#elif defined(__clang__) && defined(__apple_build_version__)
-# define COMPILER_ID "AppleClang"
-# if defined(_MSC_VER)
-#  define SIMULATE_ID "MSVC"
-# endif
-# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
-# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
-# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
-# if defined(_MSC_VER)
-   /* _MSC_VER = VVRR */
-#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
-#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
-# endif
-# define COMPILER_VERSION_TWEAK DEC(__apple_build_version__)
-
-#elif defined(__clang__) && defined(__ARMCOMPILER_VERSION)
-# define COMPILER_ID "ARMClang"
-  # define COMPILER_VERSION_MAJOR DEC(__ARMCOMPILER_VERSION/1000000)
-  # define COMPILER_VERSION_MINOR DEC(__ARMCOMPILER_VERSION/10000 % 100)
-  # define COMPILER_VERSION_PATCH DEC(__ARMCOMPILER_VERSION/100   % 100)
-# define COMPILER_VERSION_INTERNAL DEC(__ARMCOMPILER_VERSION)
-
-#elif defined(__clang__) && defined(__ti__)
-# define COMPILER_ID "TIClang"
-  # define COMPILER_VERSION_MAJOR DEC(__ti_major__)
-  # define COMPILER_VERSION_MINOR DEC(__ti_minor__)
-  # define COMPILER_VERSION_PATCH DEC(__ti_patchlevel__)
-# define COMPILER_VERSION_INTERNAL DEC(__ti_version__)
-
-#elif defined(__clang__)
-# define COMPILER_ID "Clang"
-# if defined(_MSC_VER)
-#  define SIMULATE_ID "MSVC"
-# endif
-# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
-# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
-# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
-# if defined(_MSC_VER)
-   /* _MSC_VER = VVRR */
-#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
-#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
-# endif
-
-#elif defined(__LCC__) && (defined(__GNUC__) || defined(__GNUG__) || defined(__MCST__))
-# define COMPILER_ID "LCC"
-# define COMPILER_VERSION_MAJOR DEC(__LCC__ / 100)
-# define COMPILER_VERSION_MINOR DEC(__LCC__ % 100)
-# if defined(__LCC_MINOR__)
-#  define COMPILER_VERSION_PATCH DEC(__LCC_MINOR__)
-# endif
-# if defined(__GNUC__) && defined(__GNUC_MINOR__)
-#  define SIMULATE_ID "GNU"
-#  define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
-#  define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
-#  if defined(__GNUC_PATCHLEVEL__)
-#   define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
-#  endif
-# endif
-
-#elif defined(__GNUC__) || defined(__GNUG__)
-# define COMPILER_ID "GNU"
-# if defined(__GNUC__)
-#  define COMPILER_VERSION_MAJOR DEC(__GNUC__)
-# else
-#  define COMPILER_VERSION_MAJOR DEC(__GNUG__)
-# endif
-# if defined(__GNUC_MINOR__)
-#  define COMPILER_VERSION_MINOR DEC(__GNUC_MINOR__)
-# endif
-# if defined(__GNUC_PATCHLEVEL__)
-#  define COMPILER_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
-# endif
-
-#elif defined(_MSC_VER)
-# define COMPILER_ID "MSVC"
-  /* _MSC_VER = VVRR */
-# define COMPILER_VERSION_MAJOR DEC(_MSC_VER / 100)
-# define COMPILER_VERSION_MINOR DEC(_MSC_VER % 100)
-# if defined(_MSC_FULL_VER)
-#  if _MSC_VER >= 1400
-    /* _MSC_FULL_VER = VVRRPPPPP */
-#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 100000)
-#  else
-    /* _MSC_FULL_VER = VVRRPPPP */
-#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 10000)
-#  endif
-# endif
-# if defined(_MSC_BUILD)
-#  define COMPILER_VERSION_TWEAK DEC(_MSC_BUILD)
-# endif
-
-#elif defined(_ADI_COMPILER)
-# define COMPILER_ID "ADSP"
-#if defined(__VERSIONNUM__)
-  /* __VERSIONNUM__ = 0xVVRRPPTT */
-#  define COMPILER_VERSION_MAJOR DEC(__VERSIONNUM__ >> 24 & 0xFF)
-#  define COMPILER_VERSION_MINOR DEC(__VERSIONNUM__ >> 16 & 0xFF)
-#  define COMPILER_VERSION_PATCH DEC(__VERSIONNUM__ >> 8 & 0xFF)
-#  define COMPILER_VERSION_TWEAK DEC(__VERSIONNUM__ & 0xFF)
-#endif
-
-#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
-# define COMPILER_ID "IAR"
-# if defined(__VER__) && defined(__ICCARM__)
-#  define COMPILER_VERSION_MAJOR DEC((__VER__) / 1000000)
-#  define COMPILER_VERSION_MINOR DEC(((__VER__) / 1000) % 1000)
-#  define COMPILER_VERSION_PATCH DEC((__VER__) % 1000)
-#  define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
-# elif defined(__VER__) && (defined(__ICCAVR__) || defined(__ICCRX__) || defined(__ICCRH850__) || defined(__ICCRL78__) || defined(__ICC430__) || defined(__ICCRISCV__) || defined(__ICCV850__) || defined(__ICC8051__) || defined(__ICCSTM8__))
-#  define COMPILER_VERSION_MAJOR DEC((__VER__) / 100)
-#  define COMPILER_VERSION_MINOR DEC((__VER__) - (((__VER__) / 100)*100))
-#  define COMPILER_VERSION_PATCH DEC(__SUBVERSION__)
-#  define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
-# endif
-
-
-/* These compilers are either not known or too old to define an
-  identification macro.  Try to identify the platform and guess that
-  it is the native compiler.  */
-#elif defined(__hpux) || defined(__hpua)
-# define COMPILER_ID "HP"
-
-#else /* unknown compiler */
-# define COMPILER_ID ""
-#endif
-
-/* Construct the string literal in pieces to prevent the source from
-   getting matched.  Store it in a pointer rather than an array
-   because some compilers will just produce instructions to fill the
-   array rather than assigning a pointer to a static array.  */
-char const* info_compiler = "INFO" ":" "compiler[" COMPILER_ID "]";
-#ifdef SIMULATE_ID
-char const* info_simulate = "INFO" ":" "simulate[" SIMULATE_ID "]";
-#endif
-
-#ifdef __QNXNTO__
-char const* qnxnto = "INFO" ":" "qnxnto[]";
-#endif
-
-#if defined(__CRAYXT_COMPUTE_LINUX_TARGET)
-char const *info_cray = "INFO" ":" "compiler_wrapper[CrayPrgEnv]";
-#endif
-
-#define STRINGIFY_HELPER(X) #X
-#define STRINGIFY(X) STRINGIFY_HELPER(X)
-
-/* Identify known platforms by name.  */
-#if defined(__linux) || defined(__linux__) || defined(linux)
-# define PLATFORM_ID "Linux"
-
-#elif defined(__MSYS__)
-# define PLATFORM_ID "MSYS"
-
-#elif defined(__CYGWIN__)
-# define PLATFORM_ID "Cygwin"
-
-#elif defined(__MINGW32__)
-# define PLATFORM_ID "MinGW"
-
-#elif defined(__APPLE__)
-# define PLATFORM_ID "Darwin"
-
-#elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
-# define PLATFORM_ID "Windows"
-
-#elif defined(__FreeBSD__) || defined(__FreeBSD)
-# define PLATFORM_ID "FreeBSD"
-
-#elif defined(__NetBSD__) || defined(__NetBSD)
-# define PLATFORM_ID "NetBSD"
-
-#elif defined(__OpenBSD__) || defined(__OPENBSD)
-# define PLATFORM_ID "OpenBSD"
-
-#elif defined(__sun) || defined(sun)
-# define PLATFORM_ID "SunOS"
-
-#elif defined(_AIX) || defined(__AIX) || defined(__AIX__) || defined(__aix) || defined(__aix__)
-# define PLATFORM_ID "AIX"
-
-#elif defined(__hpux) || defined(__hpux__)
-# define PLATFORM_ID "HP-UX"
-
-#elif defined(__HAIKU__)
-# define PLATFORM_ID "Haiku"
-
-#elif defined(__BeOS) || defined(__BEOS__) || defined(_BEOS)
-# define PLATFORM_ID "BeOS"
-
-#elif defined(__QNX__) || defined(__QNXNTO__)
-# define PLATFORM_ID "QNX"
-
-#elif defined(__tru64) || defined(_tru64) || defined(__TRU64__)
-# define PLATFORM_ID "Tru64"
-
-#elif defined(__riscos) || defined(__riscos__)
-# define PLATFORM_ID "RISCos"
-
-#elif defined(__sinix) || defined(__sinix__) || defined(__SINIX__)
-# define PLATFORM_ID "SINIX"
-
-#elif defined(__UNIX_SV__)
-# define PLATFORM_ID "UNIX_SV"
-
-#elif defined(__bsdos__)
-# define PLATFORM_ID "BSDOS"
-
-#elif defined(_MPRAS) || defined(MPRAS)
-# define PLATFORM_ID "MP-RAS"
-
-#elif defined(__osf) || defined(__osf__)
-# define PLATFORM_ID "OSF1"
-
-#elif defined(_SCO_SV) || defined(SCO_SV) || defined(sco_sv)
-# define PLATFORM_ID "SCO_SV"
-
-#elif defined(__ultrix) || defined(__ultrix__) || defined(_ULTRIX)
-# define PLATFORM_ID "ULTRIX"
-
-#elif defined(__XENIX__) || defined(_XENIX) || defined(XENIX)
-# define PLATFORM_ID "Xenix"
-
-#elif defined(__WATCOMC__)
-# if defined(__LINUX__)
-#  define PLATFORM_ID "Linux"
-
-# elif defined(__DOS__)
-#  define PLATFORM_ID "DOS"
-
-# elif defined(__OS2__)
-#  define PLATFORM_ID "OS2"
-
-# elif defined(__WINDOWS__)
-#  define PLATFORM_ID "Windows3x"
-
-# elif defined(__VXWORKS__)
-#  define PLATFORM_ID "VxWorks"
-
-# else /* unknown platform */
-#  define PLATFORM_ID
-# endif
-
-#elif defined(__INTEGRITY)
-# if defined(INT_178B)
-#  define PLATFORM_ID "Integrity178"
-
-# else /* regular Integrity */
-#  define PLATFORM_ID "Integrity"
-# endif
-
-# elif defined(_ADI_COMPILER)
-#  define PLATFORM_ID "ADSP"
-
-#else /* unknown platform */
-# define PLATFORM_ID
-
-#endif
-
-/* For windows compilers MSVC and Intel we can determine
-   the architecture of the compiler being used.  This is because
-   the compilers do not have flags that can change the architecture,
-   but rather depend on which compiler is being used
-*/
-#if defined(_WIN32) && defined(_MSC_VER)
-# if defined(_M_IA64)
-#  define ARCHITECTURE_ID "IA64"
-
-# elif defined(_M_ARM64EC)
-#  define ARCHITECTURE_ID "ARM64EC"
-
-# elif defined(_M_X64) || defined(_M_AMD64)
-#  define ARCHITECTURE_ID "x64"
-
-# elif defined(_M_IX86)
-#  define ARCHITECTURE_ID "X86"
-
-# elif defined(_M_ARM64)
-#  define ARCHITECTURE_ID "ARM64"
-
-# elif defined(_M_ARM)
-#  if _M_ARM == 4
-#   define ARCHITECTURE_ID "ARMV4I"
-#  elif _M_ARM == 5
-#   define ARCHITECTURE_ID "ARMV5I"
-#  else
-#   define ARCHITECTURE_ID "ARMV" STRINGIFY(_M_ARM)
-#  endif
-
-# elif defined(_M_MIPS)
-#  define ARCHITECTURE_ID "MIPS"
-
-# elif defined(_M_SH)
-#  define ARCHITECTURE_ID "SHx"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-#elif defined(__WATCOMC__)
-# if defined(_M_I86)
-#  define ARCHITECTURE_ID "I86"
-
-# elif defined(_M_IX86)
-#  define ARCHITECTURE_ID "X86"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
-# if defined(__ICCARM__)
-#  define ARCHITECTURE_ID "ARM"
-
-# elif defined(__ICCRX__)
-#  define ARCHITECTURE_ID "RX"
-
-# elif defined(__ICCRH850__)
-#  define ARCHITECTURE_ID "RH850"
-
-# elif defined(__ICCRL78__)
-#  define ARCHITECTURE_ID "RL78"
-
-# elif defined(__ICCRISCV__)
-#  define ARCHITECTURE_ID "RISCV"
-
-# elif defined(__ICCAVR__)
-#  define ARCHITECTURE_ID "AVR"
-
-# elif defined(__ICC430__)
-#  define ARCHITECTURE_ID "MSP430"
-
-# elif defined(__ICCV850__)
-#  define ARCHITECTURE_ID "V850"
-
-# elif defined(__ICC8051__)
-#  define ARCHITECTURE_ID "8051"
-
-# elif defined(__ICCSTM8__)
-#  define ARCHITECTURE_ID "STM8"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-#elif defined(__ghs__)
-# if defined(__PPC64__)
-#  define ARCHITECTURE_ID "PPC64"
-
-# elif defined(__ppc__)
-#  define ARCHITECTURE_ID "PPC"
-
-# elif defined(__ARM__)
-#  define ARCHITECTURE_ID "ARM"
-
-# elif defined(__x86_64__)
-#  define ARCHITECTURE_ID "x64"
-
-# elif defined(__i386__)
-#  define ARCHITECTURE_ID "X86"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-#elif defined(__clang__) && defined(__ti__)
-# if defined(__ARM_ARCH)
-#  define ARCHITECTURE_ID "ARM"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-#elif defined(__TI_COMPILER_VERSION__)
-# if defined(__TI_ARM__)
-#  define ARCHITECTURE_ID "ARM"
-
-# elif defined(__MSP430__)
-#  define ARCHITECTURE_ID "MSP430"
-
-# elif defined(__TMS320C28XX__)
-#  define ARCHITECTURE_ID "TMS320C28x"
-
-# elif defined(__TMS320C6X__) || defined(_TMS320C6X)
-#  define ARCHITECTURE_ID "TMS320C6x"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-# elif defined(__ADSPSHARC__)
-#  define ARCHITECTURE_ID "SHARC"
-
-# elif defined(__ADSPBLACKFIN__)
-#  define ARCHITECTURE_ID "Blackfin"
-
-#elif defined(__TASKING__)
-
-# if defined(__CTC__) || defined(__CPTC__)
-#  define ARCHITECTURE_ID "TriCore"
-
-# elif defined(__CMCS__)
-#  define ARCHITECTURE_ID "MCS"
-
-# elif defined(__CARM__)
-#  define ARCHITECTURE_ID "ARM"
-
-# elif defined(__CARC__)
-#  define ARCHITECTURE_ID "ARC"
-
-# elif defined(__C51__)
-#  define ARCHITECTURE_ID "8051"
-
-# elif defined(__CPCP__)
-#  define ARCHITECTURE_ID "PCP"
-
-# else
-#  define ARCHITECTURE_ID ""
-# endif
-
-#else
-#  define ARCHITECTURE_ID
-#endif
-
-/* Convert integer to decimal digit literals.  */
-#define DEC(n)                   \
-  ('0' + (((n) / 10000000)%10)), \
-  ('0' + (((n) / 1000000)%10)),  \
-  ('0' + (((n) / 100000)%10)),   \
-  ('0' + (((n) / 10000)%10)),    \
-  ('0' + (((n) / 1000)%10)),     \
-  ('0' + (((n) / 100)%10)),      \
-  ('0' + (((n) / 10)%10)),       \
-  ('0' +  ((n) % 10))
-
-/* Convert integer to hex digit literals.  */
-#define HEX(n)             \
-  ('0' + ((n)>>28 & 0xF)), \
-  ('0' + ((n)>>24 & 0xF)), \
-  ('0' + ((n)>>20 & 0xF)), \
-  ('0' + ((n)>>16 & 0xF)), \
-  ('0' + ((n)>>12 & 0xF)), \
-  ('0' + ((n)>>8  & 0xF)), \
-  ('0' + ((n)>>4  & 0xF)), \
-  ('0' + ((n)     & 0xF))
-
-/* Construct a string literal encoding the version number. */
-#ifdef COMPILER_VERSION
-char const* info_version = "INFO" ":" "compiler_version[" COMPILER_VERSION "]";
-
-/* Construct a string literal encoding the version number components. */
-#elif defined(COMPILER_VERSION_MAJOR)
-char const info_version[] = {
-  'I', 'N', 'F', 'O', ':',
-  'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','[',
-  COMPILER_VERSION_MAJOR,
-# ifdef COMPILER_VERSION_MINOR
-  '.', COMPILER_VERSION_MINOR,
-#  ifdef COMPILER_VERSION_PATCH
-   '.', COMPILER_VERSION_PATCH,
-#   ifdef COMPILER_VERSION_TWEAK
-    '.', COMPILER_VERSION_TWEAK,
-#   endif
-#  endif
-# endif
-  ']','\0'};
-#endif
-
-/* Construct a string literal encoding the internal version number. */
-#ifdef COMPILER_VERSION_INTERNAL
-char const info_version_internal[] = {
-  'I', 'N', 'F', 'O', ':',
-  'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','_',
-  'i','n','t','e','r','n','a','l','[',
-  COMPILER_VERSION_INTERNAL,']','\0'};
-#elif defined(COMPILER_VERSION_INTERNAL_STR)
-char const* info_version_internal = "INFO" ":" "compiler_version_internal[" COMPILER_VERSION_INTERNAL_STR "]";
-#endif
-
-/* Construct a string literal encoding the version number components. */
-#ifdef SIMULATE_VERSION_MAJOR
-char const info_simulate_version[] = {
-  'I', 'N', 'F', 'O', ':',
-  's','i','m','u','l','a','t','e','_','v','e','r','s','i','o','n','[',
-  SIMULATE_VERSION_MAJOR,
-# ifdef SIMULATE_VERSION_MINOR
-  '.', SIMULATE_VERSION_MINOR,
-#  ifdef SIMULATE_VERSION_PATCH
-   '.', SIMULATE_VERSION_PATCH,
-#   ifdef SIMULATE_VERSION_TWEAK
-    '.', SIMULATE_VERSION_TWEAK,
-#   endif
-#  endif
-# endif
-  ']','\0'};
-#endif
-
-/* Construct the string literal in pieces to prevent the source from
-   getting matched.  Store it in a pointer rather than an array
-   because some compilers will just produce instructions to fill the
-   array rather than assigning a pointer to a static array.  */
-char const* info_platform = "INFO" ":" "platform[" PLATFORM_ID "]";
-char const* info_arch = "INFO" ":" "arch[" ARCHITECTURE_ID "]";
-
-
-
-#define CXX_STD_98 199711L
-#define CXX_STD_11 201103L
-#define CXX_STD_14 201402L
-#define CXX_STD_17 201703L
-#define CXX_STD_20 202002L
-#define CXX_STD_23 202302L
-
-#if defined(__INTEL_COMPILER) && defined(_MSVC_LANG)
-#  if _MSVC_LANG > CXX_STD_17
-#    define CXX_STD _MSVC_LANG
-#  elif _MSVC_LANG == CXX_STD_17 && defined(__cpp_aggregate_paren_init)
-#    define CXX_STD CXX_STD_20
-#  elif _MSVC_LANG > CXX_STD_14 && __cplusplus > CXX_STD_17
-#    define CXX_STD CXX_STD_20
-#  elif _MSVC_LANG > CXX_STD_14
-#    define CXX_STD CXX_STD_17
-#  elif defined(__INTEL_CXX11_MODE__) && defined(__cpp_aggregate_nsdmi)
-#    define CXX_STD CXX_STD_14
-#  elif defined(__INTEL_CXX11_MODE__)
-#    define CXX_STD CXX_STD_11
-#  else
-#    define CXX_STD CXX_STD_98
-#  endif
-#elif defined(_MSC_VER) && defined(_MSVC_LANG)
-#  if _MSVC_LANG > __cplusplus
-#    define CXX_STD _MSVC_LANG
-#  else
-#    define CXX_STD __cplusplus
-#  endif
-#elif defined(__NVCOMPILER)
-#  if __cplusplus == CXX_STD_17 && defined(__cpp_aggregate_paren_init)
-#    define CXX_STD CXX_STD_20
-#  else
-#    define CXX_STD __cplusplus
-#  endif
-#elif defined(__INTEL_COMPILER) || defined(__PGI)
-#  if __cplusplus == CXX_STD_11 && defined(__cpp_namespace_attributes)
-#    define CXX_STD CXX_STD_17
-#  elif __cplusplus == CXX_STD_11 && defined(__cpp_aggregate_nsdmi)
-#    define CXX_STD CXX_STD_14
-#  else
-#    define CXX_STD __cplusplus
-#  endif
-#elif (defined(__IBMCPP__) || defined(__ibmxl__)) && defined(__linux__)
-#  if __cplusplus == CXX_STD_11 && defined(__cpp_aggregate_nsdmi)
-#    define CXX_STD CXX_STD_14
-#  else
-#    define CXX_STD __cplusplus
-#  endif
-#elif __cplusplus == 1 && defined(__GXX_EXPERIMENTAL_CXX0X__)
-#  define CXX_STD CXX_STD_11
-#else
-#  define CXX_STD __cplusplus
-#endif
-
-const char* info_language_standard_default = "INFO" ":" "standard_default["
-#if CXX_STD > CXX_STD_23
-  "26"
-#elif CXX_STD > CXX_STD_20
-  "23"
-#elif CXX_STD > CXX_STD_17
-  "20"
-#elif CXX_STD > CXX_STD_14
-  "17"
-#elif CXX_STD > CXX_STD_11
-  "14"
-#elif CXX_STD >= CXX_STD_11
-  "11"
-#else
-  "98"
-#endif
-"]";
-
-const char* info_language_extensions_default = "INFO" ":" "extensions_default["
-#if (defined(__clang__) || defined(__GNUC__) || defined(__xlC__) ||           \
-     defined(__TI_COMPILER_VERSION__)) &&                                     \
-  !defined(__STRICT_ANSI__)
-  "ON"
-#else
-  "OFF"
-#endif
-"]";
-
-/*--------------------------------------------------------------------------*/
-
-int main(int argc, char* argv[])
-{
-  int require = 0;
-  require += info_compiler[argc];
-  require += info_platform[argc];
-  require += info_arch[argc];
-#ifdef COMPILER_VERSION_MAJOR
-  require += info_version[argc];
-#endif
-#ifdef COMPILER_VERSION_INTERNAL
-  require += info_version_internal[argc];
-#endif
-#ifdef SIMULATE_ID
-  require += info_simulate[argc];
-#endif
-#ifdef SIMULATE_VERSION_MAJOR
-  require += info_simulate_version[argc];
-#endif
-#if defined(__CRAYXT_COMPUTE_LINUX_TARGET)
-  require += info_cray[argc];
-#endif
-  require += info_language_standard_default[argc];
-  require += info_language_extensions_default[argc];
-  (void)argv;
-  return require;
-}
diff --git a/build2/CMakeFiles/3.31.6/CompilerIdCXX/a.out b/build2/CMakeFiles/3.31.6/CompilerIdCXX/a.out
deleted file mode 100755
index c8ced32cf082708045baa23211fbf858c298928d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 16096
zcmeHOeQX>@6`woj!=X-macg3d(k!8=99nPAj^nz8kaO&_*T^4f;*@}ER%_qdcj7+G
z-X66pNQ2TsjBC`;3i?Npq6&ckRRRf$sMO%Js8y?i5($YQ0Wu#EK}uUAK4e1V<Gq>p
z*6ZaQ1oRIi_F3LH@Ap1t_RZ|x?C#9N$-eGrBqErq#0LdRiI_qXq&Ryw6@Vo~yVwlJ
zcZ*xa29VcDOz9JffmYF_=xSa~colH;YrsMUeyf6^21VRL<mk5+rLjRk%mtkX`mIL=
z$wB^Ws(?A`z4|nC2GZow<ByRabH5)pWwA-wFCJLU4a&=5;_Qc_JOy3ZLw6`5K2P;A
z=X_#L@V}k%8RT&a!#wDhCchx>B0uI>2h!2YZt6d&?=bnjuE{VW$nR3HV9xd32Y%GG
zWN~B0-F$@VTdN;plz--wUa>cu8EtFbn@u%kGx^d~(^Pv~Q(LQEEa)w=Vr-WN|2U?4
z295~`GmjXhQAAHFnd71E7Sf~r3)WM^-*Yd|tslBNKJntNUw+`kwO7yv+l@YGgM{&T
zh@gyRtP^ciK0X5_8r#4x+CRxjV2uO%)m6}S0;W~K%{B1+8u-nC@2U_-m?mU&%q+T=
z<C-}ulLusM$}-0@c`KWF$QG!^{I-dnzTQKfW{cjU@Au04T7}s=)NiJ2$DYU(UE3Mz
z@5~nR_K-E2wIS9-u8^nbrZTN)h#8E?Kh;wakg>fyUP{|Dn=tD*{t)}_nJ+<_qj1Ml
z#Md!jKiXD>FVXeQ_yPs2PAEO&EXM-4rYXCI0PYa31@O-i-Wb52AUqzxpC$a#K_Lmp
z4vqz;1s{%MjOmIG=dq2tMIVmimTAd{%lj=WLLO!y%s`ldFau!*!VH8N2s7|Mk%2$e
z-geD6b+y`<UH|jFLKu(EyV3Fm<J6C;Uy|)B?|%m1^6sy~v36%dpnZAwIgrL{cXkOW
zH^0$4bMa%w%x{cSzgs*!lx&`Fe$|*e@EQat*B8O`&*OUS&PQZCz|R9>%&mVO**!~c
zJyd-^mZ9oR<%QavC(-aF;$VM9+VB57vOUYj%%XAr&4b4Ir79!xvT<?Qy#)g7rU2FD
z1=TM0$M&8)&<|=+y7QQE>Od5W#>{26#+W^@0fZ}i%H{Hv6dYcbVIm{o>(!6`e|Qj-
zSU3iLGoQX{%#;>hNnXch8ngAU!IS!I@~ZKa5xG$NoTxoFA4y&Z{P{KTZ&t!pfVui-
zw?LYoTNm@9JW|OTqPvyw+2r*R=r(Ms>{G87v8f@283;2FW+2Q!n1L_@VFtnsgc%4k
z5N06E!2fdw@cY+|sCS@y@ZPaPZZea#oniPYIkMV%mEQcM?G!VG{BT@S^FCb_;$9&>
zBBaM;)^f)SPHwmlzpfH!Ib-QzD#Lfee9CfC@WF4~DrMc_=DSH_Pq}s;YbkoV!2#K-
z$d0P_H$wC9d(_Zd<?;i-Q^4`fg9{v9SBR0ta`|cC_$?MG^3V|xnTkbr)NHJN96pF4
zj%yAY!Tt_3=-Md1<lPR%R`_3hvs{+ImRR?eh7Z-=^kDT#ad7)R@7s4fenyo3Snnma
zLl6jKy72!4i2Dr$l3QY*jdpI{5IqYuBM?%UfiMGM2Eq)483;2FW+2Q!n1L_@VFupb
z4DfnIUZ2Qo0Oi9AR8_;((fY;BB>$AwIlhZzUI)2@WPXI%PBO2D#OEF)*8gR>TtNBT
zw3v|B2&VC&4G7mIB3&Z=JCrC+6TgXg1Mzy|%*aj5(>lbBq=-{R+>UlSaaimriR0Zy
zGTZ&VtlA6a5?Ur%EhdK#+$(zN36GcZ{1)ka{zfv#qwsGZ<MrYHWkg<=s%a_^uRG;+
zro66{*OB&gcHXNs9vdy?-I4|m`tXF`)K-#W%ZZj&J>I&9;2Sp#yJ4O9V>xJr{SpDq
zW7MG<8Q}WjO7_@qQL#l#(zqpap%H#IfbS!muLHL4g+fF$i1vg+uzg6l8ao0{_dKp8
z2!~I>Ki13F72~I&5D_;EzD^kbIut6k|D3dsiG-#sTNHx`mF+J89)XqIr{6<{K2|CI
zucSR(ErId!d+E2;TZhkKu1WiMde;%-F-S-q3qIZixaO0&cwFM!gh()=crV~FvCYdf
zYYzin7p)b1zhV4-vJb`?lkwSVg*$+6jcyY>u37Ui;!v~D6hfD&_=3c@iQxL{rwI?P
zr+xwO7>tudf+H*b0N`~n9uhR(<U1r#y-0ClWY7153lxXP8%O&E#o0smUHQ%kl(;_y
z&nsyE2E}g-#IK2Zr^=xvzXR}Hs}Lo00A3e`yKLZk=>dEz^p}=UcHDk(bj)#^^#ZKG
zw?;FjYfT6Mif(CqTptrFtMyGcXO7`|{UTVV3g$$%FluGZlv{9$rd65}_>M7ayLL*C
zSGK^N0vXeC9BbON^R6>3#vLnXo2gPRHw`X6$plMxm1$?c^>MrN`0-A9li8cn$0jF*
z`O&`SmP~%Uz;7-gPWO?H{-l{4=rUm+LDxqHI{JG%0ftwfX3`+7(RD<aJ$-|RI{M7P
z?(U<>A#<qXP+t-}g4-Mtyqn=)?O?D|mTL)lmJkI6wVeTk)q5MvRIy;D;q@r)d*~em
zt5ha$mWp;t$W!5Wt4hjR`H7M>VVnQ_-c&#y$%o(YLS>`HB2`SgG+?6zr9+1I0tR2v
z-eA|o>a8ALN^paR>?_q&eE%ziUYyRk)+lh-Q9RA1Odj@qObR_;aBY1eU(zR?!ldoE
z(>`dllz~k<nG``ChkBcEP)hT(RZI&#HJyhl6n7n^p%>Sy1QT?Qowd+G=s2W=KABYq
zeWCyb7ji0e9G75Oko~9IX&Q;?6!^2G{MC?D9$bdtRxUFJ&B5;1A^Spy-pIiauW)((
z+Yrvr;MU;1<qz(+<M|l}Mq59<7X+L`!R0S$t$k&r_U3skw?V=0AKYJt@74Xp_hZKJ
z_t@{x^8w}>8xjxte;Dw;!W@j-&+|^^TtCk{z55!)vw-8All^&K%KUM%!!}~>*q`T<
z8NhG~!~Q(aWqulTehTLQ6QIO7Cj0Zek~z=Ux&3U%`~>*poRwvsw=$1Y<-zuIo93W^
zIc0yIM>FSnG}j+I|1X0to)hc6-xd0O;pYc1kreE|uK?=z*T|1KiR8WVv&Hx`0slBD
zn6n)RV43;10{#h7F#lqp!`P4GeJ9}0^BU&-e8u*`^Z!2ibN+=!mc(Brkr}}(iXTD=
zo5=pJlL7O)JWEvw*8gLG{r*ej&-}@NKleYwKZ63SY4!F+@_d;0V+QS6X8v37t@Ziy
z{ClYhKp?hL(u&OZTcE(PM~@LJ^Iup$i!@LDhvOfK{kR{$1{j*KKR;K_??r1N67slm
zV1MRIpz`~B4sqqvzTzrN?8opj6cFS3dEVDf{y}>>9d;L003b%@9?t%EdWb5pzn}Bi
z@tdY8Am0b^I>u)eZV%u8HUY+M_xmUCV=B;nf#6)P(&C)6vi}+UVF9WMI0QuT55M$T
ASpWb4

diff --git a/build2/CMakeFiles/CMakeConfigureLog.yaml b/build2/CMakeFiles/CMakeConfigureLog.yaml
deleted file mode 100644
index 0c5487522..000000000
--- a/build2/CMakeFiles/CMakeConfigureLog.yaml
+++ /dev/null
@@ -1,294 +0,0 @@
-
----
-events:
-  -
-    kind: "message-v1"
-    backtrace:
-      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineSystem.cmake:205 (message)"
-      - "CMakeLists.txt:5 (project)"
-    message: |
-      The system is: Linux - 6.11.0-1018-azure - x86_64
-  -
-    kind: "message-v1"
-    backtrace:
-      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCompilerId.cmake:17 (message)"
-      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCompilerId.cmake:64 (__determine_compiler_id_test)"
-      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCXXCompiler.cmake:126 (CMAKE_DETERMINE_COMPILER_ID)"
-      - "CMakeLists.txt:5 (project)"
-    message: |
-      Compiling the CXX compiler identification source file "CMakeCXXCompilerId.cpp" succeeded.
-      Compiler: /usr/bin/c++ 
-      Build flags: 
-      Id flags:  
-      
-      The output was:
-      0
-      
-      
-      Compilation of the CXX compiler identification source "CMakeCXXCompilerId.cpp" produced "a.out"
-      
-      The CXX compiler identification is GNU, found in:
-        /home/runner/work/mscclpp/mscclpp/build2/CMakeFiles/3.31.6/CompilerIdCXX/a.out
-      
-  -
-    kind: "try_compile-v1"
-    backtrace:
-      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCompilerABI.cmake:74 (try_compile)"
-      - "/usr/local/share/cmake-3.31/Modules/CMakeTestCXXCompiler.cmake:26 (CMAKE_DETERMINE_COMPILER_ABI)"
-      - "CMakeLists.txt:5 (project)"
-    checks:
-      - "Detecting CXX compiler ABI info"
-    directories:
-      source: "/home/runner/work/mscclpp/mscclpp/build2/CMakeFiles/CMakeScratch/TryCompile-NUgSX3"
-      binary: "/home/runner/work/mscclpp/mscclpp/build2/CMakeFiles/CMakeScratch/TryCompile-NUgSX3"
-    cmakeVariables:
-      CMAKE_CXX_FLAGS: ""
-      CMAKE_CXX_FLAGS_DEBUG: "-g"
-      CMAKE_CXX_SCAN_FOR_MODULES: "OFF"
-      CMAKE_EXE_LINKER_FLAGS: ""
-    buildResult:
-      variable: "CMAKE_CXX_ABI_COMPILED"
-      cached: true
-      stdout: |
-        Change Dir: '/home/runner/work/mscclpp/mscclpp/build2/CMakeFiles/CMakeScratch/TryCompile-NUgSX3'
-        
-        Run Build Command(s): /usr/local/bin/cmake -E env VERBOSE=1 /usr/bin/gmake -f Makefile cmTC_68918/fast
-        /usr/bin/gmake  -f CMakeFiles/cmTC_68918.dir/build.make CMakeFiles/cmTC_68918.dir/build
-        gmake[1]: Entering directory '/home/runner/work/mscclpp/mscclpp/build2/CMakeFiles/CMakeScratch/TryCompile-NUgSX3'
-        Building CXX object CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o
-        /usr/bin/c++   -v -o CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o -c /usr/local/share/cmake-3.31/Modules/CMakeCXXCompilerABI.cpp
-        Using built-in specs.
-        COLLECT_GCC=/usr/bin/c++
-        OFFLOAD_TARGET_NAMES=nvptx-none:amdgcn-amdhsa
-        OFFLOAD_TARGET_DEFAULT=1
-        Target: x86_64-linux-gnu
-        Configured with: ../src/configure -v --with-pkgversion='Ubuntu 13.3.0-6ubuntu2~24.04' --with-bugurl=file:///usr/share/doc/gcc-13/README.Bugs --enable-languages=c,ada,c++,go,d,fortran,objc,obj-c++,m2 --prefix=/usr --with-gcc-major-version-only --program-suffix=-13 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/libexec --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --enable-bootstrap --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-libstdcxx-backtrace --enable-gnu-unique-object --disable-vtable-verify --enable-plugin --enable-default-pie --with-system-zlib --enable-libphobos-checking=release --with-target-system-zlib=auto --enable-objc-gc=auto --enable-multiarch --disable-werror --enable-cet --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-nvptx/usr,amdgcn-amdhsa=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-gcn/usr --enable-offload-defaulted --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu --with-build-config=bootstrap-lto-lean --enable-link-serialization=2
-        Thread model: posix
-        Supported LTO compression algorithms: zlib zstd
-        gcc version 13.3.0 (Ubuntu 13.3.0-6ubuntu2~24.04) 
-        COLLECT_GCC_OPTIONS='-v' '-o' 'CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o' '-c' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'CMakeFiles/cmTC_68918.dir/'
-         /usr/libexec/gcc/x86_64-linux-gnu/13/cc1plus -quiet -v -imultiarch x86_64-linux-gnu -D_GNU_SOURCE /usr/local/share/cmake-3.31/Modules/CMakeCXXCompilerABI.cpp -quiet -dumpdir CMakeFiles/cmTC_68918.dir/ -dumpbase CMakeCXXCompilerABI.cpp.cpp -dumpbase-ext .cpp -mtune=generic -march=x86-64 -version -fasynchronous-unwind-tables -fstack-protector-strong -Wformat -Wformat-security -fstack-clash-protection -fcf-protection -o /tmp/ccqGcDxl.s
-        GNU C++17 (Ubuntu 13.3.0-6ubuntu2~24.04) version 13.3.0 (x86_64-linux-gnu)
-        	compiled by GNU C version 13.3.0, GMP version 6.3.0, MPFR version 4.2.1, MPC version 1.3.1, isl version isl-0.26-GMP
-        
-        GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
-        ignoring duplicate directory "/usr/include/x86_64-linux-gnu/c++/13"
-        ignoring nonexistent directory "/usr/local/include/x86_64-linux-gnu"
-        ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/13/include-fixed/x86_64-linux-gnu"
-        ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/13/include-fixed"
-        ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/13/../../../../x86_64-linux-gnu/include"
-        #include "..." search starts here:
-        #include <...> search starts here:
-         /usr/include/c++/13
-         /usr/include/x86_64-linux-gnu/c++/13
-         /usr/include/c++/13/backward
-         /usr/lib/gcc/x86_64-linux-gnu/13/include
-         /usr/local/include
-         /usr/include/x86_64-linux-gnu
-         /usr/include
-        End of search list.
-        Compiler executable checksum: c81c05345ce537099dafd5580045814a
-        COLLECT_GCC_OPTIONS='-v' '-o' 'CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o' '-c' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'CMakeFiles/cmTC_68918.dir/'
-         as -v --64 -o CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o /tmp/ccqGcDxl.s
-        GNU assembler version 2.42 (x86_64-linux-gnu) using BFD version (GNU Binutils for Ubuntu) 2.42
-        COMPILER_PATH=/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/
-        LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../:/lib/:/usr/lib/
-        COLLECT_GCC_OPTIONS='-v' '-o' 'CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o' '-c' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.'
-        Linking CXX executable cmTC_68918
-        /usr/local/bin/cmake -E cmake_link_script CMakeFiles/cmTC_68918.dir/link.txt --verbose=1
-        Using built-in specs.
-        COLLECT_GCC=/usr/bin/c++
-        COLLECT_LTO_WRAPPER=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper
-        OFFLOAD_TARGET_NAMES=nvptx-none:amdgcn-amdhsa
-        OFFLOAD_TARGET_DEFAULT=1
-        Target: x86_64-linux-gnu
-        Configured with: ../src/configure -v --with-pkgversion='Ubuntu 13.3.0-6ubuntu2~24.04' --with-bugurl=file:///usr/share/doc/gcc-13/README.Bugs --enable-languages=c,ada,c++,go,d,fortran,objc,obj-c++,m2 --prefix=/usr --with-gcc-major-version-only --program-suffix=-13 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/libexec --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --enable-bootstrap --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-libstdcxx-backtrace --enable-gnu-unique-object --disable-vtable-verify --enable-plugin --enable-default-pie --with-system-zlib --enable-libphobos-checking=release --with-target-system-zlib=auto --enable-objc-gc=auto --enable-multiarch --disable-werror --enable-cet --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-nvptx/usr,amdgcn-amdhsa=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-gcn/usr --enable-offload-defaulted --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu --with-build-config=bootstrap-lto-lean --enable-link-serialization=2
-        Thread model: posix
-        Supported LTO compression algorithms: zlib zstd
-        gcc version 13.3.0 (Ubuntu 13.3.0-6ubuntu2~24.04) 
-        COMPILER_PATH=/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/
-        LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../:/lib/:/usr/lib/
-        COLLECT_GCC_OPTIONS='-v' '-o' 'cmTC_68918' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'cmTC_68918.'
-         /usr/libexec/gcc/x86_64-linux-gnu/13/collect2 -plugin /usr/libexec/gcc/x86_64-linux-gnu/13/liblto_plugin.so -plugin-opt=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper -plugin-opt=-fresolution=/tmp/ccE7OB0z.res -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -dynamic-linker /lib64/ld-linux-x86-64.so.2 -pie -z now -z relro -o cmTC_68918 /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o -L/usr/lib/gcc/x86_64-linux-gnu/13 -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/13/../../.. -v CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o
-        collect2 version 13.3.0
-        /usr/bin/ld -plugin /usr/libexec/gcc/x86_64-linux-gnu/13/liblto_plugin.so -plugin-opt=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper -plugin-opt=-fresolution=/tmp/ccE7OB0z.res -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -dynamic-linker /lib64/ld-linux-x86-64.so.2 -pie -z now -z relro -o cmTC_68918 /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o -L/usr/lib/gcc/x86_64-linux-gnu/13 -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/13/../../.. -v CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o
-        GNU ld (GNU Binutils for Ubuntu) 2.42
-        COLLECT_GCC_OPTIONS='-v' '-o' 'cmTC_68918' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'cmTC_68918.'
-        /usr/bin/c++  -v -Wl,-v CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o -o cmTC_68918
-        gmake[1]: Leaving directory '/home/runner/work/mscclpp/mscclpp/build2/CMakeFiles/CMakeScratch/TryCompile-NUgSX3'
-        
-      exitCode: 0
-  -
-    kind: "message-v1"
-    backtrace:
-      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCompilerABI.cmake:182 (message)"
-      - "/usr/local/share/cmake-3.31/Modules/CMakeTestCXXCompiler.cmake:26 (CMAKE_DETERMINE_COMPILER_ABI)"
-      - "CMakeLists.txt:5 (project)"
-    message: |
-      Parsed CXX implicit include dir info: rv=done
-        found start of include info
-        found start of implicit include info
-          add: [/usr/include/c++/13]
-          add: [/usr/include/x86_64-linux-gnu/c++/13]
-          add: [/usr/include/c++/13/backward]
-          add: [/usr/lib/gcc/x86_64-linux-gnu/13/include]
-          add: [/usr/local/include]
-          add: [/usr/include/x86_64-linux-gnu]
-          add: [/usr/include]
-        end of search list found
-        collapse include dir [/usr/include/c++/13] ==> [/usr/include/c++/13]
-        collapse include dir [/usr/include/x86_64-linux-gnu/c++/13] ==> [/usr/include/x86_64-linux-gnu/c++/13]
-        collapse include dir [/usr/include/c++/13/backward] ==> [/usr/include/c++/13/backward]
-        collapse include dir [/usr/lib/gcc/x86_64-linux-gnu/13/include] ==> [/usr/lib/gcc/x86_64-linux-gnu/13/include]
-        collapse include dir [/usr/local/include] ==> [/usr/local/include]
-        collapse include dir [/usr/include/x86_64-linux-gnu] ==> [/usr/include/x86_64-linux-gnu]
-        collapse include dir [/usr/include] ==> [/usr/include]
-        implicit include dirs: [/usr/include/c++/13;/usr/include/x86_64-linux-gnu/c++/13;/usr/include/c++/13/backward;/usr/lib/gcc/x86_64-linux-gnu/13/include;/usr/local/include;/usr/include/x86_64-linux-gnu;/usr/include]
-      
-      
-  -
-    kind: "message-v1"
-    backtrace:
-      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCompilerABI.cmake:218 (message)"
-      - "/usr/local/share/cmake-3.31/Modules/CMakeTestCXXCompiler.cmake:26 (CMAKE_DETERMINE_COMPILER_ABI)"
-      - "CMakeLists.txt:5 (project)"
-    message: |
-      Parsed CXX implicit link information:
-        link line regex: [^( *|.*[/\\])(ld[0-9]*(\\.[a-z]+)?|CMAKE_LINK_STARTFILE-NOTFOUND|([^/\\]+-)?ld|collect2)[^/\\]*( |$)]
-        linker tool regex: [^[ 	]*(->|")?[ 	]*(([^"]*[/\\])?(ld[0-9]*(\\.[a-z]+)?))("|,| |$)]
-        ignore line: [Change Dir: '/home/runner/work/mscclpp/mscclpp/build2/CMakeFiles/CMakeScratch/TryCompile-NUgSX3']
-        ignore line: []
-        ignore line: [Run Build Command(s): /usr/local/bin/cmake -E env VERBOSE=1 /usr/bin/gmake -f Makefile cmTC_68918/fast]
-        ignore line: [/usr/bin/gmake  -f CMakeFiles/cmTC_68918.dir/build.make CMakeFiles/cmTC_68918.dir/build]
-        ignore line: [gmake[1]: Entering directory '/home/runner/work/mscclpp/mscclpp/build2/CMakeFiles/CMakeScratch/TryCompile-NUgSX3']
-        ignore line: [Building CXX object CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o]
-        ignore line: [/usr/bin/c++   -v -o CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o -c /usr/local/share/cmake-3.31/Modules/CMakeCXXCompilerABI.cpp]
-        ignore line: [Using built-in specs.]
-        ignore line: [COLLECT_GCC=/usr/bin/c++]
-        ignore line: [OFFLOAD_TARGET_NAMES=nvptx-none:amdgcn-amdhsa]
-        ignore line: [OFFLOAD_TARGET_DEFAULT=1]
-        ignore line: [Target: x86_64-linux-gnu]
-        ignore line: [Configured with: ../src/configure -v --with-pkgversion='Ubuntu 13.3.0-6ubuntu2~24.04' --with-bugurl=file:///usr/share/doc/gcc-13/README.Bugs --enable-languages=c ada c++ go d fortran objc obj-c++ m2 --prefix=/usr --with-gcc-major-version-only --program-suffix=-13 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/libexec --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --enable-bootstrap --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-libstdcxx-backtrace --enable-gnu-unique-object --disable-vtable-verify --enable-plugin --enable-default-pie --with-system-zlib --enable-libphobos-checking=release --with-target-system-zlib=auto --enable-objc-gc=auto --enable-multiarch --disable-werror --enable-cet --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32 m64 mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-nvptx/usr amdgcn-amdhsa=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-gcn/usr --enable-offload-defaulted --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu --with-build-config=bootstrap-lto-lean --enable-link-serialization=2]
-        ignore line: [Thread model: posix]
-        ignore line: [Supported LTO compression algorithms: zlib zstd]
-        ignore line: [gcc version 13.3.0 (Ubuntu 13.3.0-6ubuntu2~24.04) ]
-        ignore line: [COLLECT_GCC_OPTIONS='-v' '-o' 'CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o' '-c' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'CMakeFiles/cmTC_68918.dir/']
-        ignore line: [ /usr/libexec/gcc/x86_64-linux-gnu/13/cc1plus -quiet -v -imultiarch x86_64-linux-gnu -D_GNU_SOURCE /usr/local/share/cmake-3.31/Modules/CMakeCXXCompilerABI.cpp -quiet -dumpdir CMakeFiles/cmTC_68918.dir/ -dumpbase CMakeCXXCompilerABI.cpp.cpp -dumpbase-ext .cpp -mtune=generic -march=x86-64 -version -fasynchronous-unwind-tables -fstack-protector-strong -Wformat -Wformat-security -fstack-clash-protection -fcf-protection -o /tmp/ccqGcDxl.s]
-        ignore line: [GNU C++17 (Ubuntu 13.3.0-6ubuntu2~24.04) version 13.3.0 (x86_64-linux-gnu)]
-        ignore line: [	compiled by GNU C version 13.3.0  GMP version 6.3.0  MPFR version 4.2.1  MPC version 1.3.1  isl version isl-0.26-GMP]
-        ignore line: []
-        ignore line: [GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072]
-        ignore line: [ignoring duplicate directory "/usr/include/x86_64-linux-gnu/c++/13"]
-        ignore line: [ignoring nonexistent directory "/usr/local/include/x86_64-linux-gnu"]
-        ignore line: [ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/13/include-fixed/x86_64-linux-gnu"]
-        ignore line: [ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/13/include-fixed"]
-        ignore line: [ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/13/../../../../x86_64-linux-gnu/include"]
-        ignore line: [#include "..." search starts here:]
-        ignore line: [#include <...> search starts here:]
-        ignore line: [ /usr/include/c++/13]
-        ignore line: [ /usr/include/x86_64-linux-gnu/c++/13]
-        ignore line: [ /usr/include/c++/13/backward]
-        ignore line: [ /usr/lib/gcc/x86_64-linux-gnu/13/include]
-        ignore line: [ /usr/local/include]
-        ignore line: [ /usr/include/x86_64-linux-gnu]
-        ignore line: [ /usr/include]
-        ignore line: [End of search list.]
-        ignore line: [Compiler executable checksum: c81c05345ce537099dafd5580045814a]
-        ignore line: [COLLECT_GCC_OPTIONS='-v' '-o' 'CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o' '-c' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'CMakeFiles/cmTC_68918.dir/']
-        ignore line: [ as -v --64 -o CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o /tmp/ccqGcDxl.s]
-        ignore line: [GNU assembler version 2.42 (x86_64-linux-gnu) using BFD version (GNU Binutils for Ubuntu) 2.42]
-        ignore line: [COMPILER_PATH=/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/]
-        ignore line: [LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../:/lib/:/usr/lib/]
-        ignore line: [COLLECT_GCC_OPTIONS='-v' '-o' 'CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o' '-c' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.']
-        ignore line: [Linking CXX executable cmTC_68918]
-        ignore line: [/usr/local/bin/cmake -E cmake_link_script CMakeFiles/cmTC_68918.dir/link.txt --verbose=1]
-        ignore line: [Using built-in specs.]
-        ignore line: [COLLECT_GCC=/usr/bin/c++]
-        ignore line: [COLLECT_LTO_WRAPPER=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper]
-        ignore line: [OFFLOAD_TARGET_NAMES=nvptx-none:amdgcn-amdhsa]
-        ignore line: [OFFLOAD_TARGET_DEFAULT=1]
-        ignore line: [Target: x86_64-linux-gnu]
-        ignore line: [Configured with: ../src/configure -v --with-pkgversion='Ubuntu 13.3.0-6ubuntu2~24.04' --with-bugurl=file:///usr/share/doc/gcc-13/README.Bugs --enable-languages=c ada c++ go d fortran objc obj-c++ m2 --prefix=/usr --with-gcc-major-version-only --program-suffix=-13 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/libexec --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --enable-bootstrap --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-libstdcxx-backtrace --enable-gnu-unique-object --disable-vtable-verify --enable-plugin --enable-default-pie --with-system-zlib --enable-libphobos-checking=release --with-target-system-zlib=auto --enable-objc-gc=auto --enable-multiarch --disable-werror --enable-cet --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32 m64 mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-nvptx/usr amdgcn-amdhsa=/build/gcc-13-fG75Ri/gcc-13-13.3.0/debian/tmp-gcn/usr --enable-offload-defaulted --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu --with-build-config=bootstrap-lto-lean --enable-link-serialization=2]
-        ignore line: [Thread model: posix]
-        ignore line: [Supported LTO compression algorithms: zlib zstd]
-        ignore line: [gcc version 13.3.0 (Ubuntu 13.3.0-6ubuntu2~24.04) ]
-        ignore line: [COMPILER_PATH=/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/13/:/usr/libexec/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/]
-        ignore line: [LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/13/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/13/../../../:/lib/:/usr/lib/]
-        ignore line: [COLLECT_GCC_OPTIONS='-v' '-o' 'cmTC_68918' '-shared-libgcc' '-mtune=generic' '-march=x86-64' '-dumpdir' 'cmTC_68918.']
-        link line: [ /usr/libexec/gcc/x86_64-linux-gnu/13/collect2 -plugin /usr/libexec/gcc/x86_64-linux-gnu/13/liblto_plugin.so -plugin-opt=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper -plugin-opt=-fresolution=/tmp/ccE7OB0z.res -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -dynamic-linker /lib64/ld-linux-x86-64.so.2 -pie -z now -z relro -o cmTC_68918 /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o -L/usr/lib/gcc/x86_64-linux-gnu/13 -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/13/../../.. -v CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o]
-          arg [/usr/libexec/gcc/x86_64-linux-gnu/13/collect2] ==> ignore
-          arg [-plugin] ==> ignore
-          arg [/usr/libexec/gcc/x86_64-linux-gnu/13/liblto_plugin.so] ==> ignore
-          arg [-plugin-opt=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper] ==> ignore
-          arg [-plugin-opt=-fresolution=/tmp/ccE7OB0z.res] ==> ignore
-          arg [-plugin-opt=-pass-through=-lgcc_s] ==> ignore
-          arg [-plugin-opt=-pass-through=-lgcc] ==> ignore
-          arg [-plugin-opt=-pass-through=-lc] ==> ignore
-          arg [-plugin-opt=-pass-through=-lgcc_s] ==> ignore
-          arg [-plugin-opt=-pass-through=-lgcc] ==> ignore
-          arg [--build-id] ==> ignore
-          arg [--eh-frame-hdr] ==> ignore
-          arg [-m] ==> ignore
-          arg [elf_x86_64] ==> ignore
-          arg [--hash-style=gnu] ==> ignore
-          arg [--as-needed] ==> ignore
-          arg [-dynamic-linker] ==> ignore
-          arg [/lib64/ld-linux-x86-64.so.2] ==> ignore
-          arg [-pie] ==> ignore
-          arg [-znow] ==> ignore
-          arg [-zrelro] ==> ignore
-          arg [-o] ==> ignore
-          arg [cmTC_68918] ==> ignore
-          arg [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o] ==> obj [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o]
-          arg [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o] ==> obj [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o]
-          arg [/usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o] ==> obj [/usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o]
-          arg [-L/usr/lib/gcc/x86_64-linux-gnu/13] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/13]
-          arg [-L/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu]
-          arg [-L/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib]
-          arg [-L/lib/x86_64-linux-gnu] ==> dir [/lib/x86_64-linux-gnu]
-          arg [-L/lib/../lib] ==> dir [/lib/../lib]
-          arg [-L/usr/lib/x86_64-linux-gnu] ==> dir [/usr/lib/x86_64-linux-gnu]
-          arg [-L/usr/lib/../lib] ==> dir [/usr/lib/../lib]
-          arg [-L/usr/lib/gcc/x86_64-linux-gnu/13/../../..] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/13/../../..]
-          arg [-v] ==> ignore
-          arg [CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o] ==> ignore
-          arg [-lstdc++] ==> lib [stdc++]
-          arg [-lm] ==> lib [m]
-          arg [-lgcc_s] ==> lib [gcc_s]
-          arg [-lgcc] ==> lib [gcc]
-          arg [-lc] ==> lib [c]
-          arg [-lgcc_s] ==> lib [gcc_s]
-          arg [-lgcc] ==> lib [gcc]
-          arg [/usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o] ==> obj [/usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o]
-          arg [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o] ==> obj [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o]
-        ignore line: [collect2 version 13.3.0]
-        ignore line: [/usr/bin/ld -plugin /usr/libexec/gcc/x86_64-linux-gnu/13/liblto_plugin.so -plugin-opt=/usr/libexec/gcc/x86_64-linux-gnu/13/lto-wrapper -plugin-opt=-fresolution=/tmp/ccE7OB0z.res -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -dynamic-linker /lib64/ld-linux-x86-64.so.2 -pie -z now -z relro -o cmTC_68918 /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o -L/usr/lib/gcc/x86_64-linux-gnu/13 -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/13/../../.. -v CMakeFiles/cmTC_68918.dir/CMakeCXXCompilerABI.cpp.o -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o /usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o]
-        linker tool for 'CXX': /usr/bin/ld
-        collapse obj [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/Scrt1.o] ==> [/usr/lib/x86_64-linux-gnu/Scrt1.o]
-        collapse obj [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crti.o] ==> [/usr/lib/x86_64-linux-gnu/crti.o]
-        collapse obj [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu/crtn.o] ==> [/usr/lib/x86_64-linux-gnu/crtn.o]
-        collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/13] ==> [/usr/lib/gcc/x86_64-linux-gnu/13]
-        collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/13/../../../x86_64-linux-gnu] ==> [/usr/lib/x86_64-linux-gnu]
-        collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/13/../../../../lib] ==> [/usr/lib]
-        collapse library dir [/lib/x86_64-linux-gnu] ==> [/lib/x86_64-linux-gnu]
-        collapse library dir [/lib/../lib] ==> [/lib]
-        collapse library dir [/usr/lib/x86_64-linux-gnu] ==> [/usr/lib/x86_64-linux-gnu]
-        collapse library dir [/usr/lib/../lib] ==> [/usr/lib]
-        collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/13/../../..] ==> [/usr/lib]
-        implicit libs: [stdc++;m;gcc_s;gcc;c;gcc_s;gcc]
-        implicit objs: [/usr/lib/x86_64-linux-gnu/Scrt1.o;/usr/lib/x86_64-linux-gnu/crti.o;/usr/lib/gcc/x86_64-linux-gnu/13/crtbeginS.o;/usr/lib/gcc/x86_64-linux-gnu/13/crtendS.o;/usr/lib/x86_64-linux-gnu/crtn.o]
-        implicit dirs: [/usr/lib/gcc/x86_64-linux-gnu/13;/usr/lib/x86_64-linux-gnu;/usr/lib;/lib/x86_64-linux-gnu;/lib]
-        implicit fwks: []
-      
-      
-  -
-    kind: "message-v1"
-    backtrace:
-      - "/usr/local/share/cmake-3.31/Modules/Internal/CMakeDetermineLinkerId.cmake:40 (message)"
-      - "/usr/local/share/cmake-3.31/Modules/CMakeDetermineCompilerABI.cmake:255 (cmake_determine_linker_id)"
-      - "/usr/local/share/cmake-3.31/Modules/CMakeTestCXXCompiler.cmake:26 (CMAKE_DETERMINE_COMPILER_ABI)"
-      - "CMakeLists.txt:5 (project)"
-    message: |
-      Running the CXX compiler's linker: "/usr/bin/ld" "-v"
-      GNU ld (GNU Binutils for Ubuntu) 2.42
-...
diff --git a/build2/CMakeFiles/cmake.check_cache b/build2/CMakeFiles/cmake.check_cache
deleted file mode 100644
index 3dccd7317..000000000
--- a/build2/CMakeFiles/cmake.check_cache
+++ /dev/null
@@ -1 +0,0 @@
-# This file is generated by cmake for dependency checking of the CMakeCache.txt file
diff --git a/build2/include/mscclpp/version.hpp b/build2/include/mscclpp/version.hpp
deleted file mode 100644
index 0ec54ad62..000000000
--- a/build2/include/mscclpp/version.hpp
+++ /dev/null
@@ -1,13 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#ifndef MSCCLPP_VERSION_HPP_
-#define MSCCLPP_VERSION_HPP_
-
-#define MSCCLPP_MAJOR 0
-#define MSCCLPP_MINOR 8
-#define MSCCLPP_PATCH 0
-#define MSCCLPP_VERSION (MSCCLPP_MAJOR * 10000 + MSCCLPP_MINOR * 100 + MSCCLPP_PATCH)
-#define MSCCLPP_GIT_COMMIT "305d15717edc"
-
-#endif  // MSCCLPP_VERSION_HPP_

From 50f6a24b6990d73e6113b051b23d177f43f5f700 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 11 Feb 2026 03:16:00 +0000
Subject: [PATCH 019/132] Remove test/perf/ directory completely
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove test/perf/ directory (fifo_test.cu, framework.{cc,hpp}, CMakeLists.txt)
- Remove add_subdirectory(perf) from test/CMakeLists.txt
- Performance tests now integrated into unit_tests as fifo_perf_tests.cu
- Fix mp_unit_tests.cc to use framework functions without ::testing:: namespace
- Fix bootstrap_tests.cc ErrorCode comparison to use ASSERT_TRUE
- Fix switch_channel_tests.cu to not use streaming with ASSERT_EQ
- Add missing #include <unistd.h> to executor_tests.cc

All perf test functionality is now in unit_tests and can be filtered
with --exclude-perf-tests flag. The standalone test/perf/ directory
is no longer needed.

Verified builds:
- unit_tests: ✅
- mp_unit_tests: ✅

Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
---
 test/CMakeLists.txt                  |   3 -
 test/mp_unit/bootstrap_tests.cc      |   2 +-
 test/mp_unit/executor_tests.cc       |   1 +
 test/mp_unit/mp_unit_tests.cc        |   4 +-
 test/mp_unit/switch_channel_tests.cu |   5 +-
 test/perf/CMakeLists.txt             |  44 ----
 test/perf/fifo_test.cu               | 298 ---------------------------
 test/perf/framework.cc               |  88 --------
 test/perf/framework.hpp              |  32 ---
 test/unit/fifo_perf_tests.cu         |   4 +-
 10 files changed, 10 insertions(+), 471 deletions(-)
 delete mode 100644 test/perf/CMakeLists.txt
 delete mode 100644 test/perf/fifo_test.cu
 delete mode 100644 test/perf/framework.cc
 delete mode 100644 test/perf/framework.hpp

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 6b6941487..8c3c41499 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -59,6 +59,3 @@ add_test(NAME mp_unit_tests COMMAND ${CMAKE_CURRENT_BINARY_DIR}/run_mpi_test.sh
 
 # mscclpp-test
 add_subdirectory(mscclpp-test)
-
-# Performance tests
-add_subdirectory(perf)
diff --git a/test/mp_unit/bootstrap_tests.cc b/test/mp_unit/bootstrap_tests.cc
index 4bbab2f18..56bcf78ff 100644
--- a/test/mp_unit/bootstrap_tests.cc
+++ b/test/mp_unit/bootstrap_tests.cc
@@ -99,7 +99,7 @@ TEST_F(BootstrapTest, TimeoutWithId) {
     // Set bootstrap timeout to 1 second
     bootstrap->initialize(id, 1);
   } catch (const mscclpp::Error& e) {
-    ASSERT_EQ(e.getErrorCode(), mscclpp::ErrorCode::Timeout);
+    ASSERT_TRUE(e.getErrorCode() == mscclpp::ErrorCode::Timeout);
   }
 
   // Timeout should be sligtly greater than 1 second
diff --git a/test/mp_unit/executor_tests.cc b/test/mp_unit/executor_tests.cc
index a903ed08d..329d80814 100644
--- a/test/mp_unit/executor_tests.cc
+++ b/test/mp_unit/executor_tests.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT license.
 
 #include <mpi.h>
+#include <unistd.h>
 
 #include <filesystem>
 #include <mscclpp/env.hpp>
diff --git a/test/mp_unit/mp_unit_tests.cc b/test/mp_unit/mp_unit_tests.cc
index cafd9bbca..f610822e5 100644
--- a/test/mp_unit/mp_unit_tests.cc
+++ b/test/mp_unit/mp_unit_tests.cc
@@ -128,9 +128,9 @@ void MultiProcessTest::TearDown() {
 }
 
 int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
+  InitGoogleTest(&argc, argv);
   gEnv = new MultiProcessTestEnv(argc, (const char**)argv);
-  ::testing::AddGlobalTestEnvironment(gEnv);
+  AddGlobalTestEnvironment(gEnv);
   return RUN_ALL_TESTS();
 }
 
diff --git a/test/mp_unit/switch_channel_tests.cu b/test/mp_unit/switch_channel_tests.cu
index 44f4ebedd..c75a9b3a5 100644
--- a/test/mp_unit/switch_channel_tests.cu
+++ b/test/mp_unit/switch_channel_tests.cu
@@ -66,5 +66,8 @@ TEST_F(SwitchChannelTest, SimpleAllReduce) {
   for (int i = 0; i < numRanksToUse; i++) {
     expected += i + 1.0f;
   }
-  ASSERT_EQ(result, expected) << "Expected " << expected << " but got " << result << " for rank " << gEnv->rank;
+  if (result != expected) {
+    std::cerr << "Expected " << expected << " but got " << result << " for rank " << gEnv->rank << std::endl;
+  }
+  ASSERT_EQ(result, expected);
 }
diff --git a/test/perf/CMakeLists.txt b/test/perf/CMakeLists.txt
deleted file mode 100644
index caee29f07..000000000
--- a/test/perf/CMakeLists.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-# Find required packages
-find_package(MPI REQUIRED)
-
-# Note: nlohmann_json::nlohmann_json target is already available from the main project
-
-# Set up common libraries and includes for tests
-set(PERF_TEST_LIBS_COMMON mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} Threads::Threads MPI::MPI_CXX)
-if(MSCCLPP_USE_IB)
-    list(APPEND PERF_TEST_LIBS_COMMON ${IBVERBS_LIBRARIES})
-endif()
-
-set(PERF_TEST_INC_COMMON 
-    PRIVATE ${PROJECT_SOURCE_DIR}/include 
-    SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
-
-# Function to add a test executable
-function(add_perf_test_executable name sources)
-    if(MSCCLPP_USE_ROCM)
-        set_source_files_properties(${sources} PROPERTIES LANGUAGE CXX)
-    endif()
-    add_executable(${name} ${sources})
-    target_link_libraries(${name} ${PERF_TEST_LIBS_COMMON} test_framework)
-    
-    # Link nlohmann_json - use the target from main project
-    target_link_libraries(${name} nlohmann_json::nlohmann_json)
-    
-    if(MSCCLPP_USE_IB)
-        target_compile_definitions(${name} PRIVATE USE_IBVERBS)
-    endif()
-    
-    target_include_directories(${name} ${PERF_TEST_INC_COMMON})
-    target_compile_definitions(${name} PRIVATE MSCCLPP_USE_MPI_FOR_TESTS)
-    
-    # Set C++ standard
-    target_compile_features(${name} PRIVATE cxx_std_17)
-
-    set_target_properties(${name} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/perf")
-endfunction()
-
-# Add FIFO test  
-add_perf_test_executable(fifo_test "framework.cc;fifo_test.cu")
diff --git a/test/perf/fifo_test.cu b/test/perf/fifo_test.cu
deleted file mode 100644
index 3e6980eb9..000000000
--- a/test/perf/fifo_test.cu
+++ /dev/null
@@ -1,298 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <getopt.h>
-
-#include <iostream>
-#include <map>
-#include <memory>
-#include <mscclpp/fifo.hpp>
-#include <mscclpp/gpu_utils.hpp>
-#include <mscclpp/numa.hpp>
-#include <sstream>
-#include <stdexcept>
-
-#include "framework.hpp"
-
-using namespace mscclpp::test;
-
-// Constants for timeout and trigger calculation
-constexpr uint64_t TIMEOUT_SPINS = 1000000;
-constexpr int MIN_TRIGGERS = 1000;
-constexpr int MIN_WARMUP_TRIGGERS = 100;
-constexpr int TRIGGERS_PER_FIFO_SIZE = 10;
-constexpr int WARMUP_TRIGGERS_PER_FIFO_SIZE = 2;
-
-__constant__ mscclpp::FifoDeviceHandle gFifoDeviceHandle;
-
-__global__ void kernelFifoPush(size_t numTriggers) {
-  mscclpp::FifoDeviceHandle& fifo = gFifoDeviceHandle;
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  mscclpp::ProxyTrigger trigger;
-  for (size_t i = 1; i <= numTriggers; ++i) {
-    trigger.fst = i;
-    trigger.snd = tid ^ i;
-    fifo.push(trigger);
-  }
-}
-
-__global__ void kernelFifoPushSync(size_t numTriggers) {
-  mscclpp::FifoDeviceHandle& fifo = gFifoDeviceHandle;
-  mscclpp::ProxyTrigger trigger;
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  for (size_t i = 1; i <= numTriggers; ++i) {
-    trigger.fst = i;
-    trigger.snd = tid ^ i;
-    fifo.sync(fifo.push(trigger));
-  }
-}
-
-static void setupCuda(int& cudaDevice, int& numaNode) {
-  CUDA_CHECK(cudaGetDevice(&cudaDevice));
-  numaNode = mscclpp::getDeviceNumaNode(cudaDevice);
-  mscclpp::numaBind(numaNode);
-}
-
-// Helper function to consume triggers from FIFO
-static bool consumeTriggers(std::unique_ptr<mscclpp::Fifo>& hostFifo, int numTriggers, int parallel) {
-  int totalTriggers = numTriggers * parallel;
-  std::unordered_map<int, int> triggerCounts;
-  for (int i = 0; i < totalTriggers; ++i) {
-    mscclpp::ProxyTrigger trigger;
-    uint64_t spin = 0;
-    do {
-      trigger = hostFifo->poll();
-      if (spin++ > TIMEOUT_SPINS) {
-        return false;
-      }
-    } while (trigger.fst == 0 || trigger.snd == 0);
-
-    // Process trigger (see src/proxy.cc)
-    trigger.snd ^= ((uint64_t)1 << (uint64_t)63);
-    trigger.snd = trigger.snd ^ trigger.fst;
-    assert(triggerCounts[trigger.snd] + 1 == trigger.fst);
-    triggerCounts[trigger.snd]++;
-    hostFifo->pop();
-  }
-  return true;
-}
-
-// Helper function to run a single kernel variant and return performance metrics
-std::tuple<double, double, int, int> runSingleKernelVariant(void (*kernel)(size_t),
-                                                            std::unique_ptr<mscclpp::Fifo>& hostFifo,
-                                                            cudaStream_t stream, int numParallel) {
-  // Calculate triggers based on FIFO size
-  const int numTriggers = std::max(MIN_TRIGGERS, static_cast<int>(hostFifo->size() * TRIGGERS_PER_FIFO_SIZE));
-  const int warmupTriggers =
-      std::max(MIN_WARMUP_TRIGGERS, static_cast<int>(hostFifo->size() * WARMUP_TRIGGERS_PER_FIFO_SIZE));
-
-  // Warmup
-  kernel<<<numParallel, 1, 0, stream>>>(warmupTriggers);
-  CUDA_CHECK(cudaGetLastError());
-
-  // Process warmup triggers (note: total triggers = warmupTriggers * numParallel)
-  if (!consumeTriggers(hostFifo, warmupTriggers, numParallel)) {
-    return {0.0, 0.0, 0, 0};  // Return error values
-  }
-  CUDA_CHECK(cudaStreamSynchronize(stream));
-
-  // Benchmark
-  utils::Timer timer;
-  timer.start();
-
-  kernel<<<numParallel, 1, 0, stream>>>(numTriggers);
-  CUDA_CHECK(cudaGetLastError());
-
-  // Process all triggers
-  if (!consumeTriggers(hostFifo, numTriggers, numParallel)) {
-    return {0.0, 0.0, 0, 0};
-  }
-  CUDA_CHECK(cudaStreamSynchronize(stream));
-
-  timer.stop();
-
-  const int totalTriggers = numTriggers * numParallel;
-  double throughput = totalTriggers / timer.elapsedSeconds();
-  double duration_us = timer.elapsedMicroseconds();
-
-  CUDA_CHECK(cudaDeviceSynchronize());
-
-  return {throughput, duration_us, totalTriggers, warmupTriggers * numParallel};
-}
-
-void runFifoTestVariant(std::unique_ptr<mscclpp::Fifo>& hostFifo, cudaStream_t stream, int numParallel,
-                        nlohmann::ordered_json& combinedMetrics) {
-  auto [pushThroughput, pushDuration, numTriggers, warmupTriggers] =
-      runSingleKernelVariant(kernelFifoPush, hostFifo, stream, numParallel);
-
-  auto [syncThroughput, syncDuration, syncNumTriggers, syncWarmupTriggers] =
-      runSingleKernelVariant(kernelFifoPushSync, hostFifo, stream, numParallel);
-
-  auto formatThroughput = [](double thru) {
-    return double(int(thru * 10)) / 10.0;  // Round to 1 decimal place
-  };
-
-  std::string prefix = "p" + std::to_string(numParallel) + "_";
-  combinedMetrics[prefix + "push_throughput"] = formatThroughput(pushThroughput);
-  combinedMetrics[prefix + "push_sync_throughput"] = formatThroughput(syncThroughput);
-  combinedMetrics[prefix + "push_duration_us"] = pushDuration;
-  combinedMetrics[prefix + "push_sync_duration_us"] = syncDuration;
-  combinedMetrics[prefix + "num_triggers"] = numTriggers;
-  combinedMetrics[prefix + "warmup_triggers"] = warmupTriggers;
-}
-
-struct FifoTestConfig {
-  int fifoSize;
-  std::vector<int> parallelismLevels;
-
-  // Constructor with default parallelism levels
-  FifoTestConfig(int size, const std::vector<int>& parallel = {1, 2, 4, 8, 16})
-      : fifoSize(size), parallelismLevels(parallel) {}
-};
-
-void runFifoTest(const FifoTestConfig& config, [[maybe_unused]] int rank, [[maybe_unused]] int worldSize,
-                 [[maybe_unused]] int localRank) {
-  if (config.fifoSize <= 0) {
-    throw std::invalid_argument("FIFO size must be positive");
-  }
-  if (config.parallelismLevels.empty()) {
-    throw std::invalid_argument("At least one parallelism level must be specified");
-  }
-
-  int cudaDevice, numaNode;
-  setupCuda(cudaDevice, numaNode);
-
-  auto hostFifo = std::make_unique<mscclpp::Fifo>(config.fifoSize);
-
-  mscclpp::FifoDeviceHandle hostHandle = hostFifo->deviceHandle();
-  CUDA_CHECK(cudaMemcpyToSymbol(gFifoDeviceHandle, &hostHandle, sizeof(mscclpp::FifoDeviceHandle)));
-
-  cudaStream_t stream;
-  CUDA_CHECK(cudaStreamCreate(&stream));
-
-  // Create test name with parallelism range
-  std::string testName = "FifoTest_Size" + std::to_string(config.fifoSize) + "_Parallel";
-
-  // Add parallelism range to test name (e.g., "P1-16" or "P1-4-16-64")
-  if (!config.parallelismLevels.empty()) {
-    testName += std::to_string(config.parallelismLevels.front());
-    if (config.parallelismLevels.size() > 1) {
-      testName += "-" + std::to_string(config.parallelismLevels.back());
-
-      // If parallelism levels have non-standard steps, include more detail
-      if (config.parallelismLevels.size() > 2 &&
-          (config.parallelismLevels[1] != 2 * config.parallelismLevels[0] || config.parallelismLevels.size() > 3)) {
-        testName = "FifoTest_Size" + std::to_string(config.fifoSize) + "_ParallelCustom";
-      }
-    }
-  }
-
-  // Print test configuration
-  if (utils::isMainRank()) {
-    std::stringstream ss;
-    ss << "Running FIFO test with size=" << config.fifoSize << ", parallelism_levels=[";
-    for (size_t i = 0; i < config.parallelismLevels.size(); ++i) {
-      if (i > 0) ss << ",";
-      ss << config.parallelismLevels[i];
-    }
-    ss << "]";
-    std::cout << ss.str() << std::endl;
-  }
-
-  nlohmann::ordered_json combinedMetrics;
-
-  for (int numParallel : config.parallelismLevels) {
-    runFifoTestVariant(hostFifo, stream, numParallel, combinedMetrics);
-  }
-
-  std::map<std::string, std::string> testParams;
-  testParams["fifo_size"] = std::to_string(static_cast<int>(hostFifo->size()));
-
-  // Add parallelism levels to test parameters
-  std::stringstream parallelismStream;
-  for (size_t i = 0; i < config.parallelismLevels.size(); ++i) {
-    if (i > 0) parallelismStream << ",";
-    parallelismStream << config.parallelismLevels[i];
-  }
-  testParams["parallelism_levels"] = parallelismStream.str();
-
-  utils::recordResult(testName, "fifo", combinedMetrics, testParams);
-
-  CUDA_CHECK(cudaStreamDestroy(stream));
-}
-
-void runAllFifoTests([[maybe_unused]] int rank, [[maybe_unused]] int worldSize, [[maybe_unused]] int localRank) {
-  // clang-format off
-  std::vector<FifoTestConfig> configs = {
-      {1, {1}},
-      {128, {1, 8, 64, 128}},
-      {512, {1, 8, 64, 256, 512}},
-  };
-  // clang-format on
-
-  for (const auto& config : configs) {
-    runFifoTest(config, rank, worldSize, localRank);
-  }
-}
-
-static void printUsage(char* argv0) {
-  std::stringstream ss;
-  ss << "Usage: " << argv0 << " [OPTIONS]\n"
-     << "\n"
-     << "Options:\n"
-     << "  -o, --output-format FORMAT   Output format: human or json (default: human)\n"
-     << "  -f, --output-file FILE       JSON output file path (default: report.jsonl)\n"
-     << "  -v, --verbose                Increase verbosity\n"
-     << "  -h, --help                   Show this help message\n";
-  std::cout << ss.str();
-}
-
-int main(int argc, char* argv[]) {
-  std::string outputFormat = "human";
-  std::string outputFile = "report.jsonl";
-  bool verbose = false;
-
-  static struct option longOptions[] = {{"output-format", required_argument, 0, 'o'},
-                                        {"output-file", required_argument, 0, 'f'},
-                                        {"verbose", no_argument, 0, 'v'},
-                                        {"help", no_argument, 0, 'h'},
-                                        {0, 0, 0, 0}};
-
-  int c;
-  while ((c = getopt_long(argc, argv, "o:f:vh", longOptions, nullptr)) != -1) {
-    switch (c) {
-      case 'o':
-        outputFormat = optarg;
-        break;
-      case 'f':
-        outputFile = optarg;
-        break;
-      case 'v':
-        verbose = true;
-        break;
-      case 'h':
-        printUsage(argv[0]);
-        return 0;
-      default:
-        printUsage(argv[0]);
-        return 1;
-    }
-  }
-
-  std::vector<std::tuple<std::string, std::string, std::function<void(int, int, int)>>> tests = {
-      {"AllFifoTests", "FIFO performance tests with multiple configurations", runAllFifoTests}};
-
-  int result = utils::runMultipleTests(argc, argv, tests);
-
-  if (utils::isMainRank()) {
-    if (outputFormat == "json") {
-      utils::writeResultsToFile(outputFile);
-    } else {
-      utils::printResults(verbose);
-    }
-  }
-
-  utils::cleanupMPI();
-
-  return result;
-}
diff --git a/test/perf/framework.cc b/test/perf/framework.cc
deleted file mode 100644
index 680444604..000000000
--- a/test/perf/framework.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include "framework.hpp"
-
-#include <iomanip>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-
-namespace mscclpp {
-namespace test {
-
-// Global state for performance test results
-static std::vector<TestResult> gPerfResults;
-
-namespace {
-std::string getCurrentTimestamp() {
-  auto now = std::chrono::system_clock::now();
-  auto time_t = std::chrono::system_clock::to_time_t(now);
-  std::stringstream ss;
-  ss << std::put_time(std::gmtime(&time_t), "%Y-%m-%dT%H:%M:%S");
-  return ss.str();
-}
-}  // namespace
-
-namespace utils {
-
-void recordResult(const std::string& testName, const std::string& testCategory, const nlohmann::ordered_json& metrics,
-                  const std::map<std::string, std::string>& testParams) {
-  TestResult result;
-  result.testName = testName;
-  result.testCategory = testCategory;
-  result.testParams = testParams;
-  result.metrics = metrics;
-  result.numProcesses = getMPISize();
-  result.processRank = getMPIRank();
-  result.timestamp = getCurrentTimestamp();
-
-  gPerfResults.push_back(result);
-}
-
-void writeResultsToFile(const std::string& filename) {
-  std::ofstream file(filename);
-  if (!file) {
-    throw std::runtime_error("Cannot open output file: " + filename);
-  }
-
-  for (const auto& result : gPerfResults) {
-    nlohmann::ordered_json j;
-    j["test_name"] = result.testName;
-    j["test_category"] = result.testCategory;
-    j["test_config"] = result.testParams;
-    j["metrics"] = result.metrics;
-    j["num_processes"] = result.numProcesses;
-    j["process_rank"] = result.processRank;
-    j["timestamp"] = result.timestamp;
-
-    file << j.dump() << std::endl;
-  }
-}
-
-void printResults(bool verbose) {
-  if (!isMainRank()) return;
-
-  std::cout << "\n=== Test Results ===" << std::endl;
-
-  for (const auto& result : gPerfResults) {
-    std::cout << "\nTest: " << result.testName << " (" << result.testCategory << ")" << std::endl;
-
-    if (verbose && !result.testParams.empty()) {
-      std::cout << "  Parameters:" << std::endl;
-      for (const auto& param : result.testParams) {
-        std::cout << "    " << param.first << ": " << param.second << std::endl;
-      }
-    }
-
-    std::cout << "  Metrics:" << std::endl;
-    for (auto it = result.metrics.begin(); it != result.metrics.end(); ++it) {
-      std::cout << "    " << it.key() << ": " << it.value() << std::endl;
-    }
-  }
-  std::cout << std::endl;
-}
-
-}  // namespace utils
-}  // namespace test
-}  // namespace mscclpp
diff --git a/test/perf/framework.hpp b/test/perf/framework.hpp
deleted file mode 100644
index ae1122b3d..000000000
--- a/test/perf/framework.hpp
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#ifndef MSCCLPP_TEST_PERF_FRAMEWORK_HPP_
-#define MSCCLPP_TEST_PERF_FRAMEWORK_HPP_
-
-// This file is kept for backwards compatibility with perf tests
-// The actual framework is now in test/framework.hpp
-
-#include <nlohmann/json.hpp>
-
-#include "../framework.hpp"
-
-namespace mscclpp {
-namespace test {
-namespace utils {
-
-// Additional performance test utilities not in the base framework
-
-// Result recording for performance tests
-void recordResult(const std::string& testName, const std::string& testCategory, const nlohmann::ordered_json& metrics,
-                  const std::map<std::string, std::string>& testParams = {});
-
-// Output utilities for performance tests
-void writeResultsToFile(const std::string& filename);
-void printResults(bool verbose = false);
-
-}  // namespace utils
-}  // namespace test
-}  // namespace mscclpp
-
-#endif  // MSCCLPP_TEST_PERF_FRAMEWORK_HPP_
diff --git a/test/unit/fifo_perf_tests.cu b/test/unit/fifo_perf_tests.cu
index 76aed8355..7b0e726ae 100644
--- a/test/unit/fifo_perf_tests.cu
+++ b/test/unit/fifo_perf_tests.cu
@@ -9,8 +9,8 @@
 #include <mscclpp/numa.hpp>
 
 // Simple FIFO performance test to be run as part of unit_tests
-// This is a simplified version of test/perf/fifo_test.cu that can be
-// integrated into the unit test suite and marked as a performance test.
+// This is a performance test that can be excluded from coverage runs
+// using the --exclude-perf-tests flag.
 
 constexpr uint64_t TIMEOUT_SPINS = 1000000;
 constexpr int MIN_TRIGGERS = 100;  // Reduced for faster unit test execution

From e26f8abbd4ab55f28589a9c3f65b5f60efa629ec Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 11 Feb 2026 08:32:28 +0000
Subject: [PATCH 020/132] Address PR review comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. Add missing includes to fifo_perf_tests.cu
   - Add #include <cassert>
   - Add #include <unordered_map>

2. Fix license header capitalization (4 files)
   - "license." → "License." in test/framework.{hpp,cc}
   - "license." → "License." in test/unit/{unit_tests_main.cc,fifo_perf_tests.cu}

3. Fix double MPI_Init issue
   - Check MPI_Initialized() before calling MPI_Init
   - Prevents double initialization when mp_unit_tests already inits MPI

4. Fix coverage flags for CUDA compilation
   - Use generator expressions to apply --coverage only to C++ language
   - Prevents breaking CUDA compilation with host-only flags

5. Fix environment memory leak
   - Delete environment objects after TearDown()
   - Clear environments_ vector

6. Implement proper GTEST_SKIP handling
   - Create SkipException class
   - Handle skipped tests separately from failures
   - Report skipped test count

7. Implement GTest-style filter pattern matching
   - Support wildcards (* and ?)
   - Support negative patterns (-Pattern)
   - Support colon-separated patterns (Foo:Bar)
   - Compatible with existing CI usage like --gtest_filter=-*Ib*

Verified builds successfully with Docker.

Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
---
 CMakeLists.txt               |   8 ++-
 test/framework.cc            | 114 +++++++++++++++++++++++++++++++++--
 test/framework.hpp           |  13 ++--
 test/unit/fifo_perf_tests.cu |   4 +-
 test/unit/unit_tests_main.cc |   2 +-
 5 files changed, 128 insertions(+), 13 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b12ea8e43..738ec780b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -109,9 +109,11 @@ if(MSCCLPP_ENABLE_COVERAGE)
     if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
         message(STATUS "Code coverage enabled")
         
-        # Add coverage flags to all targets
-        add_compile_options(--coverage -O0 -g)
-        add_link_options(--coverage)
+        # Add coverage flags to C++ targets only (not CUDA)
+        add_compile_options($<$<COMPILE_LANGUAGE:CXX>:--coverage>)
+        add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-O0>)
+        add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-g>)
+        add_link_options($<$<LINK_LANGUAGE:CXX>:--coverage>)
         
         # Find lcov
         find_program(LCOV_PATH lcov)
diff --git a/test/framework.cc b/test/framework.cc
index cbfc2ffc6..3c08acd68 100644
--- a/test/framework.cc
+++ b/test/framework.cc
@@ -1,8 +1,9 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include "framework.hpp"
 
+#include <algorithm>
 #include <iomanip>
 #include <iostream>
 #include <sstream>
@@ -11,6 +12,86 @@
 namespace mscclpp {
 namespace test {
 
+// Helper function for wildcard pattern matching (supports * and ?)
+static bool matchPattern(const std::string& str, const std::string& pattern) {
+  size_t strIdx = 0;
+  size_t patIdx = 0;
+  size_t starIdx = std::string::npos;
+  size_t matchIdx = 0;
+
+  while (strIdx < str.length()) {
+    if (patIdx < pattern.length() && (pattern[patIdx] == '?' || pattern[patIdx] == str[strIdx])) {
+      strIdx++;
+      patIdx++;
+    } else if (patIdx < pattern.length() && pattern[patIdx] == '*') {
+      starIdx = patIdx;
+      matchIdx = strIdx;
+      patIdx++;
+    } else if (starIdx != std::string::npos) {
+      patIdx = starIdx + 1;
+      matchIdx++;
+      strIdx = matchIdx;
+    } else {
+      return false;
+    }
+  }
+
+  while (patIdx < pattern.length() && pattern[patIdx] == '*') {
+    patIdx++;
+  }
+
+  return patIdx == pattern.length();
+}
+
+// Helper function to check if test name matches GTest-style filter
+static bool matchesFilter(const std::string& testName, const std::string& filter) {
+  if (filter.empty()) return true;
+
+  // Split filter by ':' for multiple patterns
+  std::vector<std::string> patterns;
+  size_t start = 0;
+  size_t end = filter.find(':');
+  while (end != std::string::npos) {
+    patterns.push_back(filter.substr(start, end - start));
+    start = end + 1;
+    end = filter.find(':', start);
+  }
+  patterns.push_back(filter.substr(start));
+
+  // Check for positive patterns first
+  bool hasPositivePattern = false;
+  bool positiveMatch = false;
+  
+  for (const auto& pattern : patterns) {
+    if (pattern.empty()) continue;
+    
+    if (pattern[0] != '-') {
+      hasPositivePattern = true;
+      if (matchPattern(testName, pattern)) {
+        positiveMatch = true;
+      }
+    }
+  }
+
+  // If there are positive patterns and none matched, exclude
+  if (hasPositivePattern && !positiveMatch) {
+    return false;
+  }
+
+  // Check negative patterns
+  for (const auto& pattern : patterns) {
+    if (pattern.empty()) continue;
+    
+    if (pattern[0] == '-' && pattern.length() > 1) {
+      if (matchPattern(testName, pattern.substr(1))) {
+        return false;  // Negative match - exclude this test
+      }
+    }
+  }
+
+  return true;
+}
+
 // Global state
 static int gMpiRank = 0;
 static int gMpiSize = 1;
@@ -24,7 +105,12 @@ namespace utils {
 void initializeMPI(int argc, char* argv[]) {
   if (gMpiInitialized) return;
 
-  MPI_Init(&argc, &argv);
+  int initialized = 0;
+  MPI_Initialized(&initialized);
+  if (!initialized) {
+    MPI_Init(&argc, &argv);
+  }
+  
   MPI_Comm_rank(MPI_COMM_WORLD, &gMpiRank);
   MPI_Comm_size(MPI_COMM_WORLD, &gMpiSize);
   gMpiInitialized = true;
@@ -223,6 +309,7 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
   int passed = 0;
   int failed = 0;
   int skipped = 0;
+  int skippedByFilter = 0;
 
   // Count tests to run
   int total_to_run = 0;
@@ -258,8 +345,8 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
       continue;
     }
     
-    // Apply filter
-    if (!filter.empty() && full_name.find(filter) == std::string::npos) {
+    // Apply name filter with wildcard support
+    if (!matchesFilter(full_name, filter)) {
       continue;
     }
 
@@ -275,11 +362,19 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
     UnitTest::GetInstance()->set_current_test_info(&current_info);
 
     TestCase* test_case = nullptr;
+    bool testSkipped = false;
     try {
       test_case = test_info.factory();
       test_case->SetUp();
       test_case->TestBody();
       test_case->TearDown();
+    } catch (const SkipException& e) {
+      // Test was skipped - count as skipped, not failed
+      gCurrentTestPassed = true;  // Skipped tests don't count as failures
+      testSkipped = true;
+      if (gMpiRank == 0) {
+        std::cout << "[  SKIPPED ] " << full_name << ": " << e.what() << std::endl;
+      }
     } catch (const std::exception& e) {
       gCurrentTestPassed = false;
       if (gCurrentTestFailureMessage.empty()) {
@@ -297,6 +392,12 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
     // Clear current test info
     UnitTest::GetInstance()->set_current_test_info(nullptr);
 
+    // For skipped tests, handle specially
+    if (testSkipped) {
+      skipped++;
+      continue;  // Don't synchronize or count skipped tests
+    }
+
     // Synchronize test status across all MPI processes
     int local_passed = gCurrentTestPassed ? 1 : 0;
     int global_passed = 1;
@@ -322,6 +423,9 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
     if (passed > 0) {
       std::cout << "[  PASSED  ] " << passed << " tests.\n";
     }
+    if (skipped > 0) {
+      std::cout << "[  SKIPPED ] " << skipped << " tests.\n";
+    }
     if (failed > 0) {
       std::cout << "[  FAILED  ] " << failed << " tests.\n";
     }
@@ -331,12 +435,14 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
   for (auto it = environments_.rbegin(); it != environments_.rend(); ++it) {
     try {
       (*it)->TearDown();
+      delete *it;  // Clean up environment objects
     } catch (const std::exception& e) {
       if (gMpiRank == 0) {
         std::cerr << "Failed to tear down test environment: " << e.what() << std::endl;
       }
     }
   }
+  environments_.clear();
 
   return failed > 0 ? 1 : 0;
 }
diff --git a/test/framework.hpp b/test/framework.hpp
index c5e0dc8ba..174ca4f95 100644
--- a/test/framework.hpp
+++ b/test/framework.hpp
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #ifndef MSCCLPP_TEST_FRAMEWORK_HPP_
 #define MSCCLPP_TEST_FRAMEWORK_HPP_
@@ -149,6 +149,12 @@ void reportSuccess();
 
 }  // namespace utils
 
+// Custom exception for test skips
+class SkipException : public std::runtime_error {
+ public:
+  explicit SkipException(const std::string& message) : std::runtime_error(message) {}
+};
+
 // Helper class for FAIL functionality with message streaming support
 class FailHelper {
  public:
@@ -191,11 +197,10 @@ class SkipHelper {
   ~SkipHelper() noexcept(false) {
     std::string msg = message_.str();
     if (!msg.empty()) {
-      ::mscclpp::test::utils::reportFailure(file_, line_, "Test skipped: " + msg);
+      throw SkipException("Test skipped: " + msg);
     } else {
-      ::mscclpp::test::utils::reportFailure(file_, line_, "Test skipped");
+      throw SkipException("Test skipped");
     }
-    throw std::runtime_error("Test skipped");
   }
 
  private:
diff --git a/test/unit/fifo_perf_tests.cu b/test/unit/fifo_perf_tests.cu
index 7b0e726ae..9aab2643d 100644
--- a/test/unit/fifo_perf_tests.cu
+++ b/test/unit/fifo_perf_tests.cu
@@ -1,12 +1,14 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include "../framework.hpp"
 
+#include <cassert>
 #include <memory>
 #include <mscclpp/fifo.hpp>
 #include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/numa.hpp>
+#include <unordered_map>
 
 // Simple FIFO performance test to be run as part of unit_tests
 // This is a performance test that can be excluded from coverage runs
diff --git a/test/unit/unit_tests_main.cc b/test/unit/unit_tests_main.cc
index 743243d18..397566e05 100644
--- a/test/unit/unit_tests_main.cc
+++ b/test/unit/unit_tests_main.cc
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include "../framework.hpp"
 

From 7003fec7636308a20dc638908a36eacede528321 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 11 Feb 2026 08:36:24 +0000
Subject: [PATCH 021/132] Simplify filter matching to use substring matching

- Remove complex wildcard pattern matching (*, ?, negative patterns)
- Use simple substring matching with find()
- Simpler implementation, easier to understand and maintain
- Still supports --gtest_filter for basic test name filtering

Note: For advanced filtering like wildcards, users can use multiple
test runs with different substring filters.

Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
---
 test/framework.cc | 84 ++---------------------------------------------
 1 file changed, 2 insertions(+), 82 deletions(-)

diff --git a/test/framework.cc b/test/framework.cc
index 3c08acd68..ee18231f4 100644
--- a/test/framework.cc
+++ b/test/framework.cc
@@ -12,86 +12,6 @@
 namespace mscclpp {
 namespace test {
 
-// Helper function for wildcard pattern matching (supports * and ?)
-static bool matchPattern(const std::string& str, const std::string& pattern) {
-  size_t strIdx = 0;
-  size_t patIdx = 0;
-  size_t starIdx = std::string::npos;
-  size_t matchIdx = 0;
-
-  while (strIdx < str.length()) {
-    if (patIdx < pattern.length() && (pattern[patIdx] == '?' || pattern[patIdx] == str[strIdx])) {
-      strIdx++;
-      patIdx++;
-    } else if (patIdx < pattern.length() && pattern[patIdx] == '*') {
-      starIdx = patIdx;
-      matchIdx = strIdx;
-      patIdx++;
-    } else if (starIdx != std::string::npos) {
-      patIdx = starIdx + 1;
-      matchIdx++;
-      strIdx = matchIdx;
-    } else {
-      return false;
-    }
-  }
-
-  while (patIdx < pattern.length() && pattern[patIdx] == '*') {
-    patIdx++;
-  }
-
-  return patIdx == pattern.length();
-}
-
-// Helper function to check if test name matches GTest-style filter
-static bool matchesFilter(const std::string& testName, const std::string& filter) {
-  if (filter.empty()) return true;
-
-  // Split filter by ':' for multiple patterns
-  std::vector<std::string> patterns;
-  size_t start = 0;
-  size_t end = filter.find(':');
-  while (end != std::string::npos) {
-    patterns.push_back(filter.substr(start, end - start));
-    start = end + 1;
-    end = filter.find(':', start);
-  }
-  patterns.push_back(filter.substr(start));
-
-  // Check for positive patterns first
-  bool hasPositivePattern = false;
-  bool positiveMatch = false;
-  
-  for (const auto& pattern : patterns) {
-    if (pattern.empty()) continue;
-    
-    if (pattern[0] != '-') {
-      hasPositivePattern = true;
-      if (matchPattern(testName, pattern)) {
-        positiveMatch = true;
-      }
-    }
-  }
-
-  // If there are positive patterns and none matched, exclude
-  if (hasPositivePattern && !positiveMatch) {
-    return false;
-  }
-
-  // Check negative patterns
-  for (const auto& pattern : patterns) {
-    if (pattern.empty()) continue;
-    
-    if (pattern[0] == '-' && pattern.length() > 1) {
-      if (matchPattern(testName, pattern.substr(1))) {
-        return false;  // Negative match - exclude this test
-      }
-    }
-  }
-
-  return true;
-}
-
 // Global state
 static int gMpiRank = 0;
 static int gMpiSize = 1;
@@ -345,8 +265,8 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
       continue;
     }
     
-    // Apply name filter with wildcard support
-    if (!matchesFilter(full_name, filter)) {
+    // Apply simple substring filter
+    if (!filter.empty() && full_name.find(filter) == std::string::npos) {
       continue;
     }
 

From 30b98911809f4720bbebac61cb3c55bb5a21e416 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 18 Feb 2026 18:35:33 -0800
Subject: [PATCH 022/132] simplifying

---
 test/CMakeLists.txt                  |   2 +-
 test/framework.cc                    | 204 +++++++++------------------
 test/framework.hpp                   | 147 +++++--------------
 test/mp_unit/executor_tests.cc       |   2 +-
 test/mp_unit/ib_tests.cu             |   2 +-
 test/mp_unit/memory_channel_tests.cu |   2 +-
 test/mp_unit/mp_unit_tests.cc        |  11 +-
 test/mp_unit/port_channel_tests.cu   |  24 ++--
 test/mp_unit/switch_channel_tests.cu |   4 +-
 9 files changed, 125 insertions(+), 273 deletions(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 8c3c41499..288550854 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -41,7 +41,7 @@ include(CTest)
 # Build test framework library
 add_library(test_framework STATIC framework.cc)
 target_include_directories(test_framework PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${TEST_INC_COMMON})
-target_link_libraries(test_framework PUBLIC MPI::MPI_CXX nlohmann_json::nlohmann_json)
+target_link_libraries(test_framework PUBLIC MPI::MPI_CXX)
 
 # Unit tests
 add_executable(unit_tests)
diff --git a/test/framework.cc b/test/framework.cc
index ee18231f4..c75c90fc7 100644
--- a/test/framework.cc
+++ b/test/framework.cc
@@ -18,10 +18,12 @@ static int gMpiSize = 1;
 static bool gMpiInitialized = false;
 static bool gCurrentTestPassed = true;
 static std::string gCurrentTestFailureMessage;
+static std::string gCurrentTestName;
+
+std::string currentTestName() { return gCurrentTestName; }
 
 namespace utils {
 
-// Internal MPI helper functions (not exposed in header)
 void initializeMPI(int argc, char* argv[]) {
   if (gMpiInitialized) return;
 
@@ -30,7 +32,7 @@ void initializeMPI(int argc, char* argv[]) {
   if (!initialized) {
     MPI_Init(&argc, &argv);
   }
-  
+
   MPI_Comm_rank(MPI_COMM_WORLD, &gMpiRank);
   MPI_Comm_size(MPI_COMM_WORLD, &gMpiSize);
   gMpiInitialized = true;
@@ -43,9 +45,6 @@ static void finalizeMPI() {
   gMpiInitialized = false;
 }
 
-static bool isMainProcess() { return gMpiRank == 0; }
-
-// Public utility functions for test output
 bool isMainRank() { return gMpiRank == 0; }
 
 int getMPIRank() { return gMpiRank; }
@@ -103,93 +102,35 @@ void cudaCheck(cudaError_t err, const char* file, int line) {
   }
 }
 
-int runMultipleTests(
-    int argc, char* argv[],
-    const std::vector<std::tuple<std::string, std::string, std::function<void(int, int, int)>>>& tests) {
-  int totalResult = 0;
-
-  // Initialize MPI once for all tests
-  initializeMPI(argc, argv);
-
-  try {
-    // Get MPI information
-    int rank = getMPIRank();
-    int size = getMPISize();
-    int local_rank = rank;  // For simplicity, assume local_rank = rank
-
-    for (const auto& test : tests) {
-      const std::string& testName = std::get<0>(test);
-      const std::string& testDescription = std::get<1>(test);
-      const std::function<void(int, int, int)>& testFunction = std::get<2>(test);
-
-      if (rank == 0) {
-        std::cout << "Running test: " << testName << std::endl;
-        if (!testDescription.empty()) {
-          std::cout << "  " << testDescription << std::endl;
-        }
-      }
-
-      // Don't clear results - accumulate them for all tests in the same file
-      // g_results.clear();  // Commented out to accumulate results
-
-      try {
-        // Run the individual test function with MPI information
-        testFunction(rank, size, local_rank);
-
-        // Synchronize before moving to next test
-        MPI_Barrier(MPI_COMM_WORLD);
-
-      } catch (const std::exception& e) {
-        if (rank == 0) {
-          std::cerr << "Error in test " << testName << ": " << e.what() << std::endl;
-        }
-        totalResult = 1;
-      }
-    }
-
-    // Don't cleanup MPI here - let the caller handle it
-    // finalizeMPI();
-
-  } catch (const std::exception& e) {
-    if (gMpiRank == 0) {
-      std::cerr << "Error: " << e.what() << std::endl;
-    }
-    finalizeMPI();
-    return 1;
-  }
-
-  return totalResult;
-}
-
 }  // namespace utils
 
-// UnitTest implementation
-UnitTest* UnitTest::GetInstance() {
-  static UnitTest instance;
-  return &instance;
-}
-
 // TestRegistry implementation
 TestRegistry& TestRegistry::instance() {
   static TestRegistry registry;
   return registry;
 }
 
-void TestRegistry::registerTest(const std::string& test_suite, const std::string& test_name, TestFactory factory,
+void TestRegistry::registerTest(const std::string& suiteName, const std::string& testName, TestFactory factory,
                                 bool isPerfTest) {
-  TestInfoInternal info;
-  info.suiteName = test_suite;
-  info.testName = test_name;
-  info.factory = factory;
-  info.isPerfTest = isPerfTest;
-  tests_.push_back(info);
+  tests_.push_back({suiteName, testName, std::move(factory), isPerfTest});
 }
 
-void TestRegistry::addGlobalTestEnvironment(Environment* env) { environments_.push_back(env); }
-
-void TestRegistry::initGoogleTest(int* argc, char** argv) {
-  // Parse command-line arguments if needed
-  // For now, this is a no-op placeholder for compatibility
+void TestRegistry::addEnvironment(Environment* env) { environments_.push_back(env); }
+
+// Returns true if the test should run given the filter string.
+// Filter syntax:
+//   ""          -> run all
+//   "Pattern"   -> run only tests whose full name contains Pattern
+//   "-Pattern"  -> run all tests EXCEPT those whose full name contains Pattern
+static bool matchesFilter(const std::string& fullName, const std::string& filter) {
+  if (filter.empty()) return true;
+  if (filter[0] == '-') {
+    // Negative filter: exclude matching tests
+    std::string pattern = filter.substr(1);
+    return fullName.find(pattern) == std::string::npos;
+  }
+  // Positive filter: include only matching tests
+  return fullName.find(filter) != std::string::npos;
 }
 
 int TestRegistry::runAllTests(int argc, char* argv[]) {
@@ -199,14 +140,14 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
   }
 
   // Parse command line arguments
-  std::string filter = "";
+  std::string filter;
   bool excludePerfTests = false;
-  
+
   for (int i = 1; i < argc; ++i) {
     std::string arg = argv[i];
-    if (arg.find("--gtest_filter=") == 0) {
-      filter = arg.substr(15);  // Length of "--gtest_filter="
-    } else if (arg == "--gtest_filter" && i + 1 < argc) {
+    if (arg.find("--filter=") == 0) {
+      filter = arg.substr(9);  // Length of "--filter="
+    } else if (arg == "--filter" && i + 1 < argc) {
       filter = argv[i + 1];
       ++i;
     } else if (arg == "--exclude-perf-tests") {
@@ -229,71 +170,57 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
   int passed = 0;
   int failed = 0;
   int skipped = 0;
-  int skippedByFilter = 0;
 
   // Count tests to run
-  int total_to_run = 0;
-  for (const auto& test_info : tests_) {
-    std::string full_name = test_info.suiteName + "." + test_info.testName;
-    
-    // Skip performance tests if requested
-    if (excludePerfTests && test_info.isPerfTest) {
-      skipped++;
+  int totalToRun = 0;
+  int skippedByFilter = 0;
+  for (const auto& entry : tests_) {
+    std::string fullName = entry.suiteName + "." + entry.testName;
+    if (excludePerfTests && entry.isPerfTest) {
+      skippedByFilter++;
       continue;
     }
-    
-    if (!filter.empty() && full_name.find(filter) == std::string::npos) {
-      skipped++;
+    if (!matchesFilter(fullName, filter)) {
+      skippedByFilter++;
       continue;
     }
-    total_to_run++;
+    totalToRun++;
   }
 
   if (gMpiRank == 0) {
-    std::cout << "[==========] Running " << total_to_run << " tests";
-    if (skipped > 0) {
-      std::cout << " (" << skipped << " skipped)";
+    std::cout << "[==========] Running " << totalToRun << " tests";
+    if (skippedByFilter > 0) {
+      std::cout << " (" << skippedByFilter << " skipped by filter)";
     }
     std::cout << ".\n";
   }
 
-  for (const auto& test_info : tests_) {
-    std::string full_name = test_info.suiteName + "." + test_info.testName;
+  for (const auto& entry : tests_) {
+    std::string fullName = entry.suiteName + "." + entry.testName;
 
-    // Skip performance tests if requested
-    if (excludePerfTests && test_info.isPerfTest) {
-      continue;
-    }
-    
-    // Apply simple substring filter
-    if (!filter.empty() && full_name.find(filter) == std::string::npos) {
-      continue;
-    }
+    if (excludePerfTests && entry.isPerfTest) continue;
+    if (!matchesFilter(fullName, filter)) continue;
 
     gCurrentTestPassed = true;
     gCurrentTestFailureMessage.clear();
+    gCurrentTestName = fullName;
 
     if (gMpiRank == 0) {
-      std::cout << "[ RUN      ] " << full_name << std::endl;
+      std::cout << "[ RUN      ] " << fullName << std::endl;
     }
 
-    // Set current test info for UnitTest::GetInstance()->current_test_info()
-    TestInfo current_info(test_info.suiteName, test_info.testName);
-    UnitTest::GetInstance()->set_current_test_info(&current_info);
-
-    TestCase* test_case = nullptr;
+    TestCase* testCase = nullptr;
     bool testSkipped = false;
     try {
-      test_case = test_info.factory();
-      test_case->SetUp();
-      test_case->TestBody();
-      test_case->TearDown();
+      testCase = entry.factory();
+      testCase->SetUp();
+      testCase->TestBody();
+      testCase->TearDown();
     } catch (const SkipException& e) {
-      // Test was skipped - count as skipped, not failed
-      gCurrentTestPassed = true;  // Skipped tests don't count as failures
+      gCurrentTestPassed = true;
       testSkipped = true;
       if (gMpiRank == 0) {
-        std::cout << "[  SKIPPED ] " << full_name << ": " << e.what() << std::endl;
+        std::cout << "[  SKIPPED ] " << fullName << ": " << e.what() << std::endl;
       }
     } catch (const std::exception& e) {
       gCurrentTestPassed = false;
@@ -307,39 +234,36 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
       }
     }
 
-    delete test_case;
+    delete testCase;
+    gCurrentTestName.clear();
 
-    // Clear current test info
-    UnitTest::GetInstance()->set_current_test_info(nullptr);
-
-    // For skipped tests, handle specially
     if (testSkipped) {
       skipped++;
-      continue;  // Don't synchronize or count skipped tests
+      continue;
     }
 
     // Synchronize test status across all MPI processes
-    int local_passed = gCurrentTestPassed ? 1 : 0;
-    int global_passed = 1;
+    int localPassed = gCurrentTestPassed ? 1 : 0;
+    int globalPassed = 1;
     if (gMpiInitialized) {
-      MPI_Allreduce(&local_passed, &global_passed, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+      MPI_Allreduce(&localPassed, &globalPassed, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
     } else {
-      global_passed = local_passed;
+      globalPassed = localPassed;
     }
 
     if (gMpiRank == 0) {
-      if (global_passed) {
-        std::cout << "[       OK ] " << full_name << std::endl;
+      if (globalPassed) {
+        std::cout << "[       OK ] " << fullName << std::endl;
         passed++;
       } else {
-        std::cout << "[  FAILED  ] " << full_name << std::endl;
+        std::cout << "[  FAILED  ] " << fullName << std::endl;
         failed++;
       }
     }
   }
 
   if (gMpiRank == 0) {
-    std::cout << "[==========] " << total_to_run << " tests ran.\n";
+    std::cout << "[==========] " << totalToRun << " tests ran.\n";
     if (passed > 0) {
       std::cout << "[  PASSED  ] " << passed << " tests.\n";
     }
@@ -355,7 +279,7 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
   for (auto it = environments_.rbegin(); it != environments_.rend(); ++it) {
     try {
       (*it)->TearDown();
-      delete *it;  // Clean up environment objects
+      delete *it;
     } catch (const std::exception& e) {
       if (gMpiRank == 0) {
         std::cerr << "Failed to tear down test environment: " << e.what() << std::endl;
diff --git a/test/framework.hpp b/test/framework.hpp
index 174ca4f95..bcd84cf9e 100644
--- a/test/framework.hpp
+++ b/test/framework.hpp
@@ -7,40 +7,18 @@
 #include <mpi.h>
 
 #include <chrono>
-#include <fstream>
 #include <functional>
+#include <iomanip>
 #include <iostream>
-#include <map>
 #include <mscclpp/gpu.hpp>
-#include <nlohmann/json.hpp>
 #include <sstream>
 #include <stdexcept>
 #include <string>
-#include <tuple>
 #include <vector>
 
 namespace mscclpp {
 namespace test {
 
-// Test result structure
-struct TestResult {
-  std::string testName;
-  std::string testCategory;
-  std::map<std::string, std::string> testParams;
-  nlohmann::ordered_json metrics;
-  int numProcesses;
-  int processRank;
-  std::string timestamp;
-  bool passed;
-  std::string failureMessage;
-};
-
-// Forward declarations
-class Environment;
-class TestCase;
-class TestInfo;
-class UnitTest;
-
 // Test case base class
 class TestCase {
  public:
@@ -58,32 +36,6 @@ class Environment {
   virtual void TearDown() {}
 };
 
-// Test info class (for getting current test information)
-class TestInfo {
- public:
-  TestInfo(const std::string& suite, const std::string& name) : testSuiteName_(suite), testName_(name) {}
-
-  const char* test_suite_name() const { return testSuiteName_.c_str(); }
-  const char* name() const { return testName_.c_str(); }
-
- private:
-  std::string testSuiteName_;
-  std::string testName_;
-};
-
-// UnitTest singleton (for getting test information)
-class UnitTest {
- public:
-  static UnitTest* GetInstance();
-
-  const TestInfo* current_test_info() const { return currentTestInfo_; }
-  void set_current_test_info(const TestInfo* info) { currentTestInfo_ = info; }
-
- private:
-  UnitTest() = default;
-  const TestInfo* currentTestInfo_ = nullptr;
-};
-
 // Test registry and runner
 class TestRegistry {
  public:
@@ -91,30 +43,28 @@ class TestRegistry {
 
   static TestRegistry& instance();
 
-  void registerTest(const std::string& test_suite, const std::string& test_name, TestFactory factory, bool isPerfTest = false);
-  void addGlobalTestEnvironment(Environment* env);
+  void registerTest(const std::string& suiteName, const std::string& testName, TestFactory factory,
+                    bool isPerfTest = false);
+  void addEnvironment(Environment* env);
   int runAllTests(int argc, char* argv[]);
-  void initGoogleTest(int* argc, char** argv);
 
  private:
   TestRegistry() = default;
-  struct TestInfoInternal {
+  struct TestEntry {
     std::string suiteName;
     std::string testName;
     TestFactory factory;
     bool isPerfTest;
   };
-  std::vector<TestInfoInternal> tests_;
+  std::vector<TestEntry> tests_;
   std::vector<Environment*> environments_;
 };
 
-// Simple utility functions for testing
-namespace utils {
+// Returns "Suite.Name" for the currently running test, or "" if none.
+std::string currentTestName();
 
-// Test execution utilities (for performance tests)
-int runMultipleTests(
-    int argc, char* argv[],
-    const std::vector<std::tuple<std::string, std::string, std::function<void(int, int, int)>>>& tests);
+// Utility functions
+namespace utils {
 
 // MPI management
 void initializeMPI(int argc, char* argv[]);
@@ -149,13 +99,13 @@ void reportSuccess();
 
 }  // namespace utils
 
-// Custom exception for test skips
+// Exception for test skips
 class SkipException : public std::runtime_error {
  public:
   explicit SkipException(const std::string& message) : std::runtime_error(message) {}
 };
 
-// Helper class for FAIL functionality with message streaming support
+// Helper class for FAIL() macro — supports message streaming via operator<<
 class FailHelper {
  public:
   explicit FailHelper(const char* file, int line) : file_(file), line_(line) {}
@@ -180,12 +130,8 @@ class FailHelper {
   std::ostringstream message_;
 };
 
-// Helper class for GTEST_SKIP functionality
-// This class uses RAII (Resource Acquisition Is Initialization) pattern:
-// - The constructor records file and line information
-// - The stream operator (<<) allows appending a skip message
-// - The destructor throws an exception to skip the test
-// This enables usage like: GTEST_SKIP() << "Reason for skipping";
+// Helper class for SKIP_TEST() macro — supports message streaming via operator<<
+// Usage: SKIP_TEST() << "Reason for skipping";
 class SkipHelper {
  public:
   explicit SkipHelper(const char* file, int line) : file_(file), line_(line) {}
@@ -212,18 +158,17 @@ class SkipHelper {
 }  // namespace test
 }  // namespace mscclpp
 
-// Test registration macros
+// --- Test registration macros ---
+
 #define TEST(test_suite, test_name)                                                            \
   class test_suite##_##test_name##_Test : public ::mscclpp::test::TestCase {                   \
    public:                                                                                     \
-    test_suite##_##test_name##_Test() {}                                                       \
     void TestBody() override;                                                                  \
   };                                                                                           \
   static bool test_suite##_##test_name##_registered = []() {                                   \
     ::mscclpp::test::TestRegistry::instance().registerTest(                                    \
         #test_suite, #test_name,                                                               \
-        []() -> ::mscclpp::test::TestCase* { return new test_suite##_##test_name##_Test(); }, \
-        false);                                                                                \
+        []() -> ::mscclpp::test::TestCase* { return new test_suite##_##test_name##_Test(); }); \
     return true;                                                                               \
   }();                                                                                         \
   void test_suite##_##test_name##_Test::TestBody()
@@ -231,50 +176,45 @@ class SkipHelper {
 #define TEST_F(test_fixture, test_name)                                                          \
   class test_fixture##_##test_name##_Test : public test_fixture {                                \
    public:                                                                                       \
-    test_fixture##_##test_name##_Test() {}                                                       \
     void TestBody() override;                                                                    \
   };                                                                                             \
   static bool test_fixture##_##test_name##_registered = []() {                                   \
     ::mscclpp::test::TestRegistry::instance().registerTest(                                      \
         #test_fixture, #test_name,                                                               \
-        []() -> ::mscclpp::test::TestCase* { return new test_fixture##_##test_name##_Test(); }, \
-        false);                                                                                  \
+        []() -> ::mscclpp::test::TestCase* { return new test_fixture##_##test_name##_Test(); }); \
     return true;                                                                                 \
   }();                                                                                           \
   void test_fixture##_##test_name##_Test::TestBody()
 
-// Performance test registration macros
 #define PERF_TEST(test_suite, test_name)                                                       \
   class test_suite##_##test_name##_Test : public ::mscclpp::test::TestCase {                   \
    public:                                                                                     \
-    test_suite##_##test_name##_Test() {}                                                       \
     void TestBody() override;                                                                  \
   };                                                                                           \
   static bool test_suite##_##test_name##_registered = []() {                                   \
     ::mscclpp::test::TestRegistry::instance().registerTest(                                    \
         #test_suite, #test_name,                                                               \
-        []() -> ::mscclpp::test::TestCase* { return new test_suite##_##test_name##_Test(); }, \
+        []() -> ::mscclpp::test::TestCase* { return new test_suite##_##test_name##_Test(); },  \
         true);                                                                                 \
     return true;                                                                               \
   }();                                                                                         \
   void test_suite##_##test_name##_Test::TestBody()
 
-#define PERF_TEST_F(test_fixture, test_name)                                           \
-  class test_fixture##_##test_name##_Test : public test_fixture {                     \
-   public:                                                                             \
-    test_fixture##_##test_name##_Test() {}                                            \
-    void TestBody() override;                                                          \
-  };                                                                                   \
-  static bool test_fixture##_##test_name##_registered = []() {                        \
-    ::mscclpp::test::TestRegistry::instance().registerTest(                           \
-        #test_fixture, #test_name,                                                     \
-        []() -> ::mscclpp::test::TestCase* { return new test_fixture##_##test_name##_Test(); }, \
-        true);                                                                         \
-    return true;                                                                       \
-  }();                                                                                 \
+#define PERF_TEST_F(test_fixture, test_name)                                                     \
+  class test_fixture##_##test_name##_Test : public test_fixture {                                \
+   public:                                                                                       \
+    void TestBody() override;                                                                    \
+  };                                                                                             \
+  static bool test_fixture##_##test_name##_registered = []() {                                   \
+    ::mscclpp::test::TestRegistry::instance().registerTest(                                      \
+        #test_fixture, #test_name,                                                               \
+        []() -> ::mscclpp::test::TestCase* { return new test_fixture##_##test_name##_Test(); },  \
+        true);                                                                                   \
+    return true;                                                                                 \
+  }();                                                                                           \
   void test_fixture##_##test_name##_Test::TestBody()
 
-// Test runner macro
+// --- Test runner macro ---
 #define RUN_ALL_TESTS() ::mscclpp::test::TestRegistry::instance().runAllTests(argc, argv)
 
 // Assertion macros
@@ -462,25 +402,12 @@ class SkipHelper {
     }                                                                                                      \
   } while (0)
 
-// Test fail macro - throws exception to fail test execution
-// Usage: FAIL() << "Optional fail message";
-#define FAIL() ::mscclpp::test::FailHelper(__FILE__, __LINE__)
+// --- Test control macros ---
 
-// Test skip macro - throws exception to skip test execution
-// Usage: GTEST_SKIP() << "Optional skip message";
-#define GTEST_SKIP() ::mscclpp::test::SkipHelper(__FILE__, __LINE__)
-
-// Create a namespace alias for compatibility with GTest code
-namespace testing = ::mscclpp::test;
-
-// Helper functions for compatibility with GTest API
-inline void InitGoogleTest(int* argc, char** argv) {
-  ::mscclpp::test::TestRegistry::instance().initGoogleTest(argc, argv);
-}
+// Fail the current test immediately. Usage: FAIL() << "reason";
+#define FAIL() ::mscclpp::test::FailHelper(__FILE__, __LINE__)
 
-inline ::mscclpp::test::Environment* AddGlobalTestEnvironment(::mscclpp::test::Environment* env) {
-  ::mscclpp::test::TestRegistry::instance().addGlobalTestEnvironment(env);
-  return env;
-}
+// Skip the current test. Usage: SKIP_TEST() << "reason";
+#define SKIP_TEST() ::mscclpp::test::SkipHelper(__FILE__, __LINE__)
 
 #endif  // MSCCLPP_TEST_FRAMEWORK_HPP_
diff --git a/test/mp_unit/executor_tests.cc b/test/mp_unit/executor_tests.cc
index 329d80814..82fa53a83 100644
--- a/test/mp_unit/executor_tests.cc
+++ b/test/mp_unit/executor_tests.cc
@@ -23,7 +23,7 @@ std::string getExecutablePath() {
 
 void ExecutorTest::SetUp() {
   if (gEnv->worldSize != 2 || gEnv->nRanksPerNode != 2) {
-    GTEST_SKIP() << "This test requires world size to be 2 and ranks per node to be 2";
+    SKIP_TEST() << "This test requires world size to be 2 and ranks per node to be 2";
   }
   MultiProcessTest::SetUp();
 
diff --git a/test/mp_unit/ib_tests.cu b/test/mp_unit/ib_tests.cu
index 051030ac8..963e80d20 100644
--- a/test/mp_unit/ib_tests.cu
+++ b/test/mp_unit/ib_tests.cu
@@ -19,7 +19,7 @@ void IbTestBase::SetUp() {
 
 void IbPeerToPeerTest::SetUp() {
 #if !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
+  SKIP_TEST() << "This test requires IBVerbs that the current build does not support.";
 #endif  // !defined(USE_IBVERBS)
 
   IbTestBase::SetUp();
diff --git a/test/mp_unit/memory_channel_tests.cu b/test/mp_unit/memory_channel_tests.cu
index f6ef3aedc..cb5610946 100644
--- a/test/mp_unit/memory_channel_tests.cu
+++ b/test/mp_unit/memory_channel_tests.cu
@@ -8,7 +8,7 @@
 void MemoryChannelOneToOneTest::SetUp() {
   // Need at least two ranks within a node
   if (gEnv->nRanksPerNode < 2) {
-    GTEST_SKIP();
+    SKIP_TEST();
   }
   // Use only two ranks
   setNumRanksToUse(2);
diff --git a/test/mp_unit/mp_unit_tests.cc b/test/mp_unit/mp_unit_tests.cc
index f610822e5..5782930e0 100644
--- a/test/mp_unit/mp_unit_tests.cc
+++ b/test/mp_unit/mp_unit_tests.cc
@@ -98,14 +98,18 @@ static std::unordered_map<std::string, std::string> parseArgs(int argc, const ch
       continue;
     }
 
-    // Unrecognized positional token: ignore to keep parser permissive for gtest/MPI extras
+    // Unrecognized positional token: ignore
   }
 
   return options;
 }
 
 void MultiProcessTestEnv::SetUp() {
-  MPI_Init(NULL, NULL);
+  int initialized = 0;
+  MPI_Initialized(&initialized);
+  if (!initialized) {
+    MPI_Init(NULL, NULL);
+  }
   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
   MPI_Comm_size(MPI_COMM_WORLD, &worldSize);
   // get the local number of nodes with MPI
@@ -128,9 +132,8 @@ void MultiProcessTest::TearDown() {
 }
 
 int main(int argc, char** argv) {
-  InitGoogleTest(&argc, argv);
   gEnv = new MultiProcessTestEnv(argc, (const char**)argv);
-  AddGlobalTestEnvironment(gEnv);
+  ::mscclpp::test::TestRegistry::instance().addEnvironment(gEnv);
   return RUN_ALL_TESTS();
 }
 
diff --git a/test/mp_unit/port_channel_tests.cu b/test/mp_unit/port_channel_tests.cu
index 7cc5954a8..3d5c00412 100644
--- a/test/mp_unit/port_channel_tests.cu
+++ b/test/mp_unit/port_channel_tests.cu
@@ -223,8 +223,7 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
 
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
-  auto* testInfo = ::testing::UnitTest::GetInstance()->current_test_info();
-  const std::string testName = std::string(testInfo->test_suite_name()) + "." + std::string(testInfo->name());
+  const std::string testName = ::mscclpp::test::currentTestName();
   const int nTries = 1000;
 
   // Warm-up
@@ -257,7 +256,7 @@ TEST_F(PortChannelOneToOneTest, PingPongIbHostMode) {
   testPingPong(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
 #else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
+  SKIP_TEST() << "This test requires IBVerbs that the current build does not support.";
 #endif  // !defined(USE_IBVERBS)
 }
 
@@ -276,7 +275,7 @@ TEST_F(PortChannelOneToOneTest, PingPongIbHostModeWithPoll) {
   testPingPong(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Host});
 #else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
+  SKIP_TEST() << "This test requires IBVerbs that the current build does not support.";
 #endif  // !defined(USE_IBVERBS)
 }
 
@@ -290,7 +289,7 @@ TEST_F(PortChannelOneToOneTest, PingPongPerfIbHostMode) {
   testPingPongPerf(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
 #else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
+  SKIP_TEST() << "This test requires IBVerbs that the current build does not support.";
 #endif  // !defined(USE_IBVERBS)
 }
 
@@ -299,7 +298,7 @@ TEST_F(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) {
   testPingPongPerf(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
 #else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
+  SKIP_TEST() << "This test requires IBVerbs that the current build does not support.";
 #endif  // !defined(USE_IBVERBS)
 }
 
@@ -471,8 +470,7 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb, IbMode ibMode)
 
   proxyService->startProxy();
 
-  auto* testInfo = ::testing::UnitTest::GetInstance()->current_test_info();
-  const std::string testName = std::string(testInfo->test_suite_name()) + "." + std::string(testInfo->name());
+  const std::string testName = ::mscclpp::test::currentTestName();
   const int nTries = 1000000;
 
   // Warm-up
@@ -503,7 +501,7 @@ TEST_F(PortChannelOneToOneTest, PacketPingPongIbHostMode) {
 #if defined(USE_IBVERBS)
   testPacketPingPong(true, IbMode::Host);
 #else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
+  SKIP_TEST() << "This test requires IBVerbs that the current build does not support.";
 #endif  // !defined(USE_IBVERBS)
 }
 
@@ -513,7 +511,7 @@ TEST_F(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
 #if defined(USE_IBVERBS)
   testPacketPingPongPerf(true, IbMode::Host);
 #else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
+  SKIP_TEST() << "This test requires IBVerbs that the current build does not support.";
 #endif  // !defined(USE_IBVERBS)
 }
 
@@ -521,7 +519,7 @@ TEST_F(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) {
 #if defined(USE_IBVERBS)
   testPacketPingPongPerf(true, IbMode::HostNoAtomic);
 #else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
+  SKIP_TEST() << "This test requires IBVerbs that the current build does not support.";
 #endif  // !defined(USE_IBVERBS)
 }
 
@@ -530,7 +528,7 @@ TEST_F(PortChannelOneToOneTest, PingPongIbHostNoAtomicMode) {
   testPingPong(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
 #else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
+  SKIP_TEST() << "This test requires IBVerbs that the current build does not support.";
 #endif  // !defined(USE_IBVERBS)
 }
 
@@ -538,6 +536,6 @@ TEST_F(PortChannelOneToOneTest, PacketPingPongIbHostNoAtomicMode) {
 #if defined(USE_IBVERBS)
   testPacketPingPong(true, IbMode::HostNoAtomic);
 #else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
+  SKIP_TEST() << "This test requires IBVerbs that the current build does not support.";
 #endif  // !defined(USE_IBVERBS)
 }
diff --git a/test/mp_unit/switch_channel_tests.cu b/test/mp_unit/switch_channel_tests.cu
index c75a9b3a5..16152c5c7 100644
--- a/test/mp_unit/switch_channel_tests.cu
+++ b/test/mp_unit/switch_channel_tests.cu
@@ -10,10 +10,10 @@
 void SwitchChannelTest::SetUp() {
   // Need at least two ranks within a node
   if (gEnv->nRanksPerNode < 2) {
-    GTEST_SKIP();
+    SKIP_TEST();
   }
   if (!mscclpp::isNvlsSupported()) {
-    GTEST_SKIP();
+    SKIP_TEST();
   }
   // Use only two ranks
   setNumRanksToUse(2);

From b6ce0f2ede73a48f22048b06b38a317a89c03bf4 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 18 Feb 2026 19:16:21 -0800
Subject: [PATCH 023/132] simplify

---
 test/framework.cc                    |  15 ++-
 test/framework.hpp                   |  94 +++++++++---------
 test/mp_unit/bootstrap_tests.cc      |  14 +--
 test/mp_unit/communicator_tests.cu   |   6 +-
 test/mp_unit/executor_tests.cc       |   2 +-
 test/mp_unit/ib_tests.cu             |  10 +-
 test/mp_unit/memory_channel_tests.cu |  91 +++++-------------
 test/mp_unit/mp_unit_tests.cc        |   6 +-
 test/mp_unit/mp_unit_tests.hpp       |   7 ++
 test/mp_unit/port_channel_tests.cu   | 137 ++++++++-------------------
 test/mp_unit/switch_channel_tests.cu |   2 +-
 test/unit/core_tests.cc              |   4 +-
 test/unit/fifo_perf_tests.cu         |   6 +-
 13 files changed, 146 insertions(+), 248 deletions(-)

diff --git a/test/framework.cc b/test/framework.cc
index c75c90fc7..392bc770f 100644
--- a/test/framework.cc
+++ b/test/framework.cc
@@ -125,9 +125,18 @@ void TestRegistry::addEnvironment(Environment* env) { environments_.push_back(en
 static bool matchesFilter(const std::string& fullName, const std::string& filter) {
   if (filter.empty()) return true;
   if (filter[0] == '-') {
-    // Negative filter: exclude matching tests
-    std::string pattern = filter.substr(1);
-    return fullName.find(pattern) == std::string::npos;
+    // Negative filter: exclude tests matching any comma-separated pattern
+    std::string patterns = filter.substr(1);
+    size_t pos = 0;
+    while (pos < patterns.size()) {
+      size_t comma = patterns.find(',', pos);
+      std::string pattern = (comma == std::string::npos) ? patterns.substr(pos) : patterns.substr(pos, comma - pos);
+      if (!pattern.empty() && fullName.find(pattern) != std::string::npos) {
+        return false;
+      }
+      pos = (comma == std::string::npos) ? patterns.size() : comma + 1;
+    }
+    return true;
   }
   // Positive filter: include only matching tests
   return fullName.find(filter) != std::string::npos;
diff --git a/test/framework.hpp b/test/framework.hpp
index bcd84cf9e..26a32d5bc 100644
--- a/test/framework.hpp
+++ b/test/framework.hpp
@@ -155,63 +155,55 @@ class SkipHelper {
   std::ostringstream message_;
 };
 
+// SFINAE helper: resolves to T if T is a complete type (user-defined fixture),
+// otherwise falls back to TestCase. This lets TEST() work with or without a fixture class.
+namespace detail {
+template <typename...>
+using void_t = void;
+
+template <typename T, typename = void_t<>>
+struct FixtureOf {
+  using type = TestCase;
+};
+template <typename T>
+struct FixtureOf<T, void_t<decltype(sizeof(T))>> {
+  using type = T;
+};
+}  // namespace detail
+
 }  // namespace test
 }  // namespace mscclpp
 
 // --- Test registration macros ---
-
-#define TEST(test_suite, test_name)                                                            \
-  class test_suite##_##test_name##_Test : public ::mscclpp::test::TestCase {                   \
-   public:                                                                                     \
-    void TestBody() override;                                                                  \
-  };                                                                                           \
-  static bool test_suite##_##test_name##_registered = []() {                                   \
-    ::mscclpp::test::TestRegistry::instance().registerTest(                                    \
-        #test_suite, #test_name,                                                               \
-        []() -> ::mscclpp::test::TestCase* { return new test_suite##_##test_name##_Test(); }); \
-    return true;                                                                               \
-  }();                                                                                         \
-  void test_suite##_##test_name##_Test::TestBody()
-
-#define TEST_F(test_fixture, test_name)                                                          \
-  class test_fixture##_##test_name##_Test : public test_fixture {                                \
-   public:                                                                                       \
-    void TestBody() override;                                                                    \
-  };                                                                                             \
-  static bool test_fixture##_##test_name##_registered = []() {                                   \
-    ::mscclpp::test::TestRegistry::instance().registerTest(                                      \
-        #test_fixture, #test_name,                                                               \
-        []() -> ::mscclpp::test::TestCase* { return new test_fixture##_##test_name##_Test(); }); \
-    return true;                                                                                 \
-  }();                                                                                           \
+// TEST(Suite, Name): if Suite is a previously-defined class, the test inherits from it (fixture).
+// Otherwise, the test inherits from TestCase (no fixture needed).
+
+#define TEST(test_fixture, test_name)                                                                       \
+  class test_fixture;                                                                                       \
+  class test_fixture##_##test_name##_Test : public ::mscclpp::test::detail::FixtureOf<test_fixture>::type { \
+   public:                                                                                                  \
+    void TestBody() override;                                                                               \
+  };                                                                                                        \
+  static bool test_fixture##_##test_name##_registered = []() {                                              \
+    ::mscclpp::test::TestRegistry::instance().registerTest(                                                 \
+        #test_fixture, #test_name,                                                                          \
+        []() -> ::mscclpp::test::TestCase* { return new test_fixture##_##test_name##_Test(); });            \
+    return true;                                                                                            \
+  }();                                                                                                      \
   void test_fixture##_##test_name##_Test::TestBody()
 
-#define PERF_TEST(test_suite, test_name)                                                       \
-  class test_suite##_##test_name##_Test : public ::mscclpp::test::TestCase {                   \
-   public:                                                                                     \
-    void TestBody() override;                                                                  \
-  };                                                                                           \
-  static bool test_suite##_##test_name##_registered = []() {                                   \
-    ::mscclpp::test::TestRegistry::instance().registerTest(                                    \
-        #test_suite, #test_name,                                                               \
-        []() -> ::mscclpp::test::TestCase* { return new test_suite##_##test_name##_Test(); },  \
-        true);                                                                                 \
-    return true;                                                                               \
-  }();                                                                                         \
-  void test_suite##_##test_name##_Test::TestBody()
-
-#define PERF_TEST_F(test_fixture, test_name)                                                     \
-  class test_fixture##_##test_name##_Test : public test_fixture {                                \
-   public:                                                                                       \
-    void TestBody() override;                                                                    \
-  };                                                                                             \
-  static bool test_fixture##_##test_name##_registered = []() {                                   \
-    ::mscclpp::test::TestRegistry::instance().registerTest(                                      \
-        #test_fixture, #test_name,                                                               \
-        []() -> ::mscclpp::test::TestCase* { return new test_fixture##_##test_name##_Test(); },  \
-        true);                                                                                   \
-    return true;                                                                                 \
-  }();                                                                                           \
+#define PERF_TEST(test_fixture, test_name)                                                                  \
+  class test_fixture;                                                                                       \
+  class test_fixture##_##test_name##_Test : public ::mscclpp::test::detail::FixtureOf<test_fixture>::type { \
+   public:                                                                                                  \
+    void TestBody() override;                                                                               \
+  };                                                                                                        \
+  static bool test_fixture##_##test_name##_registered = []() {                                              \
+    ::mscclpp::test::TestRegistry::instance().registerTest(                                                 \
+        #test_fixture, #test_name,                                                                          \
+        []() -> ::mscclpp::test::TestCase* { return new test_fixture##_##test_name##_Test(); }, true);      \
+    return true;                                                                                            \
+  }();                                                                                                      \
   void test_fixture##_##test_name##_Test::TestBody()
 
 // --- Test runner macro ---
diff --git a/test/mp_unit/bootstrap_tests.cc b/test/mp_unit/bootstrap_tests.cc
index 56bcf78ff..f22e4c3df 100644
--- a/test/mp_unit/bootstrap_tests.cc
+++ b/test/mp_unit/bootstrap_tests.cc
@@ -48,7 +48,7 @@ void BootstrapTest::bootstrapTestAll(std::shared_ptr<mscclpp::Bootstrap> bootstr
   bootstrapTestSendRecv(bootstrap);
 }
 
-TEST_F(BootstrapTest, WithId) {
+TEST(BootstrapTest, WithId) {
   auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
   mscclpp::UniqueId id;
   if (bootstrap->getRank() == 0) id = bootstrap->createUniqueId();
@@ -57,13 +57,13 @@ TEST_F(BootstrapTest, WithId) {
   bootstrapTestAll(bootstrap);
 }
 
-TEST_F(BootstrapTest, WithIpPortPair) {
+TEST(BootstrapTest, WithIpPortPair) {
   auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
   bootstrap->initialize(gEnv->args["ip_port"]);
   bootstrapTestAll(bootstrap);
 }
 
-TEST_F(BootstrapTest, ResumeWithId) {
+TEST(BootstrapTest, ResumeWithId) {
   // This test may take a few minutes.
   bootstrapTestTimer.set(300);
 
@@ -76,19 +76,19 @@ TEST_F(BootstrapTest, ResumeWithId) {
   }
 }
 
-TEST_F(BootstrapTest, ResumeWithIpPortPair) {
+TEST(BootstrapTest, ResumeWithIpPortPair) {
   for (int i = 0; i < 5; ++i) {
     auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
     bootstrap->initialize(gEnv->args["ip_port"]);
   }
 }
 
-TEST_F(BootstrapTest, ExitBeforeConnect) {
+TEST(BootstrapTest, ExitBeforeConnect) {
   auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
   bootstrap->createUniqueId();
 }
 
-TEST_F(BootstrapTest, TimeoutWithId) {
+TEST(BootstrapTest, TimeoutWithId) {
   mscclpp::Timer timer;
 
   // All ranks initialize a bootstrap with their own id (will hang)
@@ -139,7 +139,7 @@ class MPIBootstrap : public mscclpp::Bootstrap {
   }
 };
 
-TEST_F(BootstrapTest, MPIBootstrap) {
+TEST(BootstrapTest, MPIBootstrap) {
   auto bootstrap = std::make_shared<MPIBootstrap>();
   bootstrapTestAll(bootstrap);
 }
diff --git a/test/mp_unit/communicator_tests.cu b/test/mp_unit/communicator_tests.cu
index 9d83532a1..79cbd17be 100644
--- a/test/mp_unit/communicator_tests.cu
+++ b/test/mp_unit/communicator_tests.cu
@@ -185,7 +185,7 @@ bool CommunicatorTest::testWriteCorrectness(bool skipLocal) {
   return true;
 }
 
-TEST_F(CommunicatorTest, BasicWrite) {
+TEST(CommunicatorTest, BasicWrite) {
   if (gEnv->rank >= numRanksToUse) return;
 
   deviceBufferInit();
@@ -215,7 +215,7 @@ __global__ void kernelWaitSemaphores(mscclpp::Host2DeviceSemaphore::DeviceHandle
   }
 }
 
-TEST_F(CommunicatorTest, WriteWithDeviceSemaphores) {
+TEST(CommunicatorTest, WriteWithDeviceSemaphores) {
   if (gEnv->rank >= numRanksToUse) return;
 
   std::unordered_map<int, std::shared_ptr<mscclpp::Host2DeviceSemaphore>> semaphores;
@@ -254,7 +254,7 @@ TEST_F(CommunicatorTest, WriteWithDeviceSemaphores) {
   communicator->bootstrap()->barrier();
 }
 
-TEST_F(CommunicatorTest, WriteWithHostSemaphores) {
+TEST(CommunicatorTest, WriteWithHostSemaphores) {
   if (gEnv->rank >= numRanksToUse) return;
 
   std::unordered_map<int, std::shared_ptr<mscclpp::Host2HostSemaphore>> semaphores;
diff --git a/test/mp_unit/executor_tests.cc b/test/mp_unit/executor_tests.cc
index 82fa53a83..7af5cb0d0 100644
--- a/test/mp_unit/executor_tests.cc
+++ b/test/mp_unit/executor_tests.cc
@@ -50,7 +50,7 @@ void ExecutorTest::TearDown() {
   MultiProcessTest::TearDown();
 }
 
-TEST_F(ExecutorTest, TwoNodesAllreduce) {
+TEST(ExecutorTest, TwoNodesAllreduce) {
   std::string executablePath = getExecutablePath();
   std::filesystem::path path = executablePath;
   std::filesystem::path executionFilesPath =
diff --git a/test/mp_unit/ib_tests.cu b/test/mp_unit/ib_tests.cu
index 963e80d20..2e5d8d8cb 100644
--- a/test/mp_unit/ib_tests.cu
+++ b/test/mp_unit/ib_tests.cu
@@ -18,9 +18,7 @@ void IbTestBase::SetUp() {
 }
 
 void IbPeerToPeerTest::SetUp() {
-#if !defined(USE_IBVERBS)
-  SKIP_TEST() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
+  REQUIRE_IBVERBS;
 
   IbTestBase::SetUp();
 
@@ -80,7 +78,7 @@ void IbPeerToPeerTest::stageSendWriteWithImm(uint32_t size, uint64_t wrId, uint6
   qp->stageSendWriteWithImm(mr.get(), remoteMrInfo, size, wrId, srcOffset, dstOffset, signaled, immData);
 }
 
-TEST_F(IbPeerToPeerTest, SimpleSendRecv) {
+TEST(IbPeerToPeerTest, SimpleSendRecv) {
   if (gEnv->rank >= 2) {
     // This test needs only two ranks
     return;
@@ -195,7 +193,7 @@ __global__ void kernelMemoryConsistency(uint64_t* data, volatile uint64_t* curIt
   }
 }
 
-TEST_F(IbPeerToPeerTest, MemoryConsistency) {
+TEST(IbPeerToPeerTest, MemoryConsistency) {
   if (gEnv->rank >= 2) {
     // This test needs only two ranks
     return;
@@ -303,7 +301,7 @@ TEST_F(IbPeerToPeerTest, MemoryConsistency) {
   EXPECT_EQ(res, 0);
 }
 
-TEST_F(IbPeerToPeerTest, SimpleAtomicAdd) {
+TEST(IbPeerToPeerTest, SimpleAtomicAdd) {
   if (gEnv->rank >= 2) {
     // This test needs only two ranks
     return;
diff --git a/test/mp_unit/memory_channel_tests.cu b/test/mp_unit/memory_channel_tests.cu
index cb5610946..19e5180f7 100644
--- a/test/mp_unit/memory_channel_tests.cu
+++ b/test/mp_unit/memory_channel_tests.cu
@@ -88,27 +88,12 @@ void MemoryChannelOneToOneTest::packetPingPongTest(const std::string testName,
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
   // The least nelem is 2 for packet ping pong
-  kernelWrapper(buff.get(), gEnv->rank, 2, ret.get(), defaultNTries);
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-  *ret = 0;
-
-  kernelWrapper(buff.get(), gEnv->rank, 1024, ret.get(), defaultNTries);
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelWrapper(buff.get(), gEnv->rank, 1024 * 1024, ret.get(), defaultNTries);
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelWrapper(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get(), defaultNTries);
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
+  for (int nElem : {2, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
+    *ret = 0;
+    kernelWrapper(buff.get(), gEnv->rank, nElem, ret.get(), defaultNTries);
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    EXPECT_EQ(*ret, 0);
+  }
 
   int nTries = 1000000;
   communicator->bootstrap()->barrier();
@@ -169,7 +154,7 @@ __global__ void kernelMemPutPingPong(int* buff, int rank, int nElem, int* ret) {
   }
 }
 
-TEST_F(MemoryChannelOneToOneTest, PutPingPong) {
+TEST(MemoryChannelOneToOneTest, PutPingPong) {
   if (gEnv->rank >= numRanksToUse) return;
 
   const int nElem = 4 * 1024 * 1024;
@@ -187,28 +172,12 @@ TEST_F(MemoryChannelOneToOneTest, PutPingPong) {
 
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
-  kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
+  for (int nElem : {1, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
+    *ret = 0;
+    kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, nElem, ret.get());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    EXPECT_EQ(*ret, 0);
+  }
 }
 
 __global__ void kernelMemGetPingPong(int* buff, int rank, int nElem, int* ret) {
@@ -248,7 +217,7 @@ __global__ void kernelMemGetPingPong(int* buff, int rank, int nElem, int* ret) {
   }
 }
 
-TEST_F(MemoryChannelOneToOneTest, GetPingPong) {
+TEST(MemoryChannelOneToOneTest, GetPingPong) {
   if (gEnv->rank >= numRanksToUse) return;
 
   const int nElem = 4 * 1024 * 1024;
@@ -266,28 +235,12 @@ TEST_F(MemoryChannelOneToOneTest, GetPingPong) {
 
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
-  kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
+  for (int nElem : {1, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
+    *ret = 0;
+    kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, nElem, ret.get());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    EXPECT_EQ(*ret, 0);
+  }
 }
 
 __global__ void kernelMemLL8PacketPingPong(int* buff, int rank, int nElem, int* ret, int nTries) {
@@ -371,14 +324,14 @@ __global__ void kernelMemLL16PacketPingPong(int* buff, int rank, int nElem, int*
   }
 }
 
-TEST_F(MemoryChannelOneToOneTest, LL8PacketPingPong) {
+TEST(MemoryChannelOneToOneTest, LL8PacketPingPong) {
   auto kernelMemLL8PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
     kernelMemLL8PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
   };
   packetPingPongTest("memoryLL8PacketPingPong", kernelMemLL8PacketPingPongWrapper);
 }
 
-TEST_F(MemoryChannelOneToOneTest, LL16PacketPingPong) {
+TEST(MemoryChannelOneToOneTest, LL16PacketPingPong) {
   auto kernelMemLL16PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
     kernelMemLL16PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
   };
diff --git a/test/mp_unit/mp_unit_tests.cc b/test/mp_unit/mp_unit_tests.cc
index 5782930e0..2f6dc1cab 100644
--- a/test/mp_unit/mp_unit_tests.cc
+++ b/test/mp_unit/mp_unit_tests.cc
@@ -137,12 +137,12 @@ int main(int argc, char** argv) {
   return RUN_ALL_TESTS();
 }
 
-TEST_F(MultiProcessTest, Prelim) {
+TEST(MultiProcessTest, Prelim) {
   // Test to make sure the MPI environment is set up correctly
   ASSERT_GE(gEnv->worldSize, 2);
 }
 
-TEST_F(MultiProcessTest, HostName) {
+TEST(MultiProcessTest, HostName) {
   const size_t maxNameLen = 1024;
   std::vector<char> buffer(gEnv->worldSize * maxNameLen, '\0');
   std::string hostName = mscclpp::getHostName(maxNameLen, '\0');
@@ -162,7 +162,7 @@ TEST_F(MultiProcessTest, HostName) {
   }
 }
 
-TEST_F(MultiProcessTest, HostHash) {
+TEST(MultiProcessTest, HostHash) {
   std::vector<uint64_t> buffer(gEnv->worldSize, 0);
   uint64_t hostHash = mscclpp::getHostHash();
   buffer[gEnv->rank] = hostHash;
diff --git a/test/mp_unit/mp_unit_tests.hpp b/test/mp_unit/mp_unit_tests.hpp
index bcf880ae2..03e4cbde9 100644
--- a/test/mp_unit/mp_unit_tests.hpp
+++ b/test/mp_unit/mp_unit_tests.hpp
@@ -15,6 +15,13 @@
 #include "ib.hpp"
 #include "utils_internal.hpp"
 
+// Skip the current test if IBVerbs is not available in this build
+#if defined(USE_IBVERBS)
+#define REQUIRE_IBVERBS
+#else
+#define REQUIRE_IBVERBS SKIP_TEST() << "This test requires IBVerbs that the current build does not support."
+#endif
+
 class MultiProcessTestEnv : public ::mscclpp::test::Environment {
  public:
   MultiProcessTestEnv(int argc, const char** argv);
diff --git a/test/mp_unit/port_channel_tests.cu b/test/mp_unit/port_channel_tests.cu
index 3d5c00412..5e29c80cb 100644
--- a/test/mp_unit/port_channel_tests.cu
+++ b/test/mp_unit/port_channel_tests.cu
@@ -178,26 +178,12 @@ void PortChannelOneToOneTest::testPingPong(PingPongTestParams params) {
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
   const int nTries = 1000;
-
-  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, params.waitWithPoll, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-
-  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, params.waitWithPoll, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-
-  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, params.waitWithPoll, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-
-  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, params.waitWithPoll, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
+  for (int nElem : {1, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
+    *ret = 0;
+    kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, nElem, params.waitWithPoll, nTries, ret.get());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    EXPECT_EQ(*ret, 0);
+  }
 
   proxyService->stopProxy();
 }
@@ -246,63 +232,51 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
   proxyService->stopProxy();
 }
 
-TEST_F(PortChannelOneToOneTest, PingPong) {
+TEST(PortChannelOneToOneTest, PingPong) {
   testPingPong(PingPongTestParams{
       .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongIbHostMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PingPongIbHostMode) {
+  REQUIRE_IBVERBS;
   testPingPong(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
-#else   // !defined(USE_IBVERBS)
-  SKIP_TEST() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongEthernet) {
+TEST(PortChannelOneToOneTest, PingPongEthernet) {
   testPingPong(PingPongTestParams{
       .useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongWithPoll) {
+TEST(PortChannelOneToOneTest, PingPongWithPoll) {
   testPingPong(PingPongTestParams{
       .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Default});
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongIbHostModeWithPoll) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PingPongIbHostModeWithPoll) {
+  REQUIRE_IBVERBS;
   testPingPong(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Host});
-#else   // !defined(USE_IBVERBS)
-  SKIP_TEST() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongPerf) {
+TEST(PortChannelOneToOneTest, PingPongPerf) {
   testPingPongPerf(PingPongTestParams{
       .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongPerfIbHostMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PingPongPerfIbHostMode) {
+  REQUIRE_IBVERBS;
   testPingPongPerf(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
-#else   // !defined(USE_IBVERBS)
-  SKIP_TEST() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
   testPingPongPerf(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
-#else   // !defined(USE_IBVERBS)
-  SKIP_TEST() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongPerfEthernet) {
+TEST(PortChannelOneToOneTest, PingPongPerfEthernet) {
   testPingPongPerf(PingPongTestParams{
       .useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
@@ -406,34 +380,14 @@ void PortChannelOneToOneTest::testPacketPingPong(bool useIb, IbMode ibMode) {
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
   const int nTries = 1000;
-
   // The least nelem is 2 for packet ping pong
-  kernelProxyLLPingPong<true>
-      <<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank, 2, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelProxyLLPingPong<true>
-      <<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank, 1024, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelProxyLLPingPong<true><<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank,
-                                           1024 * 1024, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelProxyLLPingPong<true><<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank,
-                                           4 * 1024 * 1024, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
+  for (int nElem : {2, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
+    *ret = 0;
+    kernelProxyLLPingPong<true>
+        <<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank, nElem, nTries, ret.get());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    EXPECT_EQ(*ret, 0);
+  }
 
   communicator->bootstrap()->barrier();
 
@@ -495,47 +449,32 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb, IbMode ibMode)
   proxyService->stopProxy();
 }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false, IbMode::Default); }
+TEST(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false, IbMode::Default); }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPongIbHostMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PacketPingPongIbHostMode) {
+  REQUIRE_IBVERBS;
   testPacketPingPong(true, IbMode::Host);
-#else   // !defined(USE_IBVERBS)
-  SKIP_TEST() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); }
+TEST(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
+  REQUIRE_IBVERBS;
   testPacketPingPongPerf(true, IbMode::Host);
-#else   // !defined(USE_IBVERBS)
-  SKIP_TEST() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
   testPacketPingPongPerf(true, IbMode::HostNoAtomic);
-#else   // !defined(USE_IBVERBS)
-  SKIP_TEST() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongIbHostNoAtomicMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PingPongIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
   testPingPong(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
-#else   // !defined(USE_IBVERBS)
-  SKIP_TEST() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPongIbHostNoAtomicMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PacketPingPongIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
   testPacketPingPong(true, IbMode::HostNoAtomic);
-#else   // !defined(USE_IBVERBS)
-  SKIP_TEST() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
diff --git a/test/mp_unit/switch_channel_tests.cu b/test/mp_unit/switch_channel_tests.cu
index 16152c5c7..d83111b59 100644
--- a/test/mp_unit/switch_channel_tests.cu
+++ b/test/mp_unit/switch_channel_tests.cu
@@ -31,7 +31,7 @@ __global__ void kernelSwitchReduce() {
 #endif  // (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
 }
 
-TEST_F(SwitchChannelTest, SimpleAllReduce) {
+TEST(SwitchChannelTest, SimpleAllReduce) {
   if (gEnv->rank >= numRanksToUse) return;
 
   std::vector<int> ranks;
diff --git a/test/unit/core_tests.cc b/test/unit/core_tests.cc
index 45fce6e2b..4b32378f0 100644
--- a/test/unit/core_tests.cc
+++ b/test/unit/core_tests.cc
@@ -20,7 +20,7 @@ class LocalCommunicatorTest : public ::mscclpp::test::TestCase {
   std::shared_ptr<mscclpp::Communicator> comm;
 };
 
-TEST_F(LocalCommunicatorTest, RegisterMemory) {
+TEST(LocalCommunicatorTest, RegisterMemory) {
   int dummy[42];
   auto memory = comm->registerMemory(&dummy, sizeof(dummy), mscclpp::NoTransports);
   EXPECT_EQ(memory.data(), &dummy);
@@ -28,7 +28,7 @@ TEST_F(LocalCommunicatorTest, RegisterMemory) {
   ASSERT_TRUE(memory.transports() == mscclpp::NoTransports);
 }
 
-TEST_F(LocalCommunicatorTest, SendMemoryToSelf) {
+TEST(LocalCommunicatorTest, SendMemoryToSelf) {
   int dummy[42];
   auto memory = comm->registerMemory(&dummy, sizeof(dummy), mscclpp::NoTransports);
   comm->sendMemory(memory, 0);
diff --git a/test/unit/fifo_perf_tests.cu b/test/unit/fifo_perf_tests.cu
index 9aab2643d..9a28591b3 100644
--- a/test/unit/fifo_perf_tests.cu
+++ b/test/unit/fifo_perf_tests.cu
@@ -1,8 +1,6 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
-#include "../framework.hpp"
-
 #include <cassert>
 #include <memory>
 #include <mscclpp/fifo.hpp>
@@ -10,6 +8,8 @@
 #include <mscclpp/numa.hpp>
 #include <unordered_map>
 
+#include "../framework.hpp"
+
 // Simple FIFO performance test to be run as part of unit_tests
 // This is a performance test that can be excluded from coverage runs
 // using the --exclude-perf-tests flag.
@@ -76,7 +76,7 @@ PERF_TEST(FifoPerfTest, BasicPerformance) {
   // Process triggers
   bool success = consumePerfTriggers(hostFifo, numTriggers, numParallel);
   ASSERT_TRUE(success);
-  
+
   CUDA_CHECK(cudaStreamSynchronize(stream));
   CUDA_CHECK(cudaStreamDestroy(stream));
   CUDA_CHECK(cudaDeviceSynchronize());

From d2efc2fd3bb7eff82ad63816ea968b0c867aec7c Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 18 Feb 2026 19:48:29 -0800
Subject: [PATCH 024/132] coverage update

---
 .azure-pipelines/templates/ut.yaml | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/.azure-pipelines/templates/ut.yaml b/.azure-pipelines/templates/ut.yaml
index ae5bedbd7..12004d6e2 100644
--- a/.azure-pipelines/templates/ut.yaml
+++ b/.azure-pipelines/templates/ut.yaml
@@ -124,8 +124,8 @@ steps:
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
-  name: MpUnitTestsCoverageNonPerf
-  displayName: Run mp_unit_tests (non-perf) with coverage
+  name: TestsCoverageNonPerf
+  displayName: Run unit_tests + mp_unit_tests (non-perf) with coverage
   inputs:
     targetType: 'inline'
     script: |
@@ -141,11 +141,12 @@ steps:
         export PATH=/usr/local/mpi/bin:\$PATH;                        \
         cd /root/mscclpp;                                             \
         export LD_LIBRARY_PATH=/root/mscclpp/build_coverage/lib:\$LD_LIBRARY_PATH; \
+        ./build_coverage/bin/unit_tests;                              \
         mpirun --allow-run-as-root -tag-output -np 2 ./build_coverage/bin/mp_unit_tests --exclude-perf-tests;  \
         mpirun --allow-run-as-root -tag-output -np 4 ./build_coverage/bin/mp_unit_tests --exclude-perf-tests;  \
         cd build_coverage;                                            \
-        lcov --directory . --capture --output-file coverage.info;     \
-        lcov --remove coverage.info '/usr/*' '*/test/*' '*/build/*' --output-file coverage.info;  \
+        lcov --directory . --capture --output-file coverage.info --ignore-errors mismatch;  \
+        lcov --remove coverage.info '/usr/*' '*/test/*' '*/build/*' --output-file coverage.info --ignore-errors unused;  \
         lcov --list coverage.info"'
       kill $CHILD_PID
     workingDirectory: '$(System.DefaultWorkingDirectory)'

From 4afbf780ed2718657f63767b279ef2b721b2ddfb Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 18 Feb 2026 19:54:37 -0800
Subject: [PATCH 025/132] minor

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index cf946377d..74307e67f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,6 @@
 .vscode/
 build/
-build_*/
+build_coverage/
 __pycache__
 .*.swp
 *.so

From e40c72bd2bc23fa945675b21b810681123110e63 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 18 Feb 2026 20:12:32 -0800
Subject: [PATCH 026/132] license text update

---
 CMakeLists.txt                       | 2 +-
 test/CMakeLists.txt                  | 2 +-
 test/mp_unit/bootstrap_tests.cc      | 2 +-
 test/mp_unit/communicator_tests.cu   | 2 +-
 test/mp_unit/executor_tests.cc       | 2 +-
 test/mp_unit/ib_tests.cu             | 2 +-
 test/mp_unit/memory_channel_tests.cu | 2 +-
 test/mp_unit/port_channel_tests.cu   | 2 +-
 test/mp_unit/switch_channel_tests.cu | 2 +-
 test/unit/CMakeLists.txt             | 2 +-
 test/unit/compile_tests.cu           | 2 +-
 test/unit/core_tests.cc              | 2 +-
 test/unit/errors_tests.cc            | 2 +-
 test/unit/fifo_tests.cu              | 2 +-
 test/unit/gpu_utils_tests.cc         | 2 +-
 test/unit/local_channel_tests.cu     | 2 +-
 test/unit/numa_tests.cc              | 2 +-
 test/unit/socket_tests.cc            | 2 +-
 test/unit/utils_tests.cc             | 2 +-
 19 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 738ec780b..fc065d298 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+# Licensed under the MIT License.
 
 cmake_minimum_required(VERSION 3.25)
 project(mscclpp LANGUAGES CXX)
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 288550854..a7c1417c9 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,5 +1,5 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+# Licensed under the MIT License.
 
 find_package(MPI REQUIRED)
 
diff --git a/test/mp_unit/bootstrap_tests.cc b/test/mp_unit/bootstrap_tests.cc
index f22e4c3df..c28087a45 100644
--- a/test/mp_unit/bootstrap_tests.cc
+++ b/test/mp_unit/bootstrap_tests.cc
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <mpi.h>
 
diff --git a/test/mp_unit/communicator_tests.cu b/test/mp_unit/communicator_tests.cu
index 79cbd17be..066c5514c 100644
--- a/test/mp_unit/communicator_tests.cu
+++ b/test/mp_unit/communicator_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <mpi.h>
 
diff --git a/test/mp_unit/executor_tests.cc b/test/mp_unit/executor_tests.cc
index 7af5cb0d0..4f3f25451 100644
--- a/test/mp_unit/executor_tests.cc
+++ b/test/mp_unit/executor_tests.cc
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <mpi.h>
 #include <unistd.h>
diff --git a/test/mp_unit/ib_tests.cu b/test/mp_unit/ib_tests.cu
index 2e5d8d8cb..04ab402dd 100644
--- a/test/mp_unit/ib_tests.cu
+++ b/test/mp_unit/ib_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <mpi.h>
 
diff --git a/test/mp_unit/memory_channel_tests.cu b/test/mp_unit/memory_channel_tests.cu
index 19e5180f7..318d301af 100644
--- a/test/mp_unit/memory_channel_tests.cu
+++ b/test/mp_unit/memory_channel_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <algorithm>
 
diff --git a/test/mp_unit/port_channel_tests.cu b/test/mp_unit/port_channel_tests.cu
index 5e29c80cb..764c32999 100644
--- a/test/mp_unit/port_channel_tests.cu
+++ b/test/mp_unit/port_channel_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <cstdint>
 #include <mscclpp/concurrency_device.hpp>
diff --git a/test/mp_unit/switch_channel_tests.cu b/test/mp_unit/switch_channel_tests.cu
index d83111b59..710fd84a8 100644
--- a/test/mp_unit/switch_channel_tests.cu
+++ b/test/mp_unit/switch_channel_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <algorithm>
 #include <mscclpp/switch_channel.hpp>
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 655f77788..7836e0632 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -1,5 +1,5 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+# Licensed under the MIT License.
 
 target_sources(unit_tests PRIVATE
     unit_tests_main.cc
diff --git a/test/unit/compile_tests.cu b/test/unit/compile_tests.cu
index 18046a1f8..893bb9403 100644
--- a/test/unit/compile_tests.cu
+++ b/test/unit/compile_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include "../framework.hpp"
 
diff --git a/test/unit/core_tests.cc b/test/unit/core_tests.cc
index 4b32378f0..d2552ff31 100644
--- a/test/unit/core_tests.cc
+++ b/test/unit/core_tests.cc
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <mscclpp/core.hpp>
 
diff --git a/test/unit/errors_tests.cc b/test/unit/errors_tests.cc
index 13c8d542a..3eeed3875 100644
--- a/test/unit/errors_tests.cc
+++ b/test/unit/errors_tests.cc
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <mscclpp/errors.hpp>
 
diff --git a/test/unit/fifo_tests.cu b/test/unit/fifo_tests.cu
index 68e777d07..8d30ca5ed 100644
--- a/test/unit/fifo_tests.cu
+++ b/test/unit/fifo_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <mscclpp/fifo.hpp>
 #include <mscclpp/gpu_utils.hpp>
diff --git a/test/unit/gpu_utils_tests.cc b/test/unit/gpu_utils_tests.cc
index c10f113c4..977314e98 100644
--- a/test/unit/gpu_utils_tests.cc
+++ b/test/unit/gpu_utils_tests.cc
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <mscclpp/gpu_utils.hpp>
 
diff --git a/test/unit/local_channel_tests.cu b/test/unit/local_channel_tests.cu
index 76060f97f..699baa385 100644
--- a/test/unit/local_channel_tests.cu
+++ b/test/unit/local_channel_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <mscclpp/core.hpp>
 #include <mscclpp/gpu_utils.hpp>
diff --git a/test/unit/numa_tests.cc b/test/unit/numa_tests.cc
index c27fde904..46bf5e18b 100644
--- a/test/unit/numa_tests.cc
+++ b/test/unit/numa_tests.cc
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/numa.hpp>
diff --git a/test/unit/socket_tests.cc b/test/unit/socket_tests.cc
index 6b7c19033..a5598938f 100644
--- a/test/unit/socket_tests.cc
+++ b/test/unit/socket_tests.cc
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <mscclpp/utils.hpp>
 #include <thread>
diff --git a/test/unit/utils_tests.cc b/test/unit/utils_tests.cc
index 110550dac..51562c219 100644
--- a/test/unit/utils_tests.cc
+++ b/test/unit/utils_tests.cc
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <mscclpp/errors.hpp>
 #include <mscclpp/utils.hpp>

From bed85b56cb1c2ed090abd0874eca45c14ff857f3 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 18 Feb 2026 20:23:42 -0800
Subject: [PATCH 027/132] codecov upload

---
 .azure-pipelines/templates/ut.yaml | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/.azure-pipelines/templates/ut.yaml b/.azure-pipelines/templates/ut.yaml
index 12004d6e2..28915889f 100644
--- a/.azure-pipelines/templates/ut.yaml
+++ b/.azure-pipelines/templates/ut.yaml
@@ -151,6 +151,34 @@ steps:
       kill $CHILD_PID
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
+- task: Bash@3
+  name: FetchCoverage
+  displayName: Fetch coverage data from remote VM
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
+      SSH_OPTION="StrictHostKeyChecking=no"
+      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+      HOST=$(head -1 ${HOSTFILE})
+      ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST} \
+        'sudo docker cp mscclpp-test:/root/mscclpp/build_coverage/coverage.info /tmp/coverage.info'
+      scp -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST}:/tmp/coverage.info $(System.DefaultWorkingDirectory)/coverage.info
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: Bash@3
+  name: UploadCodecov
+  displayName: Upload coverage to Codecov
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      curl -Os https://cli.codecov.io/latest/linux/codecov
+      chmod +x codecov
+      ./codecov upload-process --disable-search -t $(CODECOV_TOKEN) -f coverage.info
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
 - task: Bash@3
   name: PyTests
   displayName: Run pytests

From 4d9aceac6fd881006727a62cf9ffa9b95f41205a Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 18 Feb 2026 20:25:50 -0800
Subject: [PATCH 028/132] badge

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 8f300a2a6..276ec29f2 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,7 @@
 [![License](https://img.shields.io/github/license/microsoft/mscclpp.svg)](LICENSE)
 [![CodeQL](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml/badge.svg?branch=main)](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml)
 [![Docs Build](https://github.com/microsoft/mscclpp/actions/workflows/doc-build.yaml/badge.svg)](https://microsoft.github.io/mscclpp/)
+[![codecov](https://codecov.io/gh/microsoft/mscclpp/graph/badge.svg?token=DAV9DGHAY2)](https://codecov.io/gh/microsoft/mscclpp)
 
 | Testing Pipelines        | Build Status      |
 |--------------------------|-------------------|

From b693d1b3fcbe492d0bd84198ee5bbb92ee52b7ae Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 18 Feb 2026 20:31:25 -0800
Subject: [PATCH 029/132] lint issue

---
 test/executor_test.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/executor_test.cc b/test/executor_test.cc
index cc7456590..e8d24d595 100644
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -139,7 +139,8 @@ int main(int argc, char* argv[]) {
     NpKit::Shutdown();
   }
 
-  std::cout << "Rank " << rank << ": " << bufferSize << " bytes " << deltaSec * 1.e6 << " us" << std::endl;
+  double latencyUs = deltaSec * 1.e6;
+  std::cout << "Rank " << rank << ": " << bufferSize << " bytes " << latencyUs << " us" << std::endl;
   MPI_Finalize();
   return 0;
 }

From 2b4adcc4ad42e5b7c74723ddf3db87b0d3915265 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 18 Feb 2026 20:33:57 -0800
Subject: [PATCH 030/132] fix lint

---
 test/executor_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/executor_test.cc b/test/executor_test.cc
index e8d24d595..2378e7ffd 100644
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -93,8 +93,8 @@ double benchTime(int rank, std::shared_ptr<mscclpp::Bootstrap> bootstrap, std::s
 
 int main(int argc, char* argv[]) {
   if (argc != 5 && argc != 6) {
-    std::cerr << "Usage: " << argv[0] << " <buffer size>" << " <execution plan path>" << " <number of iterations>"
-              << " <number of graph iterations>" << " (optional) <packet type>" << std::endl;
+    std::cerr << "Usage: " << argv[0] << " <buffer size> <execution plan path>"
+              << " <number of iterations> <number of graph iterations> (optional) <packet type>" << std::endl;
     return 1;
   }
 

From dcdd3febd18a1f5c28f29da1c36d13c6b237aeb0 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 20 Feb 2026 13:35:32 -0800
Subject: [PATCH 031/132] update UT CI

---
 .azure-pipelines/templates/ut.yaml | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/.azure-pipelines/templates/ut.yaml b/.azure-pipelines/templates/ut.yaml
index 28915889f..a0fb1e4de 100644
--- a/.azure-pipelines/templates/ut.yaml
+++ b/.azure-pipelines/templates/ut.yaml
@@ -25,6 +25,14 @@ steps:
         cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
       fi
       make -j
+      cd ..
+      mkdir build_coverage && cd build_coverage
+      if [ "${{ parameters.platform }}" == "rocm" ]; then
+        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_ENABLE_COVERAGE=ON ..
+      else
+        cmake -DCMAKE_BUILD_TYPE=Debug -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_ENABLE_COVERAGE=ON ..
+      fi
+      make -j
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: DownloadSecureFile@1
@@ -108,21 +116,6 @@ steps:
       kill $CHILD_PID
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
-- task: Bash@3
-  name: DebugBuildWithCoverage
-  displayName: Build Debug with Coverage
-  inputs:
-    targetType: 'inline'
-    script: |
-      mkdir build_coverage && cd build_coverage
-      if [ "${{ parameters.platform }}" == "rocm" ]; then
-        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_ENABLE_COVERAGE=ON ..
-      else
-        cmake -DCMAKE_BUILD_TYPE=Debug -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_ENABLE_COVERAGE=ON ..
-      fi
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
 - task: Bash@3
   name: TestsCoverageNonPerf
   displayName: Run unit_tests + mp_unit_tests (non-perf) with coverage
@@ -145,8 +138,8 @@ steps:
         mpirun --allow-run-as-root -tag-output -np 2 ./build_coverage/bin/mp_unit_tests --exclude-perf-tests;  \
         mpirun --allow-run-as-root -tag-output -np 4 ./build_coverage/bin/mp_unit_tests --exclude-perf-tests;  \
         cd build_coverage;                                            \
-        lcov --directory . --capture --output-file coverage.info --ignore-errors mismatch;  \
-        lcov --remove coverage.info '/usr/*' '*/test/*' '*/build/*' --output-file coverage.info --ignore-errors unused;  \
+        lcov --directory . --capture --output-file coverage.info;  \
+        lcov --remove coverage.info '/usr/*' '*/test/*' '*/build/*' --output-file coverage.info;  \
         lcov --list coverage.info"'
       kill $CHILD_PID
     workingDirectory: '$(System.DefaultWorkingDirectory)'

From caeec7590a403e107ddd04e26a10555651d35788 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 20 Feb 2026 13:43:32 -0800
Subject: [PATCH 032/132] updates

---
 .azure-pipelines/templates/ut-npkit.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azure-pipelines/templates/ut-npkit.yaml b/.azure-pipelines/templates/ut-npkit.yaml
index 0ab733c9f..74b94fc15 100644
--- a/.azure-pipelines/templates/ut-npkit.yaml
+++ b/.azure-pipelines/templates/ut-npkit.yaml
@@ -88,7 +88,7 @@ steps:
         export PATH=/usr/local/mpi/bin:\$PATH; \
         export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump;    \
         export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --gtest_filter=\"ExecutorTest.TwoNodesAllreduce\"; \
+        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --filter=\"ExecutorTest.TwoNodesAllreduce\"; \
         python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
         grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
         grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json;  \

From b9609f83a02e42ac36d9fa71483adc34099bd20e Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 20 Feb 2026 14:03:54 -0800
Subject: [PATCH 033/132] add coverage flags

---
 .azure-pipelines/templates/ut.yaml |  2 +-
 .codecov.yml                       | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 .codecov.yml

diff --git a/.azure-pipelines/templates/ut.yaml b/.azure-pipelines/templates/ut.yaml
index a0fb1e4de..e6e989e79 100644
--- a/.azure-pipelines/templates/ut.yaml
+++ b/.azure-pipelines/templates/ut.yaml
@@ -169,7 +169,7 @@ steps:
       set -e
       curl -Os https://cli.codecov.io/latest/linux/codecov
       chmod +x codecov
-      ./codecov upload-process --disable-search -t $(CODECOV_TOKEN) -f coverage.info
+      ./codecov upload-process --disable-search -t $(CODECOV_TOKEN) -f coverage.info --flag ${{ parameters.platform }}-${{ parameters.gpuArch }}
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
diff --git a/.codecov.yml b/.codecov.yml
new file mode 100644
index 000000000..a98f1e89e
--- /dev/null
+++ b/.codecov.yml
@@ -0,0 +1,24 @@
+codecov:
+  require_ci_to_pass: yes
+
+coverage:
+  status:
+    project:
+      default:
+        target: 68%
+        threshold: 1%
+    patch:
+      default:
+        target: 80%
+
+flag_management:
+  default_rules:
+    carryforward: true
+
+ignore:
+  - "test/"
+  - "examples/"
+  - "python/"
+  - "tools/"
+  - "docs/"
+  - "docker/"

From febdbf9230a5a2abade928e36e041ea36913b846 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sat, 21 Feb 2026 00:02:03 -0800
Subject: [PATCH 034/132] WIP; need amd fix

---
 CMakeLists.txt                  |  14 ++++
 cmake/FindGDRCopy.cmake         |  37 ++++++++++
 include/mscclpp/env.hpp         |   5 ++
 include/mscclpp/semaphore.hpp   |   4 +
 src/core/CMakeLists.txt         |   6 ++
 src/core/connection.cc          |  93 ++++++++++++++++++++----
 src/core/context.cc             |   2 -
 src/core/env.cpp                |   4 +-
 src/core/gdr.cc                 | 125 ++++++++++++++++++++++++++++++++
 src/core/include/connection.hpp |  22 +++++-
 src/core/include/context.hpp    |   2 -
 src/core/include/gdr.hpp        |  57 +++++++++++++++
 src/core/semaphore.cc           |  55 ++++++++++----
 13 files changed, 388 insertions(+), 38 deletions(-)
 create mode 100644 cmake/FindGDRCopy.cmake
 create mode 100644 src/core/gdr.cc
 create mode 100644 src/core/include/gdr.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6288dbb08..d46e45fe5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -170,6 +170,20 @@ endif()
 find_package(NUMA REQUIRED)
 find_package(Threads REQUIRED)
 
+option(MSCCLPP_USE_GDRCOPY "Use GDRCopy for direct GPU memory access from host." ON)
+if(MSCCLPP_USE_ROCM)
+    set(MSCCLPP_USE_GDRCOPY OFF)
+endif()
+if(MSCCLPP_USE_GDRCOPY)
+    find_package(GDRCopy)
+    if(NOT GDRCOPY_FOUND)
+        message(STATUS "GDRCopy not found, disabling GDRCopy support")
+        set(MSCCLPP_USE_GDRCOPY OFF)
+    else()
+        message(STATUS "GDRCopy found: ${GDRCOPY_LIBRARIES}")
+    endif()
+endif()
+
 include(FetchContent)
 FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz)
 FetchContent_MakeAvailable(json)
diff --git a/cmake/FindGDRCopy.cmake b/cmake/FindGDRCopy.cmake
new file mode 100644
index 000000000..016adfda2
--- /dev/null
+++ b/cmake/FindGDRCopy.cmake
@@ -0,0 +1,37 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+# Find the GDRCopy libraries
+#
+# The following variables are optionally searched for defaults
+#  GDRCOPY_ROOT_DIR: Base directory where all GDRCopy components are found
+#  GDRCOPY_INCLUDE_DIR: Directory where GDRCopy headers are found
+#  GDRCOPY_LIB_DIR: Directory where GDRCopy libraries are found
+
+# The following are set after configuration is done:
+#  GDRCOPY_FOUND
+#  GDRCOPY_INCLUDE_DIRS
+#  GDRCOPY_LIBRARIES
+
+find_path(GDRCOPY_INCLUDE_DIRS
+  NAMES gdrapi.h
+  HINTS
+  ${GDRCOPY_INCLUDE_DIR}
+  ${GDRCOPY_ROOT_DIR}
+  ${GDRCOPY_ROOT_DIR}/include
+  /usr/local/include
+  /usr/include)
+
+find_library(GDRCOPY_LIBRARIES
+  NAMES gdrapi
+  HINTS
+  ${GDRCOPY_LIB_DIR}
+  ${GDRCOPY_ROOT_DIR}
+  ${GDRCOPY_ROOT_DIR}/lib
+  /usr/local/lib
+  /usr/lib
+  /usr/lib/x86_64-linux-gnu)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(GDRCopy DEFAULT_MSG GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)
+mark_as_advanced(GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)
diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp
index 39f73e8d8..fb1da22c4 100644
--- a/include/mscclpp/env.hpp
+++ b/include/mscclpp/env.hpp
@@ -110,6 +110,11 @@ class Env {
   /// Default is false.
   const bool forceDisableNvls;
 
+  /// Env name: `MSCCLPP_FORCE_DISABLE_GDR`. If set to true, it will disable the GDRCopy support in MSCCL++.
+  /// When false (default), GDRCopy is auto-detected and enabled if the gdrcopy driver is loaded.
+  /// Default is false.
+  const bool forceDisableGdr;
+
  private:
   Env();
 
diff --git a/include/mscclpp/semaphore.hpp b/include/mscclpp/semaphore.hpp
index 27f9aefac..edfa51685 100644
--- a/include/mscclpp/semaphore.hpp
+++ b/include/mscclpp/semaphore.hpp
@@ -16,6 +16,7 @@ namespace mscclpp {
 class Host2DeviceSemaphore {
  private:
   Semaphore semaphore_;
+  std::shared_ptr<uint64_t> inboundToken_;
   detail::UniqueGpuPtr<uint64_t> expectedInboundToken_;
   std::unique_ptr<uint64_t> outboundToken_;
 
@@ -29,6 +30,9 @@ class Host2DeviceSemaphore {
   /// @param connection The connection associated with this semaphore.
   Host2DeviceSemaphore(Communicator& communicator, const Connection& connection);
 
+  /// Destructor.
+  ~Host2DeviceSemaphore();
+
   /// Returns the connection.
   /// @return The connection associated with this semaphore.
   Connection& connection();
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index c1aa25bb1..3eb6466a7 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -30,6 +30,12 @@ if(MSCCLPP_USE_IB)
     target_compile_definitions(mscclpp_obj PUBLIC USE_IBVERBS)
 endif()
 
+if(MSCCLPP_USE_GDRCOPY)
+    target_include_directories(mscclpp_obj SYSTEM PRIVATE ${GDRCOPY_INCLUDE_DIRS})
+    target_link_libraries(mscclpp_obj PRIVATE ${GDRCOPY_LIBRARIES})
+    target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_GDRCOPY)
+endif()
+
 set_target_properties(mscclpp_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
 
 if(MSCCLPP_USE_CUDA)
diff --git a/src/core/connection.cc b/src/core/connection.cc
index 6466ca2af..525fb4984 100644
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -198,7 +198,15 @@ void IBConnection::recvThreadFunc() {
   }
 
   // Host-side buffer to receive newValue from imm_data (need 64-bit for cudaMemcpy)
-  uint64_t newValueHost = 0;
+  bool useGdr = gdrEnabled();
+  uint64_t* newValueHost;
+  if (useGdr) {
+    newValueHost = new uint64_t(0);
+  } else {
+    // Use pinned host memory for reliable cudaMemcpyAsync from a non-default stream
+    MSCCLPP_CUDATHROW(cudaHostAlloc(&newValueHost, sizeof(uint64_t), cudaHostAllocDefault));
+    *newValueHost = 0;
+  }
 
   while (!stopRecvThread_.load(std::memory_order_relaxed)) {
     auto qp = qp_.lock();
@@ -223,19 +231,34 @@ void IBConnection::recvThreadFunc() {
       // The imm_data contains newValue (32-bit, extended to 64-bit)
       // Note: getRecvWcImmData already converts from network byte order via ntohl
       unsigned int immData = qp->getRecvWcImmData(i);
-      newValueHost = static_cast<uint64_t>(immData);
+      *newValueHost = static_cast<uint64_t>(immData);
+
+      // Flush all in-flight GPUDirect RDMA writes to GPU device memory.
+      // IB guarantees that prior RDMA data writes have been sent before the write-with-imm
+      // completion appears, but the data may still be in-flight in PCIe / GPU internal fabric.
+      // cuFlushGPUDirectRDMAWrites ensures all prior NIC writes are committed to device memory
+      // before we update the semaphore token, so the GPU kernel sees data before the flag.
+      if (flushSupported_) {
+        MSCCLPP_CUTHROW(cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
+                                                   CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER));
+      }
 
       // Read dstGpuAddr from the local stored address (set by setRemoteUpdateDstAddr)
       uint64_t dstGpuAddr = remoteUpdateDstAddr_;
       if (dstGpuAddr != 0) {
         uint64_t* dstPtr = reinterpret_cast<uint64_t*>(dstGpuAddr);
 
-        // Use cudaMemcpyAsync with our dedicated stream to avoid blocking on the default stream
-        MSCCLPP_CUDATHROW(
-            cudaMemcpyAsync(dstPtr, &newValueHost, sizeof(uint64_t), cudaMemcpyHostToDevice, signalStream_));
-
-        INFO(CONN, "IBConnection recvThreadFunc: updated GPU ptr ", dstPtr, " to ", newValueHost, " (immData=", immData,
-             ")");
+#ifdef MSCCLPP_USE_GDRCOPY
+        if (useGdr && remoteUpdateDstAddrMap_ && remoteUpdateDstAddrMap_->valid()) {
+          // Direct host-side write to GPU memory via GDRCopy BAR1 mapping
+          remoteUpdateDstAddrMap_->copyTo(newValueHost, sizeof(uint64_t));
+        } else
+#endif
+        if (signalStream_ != nullptr) {
+          // Fallback: use cudaMemcpyAsync with our dedicated stream
+          MSCCLPP_CUDATHROW(
+              cudaMemcpyAsync(dstPtr, newValueHost, sizeof(uint64_t), cudaMemcpyHostToDevice, signalStream_));
+        }
       }
 
       // Post another recv for future messages
@@ -243,6 +266,13 @@ void IBConnection::recvThreadFunc() {
       qp->postRecv();
     }
   }
+
+  // Clean up the host-side buffer
+  if (useGdr) {
+    delete newValueHost;
+  } else {
+    MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaFreeHost(newValueHost));
+  }
 }
 
 IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& localEndpoint,
@@ -252,6 +282,7 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
       remoteTransport_(remoteEndpoint.transport()),
       dummyAtomicSource_(std::make_unique<uint64_t>(0)),
       ibNoAtomic_(getImpl(localEndpoint).ibNoAtomic_),
+      flushSupported_(false),
       stopRecvThread_(false),
       localGpuDeviceId_(localEndpoint.device().id),
       signalStream_(nullptr),
@@ -264,8 +295,28 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
   dstTransportInfo_ = getImpl(dummyAtomicSourceMem_).getTransportInfo(transport_);
 
   if (ibNoAtomic_) {
-    // Create a CUDA stream for async memory copies
-    MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&signalStream_, cudaStreamNonBlocking));
+    // Check if cuFlushGPUDirectRDMAWrites is supported on this GPU
+    if (localGpuDeviceId_ >= 0) {
+      int flushOptions = 0;
+#if !defined(MSCCLPP_USE_ROCM)
+      CUdevice cuDev;
+      if (cuDeviceGet(&cuDev, localGpuDeviceId_) == CUDA_SUCCESS) {
+        cuDeviceGetAttribute(&flushOptions, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS, cuDev);
+      }
+#endif
+      flushSupported_ = (flushOptions & CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) != 0;
+      if (flushSupported_) {
+        INFO(CONN, "cuFlushGPUDirectRDMAWrites is supported on GPU ", localGpuDeviceId_);
+      } else {
+        WARN(NET, "cuFlushGPUDirectRDMAWrites is NOT supported on GPU ", localGpuDeviceId_,
+             ". RDMA write ordering to GPU memory is not guaranteed.");
+      }
+    }
+
+    // Create a CUDA stream for async memory copies (not needed when GDRCopy is available)
+    if (!gdrEnabled()) {
+      MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&signalStream_, cudaStreamNonBlocking));
+    }
 
     // Pre-post receive requests for incoming write-with-imm
     auto qp = qp_.lock();
@@ -290,9 +341,8 @@ IBConnection::~IBConnection() {
     }
     if (signalStream_ != nullptr) {
       // Synchronize stream to ensure all async copies are complete before destruction
-      // Ignore errors during teardown (CUDA context may already be destroyed)
-      MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaStreamSynchronize(signalStream_));
-      MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaStreamDestroy(signalStream_));
+      (void)cudaStreamSynchronize(signalStream_);
+      (void)cudaStreamDestroy(signalStream_);
     }
   }
 }
@@ -301,9 +351,20 @@ Transport IBConnection::transport() const { return transport_; }
 
 Transport IBConnection::remoteTransport() const { return remoteTransport_; }
 
-void IBConnection::setRemoteUpdateDstAddr(uint64_t addr) {
-  remoteUpdateDstAddr_ = addr;
-  INFO(CONN, "IBConnection setRemoteUpdateDstAddr: ", (void*)addr);
+bool IBConnection::usesRecvThread() const { return ibNoAtomic_; }
+
+void IBConnection::setRemoteUpdateDstAddr(std::shared_ptr<uint64_t> gpuMem) {
+  remoteUpdateDstAddr_ = reinterpret_cast<uint64_t>(gpuMem.get());
+#ifdef MSCCLPP_USE_GDRCOPY
+  if (gdrEnabled()) {
+    if (gpuMem) {
+      remoteUpdateDstAddrMap_ = std::make_unique<GdrMap>(std::move(gpuMem), localGpuDeviceId_);
+    } else {
+      remoteUpdateDstAddrMap_.reset();
+    }
+  }
+#endif
+  INFO(CONN, "IBConnection setRemoteUpdateDstAddr: ", (void*)remoteUpdateDstAddr_);
 }
 
 void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset,
diff --git a/src/core/context.cc b/src/core/context.cc
index a5cdffb26..aabe71df1 100644
--- a/src/core/context.cc
+++ b/src/core/context.cc
@@ -46,8 +46,6 @@ void CudaIpcStream::sync() {
   }
 }
 
-Context::Impl::Impl() {}
-
 IbCtx* Context::Impl::getIbContext(Transport ibTransport) {
   // Find IB context or create it
   auto it = ibContexts_.find(ibTransport);
diff --git a/src/core/env.cpp b/src/core/env.cpp
index 484b40af1..96f53492e 100644
--- a/src/core/env.cpp
+++ b/src/core/env.cpp
@@ -65,7 +65,8 @@ Env::Env()
       ncclSharedLibPath(readEnv<std::string>("MSCCLPP_NCCL_LIB_PATH", "")),
       forceNcclFallbackOperation(readEnv<std::string>("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", "")),
       ncclSymmetricMemory(readEnv<bool>("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)),
-      forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)) {}
+      forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)),
+      forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)) {}
 
 std::shared_ptr<Env> env() {
   static std::shared_ptr<Env> globalEnv = std::shared_ptr<Env>(new Env());
@@ -93,6 +94,7 @@ std::shared_ptr<Env> env() {
     logEnv("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", globalEnv->forceNcclFallbackOperation);
     logEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", globalEnv->ncclSymmetricMemory);
     logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls);
+    logEnv("MSCCLPP_FORCE_DISABLE_GDR", globalEnv->forceDisableGdr);
   }
   return globalEnv;
 }
diff --git a/src/core/gdr.cc b/src/core/gdr.cc
new file mode 100644
index 000000000..2f9176adb
--- /dev/null
+++ b/src/core/gdr.cc
@@ -0,0 +1,125 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "gdr.hpp"
+
+#ifdef MSCCLPP_USE_GDRCOPY
+
+#include <unistd.h>
+
+#include <mscclpp/env.hpp>
+#include <mscclpp/gpu_utils.hpp>
+
+#include "logger.hpp"
+
+#define GPU_PAGE_SHIFT 16
+#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
+#define GPU_PAGE_MASK (~(GPU_PAGE_SIZE - 1))
+
+namespace mscclpp {
+
+// GdrContext
+
+class GdrContext {
+ public:
+  GdrContext();
+  ~GdrContext();
+
+  GdrContext(const GdrContext&) = delete;
+  GdrContext& operator=(const GdrContext&) = delete;
+
+  bool enabled() const { return enabled_; }
+  gdr_t handle() const { return handle_; }
+
+ private:
+  bool enabled_ = false;
+  gdr_t handle_ = nullptr;
+};
+
+static std::shared_ptr<GdrContext> gdrContext() {
+  static auto instance = std::make_shared<GdrContext>();
+  return instance;
+}
+
+bool gdrEnabled() { return gdrContext()->enabled(); }
+
+GdrContext::GdrContext() {
+  if (env()->forceDisableGdr) {
+    INFO(GPU, "GDRCopy disabled via MSCCLPP_FORCE_DISABLE_GDR");
+    return;
+  }
+
+  // Auto-detect: check if driver is available
+  if (access("/dev/gdrdrv", F_OK) != 0) {
+    INFO(GPU, "GDRCopy driver not detected, disabling GDRCopy");
+    return;
+  }
+
+  handle_ = gdr_open();
+  if (handle_ == nullptr) {
+    INFO(GPU, "gdr_open() failed, disabling GDRCopy");
+    return;
+  }
+
+  enabled_ = true;
+  INFO(GPU, "GDRCopy initialized successfully");
+}
+
+GdrContext::~GdrContext() {
+  if (handle_ != nullptr) {
+    gdr_close(handle_);
+    handle_ = nullptr;
+  }
+}
+
+// GdrMap
+
+GdrMap::GdrMap(std::shared_ptr<void> gpuMem, int deviceId) : ctx_(gdrContext()), gpuMem_(std::move(gpuMem)) {
+  // Ensure CUDA device context is active for gdr_pin_buffer
+  CudaDeviceGuard deviceGuard(deviceId);
+
+  uint64_t gpuAddr = reinterpret_cast<uint64_t>(gpuMem_.get());
+  // Align to GPU page boundary and pin one page around the target address
+  unsigned long alignedAddr = gpuAddr & GPU_PAGE_MASK;
+  unsigned long pageOffset = gpuAddr - alignedAddr;
+  mappedSize_ = GPU_PAGE_SIZE;
+
+  int ret = gdr_pin_buffer(ctx_->handle(), alignedAddr, mappedSize_, 0, 0, &mh_);
+  if (ret != 0) {
+    THROW(GPU, Error, ErrorCode::InternalError, "gdr_pin_buffer failed (ret=", ret, ") for addr ", (void*)gpuAddr,
+          ". Ensure the GPU memory is allocated with cudaMalloc (not cuMemCreate/cuMemMap).");
+  }
+
+  ret = gdr_map(ctx_->handle(), mh_, &barPtr_, mappedSize_);
+  if (ret != 0) {
+    (void)gdr_unpin_buffer(ctx_->handle(), mh_);
+    THROW(GPU, Error, ErrorCode::InternalError, "gdr_map failed (ret=", ret, ") for addr ", (void*)gpuAddr);
+  }
+
+  hostDstPtr_ = reinterpret_cast<volatile uint64_t*>(reinterpret_cast<char*>(barPtr_) + pageOffset);
+
+  INFO(GPU, "GDRCopy mapping established: GPU addr ", (void*)gpuAddr, " -> host ptr ", (const void*)hostDstPtr_);
+}
+
+GdrMap::~GdrMap() {
+  if (barPtr_ != nullptr) {
+    (void)gdr_unmap(ctx_->handle(), mh_, barPtr_, mappedSize_);
+  }
+  if (hostDstPtr_ != nullptr) {
+    (void)gdr_unpin_buffer(ctx_->handle(), mh_);
+  }
+}
+
+void GdrMap::copyTo(const void* src, size_t size) { gdr_copy_to_mapping(mh_, (void*)hostDstPtr_, src, size); }
+
+}  // namespace mscclpp
+
+#else  // !MSCCLPP_USE_GDRCOPY
+
+namespace mscclpp {
+
+bool gdrEnabled() { return false; }
+
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_USE_GDRCOPY
diff --git a/src/core/include/connection.hpp b/src/core/include/connection.hpp
index 06e733c72..536d33b78 100644
--- a/src/core/include/connection.hpp
+++ b/src/core/include/connection.hpp
@@ -5,6 +5,7 @@
 #define MSCCLPP_CONNECTION_HPP_
 
 #include <atomic>
+#include <memory>
 #include <mscclpp/core.hpp>
 #include <mscclpp/gpu_utils.hpp>
 #include <mutex>
@@ -15,6 +16,7 @@
 #include "communicator.hpp"
 #include "context.hpp"
 #include "endpoint.hpp"
+#include "gdr.hpp"
 #include "ib.hpp"
 #include "registered_memory.hpp"
 #include "socket.h"
@@ -38,8 +40,13 @@ class BaseConnection {
   /// Set the local address where remote updateAndSync operations should write.
   /// This is called by the receiver to specify where incoming signals should be written.
   /// Default implementation is a no-op for connections that don't need it.
-  /// @param addr The local address for incoming writes.
-  virtual void setRemoteUpdateDstAddr(uint64_t /*addr*/) {}
+  /// @param gpuMem Shared pointer to the GPU/CPU memory for incoming writes (nullptr to clear).
+  virtual void setRemoteUpdateDstAddr(std::shared_ptr<uint64_t> /*gpuMem*/) {}
+
+  /// Whether this connection uses a recv thread for signaling (host-no-atomic mode).
+  /// When true, the semaphore must allocate a separate inboundToken_ for the recv thread to write to.
+  /// When false, the NIC writes directly to the semaphore's registered memory (e.g., via atomics).
+  virtual bool usesRecvThread() const { return false; }
 
   virtual Transport transport() const = 0;
 
@@ -98,6 +105,7 @@ class IBConnection : public BaseConnection {
   // For write-with-imm mode (HostNoAtomic): uses RDMA write-with-imm to signal
   // instead of atomic operations, with a host thread forwarding to GPU for memory consistency.
   bool ibNoAtomic_;
+  bool flushSupported_;  // Whether cuFlushGPUDirectRDMAWrites is supported on this GPU
   std::thread recvThread_;
   std::atomic<bool> stopRecvThread_;
   int localGpuDeviceId_;  // Local GPU device ID for setting CUDA context in recv thread
@@ -108,6 +116,10 @@ class IBConnection : public BaseConnection {
   // - Receiver: uses remoteUpdateDstAddr_ (set via setRemoteUpdateDstAddr) to know where to write
   uint64_t remoteUpdateDstAddr_;
 
+#ifdef MSCCLPP_USE_GDRCOPY
+  std::unique_ptr<GdrMap> remoteUpdateDstAddrMap_;
+#endif
+
   void recvThreadFunc();
 
  public:
@@ -116,8 +128,10 @@ class IBConnection : public BaseConnection {
 
   /// Set the local address where remote updateAndSync operations will write.
   /// Must be called before the remote sends any updateAndSync in host-no-atomic mode.
-  /// @param addr The local address for incoming writes.
-  void setRemoteUpdateDstAddr(uint64_t addr) override;
+  /// @param gpuMem Shared pointer to the GPU/CPU memory for incoming writes (nullptr to clear).
+  void setRemoteUpdateDstAddr(std::shared_ptr<uint64_t> gpuMem) override;
+
+  bool usesRecvThread() const override;
 
   Transport transport() const override;
 
diff --git a/src/core/include/context.hpp b/src/core/include/context.hpp
index ee84d0f7b..42d03db15 100644
--- a/src/core/include/context.hpp
+++ b/src/core/include/context.hpp
@@ -42,8 +42,6 @@ struct Context::Impl {
   std::shared_ptr<TokenPool> tokenPool_;
   const size_t maxNumTokens_ = 1 << 15;  // 32K tokens
 
-  Impl();
-
   IbCtx* getIbContext(Transport ibTransport);
   std::shared_ptr<uint64_t> getToken();
 };
diff --git a/src/core/include/gdr.hpp b/src/core/include/gdr.hpp
new file mode 100644
index 000000000..03047c00c
--- /dev/null
+++ b/src/core/include/gdr.hpp
@@ -0,0 +1,57 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef MSCCLPP_GDR_HPP_
+#define MSCCLPP_GDR_HPP_
+
+namespace mscclpp {
+
+/// Whether the global GDRCopy context is enabled.
+bool gdrEnabled();
+
+}  // namespace mscclpp
+
+#ifdef MSCCLPP_USE_GDRCOPY
+
+#include <gdrapi.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+namespace mscclpp {
+
+class GdrContext;
+
+/// RAII wrapper for a per-connection GDRCopy BAR1 mapping of a GPU address.
+class GdrMap {
+ public:
+  /// Pin and map a GPU address for direct host-side access.
+  /// Holds a shared reference to the GPU memory to keep it alive.
+  /// @param gpuMem   Shared pointer to the GPU memory (e.g. from gpuCallocShared).
+  /// @param deviceId The CUDA device ID for setting context.
+  GdrMap(std::shared_ptr<void> gpuMem, int deviceId);
+  ~GdrMap();
+
+  GdrMap(const GdrMap&) = delete;
+  GdrMap& operator=(const GdrMap&) = delete;
+
+  /// Whether the mapping was established successfully.
+  bool valid() const { return hostDstPtr_ != nullptr; }
+
+  /// Copy data from host memory to the mapped GPU location.
+  void copyTo(const void* src, size_t size);
+
+ private:
+  std::shared_ptr<GdrContext> ctx_;
+  std::shared_ptr<void> gpuMem_;
+  gdr_mh_t mh_{};
+  void* barPtr_ = nullptr;
+  volatile uint64_t* hostDstPtr_ = nullptr;
+  size_t mappedSize_ = 0;
+};
+
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_USE_GDRCOPY
+#endif  // MSCCLPP_GDR_HPP_
diff --git a/src/core/semaphore.cc b/src/core/semaphore.cc
index c6eb1e232..8d9382382 100644
--- a/src/core/semaphore.cc
+++ b/src/core/semaphore.cc
@@ -8,6 +8,7 @@
 #include "atomic.hpp"
 #include "connection.hpp"
 #include "context.hpp"
+#include "logger.hpp"
 #include "registered_memory.hpp"
 #include "serialization.hpp"
 
@@ -48,12 +49,12 @@ SemaphoreStub::Impl::Impl(const Connection& connection) : connection_(connection
     token_ = std::make_shared<uint64_t>(0);
   } else if (localDevice.type == DeviceType::GPU) {
     if (localDevice.id < 0) {
-      throw Error("Local GPU ID is not provided", ErrorCode::InvalidUsage);
+      THROW(CONN, Error, ErrorCode::InvalidUsage, "Local GPU ID is not provided");
     }
     CudaDeviceGuard deviceGuard(localDevice.id);
     token_ = gpuCallocToken(connection_.context());
   } else {
-    throw Error("Unsupported local device type", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "Unsupported local device type");
   }
   idMemory_ = std::move(connection_.context()->registerMemory(token_.get(), sizeof(uint64_t), connection_.transport()));
 }
@@ -78,7 +79,7 @@ MSCCLPP_API_CPP SemaphoreStub SemaphoreStub::deserialize(const std::vector<char>
   RegisteredMemory idMemory(std::make_shared<RegisteredMemory::Impl>(data.begin(), memEnd));
   auto it = detail::deserialize(memEnd, device);
   if (it != data.end()) {
-    throw Error("SemaphoreStub deserialize failed", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "SemaphoreStub deserialize failed");
   }
   return SemaphoreStub(std::make_shared<Impl>(std::move(idMemory), device));
 }
@@ -119,15 +120,32 @@ MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(const Semaphore& sema
       expectedInboundToken_(detail::gpuCallocUnique<uint64_t>()),
       outboundToken_(std::make_unique<uint64_t>()) {
   if (connection().localDevice().type != DeviceType::GPU) {
-    throw Error("Local endpoint device type of Host2DeviceSemaphore should be GPU", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "Local endpoint device type of Host2DeviceSemaphore should be GPU");
   }
-  BaseConnection::getImpl(connection())
-      ->setRemoteUpdateDstAddr(reinterpret_cast<uint64_t>(semaphore_.localMemory().data()));
+  auto connImpl = BaseConnection::getImpl(connection());
+  if (connImpl->usesRecvThread()) {
+    // Host-no-atomic mode: the recv thread writes the token to GPU memory.
+    // Allocate a separate inbound token via plain cudaMalloc (not TokenPool/VMM)
+    // so that it is always compatible with GDRCopy pinning (VMM memory cannot be pinned by gdr_pin_buffer).
+    CudaDeviceGuard deviceGuard(connection().localDevice().id);
+    inboundToken_ = detail::gpuCallocShared<uint64_t>();
+    connImpl->setRemoteUpdateDstAddr(inboundToken_);
+  }
+  // When usesRecvThread() is false (e.g., atomic mode), inboundToken_ stays null
+  // and the GPU polls the SemaphoreStub token directly (the NIC atomic target).
 }
 
 MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(Communicator& communicator, const Connection& connection)
     : Host2DeviceSemaphore(buildSemaphoreFromConnection(communicator, connection)) {}
 
+MSCCLPP_API_CPP Host2DeviceSemaphore::~Host2DeviceSemaphore() {
+  if (inboundToken_) {
+    // Clear the connection's remote update address (and any associated GdrMap)
+    // before inboundToken_ is freed, to avoid use-after-free on the pinned GPU memory.
+    BaseConnection::getImpl(connection())->setRemoteUpdateDstAddr(nullptr);
+  }
+}
+
 MSCCLPP_API_CPP Connection& Host2DeviceSemaphore::connection() { return semaphore_.connection(); }
 
 MSCCLPP_API_CPP void Host2DeviceSemaphore::signal() {
@@ -136,7 +154,11 @@ MSCCLPP_API_CPP void Host2DeviceSemaphore::signal() {
 
 MSCCLPP_API_CPP Host2DeviceSemaphore::DeviceHandle Host2DeviceSemaphore::deviceHandle() const {
   Host2DeviceSemaphore::DeviceHandle device;
-  device.inboundToken = reinterpret_cast<uint64_t*>(semaphore_.localMemory().data());
+  // If inboundToken_ is allocated (host-no-atomic mode), the GPU polls it.
+  // Otherwise (atomic mode), the GPU polls the SemaphoreStub token directly,
+  // which is the same address targeted by the NIC's atomic operation.
+  device.inboundToken = inboundToken_ ? inboundToken_.get()
+                                      : reinterpret_cast<uint64_t*>(semaphore_.localMemory().data());
   device.expectedInboundToken = expectedInboundToken_.get();
   return device;
 }
@@ -146,13 +168,19 @@ MSCCLPP_API_CPP Host2HostSemaphore::Host2HostSemaphore(const Semaphore& semaphor
       expectedInboundToken_(std::make_unique<uint64_t>()),
       outboundToken_(std::make_unique<uint64_t>()) {
   if (connection().transport() == Transport::CudaIpc) {
-    throw Error("Host2HostSemaphore cannot be used with CudaIpc transport", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "Host2HostSemaphore cannot be used with CudaIpc transport");
   }
   if (connection().localDevice().type != DeviceType::CPU) {
-    throw Error("Local endpoint device type of Host2HostSemaphore should be CPU", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "Local endpoint device type of Host2HostSemaphore should be CPU");
+  }
+  auto connImpl = BaseConnection::getImpl(connection());
+  if (connImpl->usesRecvThread()) {
+    // Host-no-atomic mode: tell the recv thread where to write the incoming token.
+    // Non-owning shared_ptr: Host2HostSemaphore outlives the connection, so the memory stays valid.
+    auto token = std::shared_ptr<uint64_t>(reinterpret_cast<uint64_t*>(semaphore_.localMemory().data()),
+                                           [](uint64_t*) {});
+    connImpl->setRemoteUpdateDstAddr(std::move(token));
   }
-  BaseConnection::getImpl(connection())
-      ->setRemoteUpdateDstAddr(reinterpret_cast<uint64_t>(semaphore_.localMemory().data()));
 }
 
 MSCCLPP_API_CPP Host2HostSemaphore::Host2HostSemaphore(Communicator& communicator, const Connection& connection)
@@ -177,7 +205,7 @@ MSCCLPP_API_CPP void Host2HostSemaphore::wait(int64_t maxSpinCount) {
   while (atomicLoad(reinterpret_cast<uint64_t*>(semaphore_.localMemory().data()), memoryOrderAcquire) <
          (*expectedInboundToken_)) {
     if (maxSpinCount >= 0 && spinCount++ == maxSpinCount) {
-      throw Error("Host2HostSemaphore::wait timed out", ErrorCode::Timeout);
+      THROW(CONN, Error, ErrorCode::Timeout, "Host2HostSemaphore::wait timed out");
     }
   }
 }
@@ -187,7 +215,8 @@ MSCCLPP_API_CPP MemoryDevice2DeviceSemaphore::MemoryDevice2DeviceSemaphore(const
       expectedInboundToken_(detail::gpuCallocUnique<uint64_t>()),
       outboundToken_(detail::gpuCallocUnique<uint64_t>()) {
   if (connection().localDevice().type != DeviceType::GPU) {
-    throw Error("Local endpoint device type of MemoryDevice2DeviceSemaphore should be GPU", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage,
+          "Local endpoint device type of MemoryDevice2DeviceSemaphore should be GPU");
   }
 }
 

From 04ebd9ba6e7ff5964941b1393d44dc7ec9b5c725 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 23 Feb 2026 10:39:39 -0800
Subject: [PATCH 035/132] fix coverage file path

---
 .azure-pipelines/templates/ut.yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.azure-pipelines/templates/ut.yaml b/.azure-pipelines/templates/ut.yaml
index b25d11a92..bb5d25160 100644
--- a/.azure-pipelines/templates/ut.yaml
+++ b/.azure-pipelines/templates/ut.yaml
@@ -33,6 +33,8 @@ steps:
         cmake -DCMAKE_BUILD_TYPE=Debug -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_ENABLE_COVERAGE=ON ..
       fi
       make -j
+      cd ..
+      pwd > build_coverage/BUILD_PREFIX
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: DownloadSecureFile@1
@@ -133,6 +135,10 @@ steps:
         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
         export PATH=/usr/local/mpi/bin:\$PATH;                        \
         cd /root/mscclpp;                                             \
+        BUILD_PREFIX=\$(cat build_coverage/BUILD_PREFIX);             \
+        STRIP_COUNT=\$(echo \$BUILD_PREFIX | tr -cd / | wc -c);      \
+        export GCOV_PREFIX=/root/mscclpp;                             \
+        export GCOV_PREFIX_STRIP=\$STRIP_COUNT;                      \
         export LD_LIBRARY_PATH=/root/mscclpp/build_coverage/lib:\$LD_LIBRARY_PATH; \
         ./build_coverage/bin/unit_tests;                              \
         mpirun --allow-run-as-root -tag-output -np 2 ./build_coverage/bin/mp_unit_tests --exclude-perf-tests;  \

From 54e46ba8a6267edf9f13c6a7791a08f848878044 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 23 Feb 2026 11:31:33 -0800
Subject: [PATCH 036/132] rocm fix wip

---
 src/core/connection.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/core/connection.cc b/src/core/connection.cc
index 525fb4984..e1a567b59 100644
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -238,10 +238,12 @@ void IBConnection::recvThreadFunc() {
       // completion appears, but the data may still be in-flight in PCIe / GPU internal fabric.
       // cuFlushGPUDirectRDMAWrites ensures all prior NIC writes are committed to device memory
       // before we update the semaphore token, so the GPU kernel sees data before the flag.
+#if !defined(MSCCLPP_USE_ROCM)
       if (flushSupported_) {
         MSCCLPP_CUTHROW(cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
                                                    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER));
       }
+#endif
 
       // Read dstGpuAddr from the local stored address (set by setRemoteUpdateDstAddr)
       uint64_t dstGpuAddr = remoteUpdateDstAddr_;
@@ -308,7 +310,7 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
       if (flushSupported_) {
         INFO(CONN, "cuFlushGPUDirectRDMAWrites is supported on GPU ", localGpuDeviceId_);
       } else {
-        WARN(NET, "cuFlushGPUDirectRDMAWrites is NOT supported on GPU ", localGpuDeviceId_,
+        WARN(CONN, "cuFlushGPUDirectRDMAWrites is NOT supported on GPU ", localGpuDeviceId_,
              ". RDMA write ordering to GPU memory is not guaranteed.");
       }
     }

From 6c2bc8f4b391864afc71c36470b9e61e22ae97d7 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 23 Feb 2026 11:32:50 -0800
Subject: [PATCH 037/132] coverage fix

---
 .azure-pipelines/templates/ut.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.azure-pipelines/templates/ut.yaml b/.azure-pipelines/templates/ut.yaml
index bb5d25160..f20b6b3b2 100644
--- a/.azure-pipelines/templates/ut.yaml
+++ b/.azure-pipelines/templates/ut.yaml
@@ -28,9 +28,9 @@ steps:
       cd ..
       mkdir build_coverage && cd build_coverage
       if [ "${{ parameters.platform }}" == "rocm" ]; then
-        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_ENABLE_COVERAGE=ON ..
+        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_ENABLE_COVERAGE=ON ..
       else
-        cmake -DCMAKE_BUILD_TYPE=Debug -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_ENABLE_COVERAGE=ON ..
+        cmake -DCMAKE_BUILD_TYPE=Debug -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_ENABLE_COVERAGE=ON ..
       fi
       make -j
       cd ..

From d0c709ea8201a2ee4f59bcb14af4c358c1957641 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 23 Feb 2026 14:30:43 -0800
Subject: [PATCH 038/132] Fix Codecov token usage in coverage upload step

---
 .azure-pipelines/templates/ut.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.azure-pipelines/templates/ut.yaml b/.azure-pipelines/templates/ut.yaml
index f20b6b3b2..ba2a3aebb 100644
--- a/.azure-pipelines/templates/ut.yaml
+++ b/.azure-pipelines/templates/ut.yaml
@@ -175,8 +175,10 @@ steps:
       set -e
       curl -Os https://cli.codecov.io/latest/linux/codecov
       chmod +x codecov
-      ./codecov upload-process --disable-search -t $(CODECOV_TOKEN) -f coverage.info --flag ${{ parameters.platform }}-${{ parameters.gpuArch }}
+      ./codecov upload-process --disable-search -t $CODECOV_TOKEN -f coverage.info --flag ${{ parameters.platform }}-${{ parameters.gpuArch }}
     workingDirectory: '$(System.DefaultWorkingDirectory)'
+  env:
+    CODECOV_TOKEN: $(CODECOV_TOKEN)
 
 - task: Bash@3
   name: PyTests

From 2adf4a48e23ed243e8a00d61e6012c3a016be67a Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 23 Feb 2026 16:49:39 -0800
Subject: [PATCH 039/132] use variable group

---
 .azure-pipelines/templates/ut.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.azure-pipelines/templates/ut.yaml b/.azure-pipelines/templates/ut.yaml
index ba2a3aebb..f8234edd1 100644
--- a/.azure-pipelines/templates/ut.yaml
+++ b/.azure-pipelines/templates/ut.yaml
@@ -10,6 +10,8 @@ parameters:
   default: 'cuda'
 - name: gpuArch
   type: string
+variables:
+- group: mscclpp
 
 steps:
 - task: Bash@3
@@ -175,10 +177,8 @@ steps:
       set -e
       curl -Os https://cli.codecov.io/latest/linux/codecov
       chmod +x codecov
-      ./codecov upload-process --disable-search -t $CODECOV_TOKEN -f coverage.info --flag ${{ parameters.platform }}-${{ parameters.gpuArch }}
+      ./codecov upload-process --disable-search -t $(CODECOV_TOKEN) -f coverage.info --flag ${{ parameters.platform }}-${{ parameters.gpuArch }}
     workingDirectory: '$(System.DefaultWorkingDirectory)'
-  env:
-    CODECOV_TOKEN: $(CODECOV_TOKEN)
 
 - task: Bash@3
   name: PyTests

From 98b023adc6b7e0e65b42912577697f62306953f4 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 23 Feb 2026 18:13:57 -0800
Subject: [PATCH 040/132] rocm fixes

---
 src/core/connection.cc          | 42 +++++++++++++++------------------
 src/core/include/connection.hpp |  2 +-
 src/core/semaphore.cc           |  4 ++++
 3 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/core/connection.cc b/src/core/connection.cc
index e1a567b59..04619b378 100644
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -199,14 +199,9 @@ void IBConnection::recvThreadFunc() {
 
   // Host-side buffer to receive newValue from imm_data (need 64-bit for cudaMemcpy)
   bool useGdr = gdrEnabled();
-  uint64_t* newValueHost;
-  if (useGdr) {
-    newValueHost = new uint64_t(0);
-  } else {
-    // Use pinned host memory for reliable cudaMemcpyAsync from a non-default stream
-    MSCCLPP_CUDATHROW(cudaHostAlloc(&newValueHost, sizeof(uint64_t), cudaHostAllocDefault));
-    *newValueHost = 0;
-  }
+  // Use pinned host memory for reliable cudaMemcpyAsync from a non-default stream.
+  auto newValueHostPtr = useGdr ? nullptr : detail::gpuCallocHostShared<uint64_t>();
+  uint64_t* newValueHost = useGdr ? new uint64_t(0) : newValueHostPtr.get();
 
   while (!stopRecvThread_.load(std::memory_order_relaxed)) {
     auto qp = qp_.lock();
@@ -256,11 +251,16 @@ void IBConnection::recvThreadFunc() {
           remoteUpdateDstAddrMap_->copyTo(newValueHost, sizeof(uint64_t));
         } else
 #endif
-        if (signalStream_ != nullptr) {
-          // Fallback: use cudaMemcpyAsync with our dedicated stream
-          MSCCLPP_CUDATHROW(
-              cudaMemcpyAsync(dstPtr, newValueHost, sizeof(uint64_t), cudaMemcpyHostToDevice, signalStream_));
+#if defined(MSCCLPP_USE_ROCM)
+        {
+          *dstPtr = *newValueHost;
         }
+#else
+        if (signalStream_) {
+          // Fallback: use gpuMemcpyAsync with our dedicated stream
+          gpuMemcpyAsync(dstPtr, newValueHost, 1, *signalStream_, cudaMemcpyHostToDevice);
+        }
+#endif
       }
 
       // Post another recv for future messages
@@ -269,11 +269,9 @@ void IBConnection::recvThreadFunc() {
     }
   }
 
-  // Clean up the host-side buffer
+  // Clean up the host-side buffer (non-GDR path is auto-freed by shared_ptr)
   if (useGdr) {
     delete newValueHost;
-  } else {
-    MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaFreeHost(newValueHost));
   }
 }
 
@@ -305,7 +303,6 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
       if (cuDeviceGet(&cuDev, localGpuDeviceId_) == CUDA_SUCCESS) {
         cuDeviceGetAttribute(&flushOptions, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS, cuDev);
       }
-#endif
       flushSupported_ = (flushOptions & CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) != 0;
       if (flushSupported_) {
         INFO(CONN, "cuFlushGPUDirectRDMAWrites is supported on GPU ", localGpuDeviceId_);
@@ -313,12 +310,16 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
         WARN(CONN, "cuFlushGPUDirectRDMAWrites is NOT supported on GPU ", localGpuDeviceId_,
              ". RDMA write ordering to GPU memory is not guaranteed.");
       }
+#endif
     }
 
-    // Create a CUDA stream for async memory copies (not needed when GDRCopy is available)
+    // Create a CUDA stream for async memory copies (not needed when GDRCopy is available,
+    // nor on ROCm where GPU memory is host-accessible and we write directly)
+#if !defined(MSCCLPP_USE_ROCM)
     if (!gdrEnabled()) {
-      MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&signalStream_, cudaStreamNonBlocking));
+      signalStream_ = std::make_unique<CudaStreamWithFlags>(cudaStreamNonBlocking);
     }
+#endif
 
     // Pre-post receive requests for incoming write-with-imm
     auto qp = qp_.lock();
@@ -341,11 +342,6 @@ IBConnection::~IBConnection() {
     if (recvThread_.joinable()) {
       recvThread_.join();
     }
-    if (signalStream_ != nullptr) {
-      // Synchronize stream to ensure all async copies are complete before destruction
-      (void)cudaStreamSynchronize(signalStream_);
-      (void)cudaStreamDestroy(signalStream_);
-    }
   }
 }
 
diff --git a/src/core/include/connection.hpp b/src/core/include/connection.hpp
index 536d33b78..5eefd6628 100644
--- a/src/core/include/connection.hpp
+++ b/src/core/include/connection.hpp
@@ -109,7 +109,7 @@ class IBConnection : public BaseConnection {
   std::thread recvThread_;
   std::atomic<bool> stopRecvThread_;
   int localGpuDeviceId_;  // Local GPU device ID for setting CUDA context in recv thread
-  cudaStream_t signalStream_;
+  std::unique_ptr<CudaStreamWithFlags> signalStream_;
 
   // Write-with-imm design:
   // - Sender: 0-byte RDMA write-with-imm to dst MR, newValue in imm_data (32-bit)
diff --git a/src/core/semaphore.cc b/src/core/semaphore.cc
index 8d9382382..6a757b72a 100644
--- a/src/core/semaphore.cc
+++ b/src/core/semaphore.cc
@@ -128,7 +128,11 @@ MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(const Semaphore& sema
     // Allocate a separate inbound token via plain cudaMalloc (not TokenPool/VMM)
     // so that it is always compatible with GDRCopy pinning (VMM memory cannot be pinned by gdr_pin_buffer).
     CudaDeviceGuard deviceGuard(connection().localDevice().id);
+#if defined(MSCCLPP_USE_ROCM)
+    inboundToken_ = detail::gpuCallocUncachedShared<uint64_t>();
+#else
     inboundToken_ = detail::gpuCallocShared<uint64_t>();
+#endif
     connImpl->setRemoteUpdateDstAddr(inboundToken_);
   }
   // When usesRecvThread() is false (e.g., atomic mode), inboundToken_ stays null

From 22e5efb8ddb304e9f7c75bbb46c3cbd84b2fd39a Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 23 Feb 2026 18:15:38 -0800
Subject: [PATCH 041/132] gdrcopy install in container

---
 docker/base-dev-x.dockerfile | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile
index 3aa814221..5af702df1 100644
--- a/docker/base-dev-x.dockerfile
+++ b/docker/base-dev-x.dockerfile
@@ -24,8 +24,26 @@ RUN OS_ARCH=$(uname -m) && \
     rm -rf ${CMAKE_HOME}.tar.gz && \
     ln -s /usr/local/cmake-${CMAKE_VERSION}-linux-${OS_ARCH}/bin/* /usr/bin/
 
-# Install ROCm-specific packages if building for ROCm
+# Install GDRCopy userspace library for CUDA targets
 ARG TARGET="cuda13.0"
+RUN if echo "$TARGET" | grep -q "^cuda"; then \
+        GDRCOPY_VERSION="2.5.1" && \
+        apt-get update -y && \
+        apt-get install -y --no-install-recommends devscripts debhelper fakeroot pkg-config dkms && \
+        cd /tmp && \
+        curl -L https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz -o gdrcopy.tar.gz && \
+        tar xzf gdrcopy.tar.gz && \
+        cd gdrcopy-${GDRCOPY_VERSION}/packages && \
+        CUDA=$(ls -d /usr/local/cuda-* 2>/dev/null | head -1) && \
+        ./build-deb-packages.sh -k -c "$CUDA" && \
+        dpkg -i libgdrapi_*.deb && \
+        cd / && rm -rf /tmp/gdrcopy* && \
+        apt-get autoremove -y && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* /tmp/*; \
+    fi
+
+# Install ROCm-specific packages if building for ROCm
 RUN if echo "$TARGET" | grep -q "^rocm"; then \
         apt-get update -y && \
         apt-get install -y hipblas hipsparse rocsparse rocrand hiprand rocthrust rocsolver rocfft hipfft hipcub rocprim rccl roctracer-dev && \

From 2f27d7d7fe32a3221f616d4aae050b0dd38f209f Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 23 Feb 2026 18:25:10 -0800
Subject: [PATCH 042/132] Update coverage report to exclude additional
 directories in lcov command

---
 .azure-pipelines/templates/ut.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azure-pipelines/templates/ut.yaml b/.azure-pipelines/templates/ut.yaml
index f8234edd1..78a12b166 100644
--- a/.azure-pipelines/templates/ut.yaml
+++ b/.azure-pipelines/templates/ut.yaml
@@ -147,7 +147,7 @@ steps:
         mpirun --allow-run-as-root -tag-output -np 4 ./build_coverage/bin/mp_unit_tests --exclude-perf-tests;  \
         cd build_coverage;                                            \
         lcov --directory . --capture --output-file coverage.info;  \
-        lcov --remove coverage.info '/usr/*' '*/test/*' '*/build/*' --output-file coverage.info;  \
+        lcov --remove coverage.info '/usr/*' '*/test/*' '*/build/*' '*/nlohmann/*' '*/_deps/*' --output-file coverage.info;  \
         lcov --list coverage.info"'
       kill $CHILD_PID
     workingDirectory: '$(System.DefaultWorkingDirectory)'

From d88ee8de9c79169192a26ab6318f34b790d09954 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 23 Feb 2026 18:27:14 -0800
Subject: [PATCH 043/132] Refine coverage report to include only mscclpp source
 and include directories

---
 .azure-pipelines/templates/ut.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azure-pipelines/templates/ut.yaml b/.azure-pipelines/templates/ut.yaml
index 78a12b166..128a7a970 100644
--- a/.azure-pipelines/templates/ut.yaml
+++ b/.azure-pipelines/templates/ut.yaml
@@ -147,7 +147,7 @@ steps:
         mpirun --allow-run-as-root -tag-output -np 4 ./build_coverage/bin/mp_unit_tests --exclude-perf-tests;  \
         cd build_coverage;                                            \
         lcov --directory . --capture --output-file coverage.info;  \
-        lcov --remove coverage.info '/usr/*' '*/test/*' '*/build/*' '*/nlohmann/*' '*/_deps/*' --output-file coverage.info;  \
+        lcov --extract coverage.info '*/mscclpp/src/*' '*/mscclpp/include/*' --output-file coverage.info;  \
         lcov --list coverage.info"'
       kill $CHILD_PID
     workingDirectory: '$(System.DefaultWorkingDirectory)'

From 11e27e29784428b1732a9898f4891cbc3cce9461 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 23 Feb 2026 18:33:11 -0800
Subject: [PATCH 044/132] Update coverage report commands to handle errors and
 adjust paths

---
 .azure-pipelines/templates/ut.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.azure-pipelines/templates/ut.yaml b/.azure-pipelines/templates/ut.yaml
index 128a7a970..6f4206fcc 100644
--- a/.azure-pipelines/templates/ut.yaml
+++ b/.azure-pipelines/templates/ut.yaml
@@ -146,8 +146,8 @@ steps:
         mpirun --allow-run-as-root -tag-output -np 2 ./build_coverage/bin/mp_unit_tests --exclude-perf-tests;  \
         mpirun --allow-run-as-root -tag-output -np 4 ./build_coverage/bin/mp_unit_tests --exclude-perf-tests;  \
         cd build_coverage;                                            \
-        lcov --directory . --capture --output-file coverage.info;  \
-        lcov --extract coverage.info '*/mscclpp/src/*' '*/mscclpp/include/*' --output-file coverage.info;  \
+        lcov --directory . --capture --output-file coverage.info --ignore-errors inconsistent;  \
+        lcov --extract coverage.info \"\${BUILD_PREFIX}/src/*\" \"\${BUILD_PREFIX}/include/mscclpp/*\" --output-file coverage.info;  \
         lcov --list coverage.info"'
       kill $CHILD_PID
     workingDirectory: '$(System.DefaultWorkingDirectory)'

From 25f31b499e5a3c197e9c522231ba8df630c7504e Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 23 Feb 2026 19:13:10 -0800
Subject: [PATCH 045/132] updates

---
 docker/base-dev-x.dockerfile | 3 +--
 src/core/semaphore.cc        | 8 ++++----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile
index 5af702df1..a71449d74 100644
--- a/docker/base-dev-x.dockerfile
+++ b/docker/base-dev-x.dockerfile
@@ -34,8 +34,7 @@ RUN if echo "$TARGET" | grep -q "^cuda"; then \
         curl -L https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz -o gdrcopy.tar.gz && \
         tar xzf gdrcopy.tar.gz && \
         cd gdrcopy-${GDRCOPY_VERSION}/packages && \
-        CUDA=$(ls -d /usr/local/cuda-* 2>/dev/null | head -1) && \
-        ./build-deb-packages.sh -k -c "$CUDA" && \
+        ./build-deb-packages.sh -k -t && \
         dpkg -i libgdrapi_*.deb && \
         cd / && rm -rf /tmp/gdrcopy* && \
         apt-get autoremove -y && \
diff --git a/src/core/semaphore.cc b/src/core/semaphore.cc
index 6a757b72a..e2dadb19e 100644
--- a/src/core/semaphore.cc
+++ b/src/core/semaphore.cc
@@ -161,8 +161,8 @@ MSCCLPP_API_CPP Host2DeviceSemaphore::DeviceHandle Host2DeviceSemaphore::deviceH
   // If inboundToken_ is allocated (host-no-atomic mode), the GPU polls it.
   // Otherwise (atomic mode), the GPU polls the SemaphoreStub token directly,
   // which is the same address targeted by the NIC's atomic operation.
-  device.inboundToken = inboundToken_ ? inboundToken_.get()
-                                      : reinterpret_cast<uint64_t*>(semaphore_.localMemory().data());
+  device.inboundToken =
+      inboundToken_ ? inboundToken_.get() : reinterpret_cast<uint64_t*>(semaphore_.localMemory().data());
   device.expectedInboundToken = expectedInboundToken_.get();
   return device;
 }
@@ -181,8 +181,8 @@ MSCCLPP_API_CPP Host2HostSemaphore::Host2HostSemaphore(const Semaphore& semaphor
   if (connImpl->usesRecvThread()) {
     // Host-no-atomic mode: tell the recv thread where to write the incoming token.
     // Non-owning shared_ptr: Host2HostSemaphore outlives the connection, so the memory stays valid.
-    auto token = std::shared_ptr<uint64_t>(reinterpret_cast<uint64_t*>(semaphore_.localMemory().data()),
-                                           [](uint64_t*) {});
+    auto token =
+        std::shared_ptr<uint64_t>(reinterpret_cast<uint64_t*>(semaphore_.localMemory().data()), [](uint64_t*) {});
     connImpl->setRemoteUpdateDstAddr(std::move(token));
   }
 }

From ac4d7130621b1c655f2784295049ef98b04b91d1 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 23 Feb 2026 20:08:15 -0800
Subject: [PATCH 046/132] updates

---
 include/mscclpp/semaphore.hpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/mscclpp/semaphore.hpp b/include/mscclpp/semaphore.hpp
index edfa51685..058f35bb7 100644
--- a/include/mscclpp/semaphore.hpp
+++ b/include/mscclpp/semaphore.hpp
@@ -33,6 +33,12 @@ class Host2DeviceSemaphore {
   /// Destructor.
   ~Host2DeviceSemaphore();
 
+  /// Move constructor.
+  Host2DeviceSemaphore(Host2DeviceSemaphore&&) noexcept = default;
+
+  /// Move assignment operator.
+  Host2DeviceSemaphore& operator=(Host2DeviceSemaphore&&) noexcept = default;
+
   /// Returns the connection.
   /// @return The connection associated with this semaphore.
   Connection& connection();

From ac022c333c0a11947f3e1f7c3205f77551c7f178 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 24 Feb 2026 20:25:25 -0800
Subject: [PATCH 047/132] a few updates

---
 src/core/connection.cc          | 149 ++++++++++++++++----------------
 src/core/endpoint.cc            |  23 +++++
 src/core/gdr.cc                 |  34 +++++---
 src/core/include/connection.hpp |  24 ++---
 src/core/include/endpoint.hpp   |   8 ++
 src/core/include/gdr.hpp        |  58 ++++++++++---
 6 files changed, 188 insertions(+), 108 deletions(-)

diff --git a/src/core/connection.cc b/src/core/connection.cc
index 04619b378..2b7801f53 100644
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -197,11 +197,7 @@ void IBConnection::recvThreadFunc() {
     }
   }
 
-  // Host-side buffer to receive newValue from imm_data (need 64-bit for cudaMemcpy)
-  bool useGdr = gdrEnabled();
-  // Use pinned host memory for reliable cudaMemcpyAsync from a non-default stream.
-  auto newValueHostPtr = useGdr ? nullptr : detail::gpuCallocHostShared<uint64_t>();
-  uint64_t* newValueHost = useGdr ? new uint64_t(0) : newValueHostPtr.get();
+  uint64_t newValueHost = 0;
 
   while (!stopRecvThread_.load(std::memory_order_relaxed)) {
     auto qp = qp_.lock();
@@ -223,21 +219,19 @@ void IBConnection::recvThreadFunc() {
         continue;
       }
 
-      // The imm_data contains newValue (32-bit, extended to 64-bit)
-      // Note: getRecvWcImmData already converts from network byte order via ntohl
-      unsigned int immData = qp->getRecvWcImmData(i);
-      *newValueHost = static_cast<uint64_t>(immData);
-
-      // Flush all in-flight GPUDirect RDMA writes to GPU device memory.
-      // IB guarantees that prior RDMA data writes have been sent before the write-with-imm
-      // completion appears, but the data may still be in-flight in PCIe / GPU internal fabric.
-      // cuFlushGPUDirectRDMAWrites ensures all prior NIC writes are committed to device memory
-      // before we update the semaphore token, so the GPU kernel sees data before the flag.
-#if !defined(MSCCLPP_USE_ROCM)
-      if (flushSupported_) {
-        MSCCLPP_CUTHROW(cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
-                                                   CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER));
-      }
+      // Read the token value written by the remote sender.
+#if defined(DEBUG_CUFLUSH) && defined(MSCCLPP_USE_CUDA)
+      // cuFlush path: read from imm_data then flush NIC->GPU write pipeline for visibility.
+      newValueHost = static_cast<uint64_t>(qp->getRecvWcImmData(i));
+      MSCCLPP_CUTHROW(cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
+                                                 CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER));
+#else
+      // Read the 64-bit token from the local signal GPU buffer via volatile load.
+      // localSignalGpuPtr_ points to either a GDRCopy BAR1 mapping (CUDA) or the
+      // GPU buffer directly (ROCm system-coherent/uncached memory). volatile is not
+      // strictly needed here (uncacheable memory and intervening function calls prevent
+      // stale reads), but is kept as a convention for NIC-written memory.
+      newValueHost = *static_cast<volatile uint64_t*>(localSignalGpuPtr_);
 #endif
 
       // Read dstGpuAddr from the local stored address (set by setRemoteUpdateDstAddr)
@@ -245,22 +239,12 @@ void IBConnection::recvThreadFunc() {
       if (dstGpuAddr != 0) {
         uint64_t* dstPtr = reinterpret_cast<uint64_t*>(dstGpuAddr);
 
-#ifdef MSCCLPP_USE_GDRCOPY
-        if (useGdr && remoteUpdateDstAddrMap_ && remoteUpdateDstAddrMap_->valid()) {
+        if (remoteUpdateDstAddrMap_ && remoteUpdateDstAddrMap_->valid()) {
           // Direct host-side write to GPU memory via GDRCopy BAR1 mapping
-          remoteUpdateDstAddrMap_->copyTo(newValueHost, sizeof(uint64_t));
-        } else
-#endif
-#if defined(MSCCLPP_USE_ROCM)
-        {
-          *dstPtr = *newValueHost;
-        }
-#else
-        if (signalStream_) {
-          // Fallback: use gpuMemcpyAsync with our dedicated stream
-          gpuMemcpyAsync(dstPtr, newValueHost, 1, *signalStream_, cudaMemcpyHostToDevice);
+          remoteUpdateDstAddrMap_->copyTo(&newValueHost, sizeof(uint64_t));
+        } else {
+          *dstPtr = newValueHost;
         }
-#endif
       }
 
       // Post another recv for future messages
@@ -268,11 +252,6 @@ void IBConnection::recvThreadFunc() {
       qp->postRecv();
     }
   }
-
-  // Clean up the host-side buffer (non-GDR path is auto-freed by shared_ptr)
-  if (useGdr) {
-    delete newValueHost;
-  }
 }
 
 IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& localEndpoint,
@@ -280,46 +259,64 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
     : BaseConnection(context, localEndpoint),
       transport_(localEndpoint.transport()),
       remoteTransport_(remoteEndpoint.transport()),
-      dummyAtomicSource_(std::make_unique<uint64_t>(0)),
+      atomicSrc_(std::make_unique<uint64_t>(0)),
       ibNoAtomic_(getImpl(localEndpoint).ibNoAtomic_),
-      flushSupported_(false),
       stopRecvThread_(false),
       localGpuDeviceId_(localEndpoint.device().id),
-      signalStream_(nullptr),
-      remoteUpdateDstAddr_(0) {
+      remoteUpdateDstAddr_(0),
+      remoteSignalGpuMrInfo_{0, 0},
+      localSignalGpuPtr_(nullptr) {
   qp_ = getImpl(localEndpoint).ibQp_;
   qp_.lock()->rtr(getImpl(remoteEndpoint).ibQpInfo_);
   qp_.lock()->rts();
-  dummyAtomicSourceMem_ = context->registerMemory(dummyAtomicSource_.get(), sizeof(uint64_t), transport_);
-  validateTransport(dummyAtomicSourceMem_, transport_);
-  dstTransportInfo_ = getImpl(dummyAtomicSourceMem_).getTransportInfo(transport_);
+  atomicSrcMem_ = context->registerMemory(atomicSrc_.get(), sizeof(uint64_t), transport_);
+  validateTransport(atomicSrcMem_, transport_);
+  atomicSrcTransportInfo_ = getImpl(atomicSrcMem_).getTransportInfo(transport_);
 
   if (ibNoAtomic_) {
-    // Check if cuFlushGPUDirectRDMAWrites is supported on this GPU
-    if (localGpuDeviceId_ >= 0) {
-      int flushOptions = 0;
-#if !defined(MSCCLPP_USE_ROCM)
-      CUdevice cuDev;
-      if (cuDeviceGet(&cuDev, localGpuDeviceId_) == CUDA_SUCCESS) {
-        cuDeviceGetAttribute(&flushOptions, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS, cuDev);
-      }
-      flushSupported_ = (flushOptions & CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) != 0;
-      if (flushSupported_) {
-        INFO(CONN, "cuFlushGPUDirectRDMAWrites is supported on GPU ", localGpuDeviceId_);
-      } else {
-        WARN(CONN, "cuFlushGPUDirectRDMAWrites is NOT supported on GPU ", localGpuDeviceId_,
-             ". RDMA write ordering to GPU memory is not guaranteed.");
+#if defined(MSCCLPP_USE_CUDA)
+    if (!gdrEnabled()) {
+      const char* reason = "unknown";
+      switch (gdrStatus()) {
+        case GdrStatus::NotBuilt:
+          reason = "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)";
+          break;
+        case GdrStatus::Disabled:
+          reason = "GDRCopy is disabled via MSCCLPP_FORCE_DISABLE_GDR environment variable";
+          break;
+        case GdrStatus::DriverMissing:
+          reason = "GDRCopy kernel driver is not loaded (/dev/gdrdrv not found)";
+          break;
+        case GdrStatus::OpenFailed:
+          reason = "gdr_open() failed; GDRCopy driver may be misconfigured";
+          break;
+        default:
+          break;
       }
-#endif
+      THROW(CONN, Error, ErrorCode::InvalidUsage,
+            "IB host-no-atomic mode on CUDA requires GDRCopy: ", reason);
     }
+#endif
 
-    // Create a CUDA stream for async memory copies (not needed when GDRCopy is available,
-    // nor on ROCm where GPU memory is host-accessible and we write directly)
-#if !defined(MSCCLPP_USE_ROCM)
-    if (!gdrEnabled()) {
-      signalStream_ = std::make_unique<CudaStreamWithFlags>(cudaStreamNonBlocking);
+    // Extract remote endpoint's signal GPU buffer MR info for write-with-imm destination
+    const auto& remoteImpl = getImpl(remoteEndpoint);
+    remoteSignalGpuMrInfo_ = remoteImpl.ibSignalGpuMrInfo_;
+
+    // Create a GDR mapping of the local signal GPU buffer. recvThreadFunc reads the
+    // 64-bit token via localSignalGpuPtr_, which points to the BAR1-mapped host address
+    // (CUDA/GDRCopy) or the GPU buffer directly (ROCm system-coherent memory).
+    const auto& localImpl = getImpl(localEndpoint);
+    if (gdrEnabled() && localImpl.ibSignalGpuBuffer_) {
+      localSignalGpuMap_ =
+          std::make_unique<GdrMap>(std::static_pointer_cast<void>(localImpl.ibSignalGpuBuffer_), localGpuDeviceId_);
+    }
+    if (localSignalGpuMap_ && localSignalGpuMap_->valid()) {
+      // Use the BAR1-mapped host pointer; uncacheable MMIO ensures ordered volatile reads.
+      localSignalGpuPtr_ = localSignalGpuMap_->hostPtr();
+    } else if (localImpl.ibSignalGpuBuffer_) {
+      // ROCm: GPU memory is system-coherent, so direct volatile read is safe.
+      localSignalGpuPtr_ = reinterpret_cast<uint64_t*>(localImpl.ibSignalGpuBuffer_.get());
     }
-#endif
 
     // Pre-post receive requests for incoming write-with-imm
     auto qp = qp_.lock();
@@ -353,7 +350,6 @@ bool IBConnection::usesRecvThread() const { return ibNoAtomic_; }
 
 void IBConnection::setRemoteUpdateDstAddr(std::shared_ptr<uint64_t> gpuMem) {
   remoteUpdateDstAddr_ = reinterpret_cast<uint64_t>(gpuMem.get());
-#ifdef MSCCLPP_USE_GDRCOPY
   if (gdrEnabled()) {
     if (gpuMem) {
       remoteUpdateDstAddrMap_ = std::make_unique<GdrMap>(std::move(gpuMem), localGpuDeviceId_);
@@ -361,7 +357,6 @@ void IBConnection::setRemoteUpdateDstAddr(std::shared_ptr<uint64_t> gpuMem) {
       remoteUpdateDstAddrMap_.reset();
     }
   }
-#endif
   INFO(CONN, "IBConnection setRemoteUpdateDstAddr: ", (void*)remoteUpdateDstAddr_);
 }
 
@@ -415,22 +410,24 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6
   *src = newValue;
 
   if (ibNoAtomic_) {
-    // Use RDMA write-with-imm instead of atomic operation
-    // Send only newValue in imm_data (0-byte write)
-    // The remote's recvThreadFunc will use its stored remoteUpdateDstAddr_ to write
+    // Use RDMA write-with-imm instead of atomic operation.
+    // Write the token value (8 bytes) from the local host buffer to the remote signal GPU buffer,
+    // with newValue also in imm_data (32-bit). The remote's recvThreadFunc reads the token from
+    // the signal GPU buffer and forwards it to the semaphore's inbound token address.
 
     // Put newValue in imm_data (truncated to 32-bit; semaphore counters should fit)
     unsigned int immData = static_cast<unsigned int>(newValue);
 
-    // Send 0-byte write-with-imm; use dstMrInfo as target (we don't actually write anything)
-    qp_.lock()->stageSendWriteWithImm(nullptr, dstMrInfo,
-                                      /*size=*/0, /*wrId=*/0,
+    // Write the real token value into the host buffer, then RDMA write host->remote GPU
+    *atomicSrc_ = newValue;
+    qp_.lock()->stageSendWriteWithImm(atomicSrcTransportInfo_.ibMr, remoteSignalGpuMrInfo_,
+                                      /*size=*/sizeof(uint64_t), /*wrId=*/0,
                                       /*srcOffset=*/0, /*dstOffset=*/0,
                                       /*signaled=*/true, /*immData=*/immData);
     qp_.lock()->postSend();
     INFO(CONN, "IBConnection write-with-imm: value ", oldValue, " -> ", newValue);
   } else {
-    qp_.lock()->stageSendAtomicAdd(dstTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue,
+    qp_.lock()->stageSendAtomicAdd(atomicSrcTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue,
                                    /*signaled=*/true);
     qp_.lock()->postSend();
     INFO(CONN, "IBConnection atomic Write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue,
diff --git a/src/core/endpoint.cc b/src/core/endpoint.cc
index 4795aa626..056538856 100644
--- a/src/core/endpoint.cc
+++ b/src/core/endpoint.cc
@@ -53,6 +53,21 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl)
                 ->createQp(config_.ib.port, config_.ib.gidIndex, config_.ib.maxCqSize, config_.ib.maxCqPollNum,
                            config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend);
     ibQpInfo_ = ibQp_->getInfo();
+
+    // Allocate a 64-bit signal GPU buffer for write-with-imm data payload (ibNoAtomic_ only).
+    if (ibNoAtomic_ && config_.device.type == DeviceType::GPU && config_.device.id >= 0) {
+      CudaDeviceGuard deviceGuard(config_.device.id);
+#if defined(MSCCLPP_DEVICE_HIP)
+      ibSignalGpuBuffer_ = detail::gpuCallocUncachedShared<uint64_t>();
+#else
+      ibSignalGpuBuffer_ = detail::gpuCallocShared<uint64_t>();
+#endif
+      ibSignalGpuMr_ =
+          contextImpl.getIbContext(config_.transport)->registerMr(ibSignalGpuBuffer_.get(), sizeof(uint64_t));
+      ibSignalGpuMrInfo_ = ibSignalGpuMr_->getInfo();
+    } else {
+      ibSignalGpuMrInfo_ = {0, 0};
+    }
   } else if (config_.transport == Transport::Ethernet) {
     // Configuring Ethernet Interfaces
     abortFlag_ = 0;
@@ -74,6 +89,10 @@ Endpoint::Impl::Impl(const std::vector<char>& serialization) {
   if (AllIBTransports.has(config_.transport)) {
     ibLocal_ = false;
     it = detail::deserialize(it, ibQpInfo_);
+    it = detail::deserialize(it, ibNoAtomic_);
+    if (ibNoAtomic_) {
+      it = detail::deserialize(it, ibSignalGpuMrInfo_);
+    }
   } else if (config_.transport == Transport::Ethernet) {
     it = detail::deserialize(it, socketAddress_);
   }
@@ -103,6 +122,10 @@ MSCCLPP_API_CPP std::vector<char> Endpoint::serialize() const {
   detail::serialize(data, pimpl_->pidHash_);
   if (AllIBTransports.has(pimpl_->config_.transport)) {
     detail::serialize(data, pimpl_->ibQpInfo_);
+    detail::serialize(data, pimpl_->ibNoAtomic_);
+    if (pimpl_->ibNoAtomic_) {
+      detail::serialize(data, pimpl_->ibSignalGpuMrInfo_);
+    }
   } else if (pimpl_->config_.transport == Transport::Ethernet) {
     detail::serialize(data, pimpl_->socketAddress_);
   }
diff --git a/src/core/gdr.cc b/src/core/gdr.cc
index 2f9176adb..b85174a4f 100644
--- a/src/core/gdr.cc
+++ b/src/core/gdr.cc
@@ -3,7 +3,7 @@
 
 #include "gdr.hpp"
 
-#ifdef MSCCLPP_USE_GDRCOPY
+#if defined(MSCCLPP_USE_GDRCOPY)
 
 #include <unistd.h>
 
@@ -28,12 +28,12 @@ class GdrContext {
   GdrContext(const GdrContext&) = delete;
   GdrContext& operator=(const GdrContext&) = delete;
 
-  bool enabled() const { return enabled_; }
+  GdrStatus status() const { return status_; }
   gdr_t handle() const { return handle_; }
 
  private:
-  bool enabled_ = false;
-  gdr_t handle_ = nullptr;
+  GdrStatus status_;
+  gdr_t handle_;
 };
 
 static std::shared_ptr<GdrContext> gdrContext() {
@@ -41,27 +41,32 @@ static std::shared_ptr<GdrContext> gdrContext() {
   return instance;
 }
 
-bool gdrEnabled() { return gdrContext()->enabled(); }
+GdrStatus gdrStatus() { return gdrContext()->status(); }
 
-GdrContext::GdrContext() {
+bool gdrEnabled() { return gdrStatus() == GdrStatus::Ok; }
+
+GdrContext::GdrContext() : status_(GdrStatus::Disabled), handle_(nullptr) {
   if (env()->forceDisableGdr) {
     INFO(GPU, "GDRCopy disabled via MSCCLPP_FORCE_DISABLE_GDR");
+    status_ = GdrStatus::Disabled;
     return;
   }
 
   // Auto-detect: check if driver is available
   if (access("/dev/gdrdrv", F_OK) != 0) {
     INFO(GPU, "GDRCopy driver not detected, disabling GDRCopy");
+    status_ = GdrStatus::DriverMissing;
     return;
   }
 
   handle_ = gdr_open();
   if (handle_ == nullptr) {
     INFO(GPU, "gdr_open() failed, disabling GDRCopy");
+    status_ = GdrStatus::OpenFailed;
     return;
   }
 
-  enabled_ = true;
+  status_ = GdrStatus::Ok;
   INFO(GPU, "GDRCopy initialized successfully");
 }
 
@@ -74,7 +79,8 @@ GdrContext::~GdrContext() {
 
 // GdrMap
 
-GdrMap::GdrMap(std::shared_ptr<void> gpuMem, int deviceId) : ctx_(gdrContext()), gpuMem_(std::move(gpuMem)) {
+GdrMap::GdrMap(std::shared_ptr<void> gpuMem, int deviceId)
+    : ctx_(gdrContext()), gpuMem_(std::move(gpuMem)), mh_{}, barPtr_(nullptr), hostDstPtr_(nullptr), mappedSize_(0) {
   // Ensure CUDA device context is active for gdr_pin_buffer
   CudaDeviceGuard deviceGuard(deviceId);
 
@@ -96,7 +102,7 @@ GdrMap::GdrMap(std::shared_ptr<void> gpuMem, int deviceId) : ctx_(gdrContext()),
     THROW(GPU, Error, ErrorCode::InternalError, "gdr_map failed (ret=", ret, ") for addr ", (void*)gpuAddr);
   }
 
-  hostDstPtr_ = reinterpret_cast<volatile uint64_t*>(reinterpret_cast<char*>(barPtr_) + pageOffset);
+  hostDstPtr_ = reinterpret_cast<uint64_t*>(reinterpret_cast<char*>(barPtr_) + pageOffset);
 
   INFO(GPU, "GDRCopy mapping established: GPU addr ", (void*)gpuAddr, " -> host ptr ", (const void*)hostDstPtr_);
 }
@@ -110,16 +116,20 @@ GdrMap::~GdrMap() {
   }
 }
 
-void GdrMap::copyTo(const void* src, size_t size) { gdr_copy_to_mapping(mh_, (void*)hostDstPtr_, src, size); }
+void GdrMap::copyTo(const void* src, size_t size) { gdr_copy_to_mapping(mh_, hostDstPtr_, src, size); }
+
+void GdrMap::copyFrom(void* dst, size_t size) const { gdr_copy_from_mapping(mh_, dst, hostDstPtr_, size); }
 
 }  // namespace mscclpp
 
-#else  // !MSCCLPP_USE_GDRCOPY
+#else  // !defined(MSCCLPP_USE_GDRCOPY)
 
 namespace mscclpp {
 
+GdrStatus gdrStatus() { return GdrStatus::NotBuilt; }
+
 bool gdrEnabled() { return false; }
 
 }  // namespace mscclpp
 
-#endif  // MSCCLPP_USE_GDRCOPY
+#endif  // !defined(MSCCLPP_USE_GDRCOPY)
diff --git a/src/core/include/connection.hpp b/src/core/include/connection.hpp
index 5eefd6628..2442f48ea 100644
--- a/src/core/include/connection.hpp
+++ b/src/core/include/connection.hpp
@@ -98,27 +98,31 @@ class IBConnection : public BaseConnection {
   Transport transport_;
   Transport remoteTransport_;
   std::weak_ptr<IbQp> qp_;
-  std::unique_ptr<uint64_t> dummyAtomicSource_;  // not used anywhere but IB needs a source
-  RegisteredMemory dummyAtomicSourceMem_;
-  mscclpp::TransportInfo dstTransportInfo_;
+  std::unique_ptr<uint64_t> atomicSrc_;
+  RegisteredMemory atomicSrcMem_;
+  mscclpp::TransportInfo atomicSrcTransportInfo_;
 
   // For write-with-imm mode (HostNoAtomic): uses RDMA write-with-imm to signal
   // instead of atomic operations, with a host thread forwarding to GPU for memory consistency.
   bool ibNoAtomic_;
-  bool flushSupported_;  // Whether cuFlushGPUDirectRDMAWrites is supported on this GPU
   std::thread recvThread_;
   std::atomic<bool> stopRecvThread_;
-  int localGpuDeviceId_;  // Local GPU device ID for setting CUDA context in recv thread
-  std::unique_ptr<CudaStreamWithFlags> signalStream_;
+  int localGpuDeviceId_;  // Local GPU device ID for CUDA context and GDR mapping
 
   // Write-with-imm design:
-  // - Sender: 0-byte RDMA write-with-imm to dst MR, newValue in imm_data (32-bit)
-  // - Receiver: uses remoteUpdateDstAddr_ (set via setRemoteUpdateDstAddr) to know where to write
+  // - Sender: 8-byte RDMA write-with-imm from local host buffer to remote signal GPU buffer,
+  //   carrying the token value both as RDMA payload and in imm_data (32-bit).
+  // - Receiver: reads the full 64-bit token from the local signal GPU buffer (via BAR1 or
+  //   volatile read), then writes it to remoteUpdateDstAddr_ (the semaphore's inbound token).
   uint64_t remoteUpdateDstAddr_;
 
-#ifdef MSCCLPP_USE_GDRCOPY
+  // Remote endpoint's signal GPU buffer MR info (destination for RDMA write-with-imm).
+  // The local host buffer (atomicSrc_ / atomicSrcTransportInfo_.ibMr) serves as the source.
+  IbMrInfo remoteSignalGpuMrInfo_;
+
   std::unique_ptr<GdrMap> remoteUpdateDstAddrMap_;
-#endif
+  std::unique_ptr<GdrMap> localSignalGpuMap_;
+  uint64_t* localSignalGpuPtr_;
 
   void recvThreadFunc();
 
diff --git a/src/core/include/endpoint.hpp b/src/core/include/endpoint.hpp
index 363faab19..1548d527c 100644
--- a/src/core/include/endpoint.hpp
+++ b/src/core/include/endpoint.hpp
@@ -6,6 +6,7 @@
 
 #include <memory>
 #include <mscclpp/core.hpp>
+#include <mscclpp/gpu_utils.hpp>
 #include <vector>
 
 #include "ib.hpp"
@@ -29,6 +30,13 @@ struct Endpoint::Impl {
   std::shared_ptr<IbQp> ibQp_;
   IbQpInfo ibQpInfo_;
 
+  // Signal GPU buffer for write-with-imm data payload (ibNoAtomic_ only).
+  // Each endpoint allocates a 64-bit GPU buffer and registers it as an IB MR.
+  // The MR info is serialized/exchanged so the remote can RDMA-write to it.
+  std::shared_ptr<uint64_t> ibSignalGpuBuffer_;
+  std::unique_ptr<const IbMr> ibSignalGpuMr_;
+  IbMrInfo ibSignalGpuMrInfo_;
+
   // The following are only used for Ethernet and are undefined for other transports.
   std::unique_ptr<Socket> socket_;
   SocketAddress socketAddress_;
diff --git a/src/core/include/gdr.hpp b/src/core/include/gdr.hpp
index 03047c00c..6663542a4 100644
--- a/src/core/include/gdr.hpp
+++ b/src/core/include/gdr.hpp
@@ -6,19 +6,30 @@
 
 namespace mscclpp {
 
-/// Whether the global GDRCopy context is enabled.
-bool gdrEnabled();
+enum class GdrStatus {
+  Ok,             // GDRCopy initialized successfully
+  NotBuilt,       // Built without MSCCLPP_USE_GDRCOPY
+  Disabled,       // Disabled via MSCCLPP_FORCE_DISABLE_GDR
+  DriverMissing,  // /dev/gdrdrv not found
+  OpenFailed,     // gdr_open() failed
+};
 
-}  // namespace mscclpp
+/// Return the detailed status of the global GDRCopy context.
+GdrStatus gdrStatus();
 
-#ifdef MSCCLPP_USE_GDRCOPY
+/// Whether the global GDRCopy context is enabled (shorthand for gdrStatus() == GdrStatus::Ok).
+bool gdrEnabled();
 
-#include <gdrapi.h>
+}  // namespace mscclpp
 
 #include <cstddef>
 #include <cstdint>
 #include <memory>
 
+#if defined(MSCCLPP_USE_GDRCOPY)
+
+#include <gdrapi.h>
+
 namespace mscclpp {
 
 class GdrContext;
@@ -39,19 +50,46 @@ class GdrMap {
   /// Whether the mapping was established successfully.
   bool valid() const { return hostDstPtr_ != nullptr; }
 
+  /// Return the BAR1-mapped host pointer to the GPU location.
+  uint64_t* hostPtr() const { return hostDstPtr_; }
+
   /// Copy data from host memory to the mapped GPU location.
   void copyTo(const void* src, size_t size);
 
+  /// Copy data from the mapped GPU location to host memory.
+  void copyFrom(void* dst, size_t size) const;
+
  private:
   std::shared_ptr<GdrContext> ctx_;
   std::shared_ptr<void> gpuMem_;
-  gdr_mh_t mh_{};
-  void* barPtr_ = nullptr;
-  volatile uint64_t* hostDstPtr_ = nullptr;
-  size_t mappedSize_ = 0;
+  gdr_mh_t mh_;
+  void* barPtr_;
+  uint64_t* hostDstPtr_;
+  size_t mappedSize_;
+};
+
+}  // namespace mscclpp
+
+#else  // !defined(MSCCLPP_USE_GDRCOPY)
+
+namespace mscclpp {
+
+/// Stub GdrMap when GDRCopy is not available.
+class GdrMap {
+ public:
+  GdrMap(std::shared_ptr<void> /*gpuMem*/, int /*deviceId*/) {}
+  ~GdrMap() = default;
+
+  GdrMap(const GdrMap&) = delete;
+  GdrMap& operator=(const GdrMap&) = delete;
+
+  bool valid() const { return false; }
+  void copyTo(const void* /*src*/, size_t /*size*/) {}
+  void copyFrom(void* /*dst*/, size_t /*size*/) const {}
+  uint64_t* hostPtr() const { return nullptr; }
 };
 
 }  // namespace mscclpp
 
-#endif  // MSCCLPP_USE_GDRCOPY
+#endif  // !defined(MSCCLPP_USE_GDRCOPY)
 #endif  // MSCCLPP_GDR_HPP_

From 72407af2c186dcce2e27cae7a1dc994b0baf503f Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 24 Feb 2026 20:28:32 -0800
Subject: [PATCH 048/132] License

---
 src/core/gdr.cc          | 2 +-
 src/core/include/gdr.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/core/gdr.cc b/src/core/gdr.cc
index b85174a4f..904e54133 100644
--- a/src/core/gdr.cc
+++ b/src/core/gdr.cc
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include "gdr.hpp"
 
diff --git a/src/core/include/gdr.hpp b/src/core/include/gdr.hpp
index 6663542a4..bde2986ab 100644
--- a/src/core/include/gdr.hpp
+++ b/src/core/include/gdr.hpp
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #ifndef MSCCLPP_GDR_HPP_
 #define MSCCLPP_GDR_HPP_

From 8effd97bad8b8f577f4964d7d2f12df3c66cd5ae Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 24 Feb 2026 20:29:12 -0800
Subject: [PATCH 049/132] License

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8fa19a3cc..a8eb0cdf1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+# Licensed under the MIT License.
 
 cmake_minimum_required(VERSION 3.25)
 project(mscclpp LANGUAGES CXX)

From fd7358d9fb7673a63807b94c85afa26dafd9db58 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 24 Feb 2026 20:30:37 -0800
Subject: [PATCH 050/132] License, lint

---
 cmake/FindGDRCopy.cmake | 2 +-
 src/core/connection.cc  | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/cmake/FindGDRCopy.cmake b/cmake/FindGDRCopy.cmake
index 016adfda2..812ead512 100644
--- a/cmake/FindGDRCopy.cmake
+++ b/cmake/FindGDRCopy.cmake
@@ -1,5 +1,5 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+# Licensed under the MIT License.
 
 # Find the GDRCopy libraries
 #
diff --git a/src/core/connection.cc b/src/core/connection.cc
index 2b7801f53..c9bd5f0a2 100644
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -293,8 +293,7 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
         default:
           break;
       }
-      THROW(CONN, Error, ErrorCode::InvalidUsage,
-            "IB host-no-atomic mode on CUDA requires GDRCopy: ", reason);
+      THROW(CONN, Error, ErrorCode::InvalidUsage, "IB host-no-atomic mode on CUDA requires GDRCopy: ", reason);
     }
 #endif
 

From 67d170674d473ba8fbfc849bf5def49eedd5fe6d Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 25 Feb 2026 19:59:19 -0800
Subject: [PATCH 051/132] optimized recv loop

---
 src/core/connection.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/core/connection.cc b/src/core/connection.cc
index c9bd5f0a2..1c528a01f 100644
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -199,10 +199,10 @@ void IBConnection::recvThreadFunc() {
 
   uint64_t newValueHost = 0;
 
-  while (!stopRecvThread_.load(std::memory_order_relaxed)) {
-    auto qp = qp_.lock();
-    if (!qp) break;
+  auto qp = qp_.lock();
+  if (!qp) return;
 
+  while (!stopRecvThread_.load(std::memory_order_relaxed)) {
     int wcNum = qp->pollRecvCq();
     if (wcNum < 0) {
       WARN(NET, "IBConnection recvThreadFunc: pollRecvCq failed");

From 060982d25350ffd38d5ec03578f8476edccc243f Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 26 Feb 2026 12:40:58 -0800
Subject: [PATCH 052/132] updates

---
 src/core/connection.cc   |  2 +-
 src/core/endpoint.cc     |  2 +-
 src/core/ib.cc           | 18 ++++++++++--------
 src/core/include/ib.hpp  |  7 ++++---
 test/mp_unit/ib_tests.cu |  9 ++++++++-
 5 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/src/core/connection.cc b/src/core/connection.cc
index 1c528a01f..e86722771 100644
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -276,7 +276,7 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
   if (ibNoAtomic_) {
 #if defined(MSCCLPP_USE_CUDA)
     if (!gdrEnabled()) {
-      const char* reason = "unknown";
+      std::string reason = "unknown";
       switch (gdrStatus()) {
         case GdrStatus::NotBuilt:
           reason = "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)";
diff --git a/src/core/endpoint.cc b/src/core/endpoint.cc
index 056538856..6569a31e0 100644
--- a/src/core/endpoint.cc
+++ b/src/core/endpoint.cc
@@ -51,7 +51,7 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl)
 
     ibQp_ = contextImpl.getIbContext(config_.transport)
                 ->createQp(config_.ib.port, config_.ib.gidIndex, config_.ib.maxCqSize, config_.ib.maxCqPollNum,
-                           config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend);
+                           config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend, ibNoAtomic_);
     ibQpInfo_ = ibQp_->getInfo();
 
     // Allocate a 64-bit signal GPU buffer for write-with-imm data payload (ibNoAtomic_ only).
diff --git a/src/core/ib.cc b/src/core/ib.cc
index 2e7b867db..baa01727e 100644
--- a/src/core/ib.cc
+++ b/src/core/ib.cc
@@ -131,7 +131,7 @@ const void* IbMr::getBuff() const { return buff_; }
 uint32_t IbMr::getLkey() const { return mr_->lkey; }
 
 IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum,
-           int maxSendWr, int maxRecvWr, int maxWrPerSend)
+           int maxSendWr, int maxRecvWr, int maxWrPerSend, bool noAtomic)
     : portNum_(portNum),
       gidIndex_(gidIndex),
       info_(),
@@ -151,7 +151,8 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendC
       maxSendCqPollNum_(maxSendCqPollNum),
       maxSendWr_(maxSendWr),
       maxWrPerSend_(maxWrPerSend),
-      maxRecvWr_(maxRecvWr) {
+      maxRecvWr_(maxRecvWr),
+      noAtomic_(noAtomic) {
   sendCq_ = IBVerbs::ibv_create_cq(ctx, maxSendCqSize, nullptr, nullptr, 0);
   if (sendCq_ == nullptr) {
     THROW(NET, IbError, errno, "ibv_create_cq failed (errno ", errno, ")");
@@ -211,7 +212,8 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendC
   qpAttr.qp_state = IBV_QPS_INIT;
   qpAttr.pkey_index = 0;
   qpAttr.port_num = portNum_;
-  qpAttr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC;
+  qpAttr.qp_access_flags = noAtomic_ ? IBV_ACCESS_REMOTE_WRITE
+                                     : (IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC);
   if (IBVerbs::ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS) != 0) {
     THROW(NET, IbError, errno, "ibv_modify_qp failed (errno ", errno, ")");
   }
@@ -240,7 +242,7 @@ void IbQp::rtr(const IbQpInfo& info) {
   qp_attr.path_mtu = static_cast<ibv_mtu>(info.mtu);
   qp_attr.dest_qp_num = info.qpn;
   qp_attr.rq_psn = 0;
-  qp_attr.max_dest_rd_atomic = 1;
+  qp_attr.max_dest_rd_atomic = noAtomic_ ? 0 : 1;
   qp_attr.min_rnr_timer = 0x12;
   if (info.linkLayer == IBV_LINK_LAYER_ETHERNET || info.isGrh) {
     qp_attr.ah_attr.is_global = 1;
@@ -272,7 +274,7 @@ void IbQp::rts() {
   qp_attr.retry_cnt = 7;
   qp_attr.rnr_retry = 7;
   qp_attr.sq_psn = 0;
-  qp_attr.max_rd_atomic = 1;
+  qp_attr.max_rd_atomic = noAtomic_ ? 0 : 1;
   int ret = IBVerbs::ibv_modify_qp(
       qp_, &qp_attr,
       IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC);
@@ -512,7 +514,7 @@ int IbCtx::getAnyUsablePort(int gidIndex) const {
 }
 
 std::shared_ptr<IbQp> IbCtx::createQp(int port, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
-                                      int maxRecvWr, int maxWrPerSend) {
+                                      int maxRecvWr, int maxWrPerSend, bool noAtomic) {
   if (port == -1) {
     port = this->getAnyUsablePort(gidIndex);
     if (port == -1) {
@@ -521,8 +523,8 @@ std::shared_ptr<IbQp> IbCtx::createQp(int port, int gidIndex, int maxSendCqSize,
   } else if (!this->isPortUsable(port, gidIndex)) {
     THROW(NET, Error, ErrorCode::InvalidUsage, "invalid IB port: ", port);
   }
-  return std::shared_ptr<IbQp>(
-      new IbQp(ctx_, pd_, port, gidIndex, maxSendCqSize, maxSendCqPollNum, maxSendWr, maxRecvWr, maxWrPerSend));
+  return std::shared_ptr<IbQp>(new IbQp(ctx_, pd_, port, gidIndex, maxSendCqSize, maxSendCqPollNum, maxSendWr,
+                                        maxRecvWr, maxWrPerSend, noAtomic));
 }
 
 std::unique_ptr<const IbMr> IbCtx::registerMr(void* buff, std::size_t size) {
diff --git a/src/core/include/ib.hpp b/src/core/include/ib.hpp
index e9363e9cb..bfa6e3145 100644
--- a/src/core/include/ib.hpp
+++ b/src/core/include/ib.hpp
@@ -101,7 +101,7 @@ class IbQp {
   };
 
   IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
-       int maxRecvWr, int maxWrPerSend);
+       int maxRecvWr, int maxWrPerSend, bool noAtomic);
   SendWrInfo getNewSendWrInfo();
   RecvWrInfo getNewRecvWrInfo();
 
@@ -128,6 +128,7 @@ class IbQp {
   const int maxSendWr_;
   const int maxWrPerSend_;
   const int maxRecvWr_;
+  const bool noAtomic_;
 
   friend class IbCtx;
 };
@@ -139,14 +140,14 @@ class IbCtx {
   ~IbCtx();
 
   std::shared_ptr<IbQp> createQp(int port, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
-                                 int maxRecvWr, int maxWrPerSend);
+                                 int maxRecvWr, int maxWrPerSend, bool noAtomic);
   std::unique_ptr<const IbMr> registerMr(void* buff, std::size_t size);
   bool supportsRdmaAtomics() const;
 #else
   IbCtx([[maybe_unused]] const std::string& devName) {}
   ~IbCtx() {}
 
-  std::shared_ptr<IbQp> createQp(int, int, int, int, int, int, int) { return nullptr; }
+  std::shared_ptr<IbQp> createQp(int, int, int, int, int, int, int, bool) { return nullptr; }
   std::unique_ptr<const IbMr> registerMr([[maybe_unused]] void* buff, [[maybe_unused]] std::size_t size) {
     return nullptr;
   }
diff --git a/test/mp_unit/ib_tests.cu b/test/mp_unit/ib_tests.cu
index 051030ac8..4397a04f2 100644
--- a/test/mp_unit/ib_tests.cu
+++ b/test/mp_unit/ib_tests.cu
@@ -42,7 +42,8 @@ void IbPeerToPeerTest::SetUp() {
   int ib_gid_index = std::stoi(gEnv->args["ib_gid_index"]);
 
   ibCtx = std::make_shared<mscclpp::IbCtx>(ibDevName);
-  qp = ibCtx->createQp(-1, ib_gid_index, 1024, 1, 8192, 0, 64);
+  bool noAtomic = !ibCtx->supportsRdmaAtomics();
+  qp = ibCtx->createQp(-1, ib_gid_index, 1024, 1, 8192, 0, 64, noAtomic);
 
   qpInfo[gEnv->rank] = qp->getInfo();
   bootstrap->allGather(qpInfo.data(), sizeof(mscclpp::IbQpInfo));
@@ -200,6 +201,9 @@ TEST_F(IbPeerToPeerTest, MemoryConsistency) {
     // This test needs only two ranks
     return;
   }
+  if (!ibCtx->supportsRdmaAtomics()) {
+    GTEST_SKIP() << "This test requires RDMA atomics support.";
+  }
 
   const uint64_t signalPeriod = 1024;
   const uint64_t maxIter = 10000;
@@ -308,6 +312,9 @@ TEST_F(IbPeerToPeerTest, SimpleAtomicAdd) {
     // This test needs only two ranks
     return;
   }
+  if (!ibCtx->supportsRdmaAtomics()) {
+    GTEST_SKIP() << "This test requires RDMA atomics support.";
+  }
 
   mscclpp::Timer timeout(3);
 

From 8c3a4362cd76a575ca171e7a6b540648454791ed Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 26 Feb 2026 19:37:06 -0800
Subject: [PATCH 053/132] update CI

---
 .azure-pipelines/templates/ut-codecov.yaml | 136 +++++++++++++++++++++
 .azure-pipelines/templates/ut.yaml         |  72 -----------
 .azure-pipelines/ut-rocm.yml               |  23 ++++
 .azure-pipelines/ut.yml                    |  44 +++++++
 4 files changed, 203 insertions(+), 72 deletions(-)
 create mode 100644 .azure-pipelines/templates/ut-codecov.yaml

diff --git a/.azure-pipelines/templates/ut-codecov.yaml b/.azure-pipelines/templates/ut-codecov.yaml
new file mode 100644
index 000000000..21186c6b0
--- /dev/null
+++ b/.azure-pipelines/templates/ut-codecov.yaml
@@ -0,0 +1,136 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: sshKeySecureFile
+  type: string
+- name: platform
+  type: string
+  default: 'cuda'
+- name: gpuArch
+  type: string
+
+steps:
+- task: Bash@3
+  name: BuildCoverage
+  displayName: Build with coverage
+  inputs:
+    targetType: 'inline'
+    script: |
+      mkdir build && cd build
+      if [ "${{ parameters.platform }}" == "rocm" ]; then
+        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_ENABLE_COVERAGE=ON ..
+      else
+        cmake -DCMAKE_BUILD_TYPE=Debug -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_ENABLE_COVERAGE=ON ..
+      fi
+      make -j
+      cd ..
+      pwd > build/BUILD_PREFIX
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: DownloadSecureFile@1
+  name: SshKeyFile
+  displayName: Download key file
+  inputs:
+    secureFile: ${{ parameters.sshKeySecureFile }}
+
+- task: Bash@3
+  name: InstallPackages
+  displayName: Install Packages
+  inputs:
+    targetType: 'inline'
+    script: |
+      sudo apt-get update -y
+      sudo apt-get install pssh -y
+      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
+
+- task: AzureCLI@2
+  name: StartVMSS
+  displayName: Start VMSS
+  inputs:
+    azureSubscription: ${{ parameters.subscription }}
+    scriptType: bash
+    scriptLocation: inlineScript
+    inlineScript: |
+      az vmss start --name ${{ parameters.vmssName }}  --resource-group mscclpp
+
+- task: Bash@3
+  name: DeployTestEnv
+  displayName: Deploy Test Env
+  inputs:
+    targetType: filePath
+    filePath: test/deploy/deploy.sh
+    arguments: "single-node-test true ${{ parameters.platform }}"
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: Bash@3
+  name: TestsCoverageNonPerf
+  displayName: Run unit_tests + mp_unit_tests (non-perf) with coverage
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
+      SSH_OPTION="StrictHostKeyChecking=no"
+      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+      : > azureuser@10.0.0.4
+      tail -f azureuser@10.0.0.4 &
+      CHILD_PID=$!
+      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
+        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
+        export PATH=/usr/local/mpi/bin:\$PATH;                        \
+        cd /root/mscclpp;                                             \
+        BUILD_PREFIX=\$(cat build/BUILD_PREFIX);                    \
+        STRIP_COUNT=\$(echo \$BUILD_PREFIX | tr -cd / | wc -c);      \
+        export GCOV_PREFIX=/root/mscclpp;                             \
+        export GCOV_PREFIX_STRIP=\$STRIP_COUNT;                      \
+        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
+        ./build/bin/unit_tests;                                       \
+        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --exclude-perf-tests;  \
+        mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests --exclude-perf-tests;  \
+        cd build;                                                     \
+        lcov --directory . --capture --output-file coverage.info --ignore-errors inconsistent;  \
+        lcov --extract coverage.info \"\${BUILD_PREFIX}/src/*\" \"\${BUILD_PREFIX}/include/mscclpp/*\" --output-file coverage.info;  \
+        lcov --list coverage.info"'
+      kill $CHILD_PID
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: Bash@3
+  name: FetchCoverage
+  displayName: Fetch coverage data from remote VM
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
+      SSH_OPTION="StrictHostKeyChecking=no"
+      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+      HOST=$(head -1 ${HOSTFILE})
+      ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST} \
+        'sudo docker cp mscclpp-test:/root/mscclpp/build/coverage.info /tmp/coverage.info'
+      scp -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST}:/tmp/coverage.info $(System.DefaultWorkingDirectory)/coverage.info
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: Bash@3
+  name: UploadCodecov
+  displayName: Upload coverage to Codecov
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      curl -Os https://cli.codecov.io/latest/linux/codecov
+      chmod +x codecov
+      ./codecov upload-process --disable-search -t $(CODECOV_TOKEN) -f coverage.info --flag ${{ parameters.platform }}-${{ parameters.gpuArch }}
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: AzureCLI@2
+  name: StopVMSS
+  displayName: Deallocate VMSS
+  condition: always()
+  inputs:
+    azureSubscription: ${{ parameters.subscription }}
+    scriptType: bash
+    scriptLocation: inlineScript
+    inlineScript: |
+      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
diff --git a/.azure-pipelines/templates/ut.yaml b/.azure-pipelines/templates/ut.yaml
index 6f4206fcc..2086fd0ac 100644
--- a/.azure-pipelines/templates/ut.yaml
+++ b/.azure-pipelines/templates/ut.yaml
@@ -10,8 +10,6 @@ parameters:
   default: 'cuda'
 - name: gpuArch
   type: string
-variables:
-- group: mscclpp
 
 steps:
 - task: Bash@3
@@ -27,16 +25,6 @@ steps:
         cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
       fi
       make -j
-      cd ..
-      mkdir build_coverage && cd build_coverage
-      if [ "${{ parameters.platform }}" == "rocm" ]; then
-        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_ENABLE_COVERAGE=ON ..
-      else
-        cmake -DCMAKE_BUILD_TYPE=Debug -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_ENABLE_COVERAGE=ON ..
-      fi
-      make -j
-      cd ..
-      pwd > build_coverage/BUILD_PREFIX
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: DownloadSecureFile@1
@@ -120,66 +108,6 @@ steps:
       kill $CHILD_PID
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
-- task: Bash@3
-  name: TestsCoverageNonPerf
-  displayName: Run unit_tests + mp_unit_tests (non-perf) with coverage
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        export PATH=/usr/local/mpi/bin:\$PATH;                        \
-        cd /root/mscclpp;                                             \
-        BUILD_PREFIX=\$(cat build_coverage/BUILD_PREFIX);             \
-        STRIP_COUNT=\$(echo \$BUILD_PREFIX | tr -cd / | wc -c);      \
-        export GCOV_PREFIX=/root/mscclpp;                             \
-        export GCOV_PREFIX_STRIP=\$STRIP_COUNT;                      \
-        export LD_LIBRARY_PATH=/root/mscclpp/build_coverage/lib:\$LD_LIBRARY_PATH; \
-        ./build_coverage/bin/unit_tests;                              \
-        mpirun --allow-run-as-root -tag-output -np 2 ./build_coverage/bin/mp_unit_tests --exclude-perf-tests;  \
-        mpirun --allow-run-as-root -tag-output -np 4 ./build_coverage/bin/mp_unit_tests --exclude-perf-tests;  \
-        cd build_coverage;                                            \
-        lcov --directory . --capture --output-file coverage.info --ignore-errors inconsistent;  \
-        lcov --extract coverage.info \"\${BUILD_PREFIX}/src/*\" \"\${BUILD_PREFIX}/include/mscclpp/*\" --output-file coverage.info;  \
-        lcov --list coverage.info"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: FetchCoverage
-  displayName: Fetch coverage data from remote VM
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      HOST=$(head -1 ${HOSTFILE})
-      ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST} \
-        'sudo docker cp mscclpp-test:/root/mscclpp/build_coverage/coverage.info /tmp/coverage.info'
-      scp -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST}:/tmp/coverage.info $(System.DefaultWorkingDirectory)/coverage.info
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: UploadCodecov
-  displayName: Upload coverage to Codecov
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      curl -Os https://cli.codecov.io/latest/linux/codecov
-      chmod +x codecov
-      ./codecov upload-process --disable-search -t $(CODECOV_TOKEN) -f coverage.info --flag ${{ parameters.platform }}-${{ parameters.gpuArch }}
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
 - task: Bash@3
   name: PyTests
   displayName: Run pytests
diff --git a/.azure-pipelines/ut-rocm.yml b/.azure-pipelines/ut-rocm.yml
index 8b0aed1a0..0df6e8faf 100644
--- a/.azure-pipelines/ut-rocm.yml
+++ b/.azure-pipelines/ut-rocm.yml
@@ -48,3 +48,26 @@ jobs:
       sshKeySecureFile: mscclpp.pem
       platform:         rocm
       gpuArch:          gfx942
+
+- job: CodeCoverageMI300X
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci-mi300x
+  variables:
+  - group: mscclpp
+  strategy:
+    matrix:
+      rocm6_2:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/ut-codecov.yaml
+    parameters:
+      subscription:     mscclpp-ci-mi300x
+      vmssName:         mscclpp-mi300x-ci
+      sshKeySecureFile: mscclpp.pem
+      platform:         rocm
+      gpuArch:          gfx942
diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml
index 4aac07e64..c1458c3ca 100644
--- a/.azure-pipelines/ut.yml
+++ b/.azure-pipelines/ut.yml
@@ -133,3 +133,47 @@ jobs:
       vmssName:         mscclpp-h100-ci
       sshKeySecureFile: mscclpp.pem
       gpuArch:          '90'
+
+- job: CodeCoverageA100
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci
+  variables:
+  - group: mscclpp
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/ut-codecov.yaml
+    parameters:
+      subscription:     mscclpp-ci
+      vmssName:         mscclpp-ci
+      sshKeySecureFile: mscclpp.pem
+      gpuArch:          '80'
+
+- job: CodeCoverageH100
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci-h100
+  variables:
+  - group: mscclpp
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/ut-codecov.yaml
+    parameters:
+      subscription:     mscclpp-ci-h100
+      vmssName:         mscclpp-h100-ci
+      sshKeySecureFile: mscclpp.pem
+      gpuArch:          '90'

From 3b56b08bcbb4da629f5168fc9c8a3a7af643f2c5 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 4 Mar 2026 23:36:39 +0000
Subject: [PATCH 054/132] data direct

---
 CMakeLists.txt                      |   6 ++
 cmake/FindMLX5.cmake                |  37 ++++++++++
 src/core/CMakeLists.txt             |   4 ++
 src/core/connection.cc              |  49 +++++++------
 src/core/gdr.cc                     |   4 +-
 src/core/ib.cc                      | 107 +++++++++++++++++++++-------
 src/core/include/connection.hpp     |   7 ++
 src/core/include/ib.hpp             |  13 +++-
 src/core/include/mlx5dv_wrapper.hpp |  38 ++++++++++
 src/core/mlx5dv_wrapper.cc          | 103 ++++++++++++++++++++++++++
 10 files changed, 320 insertions(+), 48 deletions(-)
 create mode 100644 cmake/FindMLX5.cmake
 create mode 100644 src/core/include/mlx5dv_wrapper.hpp
 create mode 100644 src/core/mlx5dv_wrapper.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a8eb0cdf1..bed7b92e0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -167,6 +167,12 @@ if(MSCCLPP_USE_IB)
     if(NOT IBVERBS_FOUND)
         message(FATAL_ERROR "IBVerbs not found. Install libibverbs-dev or rdma-core-devel. If you want to disable InfiniBand, add `-DMSCCLPP_USE_IB=OFF` in your cmake command.")
     endif()
+    find_package(MLX5)
+    if(MLX5_FOUND)
+        message(STATUS "MLX5 Direct Verbs found: ${MLX5_LIBRARIES}")
+    else()
+        message(STATUS "MLX5 Direct Verbs not found, mlx5dv optimizations disabled")
+    endif()
 endif()
 find_package(NUMA REQUIRED)
 find_package(Threads REQUIRED)
diff --git a/cmake/FindMLX5.cmake b/cmake/FindMLX5.cmake
new file mode 100644
index 000000000..592984501
--- /dev/null
+++ b/cmake/FindMLX5.cmake
@@ -0,0 +1,37 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# Find the MLX5 Direct Verbs (mlx5dv) library
+#
+# The following variables are optionally searched for defaults
+#  MLX5_ROOT_DIR: Base directory where all MLX5 components are found
+#  MLX5_INCLUDE_DIR: Directory where MLX5 headers are found
+#  MLX5_LIB_DIR: Directory where MLX5 libraries are found
+
+# The following are set after configuration is done:
+#  MLX5_FOUND
+#  MLX5_INCLUDE_DIRS
+#  MLX5_LIBRARIES
+
+find_path(MLX5_INCLUDE_DIRS
+  NAMES infiniband/mlx5dv.h
+  HINTS
+  ${MLX5_INCLUDE_DIR}
+  ${MLX5_ROOT_DIR}
+  ${MLX5_ROOT_DIR}/include
+  /usr/local/include
+  /usr/include)
+
+find_library(MLX5_LIBRARIES
+  NAMES mlx5
+  HINTS
+  ${MLX5_LIB_DIR}
+  ${MLX5_ROOT_DIR}
+  ${MLX5_ROOT_DIR}/lib
+  /usr/local/lib
+  /usr/lib
+  /usr/lib/x86_64-linux-gnu)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(MLX5 DEFAULT_MSG MLX5_INCLUDE_DIRS MLX5_LIBRARIES)
+mark_as_advanced(MLX5_INCLUDE_DIRS MLX5_LIBRARIES)
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 3eb6466a7..9ca5fed3f 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -28,6 +28,10 @@ if(MSCCLPP_USE_IB)
     target_include_directories(mscclpp_obj SYSTEM PRIVATE ${IBVERBS_INCLUDE_DIRS})
     target_link_libraries(mscclpp_obj PRIVATE ${IBVERBS_LIBRARIES})
     target_compile_definitions(mscclpp_obj PUBLIC USE_IBVERBS)
+    if(MLX5_FOUND)
+        target_include_directories(mscclpp_obj SYSTEM PRIVATE ${MLX5_INCLUDE_DIRS})
+        target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_MLX5DV)
+    endif()
 endif()
 
 if(MSCCLPP_USE_GDRCOPY)
diff --git a/src/core/connection.cc b/src/core/connection.cc
index e86722771..c821eb59d 100644
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -219,25 +219,23 @@ void IBConnection::recvThreadFunc() {
         continue;
       }
 
-      // Read the token value written by the remote sender.
-#if defined(DEBUG_CUFLUSH) && defined(MSCCLPP_USE_CUDA)
-      // cuFlush path: read from imm_data then flush NIC->GPU write pipeline for visibility.
-      newValueHost = static_cast<uint64_t>(qp->getRecvWcImmData(i));
-      MSCCLPP_CUTHROW(cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
-                                                 CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER));
-#else
-      // Read the 64-bit token from the local signal GPU buffer via volatile load.
-      // localSignalGpuPtr_ points to either a GDRCopy BAR1 mapping (CUDA) or the
-      // GPU buffer directly (ROCm system-coherent/uncached memory). volatile is not
-      // strictly needed here (uncacheable memory and intervening function calls prevent
-      // stale reads), but is kept as a convention for NIC-written memory.
-      newValueHost = *static_cast<volatile uint64_t*>(localSignalGpuPtr_);
-#endif
+      // Read the token value from the incoming write-with-imm completion.
+      if (dataDirectEnabled_) {
+        // Data Direct path: the signal GPU buffer MR was registered with
+        // MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT, and the semaphore token is also written
+        // through Data Direct (via GDRCopy). Both writes go through the same path, so
+        // all data is visible in GPU memory when the CQE is polled. Read from imm_data.
+        newValueHost = static_cast<uint64_t>(qp->getRecvWcImmData(i));
+      } else {
+        // Slow path: read the 64-bit token from the local signal GPU buffer via volatile load.
+        // localSignalGpuPtr_ points to either a GDRCopy BAR1 mapping (CUDA) or the
+        // GPU buffer directly (ROCm system-coherent/uncached memory).
+        newValueHost = *static_cast<volatile uint64_t*>(localSignalGpuPtr_);
+      }
 
-      // Read dstGpuAddr from the local stored address (set by setRemoteUpdateDstAddr)
-      uint64_t dstGpuAddr = remoteUpdateDstAddr_;
-      if (dstGpuAddr != 0) {
-        uint64_t* dstPtr = reinterpret_cast<uint64_t*>(dstGpuAddr);
+      // Read token address from the local stored address (set by setRemoteUpdateDstAddr)
+      if (remoteUpdateDstAddr_ != 0) {
+        uint64_t* dstPtr = reinterpret_cast<uint64_t*>(remoteUpdateDstAddr_);
 
         if (remoteUpdateDstAddrMap_ && remoteUpdateDstAddrMap_->valid()) {
           // Direct host-side write to GPU memory via GDRCopy BAR1 mapping
@@ -265,7 +263,8 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
       localGpuDeviceId_(localEndpoint.device().id),
       remoteUpdateDstAddr_(0),
       remoteSignalGpuMrInfo_{0, 0},
-      localSignalGpuPtr_(nullptr) {
+      localSignalGpuPtr_(nullptr),
+      dataDirectEnabled_(false) {
   qp_ = getImpl(localEndpoint).ibQp_;
   qp_.lock()->rtr(getImpl(remoteEndpoint).ibQpInfo_);
   qp_.lock()->rts();
@@ -317,8 +316,18 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
       localSignalGpuPtr_ = reinterpret_cast<uint64_t*>(localImpl.ibSignalGpuBuffer_.get());
     }
 
-    // Pre-post receive requests for incoming write-with-imm
+    // When the QP is mlx5 and the signal GPU buffer MR is a Data Direct DMABUF
+    // (registered via mlx5dv_reg_dmabuf_mr with MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT),
+    // and the semaphore token write also goes through Data Direct (via GDRCopy to a
+    // Data Direct DMABUF MR), all writes are visible in GPU memory when the CQE is
+    // polled. This allows reading the token from imm_data instead of the signal GPU buffer.
     auto qp = qp_.lock();
+    dataDirectEnabled_ = localImpl.ibSignalGpuMr_ && localImpl.ibSignalGpuMr_->isDataDirect();
+    if (dataDirectEnabled_) {
+      INFO(CONN, "IBConnection: Data Direct enabled (mlx5 + DMABUF)");
+    }
+
+    // Pre-post receive requests for incoming write-with-imm
     int maxRecvWr = localEndpoint.config().ib.maxRecvWr;
     for (int i = 0; i < maxRecvWr; ++i) {
       qp->stageRecv(/*wrId=*/0);
diff --git a/src/core/gdr.cc b/src/core/gdr.cc
index 904e54133..004c160ae 100644
--- a/src/core/gdr.cc
+++ b/src/core/gdr.cc
@@ -90,9 +90,9 @@ GdrMap::GdrMap(std::shared_ptr<void> gpuMem, int deviceId)
   unsigned long pageOffset = gpuAddr - alignedAddr;
   mappedSize_ = GPU_PAGE_SIZE;
 
-  int ret = gdr_pin_buffer(ctx_->handle(), alignedAddr, mappedSize_, 0, 0, &mh_);
+  int ret = gdr_pin_buffer_v2(ctx_->handle(), alignedAddr, mappedSize_, GDR_PIN_FLAG_FORCE_PCIE, &mh_);
   if (ret != 0) {
-    THROW(GPU, Error, ErrorCode::InternalError, "gdr_pin_buffer failed (ret=", ret, ") for addr ", (void*)gpuAddr,
+    THROW(GPU, Error, ErrorCode::InternalError, "gdr_pin_buffer_v2 failed (ret=", ret, ") for addr ", (void*)gpuAddr,
           ". Ensure the GPU memory is allocated with cudaMalloc (not cuMemCreate/cuMemMap).");
   }
 
diff --git a/src/core/ib.cc b/src/core/ib.cc
index c7a481a97..c82b147a8 100644
--- a/src/core/ib.cc
+++ b/src/core/ib.cc
@@ -21,6 +21,9 @@
 #include "context.hpp"
 #if defined(USE_IBVERBS)
 #include "ibverbs_wrapper.hpp"
+#if defined(MSCCLPP_USE_MLX5DV)
+#include "mlx5dv_wrapper.hpp"
+#endif  // defined(MSCCLPP_USE_MLX5DV)
 #endif  // defined(USE_IBVERBS)
 #include "logger.hpp"
 
@@ -64,7 +67,8 @@ static inline bool isDmabufSupportedByGpu(int gpuId) {
   return ret;
 }
 
-IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : mr_(nullptr), buff_(buff), size_(0) {
+IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isMlx5)
+    : mr_(nullptr), buff_(buff), size_(0), isDmabuf_(false), isDataDirect_(false) {
   if (size == 0) {
     THROW(NET, Error, ErrorCode::InvalidUsage, "invalid MR size: 0");
   }
@@ -84,13 +88,24 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : mr_(nullptr), buff_(buff)
     MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, pages * pageSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
 
     size_t offsetInDmaBuf = buffIntPtr % pageSize;
-    mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd,
-                                     IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
-                                         IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC);
+    int accessFlags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
+                      IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC;
+#if defined(MSCCLPP_USE_MLX5DV)
+    if (isMlx5 && MLX5DV::isAvailable()) {
+      mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
+      if (mr_ != nullptr) {
+        isDataDirect_ = true;
+      }
+    }
+#endif
+    if (mr_ == nullptr) {
+      mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
+    }
     ::close(fd);
     if (mr_ == nullptr) {
       THROW(NET, IbError, errno, "ibv_reg_dmabuf_mr failed (errno ", errno, ")");
     }
+    isDmabuf_ = true;
 #else   // defined(MSCCLPP_USE_ROCM)
     THROW(NET, Error, ErrorCode::InvalidUsage, "We don't support DMABUF on HIP platforms yet");
 #endif  // defined(MSCCLPP_USE_ROCM)
@@ -130,8 +145,12 @@ const void* IbMr::getBuff() const { return buff_; }
 
 uint32_t IbMr::getLkey() const { return mr_->lkey; }
 
+bool IbMr::isDmabuf() const { return isDmabuf_; }
+
+bool IbMr::isDataDirect() const { return isDataDirect_; }
+
 IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum,
-           int maxSendWr, int maxRecvWr, int maxWrPerSend, bool noAtomic)
+           int maxSendWr, int maxRecvWr, int maxWrPerSend, bool noAtomic, bool isMlx5)
     : portNum_(portNum),
       gidIndex_(gidIndex),
       info_(),
@@ -152,7 +171,8 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendC
       maxSendWr_(maxSendWr),
       maxWrPerSend_(maxWrPerSend),
       maxRecvWr_(maxRecvWr),
-      noAtomic_(noAtomic) {
+      noAtomic_(noAtomic),
+      isMlx5_(isMlx5) {
   sendCq_ = IBVerbs::ibv_create_cq(ctx, maxSendCqSize, nullptr, nullptr, 0);
   if (sendCq_ == nullptr) {
     THROW(NET, IbError, errno, "ibv_create_cq failed (errno ", errno, ")");
@@ -166,21 +186,47 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendC
     }
   }
 
-  struct ibv_qp_init_attr qpInitAttr = {};
-  qpInitAttr.sq_sig_all = 0;
-  qpInitAttr.send_cq = sendCq_;
-  // Use separate recv CQ if created, otherwise use the send CQ
-  qpInitAttr.recv_cq = (recvCq_ != nullptr) ? recvCq_ : sendCq_;
-  qpInitAttr.qp_type = IBV_QPT_RC;
-  qpInitAttr.cap.max_send_wr = maxSendWr;
-  qpInitAttr.cap.max_recv_wr = maxRecvWr;
-  qpInitAttr.cap.max_send_sge = 1;
-  qpInitAttr.cap.max_recv_sge = 1;
-  qpInitAttr.cap.max_inline_data = 0;
-
-  struct ibv_qp* qp = IBVerbs::ibv_create_qp(pd, &qpInitAttr);
-  if (qp == nullptr) {
-    THROW(NET, IbError, errno, "ibv_create_qp failed (errno ", errno, ")");
+  struct ibv_qp* qp = nullptr;
+#if defined(MSCCLPP_USE_MLX5DV)
+  if (isMlx5_) {
+    struct ibv_qp_init_attr_ex qpInitAttrEx = {};
+    qpInitAttrEx.sq_sig_all = 0;
+    qpInitAttrEx.send_cq = sendCq_;
+    qpInitAttrEx.recv_cq = (recvCq_ != nullptr) ? recvCq_ : sendCq_;
+    qpInitAttrEx.qp_type = IBV_QPT_RC;
+    qpInitAttrEx.cap.max_send_wr = maxSendWr;
+    qpInitAttrEx.cap.max_recv_wr = maxRecvWr;
+    qpInitAttrEx.cap.max_send_sge = 1;
+    qpInitAttrEx.cap.max_recv_sge = 1;
+    qpInitAttrEx.cap.max_inline_data = 0;
+    qpInitAttrEx.pd = pd;
+    qpInitAttrEx.comp_mask = IBV_QP_INIT_ATTR_PD;
+
+    struct mlx5dv_qp_init_attr mlx5QpAttr = {};
+
+    qp = MLX5DV::mlx5dv_create_qp(ctx, &qpInitAttrEx, &mlx5QpAttr);
+    if (qp == nullptr) {
+      THROW(NET, IbError, errno, "mlx5dv_create_qp failed (errno ", errno, ")");
+    }
+  } else
+#endif  // defined(MSCCLPP_USE_MLX5DV)
+  {
+    struct ibv_qp_init_attr qpInitAttr = {};
+    qpInitAttr.sq_sig_all = 0;
+    qpInitAttr.send_cq = sendCq_;
+    // Use separate recv CQ if created, otherwise use the send CQ
+    qpInitAttr.recv_cq = (recvCq_ != nullptr) ? recvCq_ : sendCq_;
+    qpInitAttr.qp_type = IBV_QPT_RC;
+    qpInitAttr.cap.max_send_wr = maxSendWr;
+    qpInitAttr.cap.max_recv_wr = maxRecvWr;
+    qpInitAttr.cap.max_send_sge = 1;
+    qpInitAttr.cap.max_recv_sge = 1;
+    qpInitAttr.cap.max_inline_data = 0;
+
+    qp = IBVerbs::ibv_create_qp(pd, &qpInitAttr);
+    if (qp == nullptr) {
+      THROW(NET, IbError, errno, "ibv_create_qp failed (errno ", errno, ")");
+    }
   }
 
   struct ibv_port_attr portAttr;
@@ -436,12 +482,21 @@ std::string IbQp::getRecvWcStatusString(int idx) const { return IBVerbs::ibv_wc_
 
 unsigned int IbQp::getRecvWcImmData(int idx) const { return ntohl((*recvWcs_)[idx].imm_data); }
 
-IbCtx::IbCtx(const std::string& devName) : devName_(devName), ctx_(nullptr), pd_(nullptr), supportsRdmaAtomics_(false) {
+IbCtx::IbCtx(const std::string& devName)
+    : devName_(devName), ctx_(nullptr), pd_(nullptr), supportsRdmaAtomics_(false), isMlx5_(false) {
   int num;
   struct ibv_device** devices = IBVerbs::ibv_get_device_list(&num);
   for (int i = 0; i < num; ++i) {
     if (std::string(devices[i]->name) == devName_) {
       ctx_ = IBVerbs::ibv_open_device(devices[i]);
+#if defined(MSCCLPP_USE_MLX5DV)
+      if (MLX5DV::isAvailable()) {
+        isMlx5_ = MLX5DV::mlx5dv_is_supported(devices[i]);
+        if (isMlx5_) {
+          INFO(NET, "IB device ", devName_, " supports mlx5 Direct Verbs");
+        }
+      }
+#endif  // defined(MSCCLPP_USE_MLX5DV)
       break;
     }
   }
@@ -524,15 +579,17 @@ std::shared_ptr<IbQp> IbCtx::createQp(int port, int gidIndex, int maxSendCqSize,
     THROW(NET, Error, ErrorCode::InvalidUsage, "invalid IB port: ", port);
   }
   return std::shared_ptr<IbQp>(new IbQp(ctx_, pd_, port, gidIndex, maxSendCqSize, maxSendCqPollNum, maxSendWr,
-                                        maxRecvWr, maxWrPerSend, noAtomic));
+                                        maxRecvWr, maxWrPerSend, noAtomic, isMlx5_));
 }
 
 std::unique_ptr<const IbMr> IbCtx::registerMr(void* buff, std::size_t size) {
-  return std::unique_ptr<const IbMr>(new IbMr(pd_, buff, size));
+  return std::unique_ptr<const IbMr>(new IbMr(pd_, buff, size, isMlx5_));
 }
 
 bool IbCtx::supportsRdmaAtomics() const { return supportsRdmaAtomics_; }
 
+bool IbCtx::isMlx5() const { return isMlx5_; }
+
 MSCCLPP_API_CPP int getIBDeviceCount() {
   int num;
   IBVerbs::ibv_get_device_list(&num);
@@ -642,6 +699,8 @@ IbMr::~IbMr() {}
 IbMrInfo IbMr::getInfo() const { return IbMrInfo(); }
 const void* IbMr::getBuff() const { return nullptr; }
 uint32_t IbMr::getLkey() const { return 0; }
+bool IbMr::isDmabuf() const { return false; }
+bool IbMr::isDataDirect() const { return false; }
 
 IbQp::~IbQp() {}
 void IbQp::rtr(const IbQpInfo& /*info*/) {}
diff --git a/src/core/include/connection.hpp b/src/core/include/connection.hpp
index 2442f48ea..ecba5ed5b 100644
--- a/src/core/include/connection.hpp
+++ b/src/core/include/connection.hpp
@@ -124,6 +124,13 @@ class IBConnection : public BaseConnection {
   std::unique_ptr<GdrMap> localSignalGpuMap_;
   uint64_t* localSignalGpuPtr_;
 
+  // When true, recvThreadFunc reads the token from imm_data (from CQE) instead of the
+  // signal GPU buffer via GDRCopy. Enabled when the QP is mlx5 and the signal GPU buffer
+  // MR is a Data Direct DMABUF. Memory consistency is guaranteed because both the RDMA
+  // data write and the semaphore token write (via GDRCopy) go through the Data Direct path,
+  // so all writes are visible in GPU memory when the CQE is polled.
+  bool dataDirectEnabled_;
+
   void recvThreadFunc();
 
  public:
diff --git a/src/core/include/ib.hpp b/src/core/include/ib.hpp
index bfa6e3145..9e5a454cb 100644
--- a/src/core/include/ib.hpp
+++ b/src/core/include/ib.hpp
@@ -34,13 +34,17 @@ class IbMr {
   IbMrInfo getInfo() const;
   const void* getBuff() const;
   uint32_t getLkey() const;
+  bool isDmabuf() const;
+  bool isDataDirect() const;
 
  private:
-  IbMr(ibv_pd* pd, void* buff, std::size_t size);
+  IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isMlx5);
 
   ibv_mr* mr_;
   void* buff_;
   std::size_t size_;
+  bool isDmabuf_;
+  bool isDataDirect_;
 
   friend class IbCtx;
 };
@@ -88,6 +92,7 @@ class IbQp {
   int getRecvWcStatus(int idx) const;
   std::string getRecvWcStatusString(int idx) const;
   unsigned int getRecvWcImmData(int idx) const;
+  bool isMlx5() const { return isMlx5_; }
 
  private:
   struct SendWrInfo {
@@ -101,7 +106,7 @@ class IbQp {
   };
 
   IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
-       int maxRecvWr, int maxWrPerSend, bool noAtomic);
+       int maxRecvWr, int maxWrPerSend, bool noAtomic, bool isMlx5);
   SendWrInfo getNewSendWrInfo();
   RecvWrInfo getNewRecvWrInfo();
 
@@ -129,6 +134,7 @@ class IbQp {
   const int maxWrPerSend_;
   const int maxRecvWr_;
   const bool noAtomic_;
+  const bool isMlx5_;
 
   friend class IbCtx;
 };
@@ -143,6 +149,7 @@ class IbCtx {
                                  int maxRecvWr, int maxWrPerSend, bool noAtomic);
   std::unique_ptr<const IbMr> registerMr(void* buff, std::size_t size);
   bool supportsRdmaAtomics() const;
+  bool isMlx5() const;
 #else
   IbCtx([[maybe_unused]] const std::string& devName) {}
   ~IbCtx() {}
@@ -152,6 +159,7 @@ class IbCtx {
     return nullptr;
   }
   bool supportsRdmaAtomics() const { return false; }
+  bool isMlx5() const { return false; }
 #endif
 
   const std::string& getDevName() const { return devName_; };
@@ -164,6 +172,7 @@ class IbCtx {
   ibv_context* ctx_;
   ibv_pd* pd_;
   bool supportsRdmaAtomics_;
+  bool isMlx5_;
 };
 
 }  // namespace mscclpp
diff --git a/src/core/include/mlx5dv_wrapper.hpp b/src/core/include/mlx5dv_wrapper.hpp
new file mode 100644
index 000000000..654b086c9
--- /dev/null
+++ b/src/core/include/mlx5dv_wrapper.hpp
@@ -0,0 +1,38 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_MLX5DV_WRAPPER_HPP_
+#define MSCCLPP_MLX5DV_WRAPPER_HPP_
+
+#if defined(MSCCLPP_USE_MLX5DV)
+
+#include <infiniband/mlx5dv.h>
+
+#include <string>
+
+namespace mscclpp {
+
+struct MLX5DV {
+  /// Whether libmlx5.so was successfully loaded at runtime.
+  static bool isAvailable();
+
+  /// Check if the given IB device supports mlx5 Direct Verbs.
+  static bool mlx5dv_is_supported(struct ibv_device* device);
+
+  /// Create a QP using mlx5dv extensions.
+  static struct ibv_qp* mlx5dv_create_qp(struct ibv_context* ctx, struct ibv_qp_init_attr_ex* qpAttr,
+                                          struct mlx5dv_qp_init_attr* mlx5QpAttr);
+
+  /// Register a DMABUF memory region using mlx5dv extensions.
+  /// Returns nullptr if mlx5dv_reg_dmabuf_mr is not available in this rdma-core version.
+  static struct ibv_mr* mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset, size_t length, uint64_t iova, int fd,
+                                              int access);
+
+ private:
+  static void* dlsym(const std::string& symbol, bool allowReturnNull = false);
+};
+
+}  // namespace mscclpp
+
+#endif  // defined(MSCCLPP_USE_MLX5DV)
+#endif  // MSCCLPP_MLX5DV_WRAPPER_HPP_
diff --git a/src/core/mlx5dv_wrapper.cc b/src/core/mlx5dv_wrapper.cc
new file mode 100644
index 000000000..b1c398ee7
--- /dev/null
+++ b/src/core/mlx5dv_wrapper.cc
@@ -0,0 +1,103 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#if defined(MSCCLPP_USE_MLX5DV)
+
+#include "mlx5dv_wrapper.hpp"
+
+#include <dlfcn.h>
+
+#include <memory>
+
+#include "logger.hpp"
+
+namespace mscclpp {
+
+static std::unique_ptr<void, int (*)(void*)> globalMLX5Handle(nullptr, &::dlclose);
+
+void* MLX5DV::dlsym(const std::string& symbol, bool allowReturnNull) {
+  if (!globalMLX5Handle) {
+    const char* possibleLibNames[] = {"libmlx5.so", "libmlx5.so.1", nullptr};
+    for (int i = 0; possibleLibNames[i] != nullptr; i++) {
+      void* handle = ::dlopen(possibleLibNames[i], RTLD_NOW);
+      if (handle) {
+        globalMLX5Handle.reset(handle);
+        break;
+      }
+    }
+    if (!globalMLX5Handle) {
+      if (allowReturnNull) return nullptr;
+      THROW(NET, SysError, errno, "Failed to open libmlx5: ", std::string(::dlerror()));
+    }
+  }
+  void* ptr = ::dlsym(globalMLX5Handle.get(), symbol.c_str());
+  if (!ptr && !allowReturnNull) {
+    THROW(NET, SysError, errno, "Failed to load libmlx5 symbol: ", symbol);
+  }
+  return ptr;
+}
+
+bool MLX5DV::isAvailable() {
+  static int available = -1;
+  if (available == -1) {
+    // Try to load the library; if it fails, mlx5dv is not available
+    const char* possibleLibNames[] = {"libmlx5.so", "libmlx5.so.1", nullptr};
+    for (int i = 0; possibleLibNames[i] != nullptr; i++) {
+      void* handle = ::dlopen(possibleLibNames[i], RTLD_NOW);
+      if (handle) {
+        if (!globalMLX5Handle) {
+          globalMLX5Handle.reset(handle);
+        } else {
+          ::dlclose(handle);
+        }
+        available = 1;
+        INFO(NET, "libmlx5 loaded successfully");
+        return true;
+      }
+    }
+    available = 0;
+    DEBUG(NET, "libmlx5 not available");
+  }
+  return available == 1;
+}
+
+bool MLX5DV::mlx5dv_is_supported(struct ibv_device* device) {
+  using FuncType = bool (*)(struct ibv_device*);
+  static FuncType impl = nullptr;
+  if (!impl) {
+    void* ptr = MLX5DV::dlsym("mlx5dv_is_supported", /*allowReturnNull=*/true);
+    if (!ptr) return false;
+    impl = reinterpret_cast<FuncType>(ptr);
+  }
+  return impl(device);
+}
+
+struct ibv_qp* MLX5DV::mlx5dv_create_qp(struct ibv_context* ctx, struct ibv_qp_init_attr_ex* qpAttr,
+                                        struct mlx5dv_qp_init_attr* mlx5QpAttr) {
+  using FuncType = struct ibv_qp* (*)(struct ibv_context*, struct ibv_qp_init_attr_ex*, struct mlx5dv_qp_init_attr*);
+  static FuncType impl = nullptr;
+  if (!impl) impl = reinterpret_cast<FuncType>(MLX5DV::dlsym("mlx5dv_create_qp"));
+  return impl(ctx, qpAttr, mlx5QpAttr);
+}
+
+struct ibv_mr* MLX5DV::mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset, size_t length, uint64_t iova, int fd,
+                                            int access) {
+  // mlx5dv_reg_dmabuf_mr(pd, offset, length, iova, fd, access, mlx5_access) — the last arg is mlx5-specific flags.
+  using FuncType = struct ibv_mr* (*)(struct ibv_pd*, uint64_t, size_t, uint64_t, int, int, int);
+  static FuncType impl = nullptr;
+  static bool resolved = false;
+  if (!resolved) {
+    void* ptr = MLX5DV::dlsym("mlx5dv_reg_dmabuf_mr", /*allowReturnNull=*/true);
+    impl = ptr ? reinterpret_cast<FuncType>(ptr) : nullptr;
+    resolved = true;
+  }
+  if (!impl) return nullptr;
+#ifndef MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT
+#define MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT (1 << 0)
+#endif
+  return impl(pd, offset, length, iova, fd, access, MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT);
+}
+
+}  // namespace mscclpp
+
+#endif  // defined(MSCCLPP_USE_MLX5DV)

From 448ceb66f61645d393ae1841081c46d8e8e65ca9 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 5 Mar 2026 22:59:33 +0000
Subject: [PATCH 055/132] updates

---
 cmake/FindGDRCopy.cmake         | 13 ++++++++++++-
 src/core/connection.cc          | 21 ++++++++++++++-------
 src/core/gdr.cc                 |  7 ++++++-
 src/core/include/connection.hpp |  8 ++++----
 4 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/cmake/FindGDRCopy.cmake b/cmake/FindGDRCopy.cmake
index 812ead512..e62f32f2b 100644
--- a/cmake/FindGDRCopy.cmake
+++ b/cmake/FindGDRCopy.cmake
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-# Find the GDRCopy libraries
+# Find the GDRCopy libraries (>= 2.5 required for gdr_pin_buffer_v2 / GDR_PIN_FLAG_FORCE_PCIE)
 #
 # The following variables are optionally searched for defaults
 #  GDRCOPY_ROOT_DIR: Base directory where all GDRCopy components are found
@@ -32,6 +32,17 @@ find_library(GDRCOPY_LIBRARIES
   /usr/lib
   /usr/lib/x86_64-linux-gnu)
 
+if(GDRCOPY_INCLUDE_DIRS)
+    include(CheckSymbolExists)
+    set(CMAKE_REQUIRED_INCLUDES ${GDRCOPY_INCLUDE_DIRS})
+    check_symbol_exists(gdr_pin_buffer_v2 "gdrapi.h" GDRCOPY_HAS_PIN_BUFFER_V2)
+    unset(CMAKE_REQUIRED_INCLUDES)
+    if(NOT GDRCOPY_HAS_PIN_BUFFER_V2)
+        message(STATUS "GDRCopy found but too old (gdr_pin_buffer_v2 not available). Requires >= 2.5.")
+        set(GDRCOPY_INCLUDE_DIRS GDRCOPY_INCLUDE_DIRS-NOTFOUND)
+    endif()
+endif()
+
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(GDRCopy DEFAULT_MSG GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)
 mark_as_advanced(GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)
diff --git a/src/core/connection.cc b/src/core/connection.cc
index c821eb59d..097a48367 100644
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -316,15 +316,17 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
       localSignalGpuPtr_ = reinterpret_cast<uint64_t*>(localImpl.ibSignalGpuBuffer_.get());
     }
 
-    // When the QP is mlx5 and the signal GPU buffer MR is a Data Direct DMABUF
-    // (registered via mlx5dv_reg_dmabuf_mr with MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT),
-    // and the semaphore token write also goes through Data Direct (via GDRCopy to a
-    // Data Direct DMABUF MR), all writes are visible in GPU memory when the CQE is
-    // polled. This allows reading the token from imm_data instead of the signal GPU buffer.
+    // Data Direct requires all three conditions:
+    // 1. Signal GPU buffer MR registered with MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT
+    // 2. Local signal GPU GDRCopy mapping pinned with GDR_PIN_FLAG_FORCE_PCIE
+    // 3. (remoteUpdateDstAddr GDRCopy mapping checked at setRemoteUpdateDstAddr time)
+    // When all conditions are met, RDMA data writes and GDRCopy token writes both go
+    // through the Data Direct engine, guaranteeing GPU memory visibility at CQE poll time.
     auto qp = qp_.lock();
-    dataDirectEnabled_ = localImpl.ibSignalGpuMr_ && localImpl.ibSignalGpuMr_->isDataDirect();
+    dataDirectEnabled_ = localImpl.ibSignalGpuMr_ && localImpl.ibSignalGpuMr_->isDataDirect() &&
+                          localSignalGpuMap_ && localSignalGpuMap_->valid();
     if (dataDirectEnabled_) {
-      INFO(CONN, "IBConnection: Data Direct enabled (mlx5 + DMABUF)");
+      INFO(CONN, "IBConnection: Data Direct enabled");
     }
 
     // Pre-post receive requests for incoming write-with-imm
@@ -361,6 +363,11 @@ void IBConnection::setRemoteUpdateDstAddr(std::shared_ptr<uint64_t> gpuMem) {
   if (gdrEnabled()) {
     if (gpuMem) {
       remoteUpdateDstAddrMap_ = std::make_unique<GdrMap>(std::move(gpuMem), localGpuDeviceId_);
+      // Data Direct requires the token write mapping to also use FORCE_PCIE
+      if (dataDirectEnabled_ && !(remoteUpdateDstAddrMap_ && remoteUpdateDstAddrMap_->valid())) {
+        dataDirectEnabled_ = false;
+        INFO(CONN, "IBConnection: Data Direct disabled (remoteUpdateDstAddr GDRCopy mapping not available)");
+      }
     } else {
       remoteUpdateDstAddrMap_.reset();
     }
diff --git a/src/core/gdr.cc b/src/core/gdr.cc
index 004c160ae..341002ed6 100644
--- a/src/core/gdr.cc
+++ b/src/core/gdr.cc
@@ -80,7 +80,12 @@ GdrContext::~GdrContext() {
 // GdrMap
 
 GdrMap::GdrMap(std::shared_ptr<void> gpuMem, int deviceId)
-    : ctx_(gdrContext()), gpuMem_(std::move(gpuMem)), mh_{}, barPtr_(nullptr), hostDstPtr_(nullptr), mappedSize_(0) {
+    : ctx_(gdrContext()),
+      gpuMem_(std::move(gpuMem)),
+      mh_{},
+      barPtr_(nullptr),
+      hostDstPtr_(nullptr),
+      mappedSize_(0) {
   // Ensure CUDA device context is active for gdr_pin_buffer
   CudaDeviceGuard deviceGuard(deviceId);
 
diff --git a/src/core/include/connection.hpp b/src/core/include/connection.hpp
index ecba5ed5b..b141bbb82 100644
--- a/src/core/include/connection.hpp
+++ b/src/core/include/connection.hpp
@@ -125,10 +125,10 @@ class IBConnection : public BaseConnection {
   uint64_t* localSignalGpuPtr_;
 
   // When true, recvThreadFunc reads the token from imm_data (from CQE) instead of the
-  // signal GPU buffer via GDRCopy. Enabled when the QP is mlx5 and the signal GPU buffer
-  // MR is a Data Direct DMABUF. Memory consistency is guaranteed because both the RDMA
-  // data write and the semaphore token write (via GDRCopy) go through the Data Direct path,
-  // so all writes are visible in GPU memory when the CQE is polled.
+  // signal GPU buffer via GDRCopy. Enabled only when all Data Direct conditions are met:
+  // the signal GPU buffer MR is registered with MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT,
+  // and all GDRCopy mappings (local signal buffer and remoteUpdateDstAddr) are valid,
+  // so both RDMA data writes and GDRCopy token writes go through the Data Direct engine.
   bool dataDirectEnabled_;
 
   void recvThreadFunc();

From 7ce841bed0fb8aaff2fd9a7dcfbec0be15e9a872 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 5 Mar 2026 23:28:39 +0000
Subject: [PATCH 056/132] Updates

---
 src/core/connection.cc          | 18 +++++++++---------
 src/core/include/connection.hpp | 20 ++++++++++----------
 src/core/semaphore.cc           | 18 +++++++++---------
 3 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/src/core/connection.cc b/src/core/connection.cc
index 097a48367..7ce9b37dd 100644
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -233,7 +233,7 @@ void IBConnection::recvThreadFunc() {
         newValueHost = *static_cast<volatile uint64_t*>(localSignalGpuPtr_);
       }
 
-      // Read token address from the local stored address (set by setRemoteUpdateDstAddr)
+      // Read token address from the local stored address (set by setSignalForwardingDst)
       if (remoteUpdateDstAddr_ != 0) {
         uint64_t* dstPtr = reinterpret_cast<uint64_t*>(remoteUpdateDstAddr_);
 
@@ -319,7 +319,7 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
     // Data Direct requires all three conditions:
     // 1. Signal GPU buffer MR registered with MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT
     // 2. Local signal GPU GDRCopy mapping pinned with GDR_PIN_FLAG_FORCE_PCIE
-    // 3. (remoteUpdateDstAddr GDRCopy mapping checked at setRemoteUpdateDstAddr time)
+    // 3. (signal forwarding dst GDRCopy mapping checked at setSignalForwardingDst time)
     // When all conditions are met, RDMA data writes and GDRCopy token writes both go
     // through the Data Direct engine, guaranteeing GPU memory visibility at CQE poll time.
     auto qp = qp_.lock();
@@ -356,23 +356,23 @@ Transport IBConnection::transport() const { return transport_; }
 
 Transport IBConnection::remoteTransport() const { return remoteTransport_; }
 
-bool IBConnection::usesRecvThread() const { return ibNoAtomic_; }
+bool IBConnection::usesSignalForwarding() const { return ibNoAtomic_; }
 
-void IBConnection::setRemoteUpdateDstAddr(std::shared_ptr<uint64_t> gpuMem) {
-  remoteUpdateDstAddr_ = reinterpret_cast<uint64_t>(gpuMem.get());
+void IBConnection::setSignalForwardingDst(std::shared_ptr<uint64_t> mem) {
+  remoteUpdateDstAddr_ = reinterpret_cast<uint64_t>(mem.get());
   if (gdrEnabled()) {
-    if (gpuMem) {
-      remoteUpdateDstAddrMap_ = std::make_unique<GdrMap>(std::move(gpuMem), localGpuDeviceId_);
+    if (mem) {
+      remoteUpdateDstAddrMap_ = std::make_unique<GdrMap>(std::move(mem), localGpuDeviceId_);
       // Data Direct requires the token write mapping to also use FORCE_PCIE
       if (dataDirectEnabled_ && !(remoteUpdateDstAddrMap_ && remoteUpdateDstAddrMap_->valid())) {
         dataDirectEnabled_ = false;
-        INFO(CONN, "IBConnection: Data Direct disabled (remoteUpdateDstAddr GDRCopy mapping not available)");
+        INFO(CONN, "IBConnection: Data Direct disabled (signal forwarding dst GDRCopy mapping not available)");
       }
     } else {
       remoteUpdateDstAddrMap_.reset();
     }
   }
-  INFO(CONN, "IBConnection setRemoteUpdateDstAddr: ", (void*)remoteUpdateDstAddr_);
+  INFO(CONN, "IBConnection setSignalForwardingDst: ", (void*)remoteUpdateDstAddr_);
 }
 
 void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset,
diff --git a/src/core/include/connection.hpp b/src/core/include/connection.hpp
index b141bbb82..f2ed2c8b8 100644
--- a/src/core/include/connection.hpp
+++ b/src/core/include/connection.hpp
@@ -37,16 +37,16 @@ class BaseConnection {
 
   virtual void flush(int64_t timeoutUsec = -1) = 0;
 
-  /// Set the local address where remote updateAndSync operations should write.
-  /// This is called by the receiver to specify where incoming signals should be written.
+  /// Set the local address where forwarded signals should be written.
+  /// This is called by the receiver to specify where incoming signals should be forwarded.
   /// Default implementation is a no-op for connections that don't need it.
-  /// @param gpuMem Shared pointer to the GPU/CPU memory for incoming writes (nullptr to clear).
-  virtual void setRemoteUpdateDstAddr(std::shared_ptr<uint64_t> /*gpuMem*/) {}
+  /// @param mem Shared pointer to the memory for incoming writes (nullptr to clear).
+  virtual void setSignalForwardingDst(std::shared_ptr<uint64_t> /*mem*/) {}
 
-  /// Whether this connection uses a recv thread for signaling (host-no-atomic mode).
+  /// Whether this connection uses signal forwarding (e.g., IB host-no-atomic mode).
   /// When true, the semaphore must allocate a separate inboundToken_ for the recv thread to write to.
   /// When false, the NIC writes directly to the semaphore's registered memory (e.g., via atomics).
-  virtual bool usesRecvThread() const { return false; }
+  virtual bool usesSignalForwarding() const { return false; }
 
   virtual Transport transport() const = 0;
 
@@ -137,12 +137,12 @@ class IBConnection : public BaseConnection {
   IBConnection(std::shared_ptr<Context> context, const Endpoint& localEndpoint, const Endpoint& remoteEndpoint);
   ~IBConnection();
 
-  /// Set the local address where remote updateAndSync operations will write.
+  /// Set the local address where forwarded signals should be written.
   /// Must be called before the remote sends any updateAndSync in host-no-atomic mode.
-  /// @param gpuMem Shared pointer to the GPU/CPU memory for incoming writes (nullptr to clear).
-  void setRemoteUpdateDstAddr(std::shared_ptr<uint64_t> gpuMem) override;
+  /// @param mem Shared pointer to the memory for incoming writes (nullptr to clear).
+  void setSignalForwardingDst(std::shared_ptr<uint64_t> mem) override;
 
-  bool usesRecvThread() const override;
+  bool usesSignalForwarding() const override;
 
   Transport transport() const override;
 
diff --git a/src/core/semaphore.cc b/src/core/semaphore.cc
index e2dadb19e..c6299dec8 100644
--- a/src/core/semaphore.cc
+++ b/src/core/semaphore.cc
@@ -123,8 +123,8 @@ MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(const Semaphore& sema
     THROW(CONN, Error, ErrorCode::InvalidUsage, "Local endpoint device type of Host2DeviceSemaphore should be GPU");
   }
   auto connImpl = BaseConnection::getImpl(connection());
-  if (connImpl->usesRecvThread()) {
-    // Host-no-atomic mode: the recv thread writes the token to GPU memory.
+  if (connImpl->usesSignalForwarding()) {
+    // Signal forwarding mode: the recv thread writes the token to GPU memory.
     // Allocate a separate inbound token via plain cudaMalloc (not TokenPool/VMM)
     // so that it is always compatible with GDRCopy pinning (VMM memory cannot be pinned by gdr_pin_buffer).
     CudaDeviceGuard deviceGuard(connection().localDevice().id);
@@ -133,9 +133,9 @@ MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(const Semaphore& sema
 #else
     inboundToken_ = detail::gpuCallocShared<uint64_t>();
 #endif
-    connImpl->setRemoteUpdateDstAddr(inboundToken_);
+    connImpl->setSignalForwardingDst(inboundToken_);
   }
-  // When usesRecvThread() is false (e.g., atomic mode), inboundToken_ stays null
+  // When usesSignalForwarding() is false (e.g., atomic mode), inboundToken_ stays null
   // and the GPU polls the SemaphoreStub token directly (the NIC atomic target).
 }
 
@@ -144,9 +144,9 @@ MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(Communicator& communi
 
 MSCCLPP_API_CPP Host2DeviceSemaphore::~Host2DeviceSemaphore() {
   if (inboundToken_) {
-    // Clear the connection's remote update address (and any associated GdrMap)
+    // Clear the connection's signal forwarding destination (and any associated GdrMap)
     // before inboundToken_ is freed, to avoid use-after-free on the pinned GPU memory.
-    BaseConnection::getImpl(connection())->setRemoteUpdateDstAddr(nullptr);
+    BaseConnection::getImpl(connection())->setSignalForwardingDst(nullptr);
   }
 }
 
@@ -178,12 +178,12 @@ MSCCLPP_API_CPP Host2HostSemaphore::Host2HostSemaphore(const Semaphore& semaphor
     THROW(CONN, Error, ErrorCode::InvalidUsage, "Local endpoint device type of Host2HostSemaphore should be CPU");
   }
   auto connImpl = BaseConnection::getImpl(connection());
-  if (connImpl->usesRecvThread()) {
-    // Host-no-atomic mode: tell the recv thread where to write the incoming token.
+  if (connImpl->usesSignalForwarding()) {
+    // Signal forwarding mode: tell the recv thread where to write the incoming token.
     // Non-owning shared_ptr: Host2HostSemaphore outlives the connection, so the memory stays valid.
     auto token =
         std::shared_ptr<uint64_t>(reinterpret_cast<uint64_t*>(semaphore_.localMemory().data()), [](uint64_t*) {});
-    connImpl->setRemoteUpdateDstAddr(std::move(token));
+    connImpl->setSignalForwardingDst(std::move(token));
   }
 }
 

From bbb9c10a1e6da014c67344feabaa770844d17c5a Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 6 Mar 2026 19:15:04 +0000
Subject: [PATCH 057/132] Update Docker image

---
 .azure-pipelines/ut.yml               | 10 ++++-----
 .github/workflows/codeql-analysis.yml |  2 +-
 .github/workflows/mscclpp-lang.yml    |  2 +-
 docker/base-dev-x.dockerfile          | 30 +++++++++++++++++++++++++--
 docker/build.sh                       | 17 +++++++--------
 docs/quickstart.md                    |  2 +-
 6 files changed, 44 insertions(+), 19 deletions(-)

diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml
index c1458c3ca..d888946ba 100644
--- a/.azure-pipelines/ut.yml
+++ b/.azure-pipelines/ut.yml
@@ -37,7 +37,7 @@ jobs:
       cuda11:
         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
 
   container:
     image: $(containerImage)
@@ -59,7 +59,7 @@ jobs:
       cuda11:
         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
 
   container:
     image: $(containerImage)
@@ -79,7 +79,7 @@ jobs:
   strategy:
     matrix:
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
 
   container:
     image: $(containerImage)
@@ -99,7 +99,7 @@ jobs:
   strategy:
     matrix:
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
 
   container:
     image: $(containerImage)
@@ -121,7 +121,7 @@ jobs:
   strategy:
     matrix:
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
 
   container:
     image: $(containerImage)
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index 575c472b4..fb0651415 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -40,7 +40,7 @@ jobs:
       fail-fast: false
       matrix:
         language: [ 'cpp', 'python' ]
-        version: [ 'cuda11.8', 'cuda12.8' ]
+        version: [ 'cuda11.8', 'cuda12.9' ]
 
     steps:
     - name: Checkout repository
diff --git a/.github/workflows/mscclpp-lang.yml b/.github/workflows/mscclpp-lang.yml
index 5947b087d..a9187e968 100644
--- a/.github/workflows/mscclpp-lang.yml
+++ b/.github/workflows/mscclpp-lang.yml
@@ -15,7 +15,7 @@ jobs:
     strategy:
         fail-fast: false
         matrix:
-          version: [ 'cuda11.8', 'cuda12.8' ]
+          version: [ 'cuda11.8', 'cuda12.9' ]
 
     steps:
     - uses: actions/checkout@v4
diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile
index 3aa814221..7c6c927eb 100644
--- a/docker/base-dev-x.dockerfile
+++ b/docker/base-dev-x.dockerfile
@@ -7,13 +7,38 @@ LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
         htop \
-        lcov \
         vim \
         && \
     apt-get autoremove -y && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* /tmp/*
 
+# Install lcov 2.2
+RUN LCOV_VERSION="2.2" && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends \
+        cpanminus \
+        gcc \
+        make \
+        perl \
+        && \
+    cpanm --notest \
+        Capture::Tiny \
+        DateTime \
+        JSON::XS \
+        Memory::Process \
+        TimeDate \
+        && \
+    cd /tmp && \
+    curl -L https://github.com/linux-test-project/lcov/releases/download/v${LCOV_VERSION}/lcov-${LCOV_VERSION}.tar.gz -o lcov.tar.gz && \
+    tar xzf lcov.tar.gz && \
+    cd lcov-${LCOV_VERSION} && \
+    make install && \
+    cd / && rm -rf /tmp/lcov* && \
+    apt-get autoremove -y && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* /tmp/*
+
 # Install CMake 3.26.4
 RUN OS_ARCH=$(uname -m) && \
     CMAKE_VERSION="3.26.4" && \
@@ -47,7 +72,8 @@ RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \
         export CUPY_INSTALL_USE_HIP=1 && export ROCM_HOME=/opt/rocm; \
     fi && \
     pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir -r python/requirements_${target_type}.txt
+    pip install --no-cache-dir -r python/requirements_${target_type}.txt && \
+    pip install --no-cache-dir coverage xlsxwriter
 
 # Cleanup
 RUN rm -rf /tmp/mscclpp
diff --git a/docker/build.sh b/docker/build.sh
index 63552f748..56d152bfd 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -4,22 +4,21 @@ set -e
 
 declare -A baseImageTable
 baseImageTable=(
-    ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04"
-    ["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04"
-    ["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
-    ["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04"
+    ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu22.04"
     ["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04"
     ["cuda12.8"]="nvidia/cuda:12.8.1-devel-ubuntu22.04"
-    ["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu22.04"
+    ["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu24.04"
     ["cuda13.0"]="nvidia/cuda:13.0.2-devel-ubuntu24.04"
     ["rocm6.2"]="rocm/dev-ubuntu-22.04:6.2.2"
 )
 
 declare -A extraLdPathTable
 extraLdPathTable=(
-    ["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
-    ["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64"
-    ["cuda12.3"]="/usr/local/cuda-12.3/compat:/usr/local/cuda-12.3/lib64"
+    ["cuda11.8"]="/usr/local/cuda-11.8/compat"
+    ["cuda12.4"]="/usr/local/cuda-12.4/compat"
+    ["cuda12.8"]="/usr/local/cuda-12.8/compat"
+    ["cuda12.9"]="/usr/local/cuda-12.9/compat"
+    ["cuda13.0"]="/usr/local/cuda-13.0/compat"
     ["rocm6.2"]="/opt/rocm/lib"
 )
 
@@ -36,7 +35,7 @@ TARGET=${1}
 OS_ARCH=$(uname -m)
 
 print_usage() {
-    echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]"
+    echo "Usage: $0 [cuda11.8|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]"
 }
 
 if [[ ! -v "baseImageTable[${TARGET}]" ]]; then
diff --git a/docs/quickstart.md b/docs/quickstart.md
index ac1b7d6bb..fd0b75714 100644
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -42,7 +42,7 @@ We provide docker images which package all prerequisites for MSCCL++. You can se
 
 ```bash
 # For NVIDIA platforms
-$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.8 bash
+$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 bash
 # For AMD platforms
 $ docker run -it --privileged --net=host --ipc=host --security-opt=seccomp=unconfined --group-add=video --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2 bash
 ```

From 60ff32c014358fcae5fa6316c404eb2b95a25b95 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 6 Mar 2026 19:40:34 +0000
Subject: [PATCH 058/132] updates

---
 .azure-pipelines/ut.yml | 4 ++--
 docker/build.sh         | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml
index d888946ba..7952e53e6 100644
--- a/.azure-pipelines/ut.yml
+++ b/.azure-pipelines/ut.yml
@@ -143,7 +143,7 @@ jobs:
   strategy:
     matrix:
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
 
   container:
     image: $(containerImage)
@@ -165,7 +165,7 @@ jobs:
   strategy:
     matrix:
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
 
   container:
     image: $(containerImage)
diff --git a/docker/build.sh b/docker/build.sh
index 56d152bfd..89568e197 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -24,6 +24,7 @@ extraLdPathTable=(
 
 declare -A ofedVersionTable
 ofedVersionTable=(
+    ["cuda11.8"]="23.07-0.5.1.2"
     ["cuda12.4"]="23.07-0.5.1.2"
     ["cuda12.8"]="24.10-1.1.4.0"
     ["cuda12.9"]="24.10-1.1.4.0"

From 00583da21bf796092cf852a12a6f60e0e9fcd13e Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 6 Mar 2026 21:31:04 +0000
Subject: [PATCH 059/132] separate pipeline for codecov

---
 .azure-pipelines/{ut-rocm.yml => codecov.yml} | 43 ++++++++++++++-----
 .../{ut-codecov.yaml => codecov.yaml}         |  0
 .azure-pipelines/ut.yml                       | 41 ++++--------------
 3 files changed, 42 insertions(+), 42 deletions(-)
 rename .azure-pipelines/{ut-rocm.yml => codecov.yml} (59%)
 rename .azure-pipelines/templates/{ut-codecov.yaml => codecov.yaml} (100%)

diff --git a/.azure-pipelines/ut-rocm.yml b/.azure-pipelines/codecov.yml
similarity index 59%
rename from .azure-pipelines/ut-rocm.yml
rename to .azure-pipelines/codecov.yml
index 0df6e8faf..64d534d9f 100644
--- a/.azure-pipelines/ut-rocm.yml
+++ b/.azure-pipelines/codecov.yml
@@ -28,26 +28,49 @@ pr:
       - '**/*.md'
 
 jobs:
-- job: UnitTestMI300X
+- job: CodeCoverageA100
   timeoutInMinutes: 40
   pool:
-    name: msccl-ci-mi300x
+    name: msccl-ci
+  variables:
+  - group: mscclpp
   strategy:
     matrix:
-      rocm6_2:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
 
   container:
     image: $(containerImage)
 
   steps:
-  - template: templates/ut.yaml
+  - template: templates/codecov.yaml
     parameters:
-      subscription:     mscclpp-ci-mi300x
-      vmssName:         mscclpp-mi300x-ci
+      subscription:     mscclpp-ci
+      vmssName:         mscclpp-ci
       sshKeySecureFile: mscclpp.pem
-      platform:         rocm
-      gpuArch:          gfx942
+      gpuArch:          '80'
+
+- job: CodeCoverageH100
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci-h100
+  variables:
+  - group: mscclpp
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/codecov.yaml
+    parameters:
+      subscription:     mscclpp-ci-h100
+      vmssName:         mscclpp-h100-ci
+      sshKeySecureFile: mscclpp.pem
+      gpuArch:          '90'
 
 - job: CodeCoverageMI300X
   timeoutInMinutes: 40
@@ -64,7 +87,7 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/ut-codecov.yaml
+  - template: templates/codecov.yaml
     parameters:
       subscription:     mscclpp-ci-mi300x
       vmssName:         mscclpp-mi300x-ci
diff --git a/.azure-pipelines/templates/ut-codecov.yaml b/.azure-pipelines/templates/codecov.yaml
similarity index 100%
rename from .azure-pipelines/templates/ut-codecov.yaml
rename to .azure-pipelines/templates/codecov.yaml
diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml
index 7952e53e6..4ef8035ff 100644
--- a/.azure-pipelines/ut.yml
+++ b/.azure-pipelines/ut.yml
@@ -134,46 +134,23 @@ jobs:
       sshKeySecureFile: mscclpp.pem
       gpuArch:          '90'
 
-- job: CodeCoverageA100
+- job: UnitTestMI300X
   timeoutInMinutes: 40
   pool:
-    name: msccl-ci
-  variables:
-  - group: mscclpp
+    name: msccl-ci-mi300x
   strategy:
     matrix:
-      cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+      rocm6_2:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
 
   container:
     image: $(containerImage)
 
   steps:
-  - template: templates/ut-codecov.yaml
-    parameters:
-      subscription:     mscclpp-ci
-      vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
-      gpuArch:          '80'
-
-- job: CodeCoverageH100
-  timeoutInMinutes: 40
-  pool:
-    name: msccl-ci-h100
-  variables:
-  - group: mscclpp
-  strategy:
-    matrix:
-      cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
-
-  container:
-    image: $(containerImage)
-
-  steps:
-  - template: templates/ut-codecov.yaml
+  - template: templates/ut.yaml
     parameters:
-      subscription:     mscclpp-ci-h100
-      vmssName:         mscclpp-h100-ci
+      subscription:     mscclpp-ci-mi300x
+      vmssName:         mscclpp-mi300x-ci
       sshKeySecureFile: mscclpp.pem
-      gpuArch:          '90'
+      platform:         rocm
+      gpuArch:          gfx942

From c699b8a7840737d14b2f82d73c19480a94e4914b Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sat, 7 Mar 2026 02:23:30 +0000
Subject: [PATCH 060/132] az pipeline refactoring

---
 .azure-pipelines/codecov.yml                  |   3 -
 .azure-pipelines/integration-test.yml         |   2 -
 .azure-pipelines/multi-nodes-test.yml         | 120 ++-------
 .azure-pipelines/nccl-api-test.yaml           |   2 -
 .azure-pipelines/rccl-api-test.yml            |   1 -
 .azure-pipelines/templates/codecov.yaml       | 122 +++------
 .azure-pipelines/templates/deploy.yaml        | 127 +++++++++
 .../templates/integration-test.yaml           | 218 +++------------
 .azure-pipelines/templates/nccl-test.yaml     | 248 ++----------------
 .azure-pipelines/templates/rccl-test.yaml     | 122 ++-------
 .azure-pipelines/templates/stop.yaml          |  20 ++
 .azure-pipelines/templates/ut-no-ib-env.yaml  | 152 +++--------
 .azure-pipelines/templates/ut-npkit.yaml      | 142 +++-------
 .azure-pipelines/templates/ut.yaml            | 123 ++-------
 .azure-pipelines/ut.yml                       |   6 -
 docs/quickstart.md                            |   3 -
 test/deploy/run-remote.sh                     |  96 +++++++
 test/deploy/run_tests.sh                      |   1 -
 18 files changed, 466 insertions(+), 1042 deletions(-)
 create mode 100644 .azure-pipelines/templates/deploy.yaml
 create mode 100644 .azure-pipelines/templates/stop.yaml
 create mode 100755 test/deploy/run-remote.sh

diff --git a/.azure-pipelines/codecov.yml b/.azure-pipelines/codecov.yml
index 64d534d9f..ea006a636 100644
--- a/.azure-pipelines/codecov.yml
+++ b/.azure-pipelines/codecov.yml
@@ -47,7 +47,6 @@ jobs:
     parameters:
       subscription:     mscclpp-ci
       vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '80'
 
 - job: CodeCoverageH100
@@ -69,7 +68,6 @@ jobs:
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '90'
 
 - job: CodeCoverageMI300X
@@ -91,6 +89,5 @@ jobs:
     parameters:
       subscription:     mscclpp-ci-mi300x
       vmssName:         mscclpp-mi300x-ci
-      sshKeySecureFile: mscclpp.pem
       platform:         rocm
       gpuArch:          gfx942
diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml
index f6fe3a47f..d7479b87c 100644
--- a/.azure-pipelines/integration-test.yml
+++ b/.azure-pipelines/integration-test.yml
@@ -45,7 +45,6 @@ jobs:
     parameters:
       subscription:     mscclpp-ci
       vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '80'
 
 - job: IntegrationTestH100
@@ -65,6 +64,5 @@ jobs:
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
       perfBaselineFile: test/deploy/perf_ndmv5.jsonl
       gpuArch:          '90'
diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml
index 914c2317c..994b87ee7 100644
--- a/.azure-pipelines/multi-nodes-test.yml
+++ b/.azure-pipelines/multi-nodes-test.yml
@@ -37,33 +37,6 @@ jobs:
     image: $[ variables['containerImage'] ]
 
   steps:
-  - task: Bash@3
-    name: Build
-    displayName: Build
-    inputs:
-      targetType: 'inline'
-      script: |
-        mkdir build && cd build
-        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
-        make -j
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-  - task: DownloadSecureFile@1
-    name: SshKeyFile
-    displayName: Download key file
-    inputs:
-      secureFile: mscclpp-ssh.key
-
-  - task: Bash@3
-    name: InstallPackages
-    displayName: Install Packages
-    inputs:
-      targetType: 'inline'
-      script: |
-        sudo apt-get update -y
-        sudo apt-get install pssh -y
-        curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
   - task: Bash@3
     displayName: Add HostEntry
     inputs:
@@ -77,23 +50,11 @@ jobs:
           echo "Entry already exists, nothing to do."
         fi
 
-  - task: AzureCLI@2
-    name: StartVMSS
-    displayName: Start VMSS
-    inputs:
-      azureSubscription: msccl-it
-      scriptType: bash
-      scriptLocation: inlineScript
-      inlineScript: |
-        az vmss start --name mscclit-vmss --resource-group msccl-IT
-
-  - task: Bash@3
-    name: DeployTestEnv
-    displayName: Deploy Test Env
-    inputs:
-      targetType: filePath
-      filePath: test/deploy/deploy.sh
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
+  - template: templates/deploy.yaml
+    parameters:
+      subscription:  msccl-it
+      vmssName:      mscclit-vmss
+      resourceGroup: msccl-IT
 
   - task: Bash@3
     name: RunMscclppTest
@@ -101,18 +62,8 @@ jobs:
     inputs:
       targetType: 'inline'
       script: |
-        set -e
-        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
-        SSH_OPTION="StrictHostKeyChecking=no"
-        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-        rm -rf output/*
-        mkdir -p output
-        touch output/mscclit-000000
-        tail -f output/mscclit-000000 &
-        CHILD_PID=$!
-        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test'
-        kill $CHILD_PID
+        test/deploy/run-remote.sh --hostfile "$(System.DefaultWorkingDirectory)/test/deploy/hostfile" --host mscclit-000000 --user azureuser \
+          "bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test"
 
   - task: Bash@3
     name: RunMultiNodeUnitTest
@@ -120,18 +71,8 @@ jobs:
     inputs:
       targetType: 'inline'
       script: |
-        set -e
-        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
-        SSH_OPTION="StrictHostKeyChecking=no"
-        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-        rm -rf output/*
-        mkdir -p output
-        touch output/mscclit-000000
-        tail -f output/mscclit-000000 &
-        CHILD_PID=$!
-        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut'
-        kill $CHILD_PID
+        test/deploy/run-remote.sh --hostfile "$(System.DefaultWorkingDirectory)/test/deploy/hostfile" --host mscclit-000000 --user azureuser \
+          "bash /root/mscclpp/test/deploy/run_tests.sh mp-ut"
 
   - task: Bash@3
     name: RunMultiNodePythonTests
@@ -139,18 +80,8 @@ jobs:
     inputs:
       targetType: 'inline'
       script: |
-        set -e
-        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
-        SSH_OPTION="StrictHostKeyChecking=no"
-        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-        rm -rf output/*
-        mkdir -p output
-        touch output/mscclit-000000
-        tail -f output/mscclit-000000 &
-        CHILD_PID=$!
-        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests'
-        kill $CHILD_PID
+        test/deploy/run-remote.sh --hostfile "$(System.DefaultWorkingDirectory)/test/deploy/hostfile" --host mscclit-000000 --user azureuser \
+          "bash /root/mscclpp/test/deploy/run_tests.sh pytests"
 
   - task: Bash@3
     name: RunMultiNodePythonBenchmark
@@ -158,26 +89,11 @@ jobs:
     inputs:
       targetType: 'inline'
       script: |
-        set -e
-        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
-        SSH_OPTION="StrictHostKeyChecking=no"
-        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-        rm -rf output/*
-        mkdir -p output
-        touch output/mscclit-000000
-        tail -f output/mscclit-000000 &
-        CHILD_PID=$!
-        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark'
-        kill $CHILD_PID
+        test/deploy/run-remote.sh --hostfile "$(System.DefaultWorkingDirectory)/test/deploy/hostfile" --host mscclit-000000 --user azureuser \
+          "bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark"
 
-  - task: AzureCLI@2
-    name: StopVMSS
-    displayName: Deallocate VMSS
-    condition: always()
-    inputs:
-      azureSubscription: msccl-it
-      scriptType: bash
-      scriptLocation: inlineScript
-      inlineScript: |
-        az vmss deallocate  --name mscclit-vmss --resource-group msccl-IT
+  - template: templates/stop.yaml
+    parameters:
+      subscription:  msccl-it
+      vmssName:      mscclit-vmss
+      resourceGroup: msccl-IT
diff --git a/.azure-pipelines/nccl-api-test.yaml b/.azure-pipelines/nccl-api-test.yaml
index 4951c5bdd..275f45a3d 100644
--- a/.azure-pipelines/nccl-api-test.yaml
+++ b/.azure-pipelines/nccl-api-test.yaml
@@ -44,7 +44,6 @@ jobs:
     parameters:
       subscription:     mscclpp-ci
       vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
       nvccGencode:      "-gencode=arch=compute_80,code=sm_80"
 
 - job: NcclTestH100
@@ -65,5 +64,4 @@ jobs:
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
       nvccGencode:      "-gencode=arch=compute_90,code=sm_90"
\ No newline at end of file
diff --git a/.azure-pipelines/rccl-api-test.yml b/.azure-pipelines/rccl-api-test.yml
index 92c5874f6..dda6e93a9 100644
--- a/.azure-pipelines/rccl-api-test.yml
+++ b/.azure-pipelines/rccl-api-test.yml
@@ -44,5 +44,4 @@ jobs:
     parameters:
       subscription:     mscclpp-ci-mi300x
       vmssName:         mscclpp-mi300x-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          gfx942
diff --git a/.azure-pipelines/templates/codecov.yaml b/.azure-pipelines/templates/codecov.yaml
index 21186c6b0..46e59f085 100644
--- a/.azure-pipelines/templates/codecov.yaml
+++ b/.azure-pipelines/templates/codecov.yaml
@@ -3,8 +3,6 @@ parameters:
   type: string
 - name: vmssName
   type: string
-- name: sshKeySecureFile
-  type: string
 - name: platform
   type: string
   default: 'cuda'
@@ -12,57 +10,17 @@ parameters:
   type: string
 
 steps:
-- task: Bash@3
-  name: BuildCoverage
-  displayName: Build with coverage
-  inputs:
-    targetType: 'inline'
-    script: |
-      mkdir build && cd build
-      if [ "${{ parameters.platform }}" == "rocm" ]; then
-        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_ENABLE_COVERAGE=ON ..
-      else
-        cmake -DCMAKE_BUILD_TYPE=Debug -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_ENABLE_COVERAGE=ON ..
-      fi
-      make -j
-      cd ..
-      pwd > build/BUILD_PREFIX
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
-- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: 'inline'
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
-- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }}  --resource-group mscclpp
-
-- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: "single-node-test true ${{ parameters.platform }}"
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: templates/deploy.yaml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    platform:         ${{ parameters.platform }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    buildType:        Debug
+    cmakeArgs:        '-DMSCCLPP_ENABLE_COVERAGE=ON'
+    buildDisplayName: 'Build with coverage'
+    buildName:        BuildCoverage
+    deployArgs:       'single-node-test true ${{ parameters.platform }}'
 
 - task: Bash@3
   name: TestsCoverageNonPerf
@@ -70,30 +28,26 @@ steps:
   inputs:
     targetType: 'inline'
     script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        export PATH=/usr/local/mpi/bin:\$PATH;                        \
-        cd /root/mscclpp;                                             \
-        BUILD_PREFIX=\$(cat build/BUILD_PREFIX);                    \
-        STRIP_COUNT=\$(echo \$BUILD_PREFIX | tr -cd / | wc -c);      \
-        export GCOV_PREFIX=/root/mscclpp;                             \
-        export GCOV_PREFIX_STRIP=\$STRIP_COUNT;                      \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        ./build/bin/unit_tests;                                       \
-        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --exclude-perf-tests;  \
-        mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests --exclude-perf-tests;  \
-        cd build;                                                     \
-        lcov --directory . --capture --output-file coverage.info --ignore-errors inconsistent;  \
-        lcov --extract coverage.info \"\${BUILD_PREFIX}/src/*\" \"\${BUILD_PREFIX}/include/mscclpp/*\" --output-file coverage.info;  \
-        lcov --list coverage.info"'
-      kill $CHILD_PID
+      test/deploy/run-remote.sh '\
+        BUILD_PREFIX=\$(cat build/BUILD_PREFIX); \
+        STRIP_COUNT=\$(echo \$BUILD_PREFIX | tr -cd / | wc -c); \
+        export GCOV_PREFIX=/root/mscclpp; \
+        export GCOV_PREFIX_STRIP=\$STRIP_COUNT; \
+        ./build/bin/unit_tests; \
+        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --exclude-perf-tests; \
+        mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests --exclude-perf-tests; \
+        lcov --version; \
+        LCOV_CAPTURE_ARGS=""; \
+        if lcov --help 2>&1 | grep -q "inconsistent"; then \
+          LCOV_CAPTURE_ARGS="--ignore-errors inconsistent"; \
+        fi; \
+        lcov --directory . --capture --output-file coverage.info ${LCOV_CAPTURE_ARGS}; \
+        if [ ! -s coverage.info ]; then \
+          echo "ERROR: coverage.info was not generated. Tests may have failed before coverage capture or produced no gcov data."; \
+          exit 1; \
+        fi; \
+        lcov --extract coverage.info "\${BUILD_PREFIX}/src/*" "\${BUILD_PREFIX}/include/mscclpp/*" --output-file coverage.info; \
+        lcov --list coverage.info'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
@@ -124,13 +78,7 @@ steps:
       ./codecov upload-process --disable-search -t $(CODECOV_TOKEN) -f coverage.info --flag ${{ parameters.platform }}-${{ parameters.gpuArch }}
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
-- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
+- template: templates/stop.yaml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/deploy.yaml b/.azure-pipelines/templates/deploy.yaml
new file mode 100644
index 000000000..77a61eed3
--- /dev/null
+++ b/.azure-pipelines/templates/deploy.yaml
@@ -0,0 +1,127 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: resourceGroup
+  type: string
+  default: mscclpp
+# Build parameters
+- name: platform
+  type: string
+  default: 'cuda'
+- name: gpuArch
+  type: string
+  default: ''
+- name: buildType
+  type: string
+  default: 'Release'
+- name: buildTests
+  type: boolean
+  default: true
+- name: cmakeArgs
+  type: string
+  default: ''
+- name: buildName
+  type: string
+  default: 'Build'
+- name: buildDisplayName
+  type: string
+  default: 'Build'
+# Deploy parameters
+- name: deployArgs
+  type: string
+  default: ''
+
+steps:
+# 1. Check VMSS availability (fast, fail-fast)
+- task: AzureCLI@2
+  name: CheckVMSS
+  displayName: Check VMSS Availability
+  inputs:
+    azureSubscription: ${{ parameters.subscription }}
+    scriptType: bash
+    scriptLocation: inlineScript
+    inlineScript: |
+      set -e
+      INSTANCES=$(az vmss list-instances --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }} -o json)
+      COUNT=$(echo "$INSTANCES" | jq 'length')
+      if [ "$COUNT" -eq 0 ]; then
+        echo "##vso[task.logissue type=error]No VMSS instances found for ${{ parameters.vmssName }}"
+        exit 1
+      fi
+      FAILED=$(echo "$INSTANCES" | jq '[.[] | select(.provisioningState == "Failed")] | length')
+      if [ "$FAILED" -gt 0 ]; then
+        echo "##vso[task.logissue type=error]$FAILED VMSS instance(s) in Failed state"
+        exit 1
+      fi
+      echo "VMSS ${{ parameters.vmssName }}: $COUNT instance(s) available"
+
+# 2. Build
+- task: Bash@3
+  name: ${{ parameters.buildName }}
+  displayName: ${{ parameters.buildDisplayName }}
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      rm -rf build
+      mkdir -p build && cd build
+      ${{ if eq(parameters.platform, 'rocm') }}
+      CXX=/opt/rocm/bin/hipcc cmake \
+        -DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
+        -DMSCCLPP_BYPASS_GPU_CHECK=ON \
+        -DMSCCLPP_USE_ROCM=ON \
+        ${{ if parameters.buildTests }}-DMSCCLPP_BUILD_TESTS=ON${{ endif }} \
+        ${{ if ne(parameters.gpuArch, '') }}-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}${{ endif }} \
+        ${{ parameters.cmakeArgs }} ..
+      ${{ else }}
+      cmake \
+        -DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
+        -DMSCCLPP_BYPASS_GPU_CHECK=ON \
+        -DMSCCLPP_USE_CUDA=ON \
+        ${{ if parameters.buildTests }}-DMSCCLPP_BUILD_TESTS=ON${{ endif }} \
+        ${{ if ne(parameters.gpuArch, '') }}-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}${{ endif }} \
+        ${{ parameters.cmakeArgs }} ..
+      ${{ endif }}
+      make -j
+      cd ..
+      pwd > build/BUILD_PREFIX
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+# 3. Download SSH key + install packages + start VMSS
+- task: DownloadSecureFile@1
+  name: SshKeyFile
+  displayName: Download key file
+  inputs:
+    secureFile: mscclpp.pem
+
+- task: Bash@3
+  name: InstallPackages
+  displayName: Install Packages
+  inputs:
+    targetType: 'inline'
+    script: |
+      sudo apt-get update -y
+      sudo apt-get install pssh -y
+      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
+
+- task: AzureCLI@2
+  name: StartVMSS
+  displayName: Start VMSS
+  inputs:
+    azureSubscription: ${{ parameters.subscription }}
+    scriptType: bash
+    scriptLocation: inlineScript
+    inlineScript: |
+      az vmss start --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}
+
+# 4. Deploy test environment
+- task: Bash@3
+  name: DeployTestEnv
+  displayName: Deploy Test Env
+  inputs:
+    targetType: filePath
+    filePath: test/deploy/deploy.sh
+    arguments: ${{ parameters.deployArgs }}
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
diff --git a/.azure-pipelines/templates/integration-test.yaml b/.azure-pipelines/templates/integration-test.yaml
index 99ed6d04c..e9f15ac46 100644
--- a/.azure-pipelines/templates/integration-test.yaml
+++ b/.azure-pipelines/templates/integration-test.yaml
@@ -3,8 +3,6 @@ parameters:
   type: string
 - name: vmssName
   type: string
-- name: sshKeySecureFile
-  type: string
 - name: perfBaselineFile
   type: string
   default: 'test/deploy/perf_ndmv4.jsonl'
@@ -12,51 +10,12 @@ parameters:
   type: string
 
 steps:
-- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: inline
-    script: |
-      mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: inline
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
-- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
-- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }} 
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
-
-- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: "single-node-test"
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: templates/deploy.yaml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    deployArgs:       'single-node-test'
 
 - task: Bash@3
   name: AllGatherTest
@@ -64,24 +23,12 @@ steps:
   inputs:
     targetType: inline
     script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        export PATH=/usr/local/mpi/bin:\$PATH;                     \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                         \
-        set -e;                                                   \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl;       \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl;  \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl;  \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"'
-      kill $CHILD_PID
+      test/deploy/run-remote.sh '\
+        set -e; \
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
@@ -90,21 +37,9 @@ steps:
   inputs:
     targetType: inline
     script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                         \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl"'
-      kill $CHILD_PID
+      test/deploy/run-remote.sh '\
+        set -e; \
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
@@ -113,27 +48,15 @@ steps:
   inputs:
     targetType: 'inline'
     script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        export PATH=/usr/local/mpi/bin:\$PATH;                     \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                         \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl;                 \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl;  \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl"'
-      kill $CHILD_PID
+      test/deploy/run-remote.sh '\
+        set -e; \
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl; \
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl; \
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl; \
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
@@ -142,21 +65,10 @@ steps:
   inputs:
     targetType: 'inline'
     script: |
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
+      test/deploy/run-remote.sh '\
+        set -e; \
         mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl"'
-      kill $CHILD_PID
+        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
@@ -165,21 +77,9 @@ steps:
   inputs:
     targetType: 'inline'
     script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        cd /root/mscclpp;                                         \
-        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}"'
-      kill $CHILD_PID
+      test/deploy/run-remote.sh '\
+        set -e; \
+        python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
@@ -188,55 +88,13 @@ steps:
   inputs:
     targetType: 'inline'
     script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        set -e;                                                    \
-        cd /root/mscclpp;                                          \
-        export PATH=/usr/local/mpi/bin:\$PATH;                     \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        python3 -m pip install .;                                     \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py"'
-      kill $CHILD_PID
+      test/deploy/run-remote.sh '\
+        set -e; \
+        python3 -m pip install .; \
+        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
-- task: Bash@3
-  name: FifoPerfBenchmark
-  displayName: FIFO Performance Benchmark
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        ./build/bin/perf/fifo_test"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-
-- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
\ No newline at end of file
+- template: templates/stop.yaml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
\ No newline at end of file
diff --git a/.azure-pipelines/templates/nccl-test.yaml b/.azure-pipelines/templates/nccl-test.yaml
index 56b75d3f2..31be3fa35 100644
--- a/.azure-pipelines/templates/nccl-test.yaml
+++ b/.azure-pipelines/templates/nccl-test.yaml
@@ -4,99 +4,22 @@
 #
 # Parameters:
 #   subscription     – Azure subscription to use for VMSS start/stop
-#   sshKeySecureFile – the secureFile name for your SSH key
 
 parameters:
 - name: subscription
   type: string
 - name: vmssName
   type: string
-- name: sshKeySecureFile
-  type: string
 - name: nvccGencode
   type: string
   default: "-gencode=arch=compute_80,code=sm_80"
 
 steps:
-- checkout: self
-- checkout: git://One/msccl-users
-- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)/mscclpp'
-
-- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
-- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: 'inline'
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
-- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: mscclpp/test/deploy/deploy.sh
-    arguments: nccltest-single-node
-    workingDirectory: $(System.DefaultWorkingDirectory)/mscclpp
-
-- task: Bash@3
-  name: CopyMscclUsers
-  displayName: Copy msccl-users
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/msccl-users
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      DST_DIR="/tmp/mscclpp/msccl-users"
-      parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: GenerateExecutionFile
-#   displayName: Generate execution file
-#   inputs:
-#     targetType: 'inline'
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp/msccl-users;  \
-#         mkdir -p execution-files;      \
-#         cd /root/mscclpp/msccl-users;  \
-#         bash algos/mscclpp_a100/generate_execution_plan.sh"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: templates/deploy.yaml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    deployArgs:       'nccltest-single-node'
 
 - task: Bash@3
   name: InstallNcclTests
@@ -104,85 +27,22 @@ steps:
   inputs:
     targetType: inline
     script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"   \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        cd; git clone https://github.com/NVIDIA/nccl-tests.git;    \
-        cd nccl-tests;                                             \
-        MPI=1 MPI_HOME=/usr/local/mpi make -j"'
+      test/deploy/run-remote.sh '\
+        cd; git clone https://github.com/NVIDIA/nccl-tests.git; \
+        cd nccl-tests; \
+        MPI=1 MPI_HOME=/usr/local/mpi make -j'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
-# - task: Bash@3
-#   name: RunNcclAllReduceTest
-#   displayName: Run NCCL AllReduce Test
-#   inputs:
-#     targetType: inline
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp;                                         \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN  -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: RunNcclAllGatherTest
-#   displayName: Run NCCL AllGather Test
-#   inputs:
-#     targetType: inline
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp;                                         \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN  -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: RunNcclReduceScatterTest
-#   displayName: Run NCCL Reduce Scatter Test
-#   inputs:
-#     targetType: inline
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp;                                         \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN  -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
 - task: Bash@3
   name: InstallNccl
   displayName: Install NCCL
   inputs:
     targetType: inline
     script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"   \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        cd; git clone https://github.com/NVIDIA/nccl.git;          \
-        cd nccl;                                                   \
-        make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}"'
+      test/deploy/run-remote.sh '\
+        cd; git clone https://github.com/NVIDIA/nccl.git; \
+        cd nccl; \
+        make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
@@ -191,19 +51,9 @@ steps:
   inputs:
     targetType: inline
     script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
+      test/deploy/run-remote.sh '\
+        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
+        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
@@ -212,19 +62,9 @@ steps:
   inputs:
     targetType: 'inline'
     script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
+      test/deploy/run-remote.sh '\
+        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
+        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
@@ -233,48 +73,12 @@ steps:
   inputs:
     targetType: 'inline'
     script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
+      test/deploy/run-remote.sh '\
+        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
+        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
-# - task: Bash@3
-#   name: RunNcclReduceScatterFallbaclkToNcclTest
-#   displayName: Run NCCL ReduceScatter Test with or without Fallback to NCCL operation
-#   inputs:
-#     targetType: 'inline'
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp;                                         \
-#         echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"reducescatter\" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";                                                                 \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="reducescatter" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;                                                                            \
-#         echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
+- template: templates/stop.yaml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/rccl-test.yaml b/.azure-pipelines/templates/rccl-test.yaml
index 040605dfd..00ab9b443 100644
--- a/.azure-pipelines/templates/rccl-test.yaml
+++ b/.azure-pipelines/templates/rccl-test.yaml
@@ -5,7 +5,6 @@
 # Parameters:
 #   subscription     – Azure subscription to use for VMSS start/stop
 #   vmssName         – VMSS name to start/stop
-#   sshKeySecureFile – the secureFile name for your SSH key
 #   gpuArch          – GPU architecture (e.g. gfx942)
 
 parameters:
@@ -13,56 +12,19 @@ parameters:
   type: string
 - name: vmssName
   type: string
-- name: sshKeySecureFile
-  type: string
 - name: gpuArch
   type: string
   default: "gfx942"
 
 steps:
-- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      mkdir build && cd build
-      CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
-- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: 'inline'
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
-- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: "single-node-test true rocm"
-    workingDirectory: $(System.DefaultWorkingDirectory)
+- template: templates/deploy.yaml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    platform:         rocm
+    gpuArch:          ${{ parameters.gpuArch }}
+    buildTests:       false
+    deployArgs:       'single-node-test true rocm'
 
 
 - task: Bash@3
@@ -71,21 +33,15 @@ steps:
   inputs:
     targetType: inline
     script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"   \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        cd;                                                        \
+      test/deploy/run-remote.sh '\
+        cd; \
         git clone --filter=blob:none --no-checkout https://github.com/ROCm/rocm-systems.git; \
-        cd rocm-systems;                                           \
-        git sparse-checkout init --cone;                           \
-        git sparse-checkout set projects/rccl-tests;               \
-        git checkout;                                              \
-        cd projects/rccl-tests;                                    \
-        MPI=1 MPI_HOME=/usr/local/mpi make -j"'
+        cd rocm-systems; \
+        git sparse-checkout init --cone; \
+        git sparse-checkout set projects/rccl-tests; \
+        git checkout; \
+        cd projects/rccl-tests; \
+        MPI=1 MPI_HOME=/usr/local/mpi make -j'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
@@ -94,19 +50,9 @@ steps:
   inputs:
     targetType: inline
     script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN  /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
+      test/deploy/run-remote.sh '\
+        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
+        mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
@@ -115,28 +61,12 @@ steps:
   inputs:
     targetType: 'inline'
     script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN  /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
+      test/deploy/run-remote.sh '\
+        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
+        mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
-- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
+- template: templates/stop.yaml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/stop.yaml b/.azure-pipelines/templates/stop.yaml
new file mode 100644
index 000000000..40498c290
--- /dev/null
+++ b/.azure-pipelines/templates/stop.yaml
@@ -0,0 +1,20 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: resourceGroup
+  type: string
+  default: mscclpp
+
+steps:
+- task: AzureCLI@2
+  name: StopVMSS
+  displayName: Deallocate VMSS
+  condition: always()
+  inputs:
+    azureSubscription: ${{ parameters.subscription }}
+    scriptType: bash
+    scriptLocation: inlineScript
+    inlineScript: |
+      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}
diff --git a/.azure-pipelines/templates/ut-no-ib-env.yaml b/.azure-pipelines/templates/ut-no-ib-env.yaml
index 0d97f9fc9..cf1c63867 100644
--- a/.azure-pipelines/templates/ut-no-ib-env.yaml
+++ b/.azure-pipelines/templates/ut-no-ib-env.yaml
@@ -3,57 +3,17 @@ parameters:
   type: string
 - name: vmssName
   type: string
-- name: sshKeySecureFile
-  type: string
 - name: gpuArch
   type: string
 
 steps:
-- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_USE_IB=OFF -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
-- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: 'inline'
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
-- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }} 
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }}  --resource-group mscclpp
-
-- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: single-node-test false
-    workingDirectory: $(System.DefaultWorkingDirectory)
+- template: templates/deploy.yaml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    cmakeArgs:        '-DMSCCLPP_USE_IB=OFF'
+    deployArgs:       'single-node-test false'
 
 - task: Bash@3
   name: UnitTests
@@ -61,19 +21,8 @@ steps:
   inputs:
     targetType: inline
     script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        cd /root/mscclpp;                                             \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        ./build/bin/unit_tests"'
-      kill $CHILD_PID
+      test/deploy/run-remote.sh '\
+        ./build/bin/unit_tests'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
@@ -82,22 +31,10 @@ steps:
   inputs:
     targetType: 'inline'
     script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        export PATH=/usr/local/mpi/bin:\$PATH;                        \
-        cd /root/mscclpp;                                             \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests;  \
-        mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests;  \
-        mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"'
-      kill $CHILD_PID
+      test/deploy/run-remote.sh '\
+        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests; \
+        mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests; \
+        mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
@@ -106,20 +43,8 @@ steps:
   inputs:
     targetType: inline
     script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .     \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "     \
-        export PATH=/usr/local/mpi/bin:\$PATH                          \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                              \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
-      kill $CHILD_PID
+      test/deploy/run-remote.sh '\
+        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
@@ -128,11 +53,7 @@ steps:
   inputs:
     targetType: 'inline'
     script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
+      test/deploy/run-remote.sh --no-docker --no-log \
         "sudo docker stop mscclpp-test || true; sudo docker rm mscclpp-test || true"
       rm -f $(System.DefaultWorkingDirectory)/sshkey $(System.DefaultWorkingDirectory)/sshkey.pub
     workingDirectory: '$(System.DefaultWorkingDirectory)'
@@ -143,8 +64,15 @@ steps:
   inputs:
     targetType: 'inline'
     script: |
-      rm -rf build && mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
+      set -e
+      rm -rf build
+      mkdir -p build && cd build
+      cmake \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DMSCCLPP_BYPASS_GPU_CHECK=ON \
+        -DMSCCLPP_USE_CUDA=ON \
+        -DMSCCLPP_BUILD_TESTS=ON \
+        -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
       make -j
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
@@ -163,29 +91,11 @@ steps:
   inputs:
     targetType: inline
     script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .     \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "     \
-        export PATH=/usr/local/mpi/bin:\$PATH                          \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                              \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
-      kill $CHILD_PID
+      test/deploy/run-remote.sh '\
+        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
-- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
\ No newline at end of file
+- template: templates/stop.yaml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/ut-npkit.yaml b/.azure-pipelines/templates/ut-npkit.yaml
index d4456f892..86614a15b 100644
--- a/.azure-pipelines/templates/ut-npkit.yaml
+++ b/.azure-pipelines/templates/ut-npkit.yaml
@@ -3,70 +3,18 @@ parameters:
   type: string
 - name: vmssName
   type: string
-- name: sshKeySecureFile
-  type: string
 - name: gpuArch
   type: string
 
 
 steps:
-- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
-- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: inline
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
-- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }} 
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }}  --resource-group mscclpp
-
-- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: "single-node-test"
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        set -e;                                                       \
-        cd /root/mscclpp;                                             \
-        mkdir -p build && cd build;                                   \
-        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_NPKIT_FLAGS=\"-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT\" ..; \
-        make -j"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: templates/deploy.yaml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    cmakeArgs:        '-DMSCCLPP_NPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT"'
+    deployArgs:       'single-node-test'
 
 - task: Bash@3
   name: MpUnitTests
@@ -74,27 +22,15 @@ steps:
   inputs:
     targetType: 'inline'
     script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        cd /root/mscclpp;                                             \
+      test/deploy/run-remote.sh '\
         rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
-        export PATH=/usr/local/mpi/bin:\$PATH; \
-        export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump;    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
+        export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \
         mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --filter=\"ExecutorTest.TwoNodesAllreduce\"; \
         python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
-        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json;  \
-        grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json"'
-      kill $CHILD_PID
+        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
+        grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json; \
+        grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json; \
+        grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
@@ -103,43 +39,25 @@ steps:
   inputs:
     targetType: 'inline'
     script: |
-      # set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        cd /root/mscclpp;                                             \
+      test/deploy/run-remote.sh '\
         rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
-        export PATH=/usr/local/mpi/bin:\$PATH; \
-        export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump;    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'; \
+        export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \
+        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k '"'"'test_executor[allreduce.json'"'"'; \
         python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
-        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json;  \
-        grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
+        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
+        grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json; \
+        grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json; \
         grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json; \
-        rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output;     \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json';      \
-        python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output;  \
-        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json;          \
-        grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json;   \
-        grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json"'
-      kill $CHILD_PID
+        rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
+        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k '"'"'test_executor[allreduce_packet.json'"'"'; \
+        python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
+        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
+        grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json; \
+        grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json; \
+        grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
-- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
+- template: templates/stop.yaml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/ut.yaml b/.azure-pipelines/templates/ut.yaml
index 2086fd0ac..cf9ad6157 100644
--- a/.azure-pipelines/templates/ut.yaml
+++ b/.azure-pipelines/templates/ut.yaml
@@ -3,8 +3,6 @@ parameters:
   type: string
 - name: vmssName
   type: string
-- name: sshKeySecureFile
-  type: string
 - name: platform
   type: string
   default: 'cuda'
@@ -12,55 +10,13 @@ parameters:
   type: string
 
 steps:
-- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      mkdir build && cd build
-      if [ "${{ parameters.platform }}" == "rocm" ]; then
-        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      else
-        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      fi
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
-- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: 'inline'
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
-- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }} 
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }}  --resource-group mscclpp
-
-- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: "single-node-test true ${{ parameters.platform }}"
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: templates/deploy.yaml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    platform:         ${{ parameters.platform }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    deployArgs:       'single-node-test true ${{ parameters.platform }}'
 
 
 - task: Bash@3
@@ -69,19 +25,8 @@ steps:
   inputs:
     targetType: inline
     script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        cd /root/mscclpp;                                             \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        ./build/bin/unit_tests"'
-      kill $CHILD_PID
+      test/deploy/run-remote.sh '\
+        ./build/bin/unit_tests'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
@@ -90,22 +35,10 @@ steps:
   inputs:
     targetType: 'inline'
     script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        export PATH=/usr/local/mpi/bin:\$PATH;                        \
-        cd /root/mscclpp;                                             \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests;  \
-        mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests;  \
-        mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"'
-      kill $CHILD_PID
+      test/deploy/run-remote.sh '\
+        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests; \
+        mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests; \
+        mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
@@ -114,29 +47,11 @@ steps:
   inputs:
     targetType: inline
     script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .     \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "     \
-        export PATH=/usr/local/mpi/bin:\$PATH                          \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                              \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
-      kill $CHILD_PID
+      test/deploy/run-remote.sh '\
+        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
-- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
+- template: templates/stop.yaml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml
index 4ef8035ff..e6590abb1 100644
--- a/.azure-pipelines/ut.yml
+++ b/.azure-pipelines/ut.yml
@@ -47,7 +47,6 @@ jobs:
     parameters:
       subscription:     mscclpp-ci
       vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '80'
 
 - job: UnitTestWithNpKitA100
@@ -69,7 +68,6 @@ jobs:
     parameters:
       subscription:     mscclpp-ci
       vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '80'
 
 - job: UnitTestH100
@@ -89,7 +87,6 @@ jobs:
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '90'
 
 - job: UnitTestWithNpKitH100
@@ -109,7 +106,6 @@ jobs:
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '90'
 
 - job: UnitTestNoIBEnv
@@ -131,7 +127,6 @@ jobs:
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '90'
 
 - job: UnitTestMI300X
@@ -151,6 +146,5 @@ jobs:
     parameters:
       subscription:     mscclpp-ci-mi300x
       vmssName:         mscclpp-mi300x-ci
-      sshKeySecureFile: mscclpp.pem
       platform:         rocm
       gpuArch:          gfx942
diff --git a/docs/quickstart.md b/docs/quickstart.md
index fd0b75714..b7a68050e 100644
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -171,7 +171,6 @@ We implement [NCCL](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/ap
 For example, you can run [nccl-tests](https://github.com/NVIDIA/nccl-tests) using `libmscclpp_nccl.so` as follows, where `MSCCLPP_BUILD` is your MSCCL++ build directory.
 
 ```bash
-export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
 mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
 ```
 
@@ -189,13 +188,11 @@ By default, if the parameter `MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION` is not spec
 
 Example 1, Allreduce will fallback to NCCL ncclAllReduce since allreduce is in the fallback list.
 ```bash
-export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
 mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce,allgather" ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
 ```
 
 Example 2, ReduceScatter will still use msccl++ implementation since reducescatter is not in the fallbacklist.
 ```bash
-export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
 mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
 ```
 
diff --git a/test/deploy/run-remote.sh b/test/deploy/run-remote.sh
new file mode 100755
index 000000000..ca393cca1
--- /dev/null
+++ b/test/deploy/run-remote.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+# Run a command on remote CI VMs via parallel-ssh.
+# By default, runs inside the mscclpp-test docker container.
+#
+# Usage:
+#   run-remote.sh [OPTIONS] <command>
+#
+# Options:
+#   --no-docker   Run command directly on the host, not inside docker
+#   --no-log      Don't tail the log file in the background
+#   --hostfile    Override hostfile path (default: test/deploy/hostfile_ci)
+#   --host        Run command on a single host (uses parallel-ssh -H)
+#   --user        SSH user when using --host or custom hostfile
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+HOSTFILE="${SCRIPT_DIR}/hostfile_ci"
+SSH_OPTION="StrictHostKeyChecking=no"
+KeyFilePath="${SSHKEYFILE_SECUREFILEPATH}"
+
+USE_DOCKER=true
+USE_LOG=true
+TARGET_HOST=""
+REMOTE_USER=""
+
+while [[ "$1" == --* ]]; do
+    case "$1" in
+        --no-docker) USE_DOCKER=false; shift ;;
+        --no-log)    USE_LOG=false; shift ;;
+        --hostfile)
+            if [ -z "$2" ]; then
+                echo "Missing value for --hostfile" >&2
+                exit 1
+            fi
+            HOSTFILE="$2"
+            shift 2
+            ;;
+        --host)
+            if [ -z "$2" ]; then
+                echo "Missing value for --host" >&2
+                exit 1
+            fi
+            TARGET_HOST="$2"
+            shift 2
+            ;;
+        --user)
+            if [ -z "$2" ]; then
+                echo "Missing value for --user" >&2
+                exit 1
+            fi
+            REMOTE_USER="$2"
+            shift 2
+            ;;
+        *) echo "Unknown option: $1" >&2; exit 1 ;;
+    esac
+done
+
+if [ $# -eq 0 ]; then
+    echo "Usage: $0 [--no-docker] [--no-log] <command>" >&2
+    exit 1
+fi
+CMD="$*"
+
+PSSH_TARGET_ARGS=()
+if [ -n "$TARGET_HOST" ]; then
+    PSSH_TARGET_ARGS=(-H "$TARGET_HOST")
+else
+    PSSH_TARGET_ARGS=(-h "$HOSTFILE")
+fi
+
+PSSH_USER_ARGS=()
+if [ -n "$REMOTE_USER" ]; then
+    PSSH_USER_ARGS=(-l "$REMOTE_USER")
+fi
+
+if $USE_LOG; then
+    if [ -n "$TARGET_HOST" ]; then
+        HOST="$TARGET_HOST"
+    else
+        HOST=$(head -1 "${HOSTFILE}")
+        HOST="${HOST##*@}"
+    fi
+    : > "${HOST}"
+    tail -f "${HOST}" &
+    CHILD_PID=$!
+    trap "kill $CHILD_PID 2>/dev/null" EXIT
+fi
+
+if $USE_DOCKER; then
+    parallel-ssh -t 0 "${PSSH_TARGET_ARGS[@]}" "${PSSH_USER_ARGS[@]}" -x "-i ${KeyFilePath}" -o . \
+        -O "$SSH_OPTION" "sudo docker exec -t mscclpp-test bash -c \"set -ex; pushd /root/mscclpp >/dev/null; trap 'popd >/dev/null' EXIT; ${CMD}\""
+else
+    parallel-ssh -i -t 0 "${PSSH_TARGET_ARGS[@]}" "${PSSH_USER_ARGS[@]}" -x "-i ${KeyFilePath}" \
+        -O "$SSH_OPTION" "set -ex; ${CMD}"
+fi
diff --git a/test/deploy/run_tests.sh b/test/deploy/run_tests.sh
index 488fa81f6..0c05a090c 100644
--- a/test/deploy/run_tests.sh
+++ b/test/deploy/run_tests.sh
@@ -1,6 +1,5 @@
 set -e
 HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi
-export PATH=/usr/local/mpi/bin:$PATH
 
 function run_mscclpp_test()
 {

From 75ac8be225ff9958f0653e16a0c99f7ef6d0de48 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sat, 7 Mar 2026 02:31:51 +0000
Subject: [PATCH 061/132] fix

---
 .azure-pipelines/templates/codecov.yaml          | 4 ++--
 .azure-pipelines/templates/integration-test.yaml | 4 ++--
 .azure-pipelines/templates/nccl-test.yaml        | 4 ++--
 .azure-pipelines/templates/rccl-test.yaml        | 4 ++--
 .azure-pipelines/templates/ut-no-ib-env.yaml     | 4 ++--
 .azure-pipelines/templates/ut-npkit.yaml         | 4 ++--
 .azure-pipelines/templates/ut.yaml               | 4 ++--
 7 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/.azure-pipelines/templates/codecov.yaml b/.azure-pipelines/templates/codecov.yaml
index 46e59f085..b82da6bbd 100644
--- a/.azure-pipelines/templates/codecov.yaml
+++ b/.azure-pipelines/templates/codecov.yaml
@@ -10,7 +10,7 @@ parameters:
   type: string
 
 steps:
-- template: templates/deploy.yaml
+- template: deploy.yaml
   parameters:
     subscription:     ${{ parameters.subscription }}
     vmssName:         ${{ parameters.vmssName }}
@@ -78,7 +78,7 @@ steps:
       ./codecov upload-process --disable-search -t $(CODECOV_TOKEN) -f coverage.info --flag ${{ parameters.platform }}-${{ parameters.gpuArch }}
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
-- template: templates/stop.yaml
+- template: stop.yaml
   parameters:
     subscription: ${{ parameters.subscription }}
     vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/integration-test.yaml b/.azure-pipelines/templates/integration-test.yaml
index e9f15ac46..acbb710ff 100644
--- a/.azure-pipelines/templates/integration-test.yaml
+++ b/.azure-pipelines/templates/integration-test.yaml
@@ -10,7 +10,7 @@ parameters:
   type: string
 
 steps:
-- template: templates/deploy.yaml
+- template: deploy.yaml
   parameters:
     subscription:     ${{ parameters.subscription }}
     vmssName:         ${{ parameters.vmssName }}
@@ -94,7 +94,7 @@ steps:
         mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
-- template: templates/stop.yaml
+- template: stop.yaml
   parameters:
     subscription: ${{ parameters.subscription }}
     vmssName:     ${{ parameters.vmssName }}
\ No newline at end of file
diff --git a/.azure-pipelines/templates/nccl-test.yaml b/.azure-pipelines/templates/nccl-test.yaml
index b61e4aab1..c6260e761 100644
--- a/.azure-pipelines/templates/nccl-test.yaml
+++ b/.azure-pipelines/templates/nccl-test.yaml
@@ -15,7 +15,7 @@ parameters:
   default: "-gencode=arch=compute_80,code=sm_80"
 
 steps:
-- template: templates/deploy.yaml
+- template: deploy.yaml
   parameters:
     subscription:     ${{ parameters.subscription }}
     vmssName:         ${{ parameters.vmssName }}
@@ -80,7 +80,7 @@ steps:
         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
-- template: templates/stop.yaml
+- template: stop.yaml
   parameters:
     subscription: ${{ parameters.subscription }}
     vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/rccl-test.yaml b/.azure-pipelines/templates/rccl-test.yaml
index 00ab9b443..7be3f9936 100644
--- a/.azure-pipelines/templates/rccl-test.yaml
+++ b/.azure-pipelines/templates/rccl-test.yaml
@@ -17,7 +17,7 @@ parameters:
   default: "gfx942"
 
 steps:
-- template: templates/deploy.yaml
+- template: deploy.yaml
   parameters:
     subscription:     ${{ parameters.subscription }}
     vmssName:         ${{ parameters.vmssName }}
@@ -66,7 +66,7 @@ steps:
         mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
-- template: templates/stop.yaml
+- template: stop.yaml
   parameters:
     subscription: ${{ parameters.subscription }}
     vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/ut-no-ib-env.yaml b/.azure-pipelines/templates/ut-no-ib-env.yaml
index cf1c63867..6514fbc3c 100644
--- a/.azure-pipelines/templates/ut-no-ib-env.yaml
+++ b/.azure-pipelines/templates/ut-no-ib-env.yaml
@@ -7,7 +7,7 @@ parameters:
   type: string
 
 steps:
-- template: templates/deploy.yaml
+- template: deploy.yaml
   parameters:
     subscription:     ${{ parameters.subscription }}
     vmssName:         ${{ parameters.vmssName }}
@@ -95,7 +95,7 @@ steps:
         mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
-- template: templates/stop.yaml
+- template: stop.yaml
   parameters:
     subscription: ${{ parameters.subscription }}
     vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/ut-npkit.yaml b/.azure-pipelines/templates/ut-npkit.yaml
index 86614a15b..46749db7c 100644
--- a/.azure-pipelines/templates/ut-npkit.yaml
+++ b/.azure-pipelines/templates/ut-npkit.yaml
@@ -8,7 +8,7 @@ parameters:
 
 
 steps:
-- template: templates/deploy.yaml
+- template: deploy.yaml
   parameters:
     subscription:     ${{ parameters.subscription }}
     vmssName:         ${{ parameters.vmssName }}
@@ -57,7 +57,7 @@ steps:
         grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
-- template: templates/stop.yaml
+- template: stop.yaml
   parameters:
     subscription: ${{ parameters.subscription }}
     vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/ut.yaml b/.azure-pipelines/templates/ut.yaml
index cf9ad6157..4bc1a9aec 100644
--- a/.azure-pipelines/templates/ut.yaml
+++ b/.azure-pipelines/templates/ut.yaml
@@ -10,7 +10,7 @@ parameters:
   type: string
 
 steps:
-- template: templates/deploy.yaml
+- template: deploy.yaml
   parameters:
     subscription:     ${{ parameters.subscription }}
     vmssName:         ${{ parameters.vmssName }}
@@ -51,7 +51,7 @@ steps:
         mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
-- template: templates/stop.yaml
+- template: stop.yaml
   parameters:
     subscription: ${{ parameters.subscription }}
     vmssName:     ${{ parameters.vmssName }}

From e0c7ddb5ff3d8891b9846fec7c6986322a1fab3a Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sat, 7 Mar 2026 02:33:20 +0000
Subject: [PATCH 062/132] fix

---
 .azure-pipelines/templates/deploy.yaml | 45 ++++++++++++++++----------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/.azure-pipelines/templates/deploy.yaml b/.azure-pipelines/templates/deploy.yaml
index 77a61eed3..dc686fa61 100644
--- a/.azure-pipelines/templates/deploy.yaml
+++ b/.azure-pipelines/templates/deploy.yaml
@@ -67,23 +67,34 @@ steps:
       set -e
       rm -rf build
       mkdir -p build && cd build
-      ${{ if eq(parameters.platform, 'rocm') }}
-      CXX=/opt/rocm/bin/hipcc cmake \
-        -DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
-        -DMSCCLPP_BYPASS_GPU_CHECK=ON \
-        -DMSCCLPP_USE_ROCM=ON \
-        ${{ if parameters.buildTests }}-DMSCCLPP_BUILD_TESTS=ON${{ endif }} \
-        ${{ if ne(parameters.gpuArch, '') }}-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}${{ endif }} \
-        ${{ parameters.cmakeArgs }} ..
-      ${{ else }}
-      cmake \
-        -DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
-        -DMSCCLPP_BYPASS_GPU_CHECK=ON \
-        -DMSCCLPP_USE_CUDA=ON \
-        ${{ if parameters.buildTests }}-DMSCCLPP_BUILD_TESTS=ON${{ endif }} \
-        ${{ if ne(parameters.gpuArch, '') }}-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}${{ endif }} \
-        ${{ parameters.cmakeArgs }} ..
-      ${{ endif }}
+      BUILD_TESTS_ARG=""
+      if [ "${{ parameters.buildTests }}" = "true" ]; then
+        BUILD_TESTS_ARG="-DMSCCLPP_BUILD_TESTS=ON"
+      fi
+
+      GPU_ARCH_ARG=""
+      if [ -n "${{ parameters.gpuArch }}" ]; then
+        GPU_ARCH_ARG="-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}"
+      fi
+
+      CMAKE_EXTRA_ARGS='${{ parameters.cmakeArgs }}'
+      if [ "${{ parameters.platform }}" = "rocm" ]; then
+        CXX=/opt/rocm/bin/hipcc cmake \
+          -DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
+          -DMSCCLPP_BYPASS_GPU_CHECK=ON \
+          -DMSCCLPP_USE_ROCM=ON \
+          ${BUILD_TESTS_ARG} \
+          ${GPU_ARCH_ARG} \
+          ${CMAKE_EXTRA_ARGS} ..
+      else
+        cmake \
+          -DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
+          -DMSCCLPP_BYPASS_GPU_CHECK=ON \
+          -DMSCCLPP_USE_CUDA=ON \
+          ${BUILD_TESTS_ARG} \
+          ${GPU_ARCH_ARG} \
+          ${CMAKE_EXTRA_ARGS} ..
+      fi
       make -j
       cd ..
       pwd > build/BUILD_PREFIX

From c40a233f55ed793a6030d27ddee19ce944ecec07 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sat, 7 Mar 2026 02:48:08 +0000
Subject: [PATCH 063/132] fix

---
 .azure-pipelines/templates/deploy.yaml | 15 ++++++++++++++-
 .azure-pipelines/templates/stop.yaml   | 13 +++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/.azure-pipelines/templates/deploy.yaml b/.azure-pipelines/templates/deploy.yaml
index dc686fa61..0bafa09c0 100644
--- a/.azure-pipelines/templates/deploy.yaml
+++ b/.azure-pipelines/templates/deploy.yaml
@@ -34,6 +34,20 @@ parameters:
   default: ''
 
 steps:
+# 0. Ensure Azure CLI exists before running AzureCLI@2 tasks.
+- task: Bash@3
+  name: EnsureAzureCLI
+  displayName: Ensure Azure CLI Installed
+  inputs:
+    targetType: inline
+    script: |
+      set -e
+      if command -v az >/dev/null 2>&1; then
+        az version >/dev/null
+        exit 0
+      fi
+      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
+
 # 1. Check VMSS availability (fast, fail-fast)
 - task: AzureCLI@2
   name: CheckVMSS
@@ -115,7 +129,6 @@ steps:
     script: |
       sudo apt-get update -y
       sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
 
 - task: AzureCLI@2
   name: StartVMSS
diff --git a/.azure-pipelines/templates/stop.yaml b/.azure-pipelines/templates/stop.yaml
index 40498c290..777150abd 100644
--- a/.azure-pipelines/templates/stop.yaml
+++ b/.azure-pipelines/templates/stop.yaml
@@ -8,6 +8,19 @@ parameters:
   default: mscclpp
 
 steps:
+- task: Bash@3
+  name: EnsureAzureCLI
+  displayName: Ensure Azure CLI Installed
+  inputs:
+    targetType: inline
+    script: |
+      set -e
+      if command -v az >/dev/null 2>&1; then
+        az version >/dev/null
+        exit 0
+      fi
+      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
+
 - task: AzureCLI@2
   name: StopVMSS
   displayName: Deallocate VMSS

From 375bc1383117f5d4b70f8b81a3d33094d65e7813 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sat, 7 Mar 2026 02:53:54 +0000
Subject: [PATCH 064/132] fix

---
 .azure-pipelines/templates/stop.yaml | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/.azure-pipelines/templates/stop.yaml b/.azure-pipelines/templates/stop.yaml
index 777150abd..40498c290 100644
--- a/.azure-pipelines/templates/stop.yaml
+++ b/.azure-pipelines/templates/stop.yaml
@@ -8,19 +8,6 @@ parameters:
   default: mscclpp
 
 steps:
-- task: Bash@3
-  name: EnsureAzureCLI
-  displayName: Ensure Azure CLI Installed
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      if command -v az >/dev/null 2>&1; then
-        az version >/dev/null
-        exit 0
-      fi
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
 - task: AzureCLI@2
   name: StopVMSS
   displayName: Deallocate VMSS

From bcb392ffdf024401a9d2cdc2503063fd7a6fe823 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sun, 8 Mar 2026 03:33:51 +0000
Subject: [PATCH 065/132] updates

---
 .azure-pipelines/templates/deploy.yaml        | 29 +-------
 .github/workflows/integration-test-backup.yml | 69 -------------------
 .github/workflows/ut-backup.yml               | 52 --------------
 3 files changed, 3 insertions(+), 147 deletions(-)
 delete mode 100644 .github/workflows/integration-test-backup.yml
 delete mode 100644 .github/workflows/ut-backup.yml

diff --git a/.azure-pipelines/templates/deploy.yaml b/.azure-pipelines/templates/deploy.yaml
index 0bafa09c0..2e6ccc512 100644
--- a/.azure-pipelines/templates/deploy.yaml
+++ b/.azure-pipelines/templates/deploy.yaml
@@ -48,30 +48,7 @@ steps:
       fi
       curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
 
-# 1. Check VMSS availability (fast, fail-fast)
-- task: AzureCLI@2
-  name: CheckVMSS
-  displayName: Check VMSS Availability
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      set -e
-      INSTANCES=$(az vmss list-instances --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }} -o json)
-      COUNT=$(echo "$INSTANCES" | jq 'length')
-      if [ "$COUNT" -eq 0 ]; then
-        echo "##vso[task.logissue type=error]No VMSS instances found for ${{ parameters.vmssName }}"
-        exit 1
-      fi
-      FAILED=$(echo "$INSTANCES" | jq '[.[] | select(.provisioningState == "Failed")] | length')
-      if [ "$FAILED" -gt 0 ]; then
-        echo "##vso[task.logissue type=error]$FAILED VMSS instance(s) in Failed state"
-        exit 1
-      fi
-      echo "VMSS ${{ parameters.vmssName }}: $COUNT instance(s) available"
-
-# 2. Build
+# 1. Build
 - task: Bash@3
   name: ${{ parameters.buildName }}
   displayName: ${{ parameters.buildDisplayName }}
@@ -114,7 +91,7 @@ steps:
       pwd > build/BUILD_PREFIX
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
-# 3. Download SSH key + install packages + start VMSS
+# 2. Download SSH key + install packages + start VMSS
 - task: DownloadSecureFile@1
   name: SshKeyFile
   displayName: Download key file
@@ -140,7 +117,7 @@ steps:
     inlineScript: |
       az vmss start --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}
 
-# 4. Deploy test environment
+# 3. Deploy test environment
 - task: Bash@3
   name: DeployTestEnv
   displayName: Deploy Test Env
diff --git a/.github/workflows/integration-test-backup.yml b/.github/workflows/integration-test-backup.yml
deleted file mode 100644
index 900e8aba2..000000000
--- a/.github/workflows/integration-test-backup.yml
+++ /dev/null
@@ -1,69 +0,0 @@
-name: IntegrationTest
-
-on: workflow_dispatch
-
-jobs:
-  IntegrationTest:
-    runs-on: [ self-hosted, A100 ]
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        cuda: [ cuda11.8, cuda12.2 ]
-
-    container:
-      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
-      options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Build
-        run: |
-          mkdir build && cd build
-          cmake -DCMAKE_BUILD_TYPE=Release ..
-          make -j
-
-      - name: Lock GPU clock frequency
-        run: |
-          sudo nvidia-smi -pm 1
-          for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
-            sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
-          done
-
-      - name: Run mscclpp AllGather test
-        run: |
-          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
-
-      - name: Run mscclpp SendRecv test
-        run: |
-          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-
-      - name: Run mscclpp AllReduce test
-        run: |
-          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
-
-      - name: Run mscclpp AllToAll test
-        run: |
-          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
-
-      - name: Check collective primitives performance
-        run: |
-          set -e
-          python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl
diff --git a/.github/workflows/ut-backup.yml b/.github/workflows/ut-backup.yml
deleted file mode 100644
index 8849c353e..000000000
--- a/.github/workflows/ut-backup.yml
+++ /dev/null
@@ -1,52 +0,0 @@
-name: UnitTest
-
-on: workflow_dispatch
-
-jobs:
-  UnitTest:
-    runs-on: [ self-hosted, A100 ]
-    defaults:
-      run:
-        shell: bash
-    timeout-minutes: 30
-    strategy:
-      matrix:
-        cuda: [ cuda11.8, cuda12.2 ]
-
-    container:
-      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
-      options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Build
-        run: |
-          mkdir build && cd build
-          cmake -DCMAKE_BUILD_TYPE=Release ..
-          make -j
-        working-directory: ${{ github.workspace }}
-
-      - name: LockGPUClock
-        run: |
-          sudo nvidia-smi -pm 1
-          for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
-            sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
-          done
-
-      - name: UnitTests
-        run: |
-          ./build/bin/unit_tests
-
-      - name: MpUnitTests
-        run: |
-          set -e
-          mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
-          mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
-          mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
-
-      - name: PyTests
-        run: |
-          set -e
-          mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ./python/test/test_mscclpp.py -x

From ea1dd651268c59180d52f989fc71dbac1b3ca091 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sun, 8 Mar 2026 04:05:58 +0000
Subject: [PATCH 066/132] fix

---
 test/deploy/run-remote.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/deploy/run-remote.sh b/test/deploy/run-remote.sh
index ca393cca1..ee25b6b87 100755
--- a/test/deploy/run-remote.sh
+++ b/test/deploy/run-remote.sh
@@ -61,6 +61,7 @@ if [ $# -eq 0 ]; then
     exit 1
 fi
 CMD="$*"
+CMD_B64=$(printf '%s' "$CMD" | base64 | tr -d '\n')
 
 PSSH_TARGET_ARGS=()
 if [ -n "$TARGET_HOST" ]; then
@@ -89,8 +90,8 @@ fi
 
 if $USE_DOCKER; then
     parallel-ssh -t 0 "${PSSH_TARGET_ARGS[@]}" "${PSSH_USER_ARGS[@]}" -x "-i ${KeyFilePath}" -o . \
-        -O "$SSH_OPTION" "sudo docker exec -t mscclpp-test bash -c \"set -ex; pushd /root/mscclpp >/dev/null; trap 'popd >/dev/null' EXIT; ${CMD}\""
+        -O "$SSH_OPTION" "sudo docker exec -t mscclpp-test bash -c \"set -ex; pushd /root/mscclpp >/dev/null; trap 'popd >/dev/null' EXIT; CMD_B64='${CMD_B64}'; eval \\\"\\\$(printf '%s' \\\"\\\$CMD_B64\\\" | base64 -d)\\\"\""
 else
     parallel-ssh -i -t 0 "${PSSH_TARGET_ARGS[@]}" "${PSSH_USER_ARGS[@]}" -x "-i ${KeyFilePath}" \
-        -O "$SSH_OPTION" "set -ex; ${CMD}"
+        -O "$SSH_OPTION" "set -ex; CMD_B64='${CMD_B64}'; eval \"\$(printf '%s' \"\$CMD_B64\" | base64 -d)\""
 fi

From d6a6fa2ffa7f11b0fb6453df399cdaf8888fde14 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sun, 8 Mar 2026 05:31:48 +0000
Subject: [PATCH 067/132] simplified

---
 .azure-pipelines/multi-nodes-test.yml         |  60 +++++----
 .azure-pipelines/templates/codecov.yaml       |  55 ++++----
 .../templates/integration-test.yaml           | 118 +++++++-----------
 .azure-pipelines/templates/nccl-test.yaml     |  98 +++++++--------
 .azure-pipelines/templates/rccl-test.yaml     |  63 ++++------
 .../templates/run-remote-task.yaml            |  27 ++++
 .azure-pipelines/templates/ut-no-ib-env.yaml  |  78 ++++++------
 .azure-pipelines/templates/ut-npkit.yaml      |  72 +++++------
 .azure-pipelines/templates/ut.yaml            |  49 +++-----
 test/deploy/run-remote.sh                     |  53 ++++----
 10 files changed, 319 insertions(+), 354 deletions(-)
 create mode 100644 .azure-pipelines/templates/run-remote-task.yaml

diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml
index 994b87ee7..643b4351b 100644
--- a/.azure-pipelines/multi-nodes-test.yml
+++ b/.azure-pipelines/multi-nodes-test.yml
@@ -56,41 +56,37 @@ jobs:
       vmssName:      mscclit-vmss
       resourceGroup: msccl-IT
 
-  - task: Bash@3
-    name: RunMscclppTest
-    displayName: Run multi-nodes mscclpp-test
-    inputs:
-      targetType: 'inline'
-      script: |
-        test/deploy/run-remote.sh --hostfile "$(System.DefaultWorkingDirectory)/test/deploy/hostfile" --host mscclit-000000 --user azureuser \
-          "bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test"
+  - template: templates/run-remote-task.yaml
+    parameters:
+      name: RunMscclppTest
+      displayName: Run multi-nodes mscclpp-test
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test
 
-  - task: Bash@3
-    name: RunMultiNodeUnitTest
-    displayName: Run multi-nodes unit tests
-    inputs:
-      targetType: 'inline'
-      script: |
-        test/deploy/run-remote.sh --hostfile "$(System.DefaultWorkingDirectory)/test/deploy/hostfile" --host mscclit-000000 --user azureuser \
-          "bash /root/mscclpp/test/deploy/run_tests.sh mp-ut"
+  - template: templates/run-remote-task.yaml
+    parameters:
+      name: RunMultiNodeUnitTest
+      displayName: Run multi-nodes unit tests
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh mp-ut
 
-  - task: Bash@3
-    name: RunMultiNodePythonTests
-    displayName: Run multi-nodes python tests
-    inputs:
-      targetType: 'inline'
-      script: |
-        test/deploy/run-remote.sh --hostfile "$(System.DefaultWorkingDirectory)/test/deploy/hostfile" --host mscclit-000000 --user azureuser \
-          "bash /root/mscclpp/test/deploy/run_tests.sh pytests"
+  - template: templates/run-remote-task.yaml
+    parameters:
+      name: RunMultiNodePythonTests
+      displayName: Run multi-nodes python tests
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh pytests
 
-  - task: Bash@3
-    name: RunMultiNodePythonBenchmark
-    displayName: Run multi-nodes python benchmark
-    inputs:
-      targetType: 'inline'
-      script: |
-        test/deploy/run-remote.sh --hostfile "$(System.DefaultWorkingDirectory)/test/deploy/hostfile" --host mscclit-000000 --user azureuser \
-          "bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark"
+  - template: templates/run-remote-task.yaml
+    parameters:
+      name: RunMultiNodePythonBenchmark
+      displayName: Run multi-nodes python benchmark
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark
 
   - template: templates/stop.yaml
     parameters:
diff --git a/.azure-pipelines/templates/codecov.yaml b/.azure-pipelines/templates/codecov.yaml
index b82da6bbd..1392601b8 100644
--- a/.azure-pipelines/templates/codecov.yaml
+++ b/.azure-pipelines/templates/codecov.yaml
@@ -22,33 +22,34 @@ steps:
     buildName:        BuildCoverage
     deployArgs:       'single-node-test true ${{ parameters.platform }}'
 
-- task: Bash@3
-  name: TestsCoverageNonPerf
-  displayName: Run unit_tests + mp_unit_tests (non-perf) with coverage
-  inputs:
-    targetType: 'inline'
-    script: |
-      test/deploy/run-remote.sh '\
-        BUILD_PREFIX=\$(cat build/BUILD_PREFIX); \
-        STRIP_COUNT=\$(echo \$BUILD_PREFIX | tr -cd / | wc -c); \
-        export GCOV_PREFIX=/root/mscclpp; \
-        export GCOV_PREFIX_STRIP=\$STRIP_COUNT; \
-        ./build/bin/unit_tests; \
-        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --exclude-perf-tests; \
-        mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests --exclude-perf-tests; \
-        lcov --version; \
-        LCOV_CAPTURE_ARGS=""; \
-        if lcov --help 2>&1 | grep -q "inconsistent"; then \
-          LCOV_CAPTURE_ARGS="--ignore-errors inconsistent"; \
-        fi; \
-        lcov --directory . --capture --output-file coverage.info ${LCOV_CAPTURE_ARGS}; \
-        if [ ! -s coverage.info ]; then \
-          echo "ERROR: coverage.info was not generated. Tests may have failed before coverage capture or produced no gcov data."; \
-          exit 1; \
-        fi; \
-        lcov --extract coverage.info "\${BUILD_PREFIX}/src/*" "\${BUILD_PREFIX}/include/mscclpp/*" --output-file coverage.info; \
-        lcov --list coverage.info'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: run-remote-task.yaml
+  parameters:
+    name: TestsCoverageNonPerf
+    displayName: Run unit_tests + mp_unit_tests (non-perf) with coverage
+    remoteScript: |
+      BUILD_PREFIX=$(cat build/BUILD_PREFIX)
+      STRIP_COUNT=$(echo $BUILD_PREFIX | tr -cd / | wc -c)
+      export GCOV_PREFIX=/root/mscclpp
+      export GCOV_PREFIX_STRIP=$STRIP_COUNT
+
+      ./build/bin/unit_tests
+      mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --exclude-perf-tests
+      mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests --exclude-perf-tests
+
+      lcov --version
+      LCOV_CAPTURE_ARGS=""
+      if lcov --help 2>&1 | grep -q "inconsistent"; then
+        LCOV_CAPTURE_ARGS="--ignore-errors inconsistent"
+      fi
+
+      lcov --directory . --capture --output-file coverage.info ${LCOV_CAPTURE_ARGS}
+      if [ ! -s coverage.info ]; then
+        echo "ERROR: coverage.info was not generated. Tests may have failed before coverage capture or produced no gcov data."
+        exit 1
+      fi
+
+      lcov --extract coverage.info "${BUILD_PREFIX}/src/*" "${BUILD_PREFIX}/include/mscclpp/*" --output-file coverage.info
+      lcov --list coverage.info
 
 - task: Bash@3
   name: FetchCoverage
diff --git a/.azure-pipelines/templates/integration-test.yaml b/.azure-pipelines/templates/integration-test.yaml
index acbb710ff..790854669 100644
--- a/.azure-pipelines/templates/integration-test.yaml
+++ b/.azure-pipelines/templates/integration-test.yaml
@@ -17,82 +17,58 @@ steps:
     gpuArch:          ${{ parameters.gpuArch }}
     deployArgs:       'single-node-test'
 
-- task: Bash@3
-  name: AllGatherTest
-  displayName: Run mscclpp AllGather test
-  inputs:
-    targetType: inline
-    script: |
-      test/deploy/run-remote.sh '\
-        set -e; \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: run-remote-task.yaml
+  parameters:
+    name: AllGatherTest
+    displayName: Run mscclpp AllGather test
+    remoteScript: |
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
 
-- task: Bash@3
-  name: SendRecvTest
-  displayName: Run mscclpp SendRecv test
-  inputs:
-    targetType: inline
-    script: |
-      test/deploy/run-remote.sh '\
-        set -e; \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: run-remote-task.yaml
+  parameters:
+    name: SendRecvTest
+    displayName: Run mscclpp SendRecv test
+    remoteScript: |
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
 
-- task: Bash@3
-  name: AllReduceTest
-  displayName: Run mscclpp AllReduce test
-  inputs:
-    targetType: 'inline'
-    script: |
-      test/deploy/run-remote.sh '\
-        set -e; \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl; \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl; \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl; \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: run-remote-task.yaml
+  parameters:
+    name: AllReduceTest
+    displayName: Run mscclpp AllReduce test
+    remoteScript: |
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
 
-- task: Bash@3
-  name: AllToAll
-  displayName: Run mscclpp AllToAll test
-  inputs:
-    targetType: 'inline'
-    script: |
-      test/deploy/run-remote.sh '\
-        set -e; \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: run-remote-task.yaml
+  parameters:
+    name: AllToAll
+    displayName: Run mscclpp AllToAll test
+    remoteScript: |
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
 
-- task: Bash@3
-  name: CheckPerfNumber
-  displayName: Check collective primitives performance
-  inputs:
-    targetType: 'inline'
-    script: |
-      test/deploy/run-remote.sh '\
-        set -e; \
-        python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: run-remote-task.yaml
+  parameters:
+    name: CheckPerfNumber
+    displayName: Check collective primitives performance
+    remoteScript: |
+      python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}
 
-- task: Bash@3
-  name: PythonAllReduceBenchmark
-  displayName: Python Allreduce Benchmark
-  inputs:
-    targetType: 'inline'
-    script: |
-      test/deploy/run-remote.sh '\
-        set -e; \
-        python3 -m pip install .; \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: run-remote-task.yaml
+  parameters:
+    name: PythonAllReduceBenchmark
+    displayName: Python Allreduce Benchmark
+    remoteScript: |
+      python3 -m pip install .
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
 
 - template: stop.yaml
   parameters:
diff --git a/.azure-pipelines/templates/nccl-test.yaml b/.azure-pipelines/templates/nccl-test.yaml
index c6260e761..c41d4bc19 100644
--- a/.azure-pipelines/templates/nccl-test.yaml
+++ b/.azure-pipelines/templates/nccl-test.yaml
@@ -21,64 +21,54 @@ steps:
     vmssName:         ${{ parameters.vmssName }}
     deployArgs:       'nccltest-single-node'
 
-- task: Bash@3
-  name: InstallNcclTests
-  displayName: Install NCCL Tests
-  inputs:
-    targetType: inline
-    script: |
-      test/deploy/run-remote.sh '\
-        cd; git clone https://github.com/NVIDIA/nccl-tests.git; \
-        cd nccl-tests; \
-        MPI=1 MPI_HOME=/usr/local/mpi make -j'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: run-remote-task.yaml
+  parameters:
+    name: InstallNcclTests
+    displayName: Install NCCL Tests
+    remoteScript: |
+      cd
+      git clone https://github.com/NVIDIA/nccl-tests.git
+      cd nccl-tests
+      MPI=1 MPI_HOME=/usr/local/mpi make -j
 
-- task: Bash@3
-  name: InstallNccl
-  displayName: Install NCCL
-  inputs:
-    targetType: inline
-    script: |
-      test/deploy/run-remote.sh '\
-        LATEST_TAG=\$(curl -fsSL https://api.github.com/repos/NVIDIA/nccl/releases/latest | grep tag_name | cut -d\\\" -f4); \
-        if [ -z \"\$LATEST_TAG\" ]; then echo \"Failed to fetch latest NCCL tag\"; exit 1; fi; \
-        cd; git clone --branch \$LATEST_TAG --depth 1 https://github.com/NVIDIA/nccl.git; \
-        cd nccl; \
-        make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: run-remote-task.yaml
+  parameters:
+    name: InstallNccl
+    displayName: Install NCCL
+    remoteScript: |
+      LATEST_TAG=$(curl -fsSL https://api.github.com/repos/NVIDIA/nccl/releases/latest | grep tag_name | cut -d\" -f4)
+      if [ -z "$LATEST_TAG" ]; then
+        echo "Failed to fetch latest NCCL tag"
+        exit 1
+      fi
+      cd
+      git clone --branch $LATEST_TAG --depth 1 https://github.com/NVIDIA/nccl.git
+      cd nccl
+      make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}
 
-- task: Bash@3
-  name: RunNcclAllGatherFallbaclkToNcclTest
-  displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation
-  inputs:
-    targetType: inline
-    script: |
-      test/deploy/run-remote.sh '\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: run-remote-task.yaml
+  parameters:
+    name: RunNcclAllGatherFallbaclkToNcclTest
+    displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
 
-- task: Bash@3
-  name: RunNcclAllReduceFallbaclkToNcclTest
-  displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation
-  inputs:
-    targetType: 'inline'
-    script: |
-      test/deploy/run-remote.sh '\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: run-remote-task.yaml
+  parameters:
+    name: RunNcclAllReduceFallbaclkToNcclTest
+    displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
 
-- task: Bash@3
-  name: RunNcclBroadcastFallbaclkToNcclTest
-  displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation
-  inputs:
-    targetType: 'inline'
-    script: |
-      test/deploy/run-remote.sh '\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: run-remote-task.yaml
+  parameters:
+    name: RunNcclBroadcastFallbaclkToNcclTest
+    displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
 
 - template: stop.yaml
   parameters:
diff --git a/.azure-pipelines/templates/rccl-test.yaml b/.azure-pipelines/templates/rccl-test.yaml
index 7be3f9936..15c69066b 100644
--- a/.azure-pipelines/templates/rccl-test.yaml
+++ b/.azure-pipelines/templates/rccl-test.yaml
@@ -27,44 +27,35 @@ steps:
     deployArgs:       'single-node-test true rocm'
 
 
-- task: Bash@3
-  name: InstallRcclTests
-  displayName: Install RCCL Tests
-  inputs:
-    targetType: inline
-    script: |
-      test/deploy/run-remote.sh '\
-        cd; \
-        git clone --filter=blob:none --no-checkout https://github.com/ROCm/rocm-systems.git; \
-        cd rocm-systems; \
-        git sparse-checkout init --cone; \
-        git sparse-checkout set projects/rccl-tests; \
-        git checkout; \
-        cd projects/rccl-tests; \
-        MPI=1 MPI_HOME=/usr/local/mpi make -j'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: run-remote-task.yaml
+  parameters:
+    name: InstallRcclTests
+    displayName: Install RCCL Tests
+    remoteScript: |
+      cd
+      git clone --filter=blob:none --no-checkout https://github.com/ROCm/rocm-systems.git
+      cd rocm-systems
+      git sparse-checkout init --cone
+      git sparse-checkout set projects/rccl-tests
+      git checkout
+      cd projects/rccl-tests
+      MPI=1 MPI_HOME=/usr/local/mpi make -j
 
-- task: Bash@3
-  name: RunRcclAllGatherTest
-  displayName: Run RCCL AllGather Test with or without MSCCLPP Lib
-  inputs:
-    targetType: inline
-    script: |
-      test/deploy/run-remote.sh '\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
-        mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: run-remote-task.yaml
+  parameters:
+    name: RunRcclAllGatherTest
+    displayName: Run RCCL AllGather Test with or without MSCCLPP Lib
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
 
-- task: Bash@3
-  name: RunRcclAllReduceTest
-  displayName: Run RCCL AllReduce Test with or without MSCCLPP Lib
-  inputs:
-    targetType: 'inline'
-    script: |
-      test/deploy/run-remote.sh '\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
-        mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: run-remote-task.yaml
+  parameters:
+    name: RunRcclAllReduceTest
+    displayName: Run RCCL AllReduce Test with or without MSCCLPP Lib
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
 
 - template: stop.yaml
   parameters:
diff --git a/.azure-pipelines/templates/run-remote-task.yaml b/.azure-pipelines/templates/run-remote-task.yaml
new file mode 100644
index 000000000..37b3a7d7e
--- /dev/null
+++ b/.azure-pipelines/templates/run-remote-task.yaml
@@ -0,0 +1,27 @@
+parameters:
+- name: name
+  type: string
+  default: ''
+- name: displayName
+  type: string
+- name: runRemoteArgs
+  type: string
+  default: ''
+- name: remoteScript
+  type: string
+- name: workingDirectory
+  type: string
+  default: '$(System.DefaultWorkingDirectory)'
+
+steps:
+- task: Bash@3
+  ${{ if ne(parameters.name, '') }}:
+    name: ${{ parameters.name }}
+  displayName: ${{ parameters.displayName }}
+  inputs:
+    targetType: 'inline'
+    script: |
+      test/deploy/run-remote.sh ${{ parameters.runRemoteArgs }} <<'REMOTE_CMD'
+      ${{ parameters.remoteScript }}
+      REMOTE_CMD
+    workingDirectory: ${{ parameters.workingDirectory }}
diff --git a/.azure-pipelines/templates/ut-no-ib-env.yaml b/.azure-pipelines/templates/ut-no-ib-env.yaml
index 6514fbc3c..956436d53 100644
--- a/.azure-pipelines/templates/ut-no-ib-env.yaml
+++ b/.azure-pipelines/templates/ut-no-ib-env.yaml
@@ -15,46 +15,43 @@ steps:
     cmakeArgs:        '-DMSCCLPP_USE_IB=OFF'
     deployArgs:       'single-node-test false'
 
-- task: Bash@3
-  name: UnitTests
-  displayName: Run mscclpp unit tests
-  inputs:
-    targetType: inline
-    script: |
-      test/deploy/run-remote.sh '\
-        ./build/bin/unit_tests'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: run-remote-task.yaml
+  parameters:
+    name: UnitTests
+    displayName: Run mscclpp unit tests
+    remoteScript: |
+      ./build/bin/unit_tests
 
-- task: Bash@3
-  name: MpUnitTests
-  displayName: Run mscclpp multi-process unit tests
-  inputs:
-    targetType: 'inline'
-    script: |
-      test/deploy/run-remote.sh '\
-        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests; \
-        mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests; \
-        mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: run-remote-task.yaml
+  parameters:
+    name: MpUnitTests
+    displayName: Run mscclpp multi-process unit tests
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
+      mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
+      mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
 
-- task: Bash@3
-  name: PyTests
-  displayName: Run pytests
-  inputs:
-    targetType: inline
-    script: |
-      test/deploy/run-remote.sh '\
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: run-remote-task.yaml
+  parameters:
+    name: PyTests
+    displayName: Run pytests
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
+
+- template: run-remote-task.yaml
+  parameters:
+    name: StopContainer
+    displayName: Stop existing container
+    runRemoteArgs: '--no-docker --no-log'
+    remoteScript: |
+      sudo docker stop mscclpp-test || true
+      sudo docker rm mscclpp-test || true
 
 - task: Bash@3
-  name: StopContainer
-  displayName: Stop existing container
+  displayName: Remove generated SSH key files
   inputs:
     targetType: 'inline'
     script: |
-      test/deploy/run-remote.sh --no-docker --no-log \
-        "sudo docker stop mscclpp-test || true; sudo docker rm mscclpp-test || true"
       rm -f $(System.DefaultWorkingDirectory)/sshkey $(System.DefaultWorkingDirectory)/sshkey.pub
     workingDirectory: '$(System.DefaultWorkingDirectory)'
   
@@ -85,15 +82,12 @@ steps:
     arguments: single-node-test false
     workingDirectory: $(System.DefaultWorkingDirectory)
 
-- task: Bash@3
-  name: PyTestsWithIbBuildDisableIb
-  displayName: Run pytests (IB build, IB tests disabled)
-  inputs:
-    targetType: inline
-    script: |
-      test/deploy/run-remote.sh '\
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: run-remote-task.yaml
+  parameters:
+    name: PyTestsWithIbBuildDisableIb
+    displayName: Run pytests (IB build, IB tests disabled)
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
 
 - template: stop.yaml
   parameters:
diff --git a/.azure-pipelines/templates/ut-npkit.yaml b/.azure-pipelines/templates/ut-npkit.yaml
index 46749db7c..2897a489c 100644
--- a/.azure-pipelines/templates/ut-npkit.yaml
+++ b/.azure-pipelines/templates/ut-npkit.yaml
@@ -16,46 +16,40 @@ steps:
     cmakeArgs:        '-DMSCCLPP_NPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT"'
     deployArgs:       'single-node-test'
 
-- task: Bash@3
-  name: MpUnitTests
-  displayName: Run mscclpp multi-process unit tests
-  inputs:
-    targetType: 'inline'
-    script: |
-      test/deploy/run-remote.sh '\
-        rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
-        export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \
-        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --filter=\"ExecutorTest.TwoNodesAllreduce\"; \
-        python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
-        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
-        grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json; \
-        grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json; \
-        grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: run-remote-task.yaml
+  parameters:
+    name: MpUnitTests
+    displayName: Run mscclpp multi-process unit tests
+    remoteScript: |
+      rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
+      export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump
+      mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --filter="ExecutorTest.TwoNodesAllreduce"
+      python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
+      grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json
 
-- task: Bash@3
-  name: PyTests
-  displayName: Run pytests
-  inputs:
-    targetType: 'inline'
-    script: |
-      test/deploy/run-remote.sh '\
-        rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
-        export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k '"'"'test_executor[allreduce.json'"'"'; \
-        python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
-        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
-        grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json; \
-        grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json; \
-        grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json; \
-        rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k '"'"'test_executor[allreduce_packet.json'"'"'; \
-        python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
-        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
-        grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json; \
-        grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json; \
-        grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: run-remote-task.yaml
+  parameters:
+    name: PyTests
+    displayName: Run pytests
+    remoteScript: |
+      rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
+      export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'
+      python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
+      grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json
+      rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json'
+      python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
+      grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
 
 - template: stop.yaml
   parameters:
diff --git a/.azure-pipelines/templates/ut.yaml b/.azure-pipelines/templates/ut.yaml
index 4bc1a9aec..c828783df 100644
--- a/.azure-pipelines/templates/ut.yaml
+++ b/.azure-pipelines/templates/ut.yaml
@@ -19,37 +19,28 @@ steps:
     deployArgs:       'single-node-test true ${{ parameters.platform }}'
 
 
-- task: Bash@3
-  name: UnitTests
-  displayName: Run mscclpp unit tests
-  inputs:
-    targetType: inline
-    script: |
-      test/deploy/run-remote.sh '\
-        ./build/bin/unit_tests'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: run-remote-task.yaml
+  parameters:
+    name: UnitTests
+    displayName: Run mscclpp unit tests
+    remoteScript: |
+      ./build/bin/unit_tests
 
-- task: Bash@3
-  name: MpUnitTests
-  displayName: Run mscclpp multi-process unit tests
-  inputs:
-    targetType: 'inline'
-    script: |
-      test/deploy/run-remote.sh '\
-        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests; \
-        mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests; \
-        mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: run-remote-task.yaml
+  parameters:
+    name: MpUnitTests
+    displayName: Run mscclpp multi-process unit tests
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
+      mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
+      mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
 
-- task: Bash@3
-  name: PyTests
-  displayName: Run pytests
-  inputs:
-    targetType: inline
-    script: |
-      test/deploy/run-remote.sh '\
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
+- template: run-remote-task.yaml
+  parameters:
+    name: PyTests
+    displayName: Run pytests
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
 
 - template: stop.yaml
   parameters:
diff --git a/test/deploy/run-remote.sh b/test/deploy/run-remote.sh
index ee25b6b87..bdb7c0ba3 100755
--- a/test/deploy/run-remote.sh
+++ b/test/deploy/run-remote.sh
@@ -3,7 +3,7 @@
 # By default, runs inside the mscclpp-test docker container.
 #
 # Usage:
-#   run-remote.sh [OPTIONS] <command>
+#   run-remote.sh [OPTIONS] < <command_script>
 #
 # Options:
 #   --no-docker   Run command directly on the host, not inside docker
@@ -24,31 +24,35 @@ USE_LOG=true
 TARGET_HOST=""
 REMOTE_USER=""
 
+usage() {
+    echo "Usage: $0 [--no-docker] [--no-log] [--hostfile <path>] [--host <name>] [--user <name>] < <command_script>" >&2
+}
+
+require_value() {
+    local opt="$1"
+    local val="$2"
+    if [ -z "$val" ]; then
+        echo "Missing value for ${opt}" >&2
+        exit 1
+    fi
+}
+
 while [[ "$1" == --* ]]; do
     case "$1" in
         --no-docker) USE_DOCKER=false; shift ;;
         --no-log)    USE_LOG=false; shift ;;
         --hostfile)
-            if [ -z "$2" ]; then
-                echo "Missing value for --hostfile" >&2
-                exit 1
-            fi
+            require_value "--hostfile" "${2-}"
             HOSTFILE="$2"
             shift 2
             ;;
         --host)
-            if [ -z "$2" ]; then
-                echo "Missing value for --host" >&2
-                exit 1
-            fi
+            require_value "--host" "${2-}"
             TARGET_HOST="$2"
             shift 2
             ;;
         --user)
-            if [ -z "$2" ]; then
-                echo "Missing value for --user" >&2
-                exit 1
-            fi
+            require_value "--user" "${2-}"
             REMOTE_USER="$2"
             shift 2
             ;;
@@ -56,11 +60,16 @@ while [[ "$1" == --* ]]; do
     esac
 done
 
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 [--no-docker] [--no-log] <command>" >&2
+if [ $# -ne 0 ] || [ -t 0 ]; then
+    usage
+    exit 1
+fi
+
+CMD=$(cat)
+if [ -z "$CMD" ]; then
+    usage
     exit 1
 fi
-CMD="$*"
 CMD_B64=$(printf '%s' "$CMD" | base64 | tr -d '\n')
 
 PSSH_TARGET_ARGS=()
@@ -76,12 +85,8 @@ if [ -n "$REMOTE_USER" ]; then
 fi
 
 if $USE_LOG; then
-    if [ -n "$TARGET_HOST" ]; then
-        HOST="$TARGET_HOST"
-    else
-        HOST=$(head -1 "${HOSTFILE}")
-        HOST="${HOST##*@}"
-    fi
+    HOST="${TARGET_HOST:-$(head -1 "${HOSTFILE}")}"
+    HOST="${HOST##*@}"
     : > "${HOST}"
     tail -f "${HOST}" &
     CHILD_PID=$!
@@ -90,8 +95,8 @@ fi
 
 if $USE_DOCKER; then
     parallel-ssh -t 0 "${PSSH_TARGET_ARGS[@]}" "${PSSH_USER_ARGS[@]}" -x "-i ${KeyFilePath}" -o . \
-        -O "$SSH_OPTION" "sudo docker exec -t mscclpp-test bash -c \"set -ex; pushd /root/mscclpp >/dev/null; trap 'popd >/dev/null' EXIT; CMD_B64='${CMD_B64}'; eval \\\"\\\$(printf '%s' \\\"\\\$CMD_B64\\\" | base64 -d)\\\"\""
+    -O "$SSH_OPTION" "sudo docker exec -t mscclpp-test bash -c \"set -euxo pipefail; pushd /root/mscclpp >/dev/null; trap 'popd >/dev/null' EXIT; CMD_B64='${CMD_B64}'; printf '%s' \\\"\\\$CMD_B64\\\" | base64 -d | bash -euxo pipefail\""
 else
     parallel-ssh -i -t 0 "${PSSH_TARGET_ARGS[@]}" "${PSSH_USER_ARGS[@]}" -x "-i ${KeyFilePath}" \
-        -O "$SSH_OPTION" "set -ex; CMD_B64='${CMD_B64}'; eval \"\$(printf '%s' \"\$CMD_B64\" | base64 -d)\""
+        -O "$SSH_OPTION" "set -euxo pipefail; CMD_B64='${CMD_B64}'; printf '%s' \"\$CMD_B64\" | base64 -d | bash -euxo pipefail"
 fi

From a9cf93863f261040172e21e70f3cfce750ea7b8b Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 9 Mar 2026 23:49:54 +0000
Subject: [PATCH 068/132] fix

---
 test/deploy/run-remote.sh | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/test/deploy/run-remote.sh b/test/deploy/run-remote.sh
index bdb7c0ba3..89679ed9a 100755
--- a/test/deploy/run-remote.sh
+++ b/test/deploy/run-remote.sh
@@ -93,10 +93,24 @@ if $USE_LOG; then
     trap "kill $CHILD_PID 2>/dev/null" EXIT
 fi
 
+PSSH_COMMON=(
+    -t 0
+    "${PSSH_TARGET_ARGS[@]}"
+    "${PSSH_USER_ARGS[@]}"
+    -x "-i ${KeyFilePath}"
+    -O "$SSH_OPTION"
+)
+
 if $USE_DOCKER; then
-    parallel-ssh -t 0 "${PSSH_TARGET_ARGS[@]}" "${PSSH_USER_ARGS[@]}" -x "-i ${KeyFilePath}" -o . \
-    -O "$SSH_OPTION" "sudo docker exec -t mscclpp-test bash -c \"set -euxo pipefail; pushd /root/mscclpp >/dev/null; trap 'popd >/dev/null' EXIT; CMD_B64='${CMD_B64}'; printf '%s' \\\"\\\$CMD_B64\\\" | base64 -d | bash -euxo pipefail\""
+    INNER="set -euxo pipefail;"
+    INNER+=" cd /root/mscclpp;"
+    INNER+=" export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\\\$LD_LIBRARY_PATH;"
+    INNER+=" CMD_B64='${CMD_B64}';"
+    INNER+=" printf '%s' \\\"\\\$CMD_B64\\\" | base64 -d | bash -euxo pipefail"
+
+    parallel-ssh "${PSSH_COMMON[@]}" -o . \
+        "sudo docker exec -t mscclpp-test bash -c \"${INNER}\""
 else
-    parallel-ssh -i -t 0 "${PSSH_TARGET_ARGS[@]}" "${PSSH_USER_ARGS[@]}" -x "-i ${KeyFilePath}" \
-        -O "$SSH_OPTION" "set -euxo pipefail; CMD_B64='${CMD_B64}'; printf '%s' \"\$CMD_B64\" | base64 -d | bash -euxo pipefail"
+    parallel-ssh -i "${PSSH_COMMON[@]}" \
+        "set -euxo pipefail; CMD_B64='${CMD_B64}'; printf '%s' \"\$CMD_B64\" | base64 -d | bash -euxo pipefail"
 fi

From 6647338fb430e9c46c17c436a7226b0523538c84 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 10 Mar 2026 17:50:04 +0000
Subject: [PATCH 069/132] debugging

---
 test/deploy/run-remote.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/deploy/run-remote.sh b/test/deploy/run-remote.sh
index 89679ed9a..c30634eba 100755
--- a/test/deploy/run-remote.sh
+++ b/test/deploy/run-remote.sh
@@ -108,6 +108,9 @@ if $USE_DOCKER; then
     INNER+=" CMD_B64='${CMD_B64}';"
     INNER+=" printf '%s' \\\"\\\$CMD_B64\\\" | base64 -d | bash -euxo pipefail"
 
+    FULL_CMD="sudo docker exec -t mscclpp-test bash -c \"${INNER}\""
+    echo "[run-remote.sh] executing: ${FULL_CMD}" >&2
+
     parallel-ssh "${PSSH_COMMON[@]}" -o . \
         "sudo docker exec -t mscclpp-test bash -c \"${INNER}\""
 else

From 7a87c2c856a3f8c9b77cef0425dc44c00b48a04d Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 10 Mar 2026 20:51:22 +0000
Subject: [PATCH 070/132] debugging

---
 test/deploy/run-remote.sh | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/test/deploy/run-remote.sh b/test/deploy/run-remote.sh
index c30634eba..a631a391c 100755
--- a/test/deploy/run-remote.sh
+++ b/test/deploy/run-remote.sh
@@ -84,15 +84,6 @@ if [ -n "$REMOTE_USER" ]; then
     PSSH_USER_ARGS=(-l "$REMOTE_USER")
 fi
 
-if $USE_LOG; then
-    HOST="${TARGET_HOST:-$(head -1 "${HOSTFILE}")}"
-    HOST="${HOST##*@}"
-    : > "${HOST}"
-    tail -f "${HOST}" &
-    CHILD_PID=$!
-    trap "kill $CHILD_PID 2>/dev/null" EXIT
-fi
-
 PSSH_COMMON=(
     -t 0
     "${PSSH_TARGET_ARGS[@]}"
@@ -108,10 +99,7 @@ if $USE_DOCKER; then
     INNER+=" CMD_B64='${CMD_B64}';"
     INNER+=" printf '%s' \\\"\\\$CMD_B64\\\" | base64 -d | bash -euxo pipefail"
 
-    FULL_CMD="sudo docker exec -t mscclpp-test bash -c \"${INNER}\""
-    echo "[run-remote.sh] executing: ${FULL_CMD}" >&2
-
-    parallel-ssh "${PSSH_COMMON[@]}" -o . \
+    parallel-ssh -i "${PSSH_COMMON[@]}" \
         "sudo docker exec -t mscclpp-test bash -c \"${INNER}\""
 else
     parallel-ssh -i "${PSSH_COMMON[@]}" \

From cf505d777ae92175a7fa2d7789cc88682583e3e5 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 10 Mar 2026 22:18:41 +0000
Subject: [PATCH 071/132] debugging

---
 .azure-pipelines/templates/codecov.yaml |  5 +++++
 .azure-pipelines/templates/deploy.yaml  |  3 +++
 test/deploy/deploy.sh                   | 19 ++++++++++++++++---
 3 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/.azure-pipelines/templates/codecov.yaml b/.azure-pipelines/templates/codecov.yaml
index 1392601b8..f912db4cb 100644
--- a/.azure-pipelines/templates/codecov.yaml
+++ b/.azure-pipelines/templates/codecov.yaml
@@ -27,6 +27,11 @@ steps:
     name: TestsCoverageNonPerf
     displayName: Run unit_tests + mp_unit_tests (non-perf) with coverage
     remoteScript: |
+      echo "=== build/bin/ contents ==="
+      ls -la build/bin/ 2>&1 || echo "ERROR: build/bin/ not found"
+      echo "=== build/ top-level ==="
+      ls build/ 2>&1 || echo "ERROR: build/ not found"
+
       BUILD_PREFIX=$(cat build/BUILD_PREFIX)
       STRIP_COUNT=$(echo $BUILD_PREFIX | tr -cd / | wc -c)
       export GCOV_PREFIX=/root/mscclpp
diff --git a/.azure-pipelines/templates/deploy.yaml b/.azure-pipelines/templates/deploy.yaml
index 2e6ccc512..df998497b 100644
--- a/.azure-pipelines/templates/deploy.yaml
+++ b/.azure-pipelines/templates/deploy.yaml
@@ -89,6 +89,9 @@ steps:
       make -j
       cd ..
       pwd > build/BUILD_PREFIX
+      echo "=== Build artifacts ==="
+      ls -la build/bin/ || echo "ERROR: build/bin/ missing after build"
+      du -sh build/bin/* 2>/dev/null || true
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 # 2. Download SSH key + install packages + start VMSS
diff --git a/test/deploy/deploy.sh b/test/deploy/deploy.sh
index b26ff1a85..915a37ebc 100644
--- a/test/deploy/deploy.sh
+++ b/test/deploy/deploy.sh
@@ -1,4 +1,4 @@
-set -e
+set -ex
 
 TEST_NAME=$1
 IB_ENVIRONMENT="${2:-true}"
@@ -32,8 +32,21 @@ while true; do
 done
 
 set -e
-parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo rm -rf ${DST_DIR}"
-parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
+# Transfer workspace to remote hosts via tar+ssh (more reliable than parallel-scp for large files)
+while IFS= read -r HOST; do
+  HOST_ADDR="${HOST##*@}"
+  HOST_USER="${HOST%%@*}"
+  if [ "${HOST_USER}" = "${HOST_ADDR}" ]; then
+    HOST_USER=""
+  fi
+  SSH_DEST="${HOST}"
+  echo "Deploying to ${SSH_DEST}..."
+  ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${SSH_DEST} "sudo rm -rf ${DST_DIR} && mkdir -p ${DST_DIR}"
+  tar cf - -C "$(dirname "${ROOT_DIR}")" "$(basename "${ROOT_DIR}")" | \
+    ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${SSH_DEST} "tar xf - -C ${DST_DIR} --strip-components=1"
+  echo "Verifying transfer to ${SSH_DEST}..."
+  ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${SSH_DEST} "ls ${DST_DIR}/build/bin/ 2>&1 || echo 'ERROR: build/bin/ missing after transfer'"
+done < ${HOSTFILE}
 
 if [ "${PLATFORM}" == "rocm" ]; then
   parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo modprobe amdgpu"

From 757c0ecc6ac752bd98b183c20a7d7564efa21995 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 11 Mar 2026 01:00:12 +0000
Subject: [PATCH 072/132] debugging

---
 test/deploy/deploy.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/deploy/deploy.sh b/test/deploy/deploy.sh
index 915a37ebc..edfe7dbd8 100644
--- a/test/deploy/deploy.sh
+++ b/test/deploy/deploy.sh
@@ -33,7 +33,7 @@ done
 
 set -e
 # Transfer workspace to remote hosts via tar+ssh (more reliable than parallel-scp for large files)
-while IFS= read -r HOST; do
+while IFS= read -r HOST || [ -n "$HOST" ]; do
   HOST_ADDR="${HOST##*@}"
   HOST_USER="${HOST%%@*}"
   if [ "${HOST_USER}" = "${HOST_ADDR}" ]; then

From e2a5be467d39f8bd62b022852c70efe98ee2cbcb Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 11 Mar 2026 02:40:50 +0000
Subject: [PATCH 073/132] debugging

---
 .azure-pipelines/templates/deploy.yaml |  4 ++--
 test/deploy/deploy.sh                  | 17 ++---------------
 2 files changed, 4 insertions(+), 17 deletions(-)

diff --git a/.azure-pipelines/templates/deploy.yaml b/.azure-pipelines/templates/deploy.yaml
index df998497b..1da3ce3ba 100644
--- a/.azure-pipelines/templates/deploy.yaml
+++ b/.azure-pipelines/templates/deploy.yaml
@@ -17,8 +17,8 @@ parameters:
   type: string
   default: 'Release'
 - name: buildTests
-  type: boolean
-  default: true
+  type: string
+  default: 'true'
 - name: cmakeArgs
   type: string
   default: ''
diff --git a/test/deploy/deploy.sh b/test/deploy/deploy.sh
index edfe7dbd8..1f1d0e524 100644
--- a/test/deploy/deploy.sh
+++ b/test/deploy/deploy.sh
@@ -32,21 +32,8 @@ while true; do
 done
 
 set -e
-# Transfer workspace to remote hosts via tar+ssh (more reliable than parallel-scp for large files)
-while IFS= read -r HOST || [ -n "$HOST" ]; do
-  HOST_ADDR="${HOST##*@}"
-  HOST_USER="${HOST%%@*}"
-  if [ "${HOST_USER}" = "${HOST_ADDR}" ]; then
-    HOST_USER=""
-  fi
-  SSH_DEST="${HOST}"
-  echo "Deploying to ${SSH_DEST}..."
-  ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${SSH_DEST} "sudo rm -rf ${DST_DIR} && mkdir -p ${DST_DIR}"
-  tar cf - -C "$(dirname "${ROOT_DIR}")" "$(basename "${ROOT_DIR}")" | \
-    ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${SSH_DEST} "tar xf - -C ${DST_DIR} --strip-components=1"
-  echo "Verifying transfer to ${SSH_DEST}..."
-  ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${SSH_DEST} "ls ${DST_DIR}/build/bin/ 2>&1 || echo 'ERROR: build/bin/ missing after transfer'"
-done < ${HOSTFILE}
+parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo rm -rf ${DST_DIR}"
+parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
 
 if [ "${PLATFORM}" == "rocm" ]; then
   parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo modprobe amdgpu"

From 2a705f52e11fbb503dec07e39aa20f0759512a60 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 11 Mar 2026 20:38:54 +0000
Subject: [PATCH 074/132] fix merge

---
 test/mp_unit/switch_channel_tests.cu | 63 ++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/test/mp_unit/switch_channel_tests.cu b/test/mp_unit/switch_channel_tests.cu
index 710fd84a8..6d913c649 100644
--- a/test/mp_unit/switch_channel_tests.cu
+++ b/test/mp_unit/switch_channel_tests.cu
@@ -23,6 +23,8 @@ void SwitchChannelTest::SetUp() {
 void SwitchChannelTest::TearDown() { CommunicatorTestBase::TearDown(); }
 
 __constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan;
+__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan1;
+__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan2;
 
 __global__ void kernelSwitchReduce() {
 #if (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
@@ -31,6 +33,15 @@ __global__ void kernelSwitchReduce() {
 #endif  // (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
 }
 
+__global__ void kernelSwitchReduceTwo() {
+#if (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
+  auto val1 = gConstSwitchChan1.reduce<mscclpp::f32x1>(0);
+  gConstSwitchChan1.broadcast(0, val1);
+  auto val2 = gConstSwitchChan2.reduce<mscclpp::f32x1>(0);
+  gConstSwitchChan2.broadcast(0, val2);
+#endif  // (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
+}
+
 TEST(SwitchChannelTest, SimpleAllReduce) {
   if (gEnv->rank >= numRanksToUse) return;
 
@@ -71,3 +82,55 @@ TEST(SwitchChannelTest, SimpleAllReduce) {
   }
   ASSERT_EQ(result, expected);
 }
+
+TEST(SwitchChannelTest, TwoChannelsSameConnection) {
+  if (gEnv->rank >= numRanksToUse) return;
+
+  std::vector<int> ranks;
+  for (int i = 0; i < numRanksToUse; i++) {
+    ranks.push_back(i);
+  }
+
+  const size_t bufSize = 1024;
+  auto buffer1 = mscclpp::GpuBuffer<float>(bufSize / sizeof(float));
+  auto buffer2 = mscclpp::GpuBuffer<float>(bufSize / sizeof(float));
+  float data1 = (gEnv->rank + 1.0f) * 1.0f;
+  float data2 = (gEnv->rank + 1.0f) * 10.0f;
+  MSCCLPP_CUDATHROW(cudaMemcpy(buffer1.data(), &data1, sizeof(data1), cudaMemcpyHostToDevice));
+  MSCCLPP_CUDATHROW(cudaMemcpy(buffer2.data(), &data2, sizeof(data2), cudaMemcpyHostToDevice));
+
+  const size_t connSize = buffer1.bytes() + buffer2.bytes();
+  auto nvlsConnection = mscclpp::connectNvlsCollective(communicator, ranks, connSize);
+
+  auto switchChannel1 = nvlsConnection->bindAllocatedMemory(CUdeviceptr(buffer1.data()), bufSize);
+  auto switchChannel2 = nvlsConnection->bindAllocatedMemory(CUdeviceptr(buffer2.data()), bufSize);
+
+  auto deviceHandle1 = switchChannel1.deviceHandle();
+  auto deviceHandle2 = switchChannel2.deviceHandle();
+
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gConstSwitchChan1, &deviceHandle1, sizeof(deviceHandle1)));
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gConstSwitchChan2, &deviceHandle2, sizeof(deviceHandle2)));
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+
+  communicator->bootstrap()->barrier();
+
+  if (gEnv->rank == 0) {
+    kernelSwitchReduceTwo<<<1, 1>>>();
+    MSCCLPP_CUDATHROW(cudaGetLastError());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+  }
+  communicator->bootstrap()->barrier();
+
+  float result1, result2;
+  MSCCLPP_CUDATHROW(cudaMemcpy(&result1, buffer1.data(), sizeof(result1), cudaMemcpyDeviceToHost));
+  MSCCLPP_CUDATHROW(cudaMemcpy(&result2, buffer2.data(), sizeof(result2), cudaMemcpyDeviceToHost));
+
+  float expected1 = 0.0f;
+  float expected2 = 0.0f;
+  for (int i = 0; i < numRanksToUse; i++) {
+    expected1 += (i + 1.0f) * 1.0f;
+    expected2 += (i + 1.0f) * 10.0f;
+  }
+  ASSERT_EQ(result1, expected1);
+  ASSERT_EQ(result2, expected2);
+}

From e2a96926749453359ec56fa0f420d0ce95cf1326 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 11 Mar 2026 21:04:45 +0000
Subject: [PATCH 075/132] fix merge

---
 test/mp_unit/switch_channel_tests.cu | 67 ----------------------------
 1 file changed, 67 deletions(-)

diff --git a/test/mp_unit/switch_channel_tests.cu b/test/mp_unit/switch_channel_tests.cu
index e3c31f1dd..6d913c649 100644
--- a/test/mp_unit/switch_channel_tests.cu
+++ b/test/mp_unit/switch_channel_tests.cu
@@ -134,70 +134,3 @@ TEST(SwitchChannelTest, TwoChannelsSameConnection) {
   ASSERT_EQ(result1, expected1);
   ASSERT_EQ(result2, expected2);
 }
-
-__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan1;
-__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan2;
-
-__global__ void kernelSwitchReduceTwo() {
-#if (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
-  auto val1 = gConstSwitchChan1.reduce<mscclpp::f32x1>(0);
-  gConstSwitchChan1.broadcast(0, val1);
-  auto val2 = gConstSwitchChan2.reduce<mscclpp::f32x1>(0);
-  gConstSwitchChan2.broadcast(0, val2);
-#endif  // (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
-}
-
-TEST_F(SwitchChannelTest, TwoChannelsSameConnection) {
-  if (gEnv->rank >= numRanksToUse) return;
-
-  std::vector<int> ranks;
-  for (int i = 0; i < numRanksToUse; i++) {
-    ranks.push_back(i);
-  }
-
-  const size_t bufSize = 1024;
-  auto buffer1 = mscclpp::GpuBuffer<float>(bufSize / sizeof(float));
-  auto buffer2 = mscclpp::GpuBuffer<float>(bufSize / sizeof(float));
-  float data1 = (gEnv->rank + 1.0f) * 1.0f;
-  float data2 = (gEnv->rank + 1.0f) * 10.0f;
-  MSCCLPP_CUDATHROW(cudaMemcpy(buffer1.data(), &data1, sizeof(data1), cudaMemcpyHostToDevice));
-  MSCCLPP_CUDATHROW(cudaMemcpy(buffer2.data(), &data2, sizeof(data2), cudaMemcpyHostToDevice));
-
-  // Connection size must be large enough for two granularity-aligned buffers.
-  // The multicast granularity is typically 2MB, so we need at least 2 * 2MB.
-  const size_t connSize = buffer1.bytes() + buffer2.bytes();
-  auto nvlsConnection = mscclpp::connectNvlsCollective(communicator, ranks, connSize);
-
-  // Bind two separate buffers to the same connection
-  auto switchChannel1 = nvlsConnection->bindAllocatedMemory(CUdeviceptr(buffer1.data()), bufSize);
-  auto switchChannel2 = nvlsConnection->bindAllocatedMemory(CUdeviceptr(buffer2.data()), bufSize);
-
-  auto deviceHandle1 = switchChannel1.deviceHandle();
-  auto deviceHandle2 = switchChannel2.deviceHandle();
-
-  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gConstSwitchChan1, &deviceHandle1, sizeof(deviceHandle1)));
-  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gConstSwitchChan2, &deviceHandle2, sizeof(deviceHandle2)));
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  communicator->bootstrap()->barrier();
-
-  if (gEnv->rank == 0) {
-    kernelSwitchReduceTwo<<<1, 1>>>();
-    MSCCLPP_CUDATHROW(cudaGetLastError());
-    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-  }
-  communicator->bootstrap()->barrier();
-
-  float result1, result2;
-  MSCCLPP_CUDATHROW(cudaMemcpy(&result1, buffer1.data(), sizeof(result1), cudaMemcpyDeviceToHost));
-  MSCCLPP_CUDATHROW(cudaMemcpy(&result2, buffer2.data(), sizeof(result2), cudaMemcpyDeviceToHost));
-
-  float expected1 = 0.0f;
-  float expected2 = 0.0f;
-  for (int i = 0; i < numRanksToUse; i++) {
-    expected1 += (i + 1.0f) * 1.0f;
-    expected2 += (i + 1.0f) * 10.0f;
-  }
-  ASSERT_EQ(result1, expected1) << "Channel1: expected " << expected1 << " but got " << result1;
-  ASSERT_EQ(result2, expected2) << "Channel2: expected " << expected2 << " but got " << result2;
-}

From 2c4bab8359ac48bb675e2ba349525bb2442afb73 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 16 Mar 2026 18:37:57 +0000
Subject: [PATCH 076/132] fix

---
 .azure-pipelines/templates/codecov.yaml | 2 +-
 test/CMakeLists.txt                     | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.azure-pipelines/templates/codecov.yaml b/.azure-pipelines/templates/codecov.yaml
index f912db4cb..83845d9d5 100644
--- a/.azure-pipelines/templates/codecov.yaml
+++ b/.azure-pipelines/templates/codecov.yaml
@@ -68,7 +68,7 @@ steps:
       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
       HOST=$(head -1 ${HOSTFILE})
       ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST} \
-        'sudo docker cp mscclpp-test:/root/mscclpp/build/coverage.info /tmp/coverage.info'
+        'sudo docker cp mscclpp-test:/root/mscclpp/coverage.info /tmp/coverage.info'
       scp -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST}:/tmp/coverage.info $(System.DefaultWorkingDirectory)/coverage.info
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index a7c1417c9..82b799dca 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -16,6 +16,7 @@ if(MSCCLPP_USE_ROCM)
     foreach(arch ${MSCCLPP_GPU_ARCHS})
         add_compile_options(--offload-arch=${arch})
     endforeach()
+    add_compile_definitions(__HIP_PLATFORM_AMD__)
 endif()
 
 function(add_test_executable name sources)

From a937ce4a8dc25f658274566de32c99a1f423aaf9 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 16 Mar 2026 20:35:46 +0000
Subject: [PATCH 077/132] debugging

---
 .azure-pipelines/templates/codecov.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.azure-pipelines/templates/codecov.yaml b/.azure-pipelines/templates/codecov.yaml
index 83845d9d5..abffca6e7 100644
--- a/.azure-pipelines/templates/codecov.yaml
+++ b/.azure-pipelines/templates/codecov.yaml
@@ -55,6 +55,8 @@ steps:
 
       lcov --extract coverage.info "${BUILD_PREFIX}/src/*" "${BUILD_PREFIX}/include/mscclpp/*" --output-file coverage.info
       lcov --list coverage.info
+      echo "=== coverage.info location ==="
+      ls -la $(pwd)/coverage.info
 
 - task: Bash@3
   name: FetchCoverage
@@ -67,6 +69,8 @@ steps:
       SSH_OPTION="StrictHostKeyChecking=no"
       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
       HOST=$(head -1 ${HOSTFILE})
+      ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST} \
+        'sudo docker exec mscclpp-test ls -la /root/mscclpp/coverage.info 2>&1 || echo "NOT FOUND in container"; sudo docker exec mscclpp-test find /root/mscclpp -name coverage.info 2>/dev/null || true'
       ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST} \
         'sudo docker cp mscclpp-test:/root/mscclpp/coverage.info /tmp/coverage.info'
       scp -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST}:/tmp/coverage.info $(System.DefaultWorkingDirectory)/coverage.info

From d66d7e47436eabf5d83303e5ff567461e574aab4 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 17 Mar 2026 01:41:40 +0000
Subject: [PATCH 078/132] debugging

---
 .azure-pipelines/templates/codecov.yaml | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/.azure-pipelines/templates/codecov.yaml b/.azure-pipelines/templates/codecov.yaml
index abffca6e7..5075b7259 100644
--- a/.azure-pipelines/templates/codecov.yaml
+++ b/.azure-pipelines/templates/codecov.yaml
@@ -27,11 +27,6 @@ steps:
     name: TestsCoverageNonPerf
     displayName: Run unit_tests + mp_unit_tests (non-perf) with coverage
     remoteScript: |
-      echo "=== build/bin/ contents ==="
-      ls -la build/bin/ 2>&1 || echo "ERROR: build/bin/ not found"
-      echo "=== build/ top-level ==="
-      ls build/ 2>&1 || echo "ERROR: build/ not found"
-
       BUILD_PREFIX=$(cat build/BUILD_PREFIX)
       STRIP_COUNT=$(echo $BUILD_PREFIX | tr -cd / | wc -c)
       export GCOV_PREFIX=/root/mscclpp
@@ -41,6 +36,13 @@ steps:
       mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --exclude-perf-tests
       mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests --exclude-perf-tests
 
+- template: run-remote-task.yaml
+  parameters:
+    name: CaptureCoverage
+    displayName: Capture coverage data with lcov
+    remoteScript: |
+      BUILD_PREFIX=$(cat build/BUILD_PREFIX)
+
       lcov --version
       LCOV_CAPTURE_ARGS=""
       if lcov --help 2>&1 | grep -q "inconsistent"; then
@@ -49,14 +51,13 @@ steps:
 
       lcov --directory . --capture --output-file coverage.info ${LCOV_CAPTURE_ARGS}
       if [ ! -s coverage.info ]; then
-        echo "ERROR: coverage.info was not generated. Tests may have failed before coverage capture or produced no gcov data."
+        echo "ERROR: coverage.info was not generated."
         exit 1
       fi
 
       lcov --extract coverage.info "${BUILD_PREFIX}/src/*" "${BUILD_PREFIX}/include/mscclpp/*" --output-file coverage.info
       lcov --list coverage.info
-      echo "=== coverage.info location ==="
-      ls -la $(pwd)/coverage.info
+      ls -la coverage.info
 
 - task: Bash@3
   name: FetchCoverage
@@ -69,8 +70,6 @@ steps:
       SSH_OPTION="StrictHostKeyChecking=no"
       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
       HOST=$(head -1 ${HOSTFILE})
-      ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST} \
-        'sudo docker exec mscclpp-test ls -la /root/mscclpp/coverage.info 2>&1 || echo "NOT FOUND in container"; sudo docker exec mscclpp-test find /root/mscclpp -name coverage.info 2>/dev/null || true'
       ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST} \
         'sudo docker cp mscclpp-test:/root/mscclpp/coverage.info /tmp/coverage.info'
       scp -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST}:/tmp/coverage.info $(System.DefaultWorkingDirectory)/coverage.info

From 5a65cc7aba6c8bcd68b531fb67f314f36b599a87 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 17 Mar 2026 20:00:34 +0000
Subject: [PATCH 079/132] debugging

---
 .azure-pipelines/templates/codecov.yaml | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/.azure-pipelines/templates/codecov.yaml b/.azure-pipelines/templates/codecov.yaml
index 5075b7259..973e6d072 100644
--- a/.azure-pipelines/templates/codecov.yaml
+++ b/.azure-pipelines/templates/codecov.yaml
@@ -35,6 +35,7 @@ steps:
       ./build/bin/unit_tests
       mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --exclude-perf-tests
       mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests --exclude-perf-tests
+      echo "Done"
 
 - template: run-remote-task.yaml
   parameters:
@@ -43,19 +44,29 @@ steps:
     remoteScript: |
       BUILD_PREFIX=$(cat build/BUILD_PREFIX)
 
+      # On ROCm, hipcc (Clang) generates coverage data incompatible with GCC's gcov.
+      # Use llvm-cov gcov via a wrapper so lcov can read the data.
+      GCOV_TOOL_ARG=""
+      if command -v llvm-cov >/dev/null 2>&1; then
+        GCOV_WRAPPER=$(mktemp)
+        printf '#!/bin/sh\nexec llvm-cov gcov "$@"\n' > "$GCOV_WRAPPER"
+        chmod +x "$GCOV_WRAPPER"
+        GCOV_TOOL_ARG="--gcov-tool ${GCOV_WRAPPER}"
+      fi
+
       lcov --version
       LCOV_CAPTURE_ARGS=""
       if lcov --help 2>&1 | grep -q "inconsistent"; then
         LCOV_CAPTURE_ARGS="--ignore-errors inconsistent"
       fi
 
-      lcov --directory . --capture --output-file coverage.info ${LCOV_CAPTURE_ARGS}
+      lcov ${GCOV_TOOL_ARG} --directory . --capture --output-file coverage.info ${LCOV_CAPTURE_ARGS}
       if [ ! -s coverage.info ]; then
         echo "ERROR: coverage.info was not generated."
         exit 1
       fi
 
-      lcov --extract coverage.info "${BUILD_PREFIX}/src/*" "${BUILD_PREFIX}/include/mscclpp/*" --output-file coverage.info
+      lcov ${GCOV_TOOL_ARG} --extract coverage.info "${BUILD_PREFIX}/src/*" "${BUILD_PREFIX}/include/mscclpp/*" --output-file coverage.info
       lcov --list coverage.info
       ls -la coverage.info
 

From 2297a3deda9849c6ed85b8b10b29deb3c3831c48 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 18 Mar 2026 00:58:08 +0000
Subject: [PATCH 080/132] updates

---
 .azure-pipelines/templates/codecov.yaml | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/.azure-pipelines/templates/codecov.yaml b/.azure-pipelines/templates/codecov.yaml
index 973e6d072..03d392e36 100644
--- a/.azure-pipelines/templates/codecov.yaml
+++ b/.azure-pipelines/templates/codecov.yaml
@@ -32,10 +32,17 @@ steps:
       export GCOV_PREFIX=/root/mscclpp
       export GCOV_PREFIX_STRIP=$STRIP_COUNT
 
+      echo "Running unit_tests..."
       ./build/bin/unit_tests
+      echo "unit_tests: PASSED"
+
+      echo "Running mp_unit_tests -np 2..."
       mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --exclude-perf-tests
+      echo "mp_unit_tests -np 2: PASSED"
+
+      echo "Running mp_unit_tests -np 4..."
       mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests --exclude-perf-tests
-      echo "Done"
+      echo "mp_unit_tests -np 4: PASSED"
 
 - template: run-remote-task.yaml
   parameters:
@@ -44,10 +51,9 @@ steps:
     remoteScript: |
       BUILD_PREFIX=$(cat build/BUILD_PREFIX)
 
-      # On ROCm, hipcc (Clang) generates coverage data incompatible with GCC's gcov.
-      # Use llvm-cov gcov via a wrapper so lcov can read the data.
       GCOV_TOOL_ARG=""
-      if command -v llvm-cov >/dev/null 2>&1; then
+      if [ "${{ parameters.platform }}" = "rocm" ]; then
+        apt-get update -qq && apt-get install -y -qq llvm 2>/dev/null | tail -1
         GCOV_WRAPPER=$(mktemp)
         printf '#!/bin/sh\nexec llvm-cov gcov "$@"\n' > "$GCOV_WRAPPER"
         chmod +x "$GCOV_WRAPPER"

From 275622159c5d5097c8d220c96b6dfe4825e7a0ae Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 18 Mar 2026 02:32:21 +0000
Subject: [PATCH 081/132] update

---
 test/deploy/run-remote.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/deploy/run-remote.sh b/test/deploy/run-remote.sh
index a631a391c..b646ea92e 100755
--- a/test/deploy/run-remote.sh
+++ b/test/deploy/run-remote.sh
@@ -100,7 +100,7 @@ if $USE_DOCKER; then
     INNER+=" printf '%s' \\\"\\\$CMD_B64\\\" | base64 -d | bash -euxo pipefail"
 
     parallel-ssh -i "${PSSH_COMMON[@]}" \
-        "sudo docker exec -t mscclpp-test bash -c \"${INNER}\""
+        "sudo docker exec mscclpp-test bash -c \"${INNER}\""
 else
     parallel-ssh -i "${PSSH_COMMON[@]}" \
         "set -euxo pipefail; CMD_B64='${CMD_B64}'; printf '%s' \"\$CMD_B64\" | base64 -d | bash -euxo pipefail"

From bff76d5b85516211f518d5d71a86d226c2a8608c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 18 Mar 2026 19:44:11 +0000
Subject: [PATCH 082/132] Fix TearDown() handling and replace assert() in perf
 tests

Address review comments:
1. Ensure TearDown() is always called if SetUp() succeeds, even when
   TestBody() throws. This prevents resource leaks and maintains MPI
   synchronization between tests.
2. Replace assert() in fifo_perf_tests.cu with proper return false
   on validation failure, ensuring consistent test failure reporting.

Fixes:
- test/framework.cc: Track SetUp success and call TearDown in finally-style
- test/unit/fifo_perf_tests.cu: Replace assert with explicit check

Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
---
 test/framework.cc            | 21 ++++++++++++++++++++-
 test/unit/fifo_perf_tests.cu |  4 +++-
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/test/framework.cc b/test/framework.cc
index 392bc770f..73cf1272e 100644
--- a/test/framework.cc
+++ b/test/framework.cc
@@ -220,11 +220,12 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
 
     TestCase* testCase = nullptr;
     bool testSkipped = false;
+    bool setUpSucceeded = false;
     try {
       testCase = entry.factory();
       testCase->SetUp();
+      setUpSucceeded = true;
       testCase->TestBody();
-      testCase->TearDown();
     } catch (const SkipException& e) {
       gCurrentTestPassed = true;
       testSkipped = true;
@@ -243,6 +244,24 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
       }
     }
 
+    // Always call TearDown() if SetUp() succeeded, even if TestBody() threw
+    if (setUpSucceeded && testCase != nullptr) {
+      try {
+        testCase->TearDown();
+      } catch (const std::exception& e) {
+        // If test already failed, keep original failure message
+        if (gCurrentTestPassed) {
+          gCurrentTestPassed = false;
+          gCurrentTestFailureMessage = std::string("TearDown() failed: ") + e.what();
+        }
+      } catch (...) {
+        if (gCurrentTestPassed) {
+          gCurrentTestPassed = false;
+          gCurrentTestFailureMessage = "TearDown() failed with unknown exception";
+        }
+      }
+    }
+
     delete testCase;
     gCurrentTestName.clear();
 
diff --git a/test/unit/fifo_perf_tests.cu b/test/unit/fifo_perf_tests.cu
index 9a28591b3..34b5d6bc6 100644
--- a/test/unit/fifo_perf_tests.cu
+++ b/test/unit/fifo_perf_tests.cu
@@ -45,7 +45,9 @@ static bool consumePerfTriggers(std::unique_ptr<mscclpp::Fifo>& hostFifo, int nu
 
     trigger.snd ^= ((uint64_t)1 << (uint64_t)63);
     trigger.snd = trigger.snd ^ trigger.fst;
-    assert(triggerCounts[trigger.snd] + 1 == trigger.fst);
+    if (triggerCounts[trigger.snd] + 1 != trigger.fst) {
+      return false;  // Validation failed
+    }
     triggerCounts[trigger.snd]++;
     hostFifo->pop();
   }

From 6082648f80d083d945d2665ecc797a6a30616f47 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 18 Mar 2026 20:06:37 +0000
Subject: [PATCH 083/132] fix for npkit

---
 .azure-pipelines/templates/deploy.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.azure-pipelines/templates/deploy.yaml b/.azure-pipelines/templates/deploy.yaml
index 1da3ce3ba..fc116acfb 100644
--- a/.azure-pipelines/templates/deploy.yaml
+++ b/.azure-pipelines/templates/deploy.yaml
@@ -70,7 +70,7 @@ steps:
 
       CMAKE_EXTRA_ARGS='${{ parameters.cmakeArgs }}'
       if [ "${{ parameters.platform }}" = "rocm" ]; then
-        CXX=/opt/rocm/bin/hipcc cmake \
+        eval CXX=/opt/rocm/bin/hipcc cmake \
           -DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
           -DMSCCLPP_BYPASS_GPU_CHECK=ON \
           -DMSCCLPP_USE_ROCM=ON \
@@ -78,7 +78,7 @@ steps:
           ${GPU_ARCH_ARG} \
           ${CMAKE_EXTRA_ARGS} ..
       else
-        cmake \
+        eval cmake \
           -DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
           -DMSCCLPP_BYPASS_GPU_CHECK=ON \
           -DMSCCLPP_USE_CUDA=ON \

From 79a014976da2551d5b130250342f3583353d003c Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 18 Mar 2026 20:30:18 +0000
Subject: [PATCH 084/132] updates

---
 .azure-pipelines/codecov.yml                     |  6 +++---
 .azure-pipelines/integration-test.yml            |  4 ++--
 .azure-pipelines/multi-nodes-test.yml            | 12 ++++++------
 .../{nccl-api-test.yaml => nccl-api-test.yml}    |  4 ++--
 .azure-pipelines/rccl-api-test.yml               |  2 +-
 .../templates/{codecov.yaml => codecov.yml}      |  8 ++++----
 .../templates/{deploy.yaml => deploy.yml}        |  0
 ...ntegration-test.yaml => integration-test.yml} | 16 ++++++++--------
 .../templates/{nccl-test.yaml => nccl-test.yml}  | 16 ++++++++--------
 .../templates/{rccl-test.yaml => rccl-test.yml}  | 12 ++++++------
 ...{run-remote-task.yaml => run-remote-task.yml} |  0
 .../templates/{stop.yaml => stop.yml}            |  0
 .../{ut-no-ib-env.yaml => ut-no-ib-env.yml}      | 14 +++++++-------
 .../templates/{ut-npkit.yaml => ut-npkit.yml}    |  8 ++++----
 .azure-pipelines/templates/{ut.yaml => ut.yml}   | 10 +++++-----
 .azure-pipelines/ut.yml                          | 12 ++++++------
 .../workflows/{doc-build.yaml => doc-build.yml}  |  0
 README.md                                        | 12 ++++++------
 18 files changed, 68 insertions(+), 68 deletions(-)
 rename .azure-pipelines/{nccl-api-test.yaml => nccl-api-test.yml} (93%)
 rename .azure-pipelines/templates/{codecov.yaml => codecov.yml} (97%)
 rename .azure-pipelines/templates/{deploy.yaml => deploy.yml} (100%)
 rename .azure-pipelines/templates/{integration-test.yaml => integration-test.yml} (93%)
 rename .azure-pipelines/templates/{nccl-test.yaml => nccl-test.yml} (94%)
 rename .azure-pipelines/templates/{rccl-test.yaml => rccl-test.yml} (92%)
 rename .azure-pipelines/templates/{run-remote-task.yaml => run-remote-task.yml} (100%)
 rename .azure-pipelines/templates/{stop.yaml => stop.yml} (100%)
 rename .azure-pipelines/templates/{ut-no-ib-env.yaml => ut-no-ib-env.yml} (92%)
 rename .azure-pipelines/templates/{ut-npkit.yaml => ut-npkit.yml} (96%)
 rename .azure-pipelines/templates/{ut.yaml => ut.yml} (89%)
 rename .github/workflows/{doc-build.yaml => doc-build.yml} (100%)

diff --git a/.azure-pipelines/codecov.yml b/.azure-pipelines/codecov.yml
index ea006a636..c4abeaa78 100644
--- a/.azure-pipelines/codecov.yml
+++ b/.azure-pipelines/codecov.yml
@@ -43,7 +43,7 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/codecov.yaml
+  - template: templates/codecov.yml
     parameters:
       subscription:     mscclpp-ci
       vmssName:         mscclpp-ci
@@ -64,7 +64,7 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/codecov.yaml
+  - template: templates/codecov.yml
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
@@ -85,7 +85,7 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/codecov.yaml
+  - template: templates/codecov.yml
     parameters:
       subscription:     mscclpp-ci-mi300x
       vmssName:         mscclpp-mi300x-ci
diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml
index d7479b87c..d5d5f9bde 100644
--- a/.azure-pipelines/integration-test.yml
+++ b/.azure-pipelines/integration-test.yml
@@ -41,7 +41,7 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/integration-test.yaml
+  - template: templates/integration-test.yml
     parameters:
       subscription:     mscclpp-ci
       vmssName:         mscclpp-ci
@@ -60,7 +60,7 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/integration-test.yaml
+  - template: templates/integration-test.yml
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml
index 643b4351b..d49248791 100644
--- a/.azure-pipelines/multi-nodes-test.yml
+++ b/.azure-pipelines/multi-nodes-test.yml
@@ -50,13 +50,13 @@ jobs:
           echo "Entry already exists, nothing to do."
         fi
 
-  - template: templates/deploy.yaml
+  - template: templates/deploy.yml
     parameters:
       subscription:  msccl-it
       vmssName:      mscclit-vmss
       resourceGroup: msccl-IT
 
-  - template: templates/run-remote-task.yaml
+  - template: templates/run-remote-task.yml
     parameters:
       name: RunMscclppTest
       displayName: Run multi-nodes mscclpp-test
@@ -64,7 +64,7 @@ jobs:
       remoteScript: |
         bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test
 
-  - template: templates/run-remote-task.yaml
+  - template: templates/run-remote-task.yml
     parameters:
       name: RunMultiNodeUnitTest
       displayName: Run multi-nodes unit tests
@@ -72,7 +72,7 @@ jobs:
       remoteScript: |
         bash /root/mscclpp/test/deploy/run_tests.sh mp-ut
 
-  - template: templates/run-remote-task.yaml
+  - template: templates/run-remote-task.yml
     parameters:
       name: RunMultiNodePythonTests
       displayName: Run multi-nodes python tests
@@ -80,7 +80,7 @@ jobs:
       remoteScript: |
         bash /root/mscclpp/test/deploy/run_tests.sh pytests
 
-  - template: templates/run-remote-task.yaml
+  - template: templates/run-remote-task.yml
     parameters:
       name: RunMultiNodePythonBenchmark
       displayName: Run multi-nodes python benchmark
@@ -88,7 +88,7 @@ jobs:
       remoteScript: |
         bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark
 
-  - template: templates/stop.yaml
+  - template: templates/stop.yml
     parameters:
       subscription:  msccl-it
       vmssName:      mscclit-vmss
diff --git a/.azure-pipelines/nccl-api-test.yaml b/.azure-pipelines/nccl-api-test.yml
similarity index 93%
rename from .azure-pipelines/nccl-api-test.yaml
rename to .azure-pipelines/nccl-api-test.yml
index 275f45a3d..cc0174120 100644
--- a/.azure-pipelines/nccl-api-test.yaml
+++ b/.azure-pipelines/nccl-api-test.yml
@@ -40,7 +40,7 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/nccl-test.yaml
+  - template: templates/nccl-test.yml
     parameters:
       subscription:     mscclpp-ci
       vmssName:         mscclpp-ci
@@ -60,7 +60,7 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/nccl-test.yaml
+  - template: templates/nccl-test.yml
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
diff --git a/.azure-pipelines/rccl-api-test.yml b/.azure-pipelines/rccl-api-test.yml
index dda6e93a9..43841079e 100644
--- a/.azure-pipelines/rccl-api-test.yml
+++ b/.azure-pipelines/rccl-api-test.yml
@@ -40,7 +40,7 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/rccl-test.yaml
+  - template: templates/rccl-test.yml
     parameters:
       subscription:     mscclpp-ci-mi300x
       vmssName:         mscclpp-mi300x-ci
diff --git a/.azure-pipelines/templates/codecov.yaml b/.azure-pipelines/templates/codecov.yml
similarity index 97%
rename from .azure-pipelines/templates/codecov.yaml
rename to .azure-pipelines/templates/codecov.yml
index 03d392e36..08797351a 100644
--- a/.azure-pipelines/templates/codecov.yaml
+++ b/.azure-pipelines/templates/codecov.yml
@@ -10,7 +10,7 @@ parameters:
   type: string
 
 steps:
-- template: deploy.yaml
+- template: deploy.yml
   parameters:
     subscription:     ${{ parameters.subscription }}
     vmssName:         ${{ parameters.vmssName }}
@@ -22,7 +22,7 @@ steps:
     buildName:        BuildCoverage
     deployArgs:       'single-node-test true ${{ parameters.platform }}'
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: TestsCoverageNonPerf
     displayName: Run unit_tests + mp_unit_tests (non-perf) with coverage
@@ -44,7 +44,7 @@ steps:
       mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests --exclude-perf-tests
       echo "mp_unit_tests -np 4: PASSED"
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: CaptureCoverage
     displayName: Capture coverage data with lcov
@@ -104,7 +104,7 @@ steps:
       ./codecov upload-process --disable-search -t $(CODECOV_TOKEN) -f coverage.info --flag ${{ parameters.platform }}-${{ parameters.gpuArch }}
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
-- template: stop.yaml
+- template: stop.yml
   parameters:
     subscription: ${{ parameters.subscription }}
     vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/deploy.yaml b/.azure-pipelines/templates/deploy.yml
similarity index 100%
rename from .azure-pipelines/templates/deploy.yaml
rename to .azure-pipelines/templates/deploy.yml
diff --git a/.azure-pipelines/templates/integration-test.yaml b/.azure-pipelines/templates/integration-test.yml
similarity index 93%
rename from .azure-pipelines/templates/integration-test.yaml
rename to .azure-pipelines/templates/integration-test.yml
index 790854669..b686e4f21 100644
--- a/.azure-pipelines/templates/integration-test.yaml
+++ b/.azure-pipelines/templates/integration-test.yml
@@ -10,14 +10,14 @@ parameters:
   type: string
 
 steps:
-- template: deploy.yaml
+- template: deploy.yml
   parameters:
     subscription:     ${{ parameters.subscription }}
     vmssName:         ${{ parameters.vmssName }}
     gpuArch:          ${{ parameters.gpuArch }}
     deployArgs:       'single-node-test'
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: AllGatherTest
     displayName: Run mscclpp AllGather test
@@ -27,14 +27,14 @@ steps:
       mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
       mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: SendRecvTest
     displayName: Run mscclpp SendRecv test
     remoteScript: |
       mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: AllReduceTest
     displayName: Run mscclpp AllReduce test
@@ -47,7 +47,7 @@ steps:
       mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
       mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: AllToAll
     displayName: Run mscclpp AllToAll test
@@ -55,14 +55,14 @@ steps:
       mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
       mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: CheckPerfNumber
     displayName: Check collective primitives performance
     remoteScript: |
       python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: PythonAllReduceBenchmark
     displayName: Python Allreduce Benchmark
@@ -70,7 +70,7 @@ steps:
       python3 -m pip install .
       mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
 
-- template: stop.yaml
+- template: stop.yml
   parameters:
     subscription: ${{ parameters.subscription }}
     vmssName:     ${{ parameters.vmssName }}
\ No newline at end of file
diff --git a/.azure-pipelines/templates/nccl-test.yaml b/.azure-pipelines/templates/nccl-test.yml
similarity index 94%
rename from .azure-pipelines/templates/nccl-test.yaml
rename to .azure-pipelines/templates/nccl-test.yml
index c41d4bc19..211e2393a 100644
--- a/.azure-pipelines/templates/nccl-test.yaml
+++ b/.azure-pipelines/templates/nccl-test.yml
@@ -1,4 +1,4 @@
-# .azure-pipelines/templates/nccl-test.yaml
+# .azure-pipelines/templates/nccl-test.yml
 # ----------------------------------------
 # A step‐template that runs the entire MSCCLPP→NCCL test suite on one pool/container.
 #
@@ -15,13 +15,13 @@ parameters:
   default: "-gencode=arch=compute_80,code=sm_80"
 
 steps:
-- template: deploy.yaml
+- template: deploy.yml
   parameters:
     subscription:     ${{ parameters.subscription }}
     vmssName:         ${{ parameters.vmssName }}
     deployArgs:       'nccltest-single-node'
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: InstallNcclTests
     displayName: Install NCCL Tests
@@ -31,7 +31,7 @@ steps:
       cd nccl-tests
       MPI=1 MPI_HOME=/usr/local/mpi make -j
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: InstallNccl
     displayName: Install NCCL
@@ -46,7 +46,7 @@ steps:
       cd nccl
       make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: RunNcclAllGatherFallbaclkToNcclTest
     displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation
@@ -54,7 +54,7 @@ steps:
       mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
       mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: RunNcclAllReduceFallbaclkToNcclTest
     displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation
@@ -62,7 +62,7 @@ steps:
       mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
       mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: RunNcclBroadcastFallbaclkToNcclTest
     displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation
@@ -70,7 +70,7 @@ steps:
       mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
       mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
 
-- template: stop.yaml
+- template: stop.yml
   parameters:
     subscription: ${{ parameters.subscription }}
     vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/rccl-test.yaml b/.azure-pipelines/templates/rccl-test.yml
similarity index 92%
rename from .azure-pipelines/templates/rccl-test.yaml
rename to .azure-pipelines/templates/rccl-test.yml
index 15c69066b..8e2471614 100644
--- a/.azure-pipelines/templates/rccl-test.yaml
+++ b/.azure-pipelines/templates/rccl-test.yml
@@ -1,4 +1,4 @@
-# .azure-pipelines/templates/rccl-test.yaml
+# .azure-pipelines/templates/rccl-test.yml
 # ------------------------------------------------
 # A step-template that runs the entire MSCCLPP→RCCL test suite on one pool/container.
 #
@@ -17,7 +17,7 @@ parameters:
   default: "gfx942"
 
 steps:
-- template: deploy.yaml
+- template: deploy.yml
   parameters:
     subscription:     ${{ parameters.subscription }}
     vmssName:         ${{ parameters.vmssName }}
@@ -27,7 +27,7 @@ steps:
     deployArgs:       'single-node-test true rocm'
 
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: InstallRcclTests
     displayName: Install RCCL Tests
@@ -41,7 +41,7 @@ steps:
       cd projects/rccl-tests
       MPI=1 MPI_HOME=/usr/local/mpi make -j
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: RunRcclAllGatherTest
     displayName: Run RCCL AllGather Test with or without MSCCLPP Lib
@@ -49,7 +49,7 @@ steps:
       mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
       mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: RunRcclAllReduceTest
     displayName: Run RCCL AllReduce Test with or without MSCCLPP Lib
@@ -57,7 +57,7 @@ steps:
       mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
       mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
 
-- template: stop.yaml
+- template: stop.yml
   parameters:
     subscription: ${{ parameters.subscription }}
     vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/run-remote-task.yaml b/.azure-pipelines/templates/run-remote-task.yml
similarity index 100%
rename from .azure-pipelines/templates/run-remote-task.yaml
rename to .azure-pipelines/templates/run-remote-task.yml
diff --git a/.azure-pipelines/templates/stop.yaml b/.azure-pipelines/templates/stop.yml
similarity index 100%
rename from .azure-pipelines/templates/stop.yaml
rename to .azure-pipelines/templates/stop.yml
diff --git a/.azure-pipelines/templates/ut-no-ib-env.yaml b/.azure-pipelines/templates/ut-no-ib-env.yml
similarity index 92%
rename from .azure-pipelines/templates/ut-no-ib-env.yaml
rename to .azure-pipelines/templates/ut-no-ib-env.yml
index 956436d53..a62f1a77a 100644
--- a/.azure-pipelines/templates/ut-no-ib-env.yaml
+++ b/.azure-pipelines/templates/ut-no-ib-env.yml
@@ -7,7 +7,7 @@ parameters:
   type: string
 
 steps:
-- template: deploy.yaml
+- template: deploy.yml
   parameters:
     subscription:     ${{ parameters.subscription }}
     vmssName:         ${{ parameters.vmssName }}
@@ -15,14 +15,14 @@ steps:
     cmakeArgs:        '-DMSCCLPP_USE_IB=OFF'
     deployArgs:       'single-node-test false'
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: UnitTests
     displayName: Run mscclpp unit tests
     remoteScript: |
       ./build/bin/unit_tests
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: MpUnitTests
     displayName: Run mscclpp multi-process unit tests
@@ -31,14 +31,14 @@ steps:
       mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
       mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: PyTests
     displayName: Run pytests
     remoteScript: |
       mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: StopContainer
     displayName: Stop existing container
@@ -82,14 +82,14 @@ steps:
     arguments: single-node-test false
     workingDirectory: $(System.DefaultWorkingDirectory)
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: PyTestsWithIbBuildDisableIb
     displayName: Run pytests (IB build, IB tests disabled)
     remoteScript: |
       mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
 
-- template: stop.yaml
+- template: stop.yml
   parameters:
     subscription: ${{ parameters.subscription }}
     vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/ut-npkit.yaml b/.azure-pipelines/templates/ut-npkit.yml
similarity index 96%
rename from .azure-pipelines/templates/ut-npkit.yaml
rename to .azure-pipelines/templates/ut-npkit.yml
index 2897a489c..e53b5cf59 100644
--- a/.azure-pipelines/templates/ut-npkit.yaml
+++ b/.azure-pipelines/templates/ut-npkit.yml
@@ -8,7 +8,7 @@ parameters:
 
 
 steps:
-- template: deploy.yaml
+- template: deploy.yml
   parameters:
     subscription:     ${{ parameters.subscription }}
     vmssName:         ${{ parameters.vmssName }}
@@ -16,7 +16,7 @@ steps:
     cmakeArgs:        '-DMSCCLPP_NPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT"'
     deployArgs:       'single-node-test'
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: MpUnitTests
     displayName: Run mscclpp multi-process unit tests
@@ -30,7 +30,7 @@ steps:
       grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
       grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: PyTests
     displayName: Run pytests
@@ -51,7 +51,7 @@ steps:
       grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
       grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
 
-- template: stop.yaml
+- template: stop.yml
   parameters:
     subscription: ${{ parameters.subscription }}
     vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/ut.yaml b/.azure-pipelines/templates/ut.yml
similarity index 89%
rename from .azure-pipelines/templates/ut.yaml
rename to .azure-pipelines/templates/ut.yml
index c828783df..9d17e9235 100644
--- a/.azure-pipelines/templates/ut.yaml
+++ b/.azure-pipelines/templates/ut.yml
@@ -10,7 +10,7 @@ parameters:
   type: string
 
 steps:
-- template: deploy.yaml
+- template: deploy.yml
   parameters:
     subscription:     ${{ parameters.subscription }}
     vmssName:         ${{ parameters.vmssName }}
@@ -19,14 +19,14 @@ steps:
     deployArgs:       'single-node-test true ${{ parameters.platform }}'
 
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: UnitTests
     displayName: Run mscclpp unit tests
     remoteScript: |
       ./build/bin/unit_tests
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: MpUnitTests
     displayName: Run mscclpp multi-process unit tests
@@ -35,14 +35,14 @@ steps:
       mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
       mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
 
-- template: run-remote-task.yaml
+- template: run-remote-task.yml
   parameters:
     name: PyTests
     displayName: Run pytests
     remoteScript: |
       mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
 
-- template: stop.yaml
+- template: stop.yml
   parameters:
     subscription: ${{ parameters.subscription }}
     vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml
index e6590abb1..4e6f96b1c 100644
--- a/.azure-pipelines/ut.yml
+++ b/.azure-pipelines/ut.yml
@@ -43,7 +43,7 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/ut.yaml
+  - template: templates/ut.yml
     parameters:
       subscription:     mscclpp-ci
       vmssName:         mscclpp-ci
@@ -64,7 +64,7 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/ut-npkit.yaml
+  - template: templates/ut-npkit.yml
     parameters:
       subscription:     mscclpp-ci
       vmssName:         mscclpp-ci
@@ -83,7 +83,7 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/ut.yaml
+  - template: templates/ut.yml
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
@@ -102,7 +102,7 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/ut-npkit.yaml
+  - template: templates/ut-npkit.yml
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
@@ -123,7 +123,7 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/ut-no-ib-env.yaml
+  - template: templates/ut-no-ib-env.yml
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
@@ -142,7 +142,7 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/ut.yaml
+  - template: templates/ut.yml
     parameters:
       subscription:     mscclpp-ci-mi300x
       vmssName:         mscclpp-mi300x-ci
diff --git a/.github/workflows/doc-build.yaml b/.github/workflows/doc-build.yml
similarity index 100%
rename from .github/workflows/doc-build.yaml
rename to .github/workflows/doc-build.yml
diff --git a/README.md b/README.md
index c7dd91c69..58586a309 100644
--- a/README.md
+++ b/README.md
@@ -3,16 +3,16 @@
 [![Latest Release](https://img.shields.io/github/release/microsoft/mscclpp.svg)](https://github.com/microsoft/mscclpp/releases/latest)
 [![License](https://img.shields.io/github/license/microsoft/mscclpp.svg)](LICENSE)
 [![CodeQL](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml/badge.svg?branch=main)](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml)
-[![Docs Build](https://github.com/microsoft/mscclpp/actions/workflows/doc-build.yaml/badge.svg)](https://microsoft.github.io/mscclpp/)
+[![Docs Build](https://github.com/microsoft/mscclpp/actions/workflows/doc-build.yml/badge.svg)](https://microsoft.github.io/mscclpp/)
 [![codecov](https://codecov.io/gh/microsoft/mscclpp/graph/badge.svg?token=DAV9DGHAY2)](https://codecov.io/gh/microsoft/mscclpp)
 
 | Testing Pipelines        | Build Status      |
 |--------------------------|-------------------|
-| Unit Tests (CUDA)        | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
-| Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
-| Unit Tests (ROCm) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut-rocm?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=399295&branchName=main) |
-| NCCL Tests | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-nccl?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=320665&branchName=main) |
-| RCCL Tests | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-rccl?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=448013&branchName=main) |
+| Unit Tests (CUDA)        | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main&jobName=UnitTestH100)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
+| Unit Tests (ROCm)        | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main&jobName=UnitTestMI300X)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
+| Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main&jobName=Integration%20test%20H100)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
+| NCCL Tests               | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-nccl?repoName=microsoft%2Fmscclpp&branchName=main&jobName=Run%20MSCCLPP%20over%20NCCL%20Test%20(H100))](https://msazure.visualstudio.com/One/_build/latest?definitionId=320665&repoName=microsoft%2Fmscclpp&branchName=main) |
+| RCCL Tests               | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-rccl?branchName=main&jobName=Run%20MSCCLPP%20over%20RCCL%20Test%20(MI300X))](https://msazure.visualstudio.com/One/_build/latest?definitionId=448013&branchName=main) |
 
 A GPU-driven communication stack for scalable AI applications.
 

From 67f9933ba13da5baea941bab92a114398bafa69e Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 1 Apr 2026 10:20:43 +0000
Subject: [PATCH 085/132] fix data direct

---
 .github/copilot-instructions.md     |   2 +-
 cmake/FindGDRCopy.cmake             |   2 +
 cmake/FindMLX5.cmake                |   1 +
 include/mscclpp/atomic_device.hpp   |   5 +-
 src/core/connection.cc              | 196 +++++++++-------------
 src/core/endpoint.cc                |  21 ---
 src/core/gdr.cc                     | 114 ++++++++++---
 src/core/ib.cc                      | 117 +++++++------
 src/core/ibverbs_wrapper.cc         |  43 ++++-
 src/core/include/connection.hpp     |  56 +++----
 src/core/include/endpoint.hpp       |   8 -
 src/core/include/gdr.hpp            |  57 ++-----
 src/core/include/ib.hpp             |  16 +-
 src/core/include/mlx5dv_wrapper.hpp |  12 +-
 src/core/mlx5dv_wrapper.cc          |  39 +++--
 src/core/semaphore.cc               |  21 ++-
 test/framework.cc                   |   3 +
 test/mp_unit/ib_tests.cu            | 120 ++++++++++---
 test/mp_unit/port_channel_tests.cu  |  19 +++
 test/unit/CMakeLists.txt            |   1 +
 test/unit/gdr_tests.cu              | 251 ++++++++++++++++++++++++++++
 21 files changed, 737 insertions(+), 367 deletions(-)
 create mode 100644 test/unit/gdr_tests.cu

diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
index 4f13c557d..9d7e7798c 100644
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -43,7 +43,7 @@ For testing after successful build:
 # To run tests with two GPUs - two is enough for most tests
 mpirun -np 2 ./build/bin/mp_unit_tests
 # To run tests excluding IB-related ones (when IB is not available)
-mpirun -np 2 ./build/bin/mp_unit_tests --gtest_filter=-*Ib*
+mpirun -np 2 ./build/bin/mp_unit_tests --filter=-*Ib*
 ```
 
 For building a Python package:
diff --git a/cmake/FindGDRCopy.cmake b/cmake/FindGDRCopy.cmake
index e62f32f2b..54e0ba1c6 100644
--- a/cmake/FindGDRCopy.cmake
+++ b/cmake/FindGDRCopy.cmake
@@ -35,7 +35,9 @@ find_library(GDRCOPY_LIBRARIES
 if(GDRCOPY_INCLUDE_DIRS)
     include(CheckSymbolExists)
     set(CMAKE_REQUIRED_INCLUDES ${GDRCOPY_INCLUDE_DIRS})
+    set(CMAKE_REQUIRED_LIBRARIES ${GDRCOPY_LIBRARIES})
     check_symbol_exists(gdr_pin_buffer_v2 "gdrapi.h" GDRCOPY_HAS_PIN_BUFFER_V2)
+    unset(CMAKE_REQUIRED_LIBRARIES)
     unset(CMAKE_REQUIRED_INCLUDES)
     if(NOT GDRCOPY_HAS_PIN_BUFFER_V2)
         message(STATUS "GDRCopy found but too old (gdr_pin_buffer_v2 not available). Requires >= 2.5.")
diff --git a/cmake/FindMLX5.cmake b/cmake/FindMLX5.cmake
index 592984501..9fd591275 100644
--- a/cmake/FindMLX5.cmake
+++ b/cmake/FindMLX5.cmake
@@ -33,5 +33,6 @@ find_library(MLX5_LIBRARIES
   /usr/lib/x86_64-linux-gnu)
 
 include(FindPackageHandleStandardArgs)
+
 find_package_handle_standard_args(MLX5 DEFAULT_MSG MLX5_INCLUDE_DIRS MLX5_LIBRARIES)
 mark_as_advanced(MLX5_INCLUDE_DIRS MLX5_LIBRARIES)
diff --git a/include/mscclpp/atomic_device.hpp b/include/mscclpp/atomic_device.hpp
index 74f6122f8..d00bb50cf 100644
--- a/include/mscclpp/atomic_device.hpp
+++ b/include/mscclpp/atomic_device.hpp
@@ -38,7 +38,7 @@ MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, cuda::memory_o
   return cuda::atomic_ref<T, Scope>{*ptr}.fetch_add(val, memoryOrder);
 }
 
-#elif defined(MSCCLPP_DEVICE_HIP)
+#else  // !defined(MSCCLPP_DEVICE_CUDA)
 
 constexpr auto memoryOrderRelaxed = __ATOMIC_RELAXED;
 constexpr auto memoryOrderAcquire = __ATOMIC_ACQUIRE;
@@ -46,7 +46,6 @@ constexpr auto memoryOrderRelease = __ATOMIC_RELEASE;
 constexpr auto memoryOrderAcqRel = __ATOMIC_ACQ_REL;
 constexpr auto memoryOrderSeqCst = __ATOMIC_SEQ_CST;
 
-// HIP does not have thread scope enums like CUDA
 constexpr auto scopeSystem = 0;
 constexpr auto scopeDevice = 0;
 
@@ -65,7 +64,7 @@ MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, int memoryOrde
   return __atomic_fetch_add(ptr, val, memoryOrder);
 }
 
-#endif  // defined(MSCCLPP_DEVICE_HIP)
+#endif  // !defined(MSCCLPP_DEVICE_CUDA)
 
 }  // namespace mscclpp
 
diff --git a/src/core/connection.cc b/src/core/connection.cc
index 7ce9b37dd..172bca390 100644
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -7,6 +7,7 @@
 #include <mscclpp/npkit/npkit.hpp>
 #endif
 
+#include <mscclpp/atomic_device.hpp>
 #include <mscclpp/numa.hpp>
 #include <mscclpp/utils.hpp>
 #include <sstream>
@@ -219,29 +220,18 @@ void IBConnection::recvThreadFunc() {
         continue;
       }
 
-      // Read the token value from the incoming write-with-imm completion.
-      if (dataDirectEnabled_) {
-        // Data Direct path: the signal GPU buffer MR was registered with
-        // MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT, and the semaphore token is also written
-        // through Data Direct (via GDRCopy). Both writes go through the same path, so
-        // all data is visible in GPU memory when the CQE is polled. Read from imm_data.
-        newValueHost = static_cast<uint64_t>(qp->getRecvWcImmData(i));
-      } else {
-        // Slow path: read the 64-bit token from the local signal GPU buffer via volatile load.
-        // localSignalGpuPtr_ points to either a GDRCopy BAR1 mapping (CUDA) or the
-        // GPU buffer directly (ROCm system-coherent/uncached memory).
-        newValueHost = *static_cast<volatile uint64_t*>(localSignalGpuPtr_);
-      }
-
-      // Read token address from the local stored address (set by setSignalForwardingDst)
-      if (remoteUpdateDstAddr_ != 0) {
-        uint64_t* dstPtr = reinterpret_cast<uint64_t*>(remoteUpdateDstAddr_);
+      // Read the token from imm_data (always available and correct in the CQE).
+      newValueHost = static_cast<uint64_t>(qp->getRecvWcImmData(i));
 
-        if (remoteUpdateDstAddrMap_ && remoteUpdateDstAddrMap_->valid()) {
-          // Direct host-side write to GPU memory via GDRCopy BAR1 mapping
-          remoteUpdateDstAddrMap_->copyTo(&newValueHost, sizeof(uint64_t));
+      // Forward the token to the semaphore's inbound token address via atomicStore
+      // through the GDRCopy BAR1 mapping. The GPU reads with system-scope acquire.
+      if (signalAddr_ != 0) {
+        if (signalGdrMap_ && signalGdrMap_->valid()) {
+          atomicStore(signalGdrMap_->hostPtr(), newValueHost, memoryOrderRelaxed);
         } else {
-          *dstPtr = newValueHost;
+          // For HIP/ROCm.
+          // NOTE: may need a fix in the future to ensure BAR1 mapping.
+          *reinterpret_cast<volatile uint64_t*>(signalAddr_) = newValueHost;
         }
       }
 
@@ -259,12 +249,10 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
       remoteTransport_(remoteEndpoint.transport()),
       atomicSrc_(std::make_unique<uint64_t>(0)),
       ibNoAtomic_(getImpl(localEndpoint).ibNoAtomic_),
+      gdrSignalForwarding_(false),
       stopRecvThread_(false),
       localGpuDeviceId_(localEndpoint.device().id),
-      remoteUpdateDstAddr_(0),
-      remoteSignalGpuMrInfo_{0, 0},
-      localSignalGpuPtr_(nullptr),
-      dataDirectEnabled_(false) {
+      signalAddr_(0) {
   qp_ = getImpl(localEndpoint).ibQp_;
   qp_.lock()->rtr(getImpl(remoteEndpoint).ibQpInfo_);
   qp_.lock()->rts();
@@ -274,105 +262,89 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
 
   if (ibNoAtomic_) {
 #if defined(MSCCLPP_USE_CUDA)
+    // On CUDA, HostNoAtomic requires GDRCopy for CPU→GPU signal forwarding through BAR1.
     if (!gdrEnabled()) {
-      std::string reason = "unknown";
-      switch (gdrStatus()) {
-        case GdrStatus::NotBuilt:
-          reason = "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)";
-          break;
-        case GdrStatus::Disabled:
-          reason = "GDRCopy is disabled via MSCCLPP_FORCE_DISABLE_GDR environment variable";
-          break;
-        case GdrStatus::DriverMissing:
-          reason = "GDRCopy kernel driver is not loaded (/dev/gdrdrv not found)";
-          break;
-        case GdrStatus::OpenFailed:
-          reason = "gdr_open() failed; GDRCopy driver may be misconfigured";
-          break;
-        default:
-          break;
-      }
-      THROW(CONN, Error, ErrorCode::InvalidUsage, "IB host-no-atomic mode on CUDA requires GDRCopy: ", reason);
-    }
-#endif
-
-    // Extract remote endpoint's signal GPU buffer MR info for write-with-imm destination
-    const auto& remoteImpl = getImpl(remoteEndpoint);
-    remoteSignalGpuMrInfo_ = remoteImpl.ibSignalGpuMrInfo_;
-
-    // Create a GDR mapping of the local signal GPU buffer. recvThreadFunc reads the
-    // 64-bit token via localSignalGpuPtr_, which points to the BAR1-mapped host address
-    // (CUDA/GDRCopy) or the GPU buffer directly (ROCm system-coherent memory).
-    const auto& localImpl = getImpl(localEndpoint);
-    if (gdrEnabled() && localImpl.ibSignalGpuBuffer_) {
-      localSignalGpuMap_ =
-          std::make_unique<GdrMap>(std::static_pointer_cast<void>(localImpl.ibSignalGpuBuffer_), localGpuDeviceId_);
+      THROW(CONN, Error, ErrorCode::InvalidUsage,
+            "IB host-no-atomic mode on CUDA requires GDRCopy: ", gdrStatusMessage());
     }
-    if (localSignalGpuMap_ && localSignalGpuMap_->valid()) {
-      // Use the BAR1-mapped host pointer; uncacheable MMIO ensures ordered volatile reads.
-      localSignalGpuPtr_ = localSignalGpuMap_->hostPtr();
-    } else if (localImpl.ibSignalGpuBuffer_) {
-      // ROCm: GPU memory is system-coherent, so direct volatile read is safe.
-      localSignalGpuPtr_ = reinterpret_cast<uint64_t*>(localImpl.ibSignalGpuBuffer_.get());
+    gdrSignalForwarding_ = true;
+#endif  // defined(MSCCLPP_USE_CUDA)
+
+    // On platforms with a CPU-GPU bridge that reorders posted writes (e.g., Grace/GB200
+    // NVLink-C2C), HostNoAtomic requires Data Direct for correct memory ordering. Data Direct
+    // routes NIC DMA through the PCIe Data Direct engine, bypassing the bridge. It is available
+    // on Virtual Function (VF) devices. On platforms without such a bridge (x86, non-Grace
+    // aarch64), HostNoAtomic works without Data Direct.
+    //
+    // We cannot reliably detect the bridge at compile time or runtime, so we emit a warning
+    // when the device is not a VF. If data corruption occurs, switching to VF devices with
+    // Data Direct or using IbMode::Host with RDMA atomics will resolve it.
+    {
+      IbCtx* ibCtx = getImpl(*context).getIbContext(transport_);
+      if (!ibCtx->isVirtualFunction()) {
+        WARN(CONN,
+             "IB HostNoAtomic mode without a Virtual Function (VF) device may cause data corruption "
+             "on platforms with a CPU-GPU bridge that reorders posted writes (e.g., Grace/GB200). "
+             "Device ",
+             ibCtx->getDevName(),
+             " is not a VF. "
+             "If you experience data corruption, use VF devices with Data Direct or IbMode::Host.");
+      }
     }
 
-    // Data Direct requires all three conditions:
-    // 1. Signal GPU buffer MR registered with MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT
-    // 2. Local signal GPU GDRCopy mapping pinned with GDR_PIN_FLAG_FORCE_PCIE
-    // 3. (signal forwarding dst GDRCopy mapping checked at setSignalForwardingDst time)
-    // When all conditions are met, RDMA data writes and GDRCopy token writes both go
-    // through the Data Direct engine, guaranteeing GPU memory visibility at CQE poll time.
+    // Pre-post receive requests for incoming WRITE_WITH_IMM notifications.
+    // The recv CQE guarantees the preceding data WRITE has been committed to GPU memory.
     auto qp = qp_.lock();
-    dataDirectEnabled_ = localImpl.ibSignalGpuMr_ && localImpl.ibSignalGpuMr_->isDataDirect() &&
-                          localSignalGpuMap_ && localSignalGpuMap_->valid();
-    if (dataDirectEnabled_) {
-      INFO(CONN, "IBConnection: Data Direct enabled");
-    }
-
-    // Pre-post receive requests for incoming write-with-imm
     int maxRecvWr = localEndpoint.config().ib.maxRecvWr;
     for (int i = 0; i < maxRecvWr; ++i) {
       qp->stageRecv(/*wrId=*/0);
     }
     qp->postRecv();
-    // Start the background thread to poll recv CQ
-    recvThread_ = std::thread([this]() { this->recvThreadFunc(); });
-    INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with no-atomic mode");
+    // The recv thread is started later in startSignalForwarding() when the semaphore
+    // provides the signal forwarding destination. This ensures the thread lifetime is
+    // bounded by the GdrMap lifetime (created before start, destroyed after stop).
+    INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with signal forwarding (HostNoAtomic) mode");
   } else {
     INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with atomic mode");
   }
 }
 
-IBConnection::~IBConnection() {
-  if (ibNoAtomic_) {
-    stopRecvThread_.store(true, std::memory_order_relaxed);
-    if (recvThread_.joinable()) {
-      recvThread_.join();
-    }
-  }
-}
+IBConnection::~IBConnection() { stopSignalForwarding(); }
 
 Transport IBConnection::transport() const { return transport_; }
 
 Transport IBConnection::remoteTransport() const { return remoteTransport_; }
 
-bool IBConnection::usesSignalForwarding() const { return ibNoAtomic_; }
-
-void IBConnection::setSignalForwardingDst(std::shared_ptr<uint64_t> mem) {
-  remoteUpdateDstAddr_ = reinterpret_cast<uint64_t>(mem.get());
-  if (gdrEnabled()) {
-    if (mem) {
-      remoteUpdateDstAddrMap_ = std::make_unique<GdrMap>(std::move(mem), localGpuDeviceId_);
-      // Data Direct requires the token write mapping to also use FORCE_PCIE
-      if (dataDirectEnabled_ && !(remoteUpdateDstAddrMap_ && remoteUpdateDstAddrMap_->valid())) {
-        dataDirectEnabled_ = false;
-        INFO(CONN, "IBConnection: Data Direct disabled (signal forwarding dst GDRCopy mapping not available)");
-      }
-    } else {
-      remoteUpdateDstAddrMap_.reset();
+bool IBConnection::isSignalForwarding() const { return ibNoAtomic_; }
+
+void IBConnection::startSignalForwarding(std::shared_ptr<uint64_t> mem) {
+  // Set up the forwarding destination and GdrMap, then start the recv thread.
+  // Order: set address → create GdrMap → start thread.
+  signalAddr_ = reinterpret_cast<uint64_t>(mem.get());
+  if (gdrSignalForwarding_) {
+    signalGdrMap_ = std::make_unique<GdrMap>(std::move(mem), localGpuDeviceId_);
+  }
+  if (ibNoAtomic_) {
+    stopRecvThread_.store(false, std::memory_order_relaxed);
+    recvThread_ = std::thread([this]() { this->recvThreadFunc(); });
+  }
+  INFO(CONN, "IBConnection startSignalForwarding: ", (void*)signalAddr_);
+}
+
+void IBConnection::stopSignalForwarding() {
+  // Stop the recv thread, then tear down GdrMap and address.
+  // Order: stop thread → destroy GdrMap → clear address.
+  if (ibNoAtomic_) {
+    stopRecvThread_.store(true, std::memory_order_relaxed);
+    if (recvThread_.joinable()) {
+      recvThread_.join();
     }
   }
-  INFO(CONN, "IBConnection setSignalForwardingDst: ", (void*)remoteUpdateDstAddr_);
+  if (gdrSignalForwarding_) {
+    signalGdrMap_.reset();
+  }
+  signalAddr_ = 0;
+  INFO(CONN, "IBConnection stopSignalForwarding");
 }
 
 void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset,
@@ -425,27 +397,23 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6
   *src = newValue;
 
   if (ibNoAtomic_) {
-    // Use RDMA write-with-imm instead of atomic operation.
-    // Write the token value (8 bytes) from the local host buffer to the remote signal GPU buffer,
-    // with newValue also in imm_data (32-bit). The remote's recvThreadFunc reads the token from
-    // the signal GPU buffer and forwards it to the semaphore's inbound token address.
-
-    // Put newValue in imm_data (truncated to 32-bit; semaphore counters should fit)
+    // Signal forwarding: send a 0-byte RDMA WRITE_WITH_IMM with the token in imm_data.
+    // The receiver's recv thread polls the CQE, which guarantees the preceding data WRITE
+    // has been committed to GPU memory. The recv thread then forwards the token to the
+    // semaphore's inbound token via GDRCopy atomicStore.
     unsigned int immData = static_cast<unsigned int>(newValue);
-
-    // Write the real token value into the host buffer, then RDMA write host->remote GPU
     *atomicSrc_ = newValue;
-    qp_.lock()->stageSendWriteWithImm(atomicSrcTransportInfo_.ibMr, remoteSignalGpuMrInfo_,
-                                      /*size=*/sizeof(uint64_t), /*wrId=*/0,
+    qp_.lock()->stageSendWriteWithImm(nullptr, dstMrInfo,
+                                      /*size=*/0, /*wrId=*/0,
                                       /*srcOffset=*/0, /*dstOffset=*/0,
                                       /*signaled=*/true, /*immData=*/immData);
     qp_.lock()->postSend();
-    INFO(CONN, "IBConnection write-with-imm: value ", oldValue, " -> ", newValue);
+    INFO(CONN, "IBConnection signal forwarding: value ", oldValue, " -> ", newValue);
   } else {
     qp_.lock()->stageSendAtomicAdd(atomicSrcTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue,
                                    /*signaled=*/true);
     qp_.lock()->postSend();
-    INFO(CONN, "IBConnection atomic Write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue,
+    INFO(CONN, "IBConnection atomic write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue,
          " -> ", newValue);
   }
 
diff --git a/src/core/endpoint.cc b/src/core/endpoint.cc
index 6569a31e0..5ab4bad0a 100644
--- a/src/core/endpoint.cc
+++ b/src/core/endpoint.cc
@@ -53,21 +53,6 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl)
                 ->createQp(config_.ib.port, config_.ib.gidIndex, config_.ib.maxCqSize, config_.ib.maxCqPollNum,
                            config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend, ibNoAtomic_);
     ibQpInfo_ = ibQp_->getInfo();
-
-    // Allocate a 64-bit signal GPU buffer for write-with-imm data payload (ibNoAtomic_ only).
-    if (ibNoAtomic_ && config_.device.type == DeviceType::GPU && config_.device.id >= 0) {
-      CudaDeviceGuard deviceGuard(config_.device.id);
-#if defined(MSCCLPP_DEVICE_HIP)
-      ibSignalGpuBuffer_ = detail::gpuCallocUncachedShared<uint64_t>();
-#else
-      ibSignalGpuBuffer_ = detail::gpuCallocShared<uint64_t>();
-#endif
-      ibSignalGpuMr_ =
-          contextImpl.getIbContext(config_.transport)->registerMr(ibSignalGpuBuffer_.get(), sizeof(uint64_t));
-      ibSignalGpuMrInfo_ = ibSignalGpuMr_->getInfo();
-    } else {
-      ibSignalGpuMrInfo_ = {0, 0};
-    }
   } else if (config_.transport == Transport::Ethernet) {
     // Configuring Ethernet Interfaces
     abortFlag_ = 0;
@@ -90,9 +75,6 @@ Endpoint::Impl::Impl(const std::vector<char>& serialization) {
     ibLocal_ = false;
     it = detail::deserialize(it, ibQpInfo_);
     it = detail::deserialize(it, ibNoAtomic_);
-    if (ibNoAtomic_) {
-      it = detail::deserialize(it, ibSignalGpuMrInfo_);
-    }
   } else if (config_.transport == Transport::Ethernet) {
     it = detail::deserialize(it, socketAddress_);
   }
@@ -123,9 +105,6 @@ MSCCLPP_API_CPP std::vector<char> Endpoint::serialize() const {
   if (AllIBTransports.has(pimpl_->config_.transport)) {
     detail::serialize(data, pimpl_->ibQpInfo_);
     detail::serialize(data, pimpl_->ibNoAtomic_);
-    if (pimpl_->ibNoAtomic_) {
-      detail::serialize(data, pimpl_->ibSignalGpuMrInfo_);
-    }
   } else if (pimpl_->config_.transport == Transport::Ethernet) {
     detail::serialize(data, pimpl_->socketAddress_);
   }
diff --git a/src/core/gdr.cc b/src/core/gdr.cc
index 341002ed6..22ac15c92 100644
--- a/src/core/gdr.cc
+++ b/src/core/gdr.cc
@@ -5,6 +5,7 @@
 
 #if defined(MSCCLPP_USE_GDRCOPY)
 
+#include <gdrapi.h>
 #include <unistd.h>
 
 #include <mscclpp/env.hpp>
@@ -12,9 +13,11 @@
 
 #include "logger.hpp"
 
+#ifndef GPU_PAGE_SHIFT
 #define GPU_PAGE_SHIFT 16
 #define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
 #define GPU_PAGE_MASK (~(GPU_PAGE_SIZE - 1))
+#endif
 
 namespace mscclpp {
 
@@ -45,6 +48,23 @@ GdrStatus gdrStatus() { return gdrContext()->status(); }
 
 bool gdrEnabled() { return gdrStatus() == GdrStatus::Ok; }
 
+const char* gdrStatusMessage() {
+  switch (gdrStatus()) {
+    case GdrStatus::Ok:
+      return "GDRCopy initialized successfully";
+    case GdrStatus::NotBuilt:
+      return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)";
+    case GdrStatus::Disabled:
+      return "GDRCopy is disabled via MSCCLPP_FORCE_DISABLE_GDR environment variable";
+    case GdrStatus::DriverMissing:
+      return "GDRCopy kernel driver is not loaded (/dev/gdrdrv not found)";
+    case GdrStatus::OpenFailed:
+      return "gdr_open() failed; GDRCopy driver may be misconfigured";
+    default:
+      return "unknown GDRCopy status";
+  }
+}
+
 GdrContext::GdrContext() : status_(GdrStatus::Disabled), handle_(nullptr) {
   if (env()->forceDisableGdr) {
     INFO(GPU, "GDRCopy disabled via MSCCLPP_FORCE_DISABLE_GDR");
@@ -77,53 +97,79 @@ GdrContext::~GdrContext() {
   }
 }
 
-// GdrMap
+// GdrMap::Impl — real implementation with GDRCopy
+
+struct GdrMap::Impl {
+  std::shared_ptr<GdrContext> ctx;
+  std::shared_ptr<void> gpuMem;
+  gdr_mh_t mh;
+  void* barPtr;
+  uint64_t* hostDstPtr;
+  size_t mappedSize;
+};
+
+GdrMap::GdrMap(std::shared_ptr<void> gpuMem, int deviceId) : pimpl_(std::make_unique<Impl>()) {
+  pimpl_->ctx = gdrContext();
+  pimpl_->gpuMem = std::move(gpuMem);
+  pimpl_->mh = {};
+  pimpl_->barPtr = nullptr;
+  pimpl_->hostDstPtr = nullptr;
+  pimpl_->mappedSize = 0;
 
-GdrMap::GdrMap(std::shared_ptr<void> gpuMem, int deviceId)
-    : ctx_(gdrContext()),
-      gpuMem_(std::move(gpuMem)),
-      mh_{},
-      barPtr_(nullptr),
-      hostDstPtr_(nullptr),
-      mappedSize_(0) {
   // Ensure CUDA device context is active for gdr_pin_buffer
   CudaDeviceGuard deviceGuard(deviceId);
 
-  uint64_t gpuAddr = reinterpret_cast<uint64_t>(gpuMem_.get());
+  uint64_t gpuAddr = reinterpret_cast<uint64_t>(pimpl_->gpuMem.get());
   // Align to GPU page boundary and pin one page around the target address
   unsigned long alignedAddr = gpuAddr & GPU_PAGE_MASK;
   unsigned long pageOffset = gpuAddr - alignedAddr;
-  mappedSize_ = GPU_PAGE_SIZE;
-
-  int ret = gdr_pin_buffer_v2(ctx_->handle(), alignedAddr, mappedSize_, GDR_PIN_FLAG_FORCE_PCIE, &mh_);
+  pimpl_->mappedSize = GPU_PAGE_SIZE;
+
+  // Pin the GPU memory for GDRCopy BAR1 mapping. Try GDR_PIN_FLAG_FORCE_PCIE first for optimal
+  // ordering on platforms that support it (e.g., GB200). Fall back to flags=0 if FORCE_PCIE is
+  // not supported. Both paths work correctly: CPU writes via atomicStore, GPU reads via
+  // system-scope acquire.
+  int ret =
+      gdr_pin_buffer_v2(pimpl_->ctx->handle(), alignedAddr, pimpl_->mappedSize, GDR_PIN_FLAG_FORCE_PCIE, &pimpl_->mh);
   if (ret != 0) {
-    THROW(GPU, Error, ErrorCode::InternalError, "gdr_pin_buffer_v2 failed (ret=", ret, ") for addr ", (void*)gpuAddr,
-          ". Ensure the GPU memory is allocated with cudaMalloc (not cuMemCreate/cuMemMap).");
+    ret = gdr_pin_buffer_v2(pimpl_->ctx->handle(), alignedAddr, pimpl_->mappedSize, 0, &pimpl_->mh);
+    if (ret != 0) {
+      THROW(GPU, Error, ErrorCode::InternalError, "gdr_pin_buffer_v2 failed (ret=", ret, ") for addr ", (void*)gpuAddr,
+            ". Ensure the GPU memory is allocated with cudaMalloc (not cuMemCreate/cuMemMap).");
+    }
   }
 
-  ret = gdr_map(ctx_->handle(), mh_, &barPtr_, mappedSize_);
+  ret = gdr_map(pimpl_->ctx->handle(), pimpl_->mh, &pimpl_->barPtr, pimpl_->mappedSize);
   if (ret != 0) {
-    (void)gdr_unpin_buffer(ctx_->handle(), mh_);
+    (void)gdr_unpin_buffer(pimpl_->ctx->handle(), pimpl_->mh);
     THROW(GPU, Error, ErrorCode::InternalError, "gdr_map failed (ret=", ret, ") for addr ", (void*)gpuAddr);
   }
 
-  hostDstPtr_ = reinterpret_cast<uint64_t*>(reinterpret_cast<char*>(barPtr_) + pageOffset);
+  pimpl_->hostDstPtr = reinterpret_cast<uint64_t*>(reinterpret_cast<char*>(pimpl_->barPtr) + pageOffset);
 
-  INFO(GPU, "GDRCopy mapping established: GPU addr ", (void*)gpuAddr, " -> host ptr ", (const void*)hostDstPtr_);
+  INFO(GPU, "GDRCopy mapping established: GPU addr ", (void*)gpuAddr, " -> host ptr ", (const void*)pimpl_->hostDstPtr);
 }
 
 GdrMap::~GdrMap() {
-  if (barPtr_ != nullptr) {
-    (void)gdr_unmap(ctx_->handle(), mh_, barPtr_, mappedSize_);
-  }
-  if (hostDstPtr_ != nullptr) {
-    (void)gdr_unpin_buffer(ctx_->handle(), mh_);
+  if (pimpl_) {
+    if (pimpl_->barPtr != nullptr) {
+      (void)gdr_unmap(pimpl_->ctx->handle(), pimpl_->mh, pimpl_->barPtr, pimpl_->mappedSize);
+    }
+    if (pimpl_->hostDstPtr != nullptr) {
+      (void)gdr_unpin_buffer(pimpl_->ctx->handle(), pimpl_->mh);
+    }
   }
 }
 
-void GdrMap::copyTo(const void* src, size_t size) { gdr_copy_to_mapping(mh_, hostDstPtr_, src, size); }
+bool GdrMap::valid() const { return pimpl_ && pimpl_->hostDstPtr != nullptr; }
+
+uint64_t* GdrMap::hostPtr() const { return pimpl_ ? pimpl_->hostDstPtr : nullptr; }
+
+void GdrMap::copyTo(const void* src, size_t size) { gdr_copy_to_mapping(pimpl_->mh, pimpl_->hostDstPtr, src, size); }
 
-void GdrMap::copyFrom(void* dst, size_t size) const { gdr_copy_from_mapping(mh_, dst, hostDstPtr_, size); }
+void GdrMap::copyFrom(void* dst, size_t size) const {
+  gdr_copy_from_mapping(pimpl_->mh, dst, pimpl_->hostDstPtr, size);
+}
 
 }  // namespace mscclpp
 
@@ -135,6 +181,24 @@ GdrStatus gdrStatus() { return GdrStatus::NotBuilt; }
 
 bool gdrEnabled() { return false; }
 
+const char* gdrStatusMessage() { return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)"; }
+
+// GdrMap::Impl — stub (no GDRCopy)
+
+struct GdrMap::Impl {};
+
+GdrMap::GdrMap(std::shared_ptr<void> /*gpuMem*/, int /*deviceId*/) {}
+
+GdrMap::~GdrMap() = default;
+
+bool GdrMap::valid() const { return false; }
+
+uint64_t* GdrMap::hostPtr() const { return nullptr; }
+
+void GdrMap::copyTo(const void* /*src*/, size_t /*size*/) {}
+
+void GdrMap::copyFrom(void* /*dst*/, size_t /*size*/) const {}
+
 }  // namespace mscclpp
 
 #endif  // !defined(MSCCLPP_USE_GDRCOPY)
diff --git a/src/core/ib.cc b/src/core/ib.cc
index c82b147a8..f783daa9f 100644
--- a/src/core/ib.cc
+++ b/src/core/ib.cc
@@ -67,8 +67,7 @@ static inline bool isDmabufSupportedByGpu(int gpuId) {
   return ret;
 }
 
-IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isMlx5)
-    : mr_(nullptr), buff_(buff), size_(0), isDmabuf_(false), isDataDirect_(false) {
+IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect) : mr_(nullptr), buff_(buff), size_(0) {
   if (size == 0) {
     THROW(NET, Error, ErrorCode::InvalidUsage, "invalid MR size: 0");
   }
@@ -91,11 +90,8 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isMlx5)
     int accessFlags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
                       IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC;
 #if defined(MSCCLPP_USE_MLX5DV)
-    if (isMlx5 && MLX5DV::isAvailable()) {
+    if (isDataDirect && MLX5DV::isAvailable()) {
       mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
-      if (mr_ != nullptr) {
-        isDataDirect_ = true;
-      }
     }
 #endif
     if (mr_ == nullptr) {
@@ -105,7 +101,6 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isMlx5)
     if (mr_ == nullptr) {
       THROW(NET, IbError, errno, "ibv_reg_dmabuf_mr failed (errno ", errno, ")");
     }
-    isDmabuf_ = true;
 #else   // defined(MSCCLPP_USE_ROCM)
     THROW(NET, Error, ErrorCode::InvalidUsage, "We don't support DMABUF on HIP platforms yet");
 #endif  // defined(MSCCLPP_USE_ROCM)
@@ -145,12 +140,8 @@ const void* IbMr::getBuff() const { return buff_; }
 
 uint32_t IbMr::getLkey() const { return mr_->lkey; }
 
-bool IbMr::isDmabuf() const { return isDmabuf_; }
-
-bool IbMr::isDataDirect() const { return isDataDirect_; }
-
 IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum,
-           int maxSendWr, int maxRecvWr, int maxWrPerSend, bool noAtomic, bool isMlx5)
+           int maxSendWr, int maxRecvWr, int maxWrPerSend, bool noAtomic)
     : portNum_(portNum),
       gidIndex_(gidIndex),
       info_(),
@@ -171,8 +162,7 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendC
       maxSendWr_(maxSendWr),
       maxWrPerSend_(maxWrPerSend),
       maxRecvWr_(maxRecvWr),
-      noAtomic_(noAtomic),
-      isMlx5_(isMlx5) {
+      noAtomic_(noAtomic) {
   sendCq_ = IBVerbs::ibv_create_cq(ctx, maxSendCqSize, nullptr, nullptr, 0);
   if (sendCq_ == nullptr) {
     THROW(NET, IbError, errno, "ibv_create_cq failed (errno ", errno, ")");
@@ -186,47 +176,21 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendC
     }
   }
 
-  struct ibv_qp* qp = nullptr;
-#if defined(MSCCLPP_USE_MLX5DV)
-  if (isMlx5_) {
-    struct ibv_qp_init_attr_ex qpInitAttrEx = {};
-    qpInitAttrEx.sq_sig_all = 0;
-    qpInitAttrEx.send_cq = sendCq_;
-    qpInitAttrEx.recv_cq = (recvCq_ != nullptr) ? recvCq_ : sendCq_;
-    qpInitAttrEx.qp_type = IBV_QPT_RC;
-    qpInitAttrEx.cap.max_send_wr = maxSendWr;
-    qpInitAttrEx.cap.max_recv_wr = maxRecvWr;
-    qpInitAttrEx.cap.max_send_sge = 1;
-    qpInitAttrEx.cap.max_recv_sge = 1;
-    qpInitAttrEx.cap.max_inline_data = 0;
-    qpInitAttrEx.pd = pd;
-    qpInitAttrEx.comp_mask = IBV_QP_INIT_ATTR_PD;
-
-    struct mlx5dv_qp_init_attr mlx5QpAttr = {};
-
-    qp = MLX5DV::mlx5dv_create_qp(ctx, &qpInitAttrEx, &mlx5QpAttr);
-    if (qp == nullptr) {
-      THROW(NET, IbError, errno, "mlx5dv_create_qp failed (errno ", errno, ")");
-    }
-  } else
-#endif  // defined(MSCCLPP_USE_MLX5DV)
-  {
-    struct ibv_qp_init_attr qpInitAttr = {};
-    qpInitAttr.sq_sig_all = 0;
-    qpInitAttr.send_cq = sendCq_;
-    // Use separate recv CQ if created, otherwise use the send CQ
-    qpInitAttr.recv_cq = (recvCq_ != nullptr) ? recvCq_ : sendCq_;
-    qpInitAttr.qp_type = IBV_QPT_RC;
-    qpInitAttr.cap.max_send_wr = maxSendWr;
-    qpInitAttr.cap.max_recv_wr = maxRecvWr;
-    qpInitAttr.cap.max_send_sge = 1;
-    qpInitAttr.cap.max_recv_sge = 1;
-    qpInitAttr.cap.max_inline_data = 0;
-
-    qp = IBVerbs::ibv_create_qp(pd, &qpInitAttr);
-    if (qp == nullptr) {
-      THROW(NET, IbError, errno, "ibv_create_qp failed (errno ", errno, ")");
-    }
+  struct ibv_qp_init_attr qpInitAttr = {};
+  qpInitAttr.sq_sig_all = 0;
+  qpInitAttr.send_cq = sendCq_;
+  // Use separate recv CQ if created, otherwise use the send CQ
+  qpInitAttr.recv_cq = (recvCq_ != nullptr) ? recvCq_ : sendCq_;
+  qpInitAttr.qp_type = IBV_QPT_RC;
+  qpInitAttr.cap.max_send_wr = maxSendWr;
+  qpInitAttr.cap.max_recv_wr = maxRecvWr;
+  qpInitAttr.cap.max_send_sge = 1;
+  qpInitAttr.cap.max_recv_sge = 1;
+  qpInitAttr.cap.max_inline_data = 0;
+
+  struct ibv_qp* qp = IBVerbs::ibv_create_qp(pd, &qpInitAttr);
+  if (qp == nullptr) {
+    THROW(NET, IbError, errno, "ibv_create_qp failed (errno ", errno, ")");
   }
 
   struct ibv_port_attr portAttr;
@@ -483,12 +447,29 @@ std::string IbQp::getRecvWcStatusString(int idx) const { return IBVerbs::ibv_wc_
 unsigned int IbQp::getRecvWcImmData(int idx) const { return ntohl((*recvWcs_)[idx].imm_data); }
 
 IbCtx::IbCtx(const std::string& devName)
-    : devName_(devName), ctx_(nullptr), pd_(nullptr), supportsRdmaAtomics_(false), isMlx5_(false) {
+    : devName_(devName),
+      ctx_(nullptr),
+      pd_(nullptr),
+      supportsRdmaAtomics_(false),
+      isMlx5_(false),
+      dataDirect_(false),
+      isVF_(false) {
   int num;
   struct ibv_device** devices = IBVerbs::ibv_get_device_list(&num);
   for (int i = 0; i < num; ++i) {
     if (std::string(devices[i]->name) == devName_) {
       ctx_ = IBVerbs::ibv_open_device(devices[i]);
+
+      // Detect if this IB device is a Virtual Function (VF).
+      // VFs have a 'physfn' sysfs symlink pointing to their parent PF; PFs do not.
+      {
+        std::string physfnPath = "/sys/class/infiniband/" + devName_ + "/device/physfn";
+        isVF_ = (access(physfnPath.c_str(), F_OK) == 0);
+        if (isVF_) {
+          INFO(NET, "IB device ", devName_, " is a Virtual Function (Data Direct ordering available)");
+        }
+      }
+
 #if defined(MSCCLPP_USE_MLX5DV)
       if (MLX5DV::isAvailable()) {
         isMlx5_ = MLX5DV::mlx5dv_is_supported(devices[i]);
@@ -509,6 +490,20 @@ IbCtx::IbCtx(const std::string& devName)
     THROW(NET, IbError, errno, "ibv_alloc_pd failed (errno ", errno, ")");
   }
 
+  // Detect Data Direct support via mlx5dv_get_data_direct_sysfs_path
+#if defined(MSCCLPP_USE_MLX5DV)
+  if (isMlx5_ && MLX5DV::isAvailable()) {
+    char sysfsPath[256];
+    int ret = MLX5DV::mlx5dv_get_data_direct_sysfs_path(ctx_, sysfsPath, sizeof(sysfsPath));
+    if (ret == 0) {
+      dataDirect_ = true;
+      INFO(NET, "IB device ", devName_, " supports Data Direct (sysfs: ", sysfsPath, ")");
+    } else {
+      INFO(NET, "IB device ", devName_, " does not support Data Direct");
+    }
+  }
+#endif  // defined(MSCCLPP_USE_MLX5DV)
+
   // Query and cache RDMA atomics capability
   struct ibv_device_attr attr = {};
   if (IBVerbs::ibv_query_device(ctx_, &attr) == 0) {
@@ -579,17 +574,21 @@ std::shared_ptr<IbQp> IbCtx::createQp(int port, int gidIndex, int maxSendCqSize,
     THROW(NET, Error, ErrorCode::InvalidUsage, "invalid IB port: ", port);
   }
   return std::shared_ptr<IbQp>(new IbQp(ctx_, pd_, port, gidIndex, maxSendCqSize, maxSendCqPollNum, maxSendWr,
-                                        maxRecvWr, maxWrPerSend, noAtomic, isMlx5_));
+                                        maxRecvWr, maxWrPerSend, noAtomic));
 }
 
 std::unique_ptr<const IbMr> IbCtx::registerMr(void* buff, std::size_t size) {
-  return std::unique_ptr<const IbMr>(new IbMr(pd_, buff, size, isMlx5_));
+  return std::unique_ptr<const IbMr>(new IbMr(pd_, buff, size, dataDirect_));
 }
 
 bool IbCtx::supportsRdmaAtomics() const { return supportsRdmaAtomics_; }
 
 bool IbCtx::isMlx5() const { return isMlx5_; }
 
+bool IbCtx::supportsDataDirect() const { return dataDirect_; }
+
+bool IbCtx::isVirtualFunction() const { return isVF_; }
+
 MSCCLPP_API_CPP int getIBDeviceCount() {
   int num;
   IBVerbs::ibv_get_device_list(&num);
@@ -699,8 +698,6 @@ IbMr::~IbMr() {}
 IbMrInfo IbMr::getInfo() const { return IbMrInfo(); }
 const void* IbMr::getBuff() const { return nullptr; }
 uint32_t IbMr::getLkey() const { return 0; }
-bool IbMr::isDmabuf() const { return false; }
-bool IbMr::isDataDirect() const { return false; }
 
 IbQp::~IbQp() {}
 void IbQp::rtr(const IbQpInfo& /*info*/) {}
diff --git a/src/core/ibverbs_wrapper.cc b/src/core/ibverbs_wrapper.cc
index 51f3f29c6..4fdf1b1e1 100644
--- a/src/core/ibverbs_wrapper.cc
+++ b/src/core/ibverbs_wrapper.cc
@@ -10,19 +10,37 @@
 
 #include "logger.hpp"
 
+// NOTE: MRC_SUPPORT is a temporal macro that makes the current MRC implementation work.
+// MRC_SUPPORT is needed because the current libibverbs implmentation of MRC does not provide
+// all symbols that we need, so we need to load some symbols from the original libibverbs.
+// This macro will be removed (set 0) once MRC provides all necessary symbols.
+// Non-MRC environments will not be affected by this macro as long as VMRC_LIBIBVERBS_SO
+// environment variable is not set.
+#define MRC_SUPPORT 1
+#if (MRC_SUPPORT)
+#include <cstdlib>
+#include <set>
+#endif  // (MRC_SUPPORT)
+
 namespace mscclpp {
 
 static std::unique_ptr<void, int (*)(void*)> globalIBVerbsHandle(nullptr, &::dlclose);
+#if (MRC_SUPPORT)
+static std::unique_ptr<void, int (*)(void*)> globalOrigIBVerbsHandle(nullptr, &::dlclose);
+#endif  // (MRC_SUPPORT)
 
 void* IBVerbs::dlsym(const std::string& symbol, bool allowReturnNull) {
+#if (MRC_SUPPORT)
+  static std::set<std::string> mrcSymbols = {
+      "ibv_get_device_list", "ibv_get_device_name", "ibv_open_device", "ibv_close_device", "ibv_query_qp",
+      "ibv_create_cq",       "ibv_destroy_cq",      "ibv_create_qp",   "ibv_modify_qp",    "ibv_destroy_qp",
+  };
+#endif  // (MRC_SUPPORT)
   if (!globalIBVerbsHandle) {
     if (mscclpp::env()->ibvSo != "") {
       void* handle = ::dlopen(mscclpp::env()->ibvSo.c_str(), RTLD_NOW);
       if (handle) {
         globalIBVerbsHandle.reset(handle);
-      } else {
-        THROW(NET, SysError, errno, "Failed to load libibverbs library specified by MSCCLPP_IBV_SO ('",
-              mscclpp::env()->ibvSo, "'): ", std::string(::dlerror()));
       }
     } else {
       const char* possibleLibNames[] = {"libibverbs.so", "libibverbs.so.1", nullptr};
@@ -38,7 +56,26 @@ void* IBVerbs::dlsym(const std::string& symbol, bool allowReturnNull) {
       THROW(NET, SysError, errno, "Failed to open libibverbs: ", std::string(::dlerror()));
     }
   }
+#if (MRC_SUPPORT)
+  // In MRC mode, `VMRC_LIBIBVERBS_SO` should be set.
+  char* vmrcLibibverbsSo = ::getenv("VMRC_LIBIBVERBS_SO");
+  void* ptr;
+  if (vmrcLibibverbsSo != nullptr && mrcSymbols.find(symbol) == mrcSymbols.end()) {
+    // If we are in MRC mode and the symbol is not in the table, get it from the original libibverbs.
+    if (!globalOrigIBVerbsHandle) {
+      void* handle = ::dlopen(vmrcLibibverbsSo, RTLD_NOW);
+      if (!handle) {
+        THROW(NET, SysError, errno, "Failed to open ", std::string(vmrcLibibverbsSo));
+      }
+      globalOrigIBVerbsHandle.reset(handle);
+    }
+    ptr = ::dlsym(globalOrigIBVerbsHandle.get(), symbol.c_str());
+  } else {
+    ptr = ::dlsym(globalIBVerbsHandle.get(), symbol.c_str());
+  }
+#else   // !(MRC_SUPPORT)
   void* ptr = ::dlsym(globalIBVerbsHandle.get(), symbol.c_str());
+#endif  // !(MRC_SUPPORT)
   if (!ptr && !allowReturnNull) {
     THROW(NET, SysError, errno, "Failed to load libibverbs symbol: ", symbol);
   }
diff --git a/src/core/include/connection.hpp b/src/core/include/connection.hpp
index f2ed2c8b8..47b03d6c4 100644
--- a/src/core/include/connection.hpp
+++ b/src/core/include/connection.hpp
@@ -37,16 +37,18 @@ class BaseConnection {
 
   virtual void flush(int64_t timeoutUsec = -1) = 0;
 
-  /// Set the local address where forwarded signals should be written.
-  /// This is called by the receiver to specify where incoming signals should be forwarded.
-  /// Default implementation is a no-op for connections that don't need it.
-  /// @param mem Shared pointer to the memory for incoming writes (nullptr to clear).
-  virtual void setSignalForwardingDst(std::shared_ptr<uint64_t> /*mem*/) {}
+  /// Start signal forwarding to the given memory address.
+  /// Called by the semaphore to specify where incoming signals should be written.
+  /// @param mem Shared pointer to the GPU memory for the signal token.
+  virtual void startSignalForwarding(std::shared_ptr<uint64_t> /*mem*/) {}
+
+  /// Stop signal forwarding and release associated resources.
+  virtual void stopSignalForwarding() {}
 
   /// Whether this connection uses signal forwarding (e.g., IB host-no-atomic mode).
   /// When true, the semaphore must allocate a separate inboundToken_ for the recv thread to write to.
   /// When false, the NIC writes directly to the semaphore's registered memory (e.g., via atomics).
-  virtual bool usesSignalForwarding() const { return false; }
+  virtual bool isSignalForwarding() const { return false; }
 
   virtual Transport transport() const = 0;
 
@@ -105,31 +107,20 @@ class IBConnection : public BaseConnection {
   // For write-with-imm mode (HostNoAtomic): uses RDMA write-with-imm to signal
   // instead of atomic operations, with a host thread forwarding to GPU for memory consistency.
   bool ibNoAtomic_;
+  bool gdrSignalForwarding_;  // ibNoAtomic_ && gdrEnabled() — decided once at construction
   std::thread recvThread_;
   std::atomic<bool> stopRecvThread_;
   int localGpuDeviceId_;  // Local GPU device ID for CUDA context and GDR mapping
 
-  // Write-with-imm design:
-  // - Sender: 8-byte RDMA write-with-imm from local host buffer to remote signal GPU buffer,
-  //   carrying the token value both as RDMA payload and in imm_data (32-bit).
-  // - Receiver: reads the full 64-bit token from the local signal GPU buffer (via BAR1 or
-  //   volatile read), then writes it to remoteUpdateDstAddr_ (the semaphore's inbound token).
-  uint64_t remoteUpdateDstAddr_;
-
-  // Remote endpoint's signal GPU buffer MR info (destination for RDMA write-with-imm).
-  // The local host buffer (atomicSrc_ / atomicSrcTransportInfo_.ibMr) serves as the source.
-  IbMrInfo remoteSignalGpuMrInfo_;
+  // Signal forwarding design (HostNoAtomic mode):
+  // - Sender: 0-byte RDMA WRITE_WITH_IMM carrying the token value in imm_data (32-bit).
+  // - Receiver: CPU recv thread polls recv CQ for WRITE_WITH_IMM completions (CQE), reads
+  //   the token from imm_data, then writes it to signalAddr_ (the semaphore's
+  //   inbound token) via atomicStore through the GDRCopy BAR1 mapping. The GPU reads
+  //   inboundToken with system-scope acquire ordering.
+  uint64_t signalAddr_;
 
-  std::unique_ptr<GdrMap> remoteUpdateDstAddrMap_;
-  std::unique_ptr<GdrMap> localSignalGpuMap_;
-  uint64_t* localSignalGpuPtr_;
-
-  // When true, recvThreadFunc reads the token from imm_data (from CQE) instead of the
-  // signal GPU buffer via GDRCopy. Enabled only when all Data Direct conditions are met:
-  // the signal GPU buffer MR is registered with MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT,
-  // and all GDRCopy mappings (local signal buffer and remoteUpdateDstAddr) are valid,
-  // so both RDMA data writes and GDRCopy token writes go through the Data Direct engine.
-  bool dataDirectEnabled_;
+  std::unique_ptr<GdrMap> signalGdrMap_;
 
   void recvThreadFunc();
 
@@ -137,12 +128,15 @@ class IBConnection : public BaseConnection {
   IBConnection(std::shared_ptr<Context> context, const Endpoint& localEndpoint, const Endpoint& remoteEndpoint);
   ~IBConnection();
 
-  /// Set the local address where forwarded signals should be written.
-  /// Must be called before the remote sends any updateAndSync in host-no-atomic mode.
-  /// @param mem Shared pointer to the memory for incoming writes (nullptr to clear).
-  void setSignalForwardingDst(std::shared_ptr<uint64_t> mem) override;
+  /// Start signal forwarding to the given memory address.
+  /// Must be called before the remote sends any updateAndSync in HostNoAtomic mode.
+  /// @param mem Shared pointer to the GPU memory for the signal token.
+  void startSignalForwarding(std::shared_ptr<uint64_t> mem) override;
+
+  /// Stop signal forwarding and release associated resources.
+  void stopSignalForwarding() override;
 
-  bool usesSignalForwarding() const override;
+  bool isSignalForwarding() const override;
 
   Transport transport() const override;
 
diff --git a/src/core/include/endpoint.hpp b/src/core/include/endpoint.hpp
index 1548d527c..363faab19 100644
--- a/src/core/include/endpoint.hpp
+++ b/src/core/include/endpoint.hpp
@@ -6,7 +6,6 @@
 
 #include <memory>
 #include <mscclpp/core.hpp>
-#include <mscclpp/gpu_utils.hpp>
 #include <vector>
 
 #include "ib.hpp"
@@ -30,13 +29,6 @@ struct Endpoint::Impl {
   std::shared_ptr<IbQp> ibQp_;
   IbQpInfo ibQpInfo_;
 
-  // Signal GPU buffer for write-with-imm data payload (ibNoAtomic_ only).
-  // Each endpoint allocates a 64-bit GPU buffer and registers it as an IB MR.
-  // The MR info is serialized/exchanged so the remote can RDMA-write to it.
-  std::shared_ptr<uint64_t> ibSignalGpuBuffer_;
-  std::unique_ptr<const IbMr> ibSignalGpuMr_;
-  IbMrInfo ibSignalGpuMrInfo_;
-
   // The following are only used for Ethernet and are undefined for other transports.
   std::unique_ptr<Socket> socket_;
   SocketAddress socketAddress_;
diff --git a/src/core/include/gdr.hpp b/src/core/include/gdr.hpp
index bde2986ab..e0c7f006f 100644
--- a/src/core/include/gdr.hpp
+++ b/src/core/include/gdr.hpp
@@ -4,6 +4,10 @@
 #ifndef MSCCLPP_GDR_HPP_
 #define MSCCLPP_GDR_HPP_
 
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
 namespace mscclpp {
 
 enum class GdrStatus {
@@ -20,25 +24,14 @@ GdrStatus gdrStatus();
 /// Whether the global GDRCopy context is enabled (shorthand for gdrStatus() == GdrStatus::Ok).
 bool gdrEnabled();
 
-}  // namespace mscclpp
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-
-#if defined(MSCCLPP_USE_GDRCOPY)
-
-#include <gdrapi.h>
-
-namespace mscclpp {
-
-class GdrContext;
+/// Return a human-readable error message for the current GDRCopy status.
+const char* gdrStatusMessage();
 
-/// RAII wrapper for a per-connection GDRCopy BAR1 mapping of a GPU address.
+/// RAII wrapper for a GDRCopy BAR1 mapping of a GPU address.
+/// When GDRCopy is not available, all operations are no-ops and valid() returns false.
 class GdrMap {
  public:
   /// Pin and map a GPU address for direct host-side access.
-  /// Holds a shared reference to the GPU memory to keep it alive.
   /// @param gpuMem   Shared pointer to the GPU memory (e.g. from gpuCallocShared).
   /// @param deviceId The CUDA device ID for setting context.
   GdrMap(std::shared_ptr<void> gpuMem, int deviceId);
@@ -48,10 +41,10 @@ class GdrMap {
   GdrMap& operator=(const GdrMap&) = delete;
 
   /// Whether the mapping was established successfully.
-  bool valid() const { return hostDstPtr_ != nullptr; }
+  bool valid() const;
 
   /// Return the BAR1-mapped host pointer to the GPU location.
-  uint64_t* hostPtr() const { return hostDstPtr_; }
+  uint64_t* hostPtr() const;
 
   /// Copy data from host memory to the mapped GPU location.
   void copyTo(const void* src, size_t size);
@@ -60,36 +53,10 @@ class GdrMap {
   void copyFrom(void* dst, size_t size) const;
 
  private:
-  std::shared_ptr<GdrContext> ctx_;
-  std::shared_ptr<void> gpuMem_;
-  gdr_mh_t mh_;
-  void* barPtr_;
-  uint64_t* hostDstPtr_;
-  size_t mappedSize_;
-};
-
-}  // namespace mscclpp
-
-#else  // !defined(MSCCLPP_USE_GDRCOPY)
-
-namespace mscclpp {
-
-/// Stub GdrMap when GDRCopy is not available.
-class GdrMap {
- public:
-  GdrMap(std::shared_ptr<void> /*gpuMem*/, int /*deviceId*/) {}
-  ~GdrMap() = default;
-
-  GdrMap(const GdrMap&) = delete;
-  GdrMap& operator=(const GdrMap&) = delete;
-
-  bool valid() const { return false; }
-  void copyTo(const void* /*src*/, size_t /*size*/) {}
-  void copyFrom(void* /*dst*/, size_t /*size*/) const {}
-  uint64_t* hostPtr() const { return nullptr; }
+  struct Impl;
+  std::unique_ptr<Impl> pimpl_;
 };
 
 }  // namespace mscclpp
 
-#endif  // !defined(MSCCLPP_USE_GDRCOPY)
 #endif  // MSCCLPP_GDR_HPP_
diff --git a/src/core/include/ib.hpp b/src/core/include/ib.hpp
index 9e5a454cb..923a7ca08 100644
--- a/src/core/include/ib.hpp
+++ b/src/core/include/ib.hpp
@@ -34,17 +34,13 @@ class IbMr {
   IbMrInfo getInfo() const;
   const void* getBuff() const;
   uint32_t getLkey() const;
-  bool isDmabuf() const;
-  bool isDataDirect() const;
 
  private:
-  IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isMlx5);
+  IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect);
 
   ibv_mr* mr_;
   void* buff_;
   std::size_t size_;
-  bool isDmabuf_;
-  bool isDataDirect_;
 
   friend class IbCtx;
 };
@@ -92,7 +88,6 @@ class IbQp {
   int getRecvWcStatus(int idx) const;
   std::string getRecvWcStatusString(int idx) const;
   unsigned int getRecvWcImmData(int idx) const;
-  bool isMlx5() const { return isMlx5_; }
 
  private:
   struct SendWrInfo {
@@ -106,7 +101,7 @@ class IbQp {
   };
 
   IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
-       int maxRecvWr, int maxWrPerSend, bool noAtomic, bool isMlx5);
+       int maxRecvWr, int maxWrPerSend, bool noAtomic);
   SendWrInfo getNewSendWrInfo();
   RecvWrInfo getNewRecvWrInfo();
 
@@ -134,7 +129,6 @@ class IbQp {
   const int maxWrPerSend_;
   const int maxRecvWr_;
   const bool noAtomic_;
-  const bool isMlx5_;
 
   friend class IbCtx;
 };
@@ -150,6 +144,8 @@ class IbCtx {
   std::unique_ptr<const IbMr> registerMr(void* buff, std::size_t size);
   bool supportsRdmaAtomics() const;
   bool isMlx5() const;
+  bool supportsDataDirect() const;
+  bool isVirtualFunction() const;
 #else
   IbCtx([[maybe_unused]] const std::string& devName) {}
   ~IbCtx() {}
@@ -160,6 +156,8 @@ class IbCtx {
   }
   bool supportsRdmaAtomics() const { return false; }
   bool isMlx5() const { return false; }
+  bool supportsDataDirect() const { return false; }
+  bool isVirtualFunction() const { return false; }
 #endif
 
   const std::string& getDevName() const { return devName_; };
@@ -173,6 +171,8 @@ class IbCtx {
   ibv_pd* pd_;
   bool supportsRdmaAtomics_;
   bool isMlx5_;
+  bool dataDirect_;
+  bool isVF_;
 };
 
 }  // namespace mscclpp
diff --git a/src/core/include/mlx5dv_wrapper.hpp b/src/core/include/mlx5dv_wrapper.hpp
index 654b086c9..79403a368 100644
--- a/src/core/include/mlx5dv_wrapper.hpp
+++ b/src/core/include/mlx5dv_wrapper.hpp
@@ -6,7 +6,7 @@
 
 #if defined(MSCCLPP_USE_MLX5DV)
 
-#include <infiniband/mlx5dv.h>
+#include <infiniband/verbs.h>
 
 #include <string>
 
@@ -19,14 +19,14 @@ struct MLX5DV {
   /// Check if the given IB device supports mlx5 Direct Verbs.
   static bool mlx5dv_is_supported(struct ibv_device* device);
 
-  /// Create a QP using mlx5dv extensions.
-  static struct ibv_qp* mlx5dv_create_qp(struct ibv_context* ctx, struct ibv_qp_init_attr_ex* qpAttr,
-                                          struct mlx5dv_qp_init_attr* mlx5QpAttr);
-
   /// Register a DMABUF memory region using mlx5dv extensions.
   /// Returns nullptr if mlx5dv_reg_dmabuf_mr is not available in this rdma-core version.
   static struct ibv_mr* mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset, size_t length, uint64_t iova, int fd,
-                                              int access);
+                                             int access);
+
+  /// Query the Data Direct sysfs path for the given IB context.
+  /// Returns 0 on success (device supports Data Direct), non-zero otherwise.
+  static int mlx5dv_get_data_direct_sysfs_path(struct ibv_context* context, char* buf, size_t buf_len);
 
  private:
   static void* dlsym(const std::string& symbol, bool allowReturnNull = false);
diff --git a/src/core/mlx5dv_wrapper.cc b/src/core/mlx5dv_wrapper.cc
index b1c398ee7..5d13d9c81 100644
--- a/src/core/mlx5dv_wrapper.cc
+++ b/src/core/mlx5dv_wrapper.cc
@@ -3,9 +3,19 @@
 
 #if defined(MSCCLPP_USE_MLX5DV)
 
+// _GNU_SOURCE is required for dlvsym()
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
 #include "mlx5dv_wrapper.hpp"
 
 #include <dlfcn.h>
+#include <infiniband/mlx5dv.h>
+
+#ifndef MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT
+#define MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT (1 << 0)
+#endif
 
 #include <memory>
 
@@ -72,14 +82,6 @@ bool MLX5DV::mlx5dv_is_supported(struct ibv_device* device) {
   return impl(device);
 }
 
-struct ibv_qp* MLX5DV::mlx5dv_create_qp(struct ibv_context* ctx, struct ibv_qp_init_attr_ex* qpAttr,
-                                        struct mlx5dv_qp_init_attr* mlx5QpAttr) {
-  using FuncType = struct ibv_qp* (*)(struct ibv_context*, struct ibv_qp_init_attr_ex*, struct mlx5dv_qp_init_attr*);
-  static FuncType impl = nullptr;
-  if (!impl) impl = reinterpret_cast<FuncType>(MLX5DV::dlsym("mlx5dv_create_qp"));
-  return impl(ctx, qpAttr, mlx5QpAttr);
-}
-
 struct ibv_mr* MLX5DV::mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset, size_t length, uint64_t iova, int fd,
                                             int access) {
   // mlx5dv_reg_dmabuf_mr(pd, offset, length, iova, fd, access, mlx5_access) — the last arg is mlx5-specific flags.
@@ -92,12 +94,27 @@ struct ibv_mr* MLX5DV::mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset,
     resolved = true;
   }
   if (!impl) return nullptr;
-#ifndef MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT
-#define MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT (1 << 0)
-#endif
   return impl(pd, offset, length, iova, fd, access, MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT);
 }
 
+int MLX5DV::mlx5dv_get_data_direct_sysfs_path(struct ibv_context* context, char* buf, size_t buf_len) {
+  using FuncType = int (*)(struct ibv_context*, char*, size_t);
+  static FuncType impl = nullptr;
+  static bool resolved = false;
+  if (!resolved) {
+    if (globalMLX5Handle) {
+      void* ptr = dlvsym(globalMLX5Handle.get(), "mlx5dv_get_data_direct_sysfs_path", "MLX5_1.25");
+      if (!ptr) {
+        ptr = MLX5DV::dlsym("mlx5dv_get_data_direct_sysfs_path", /*allowReturnNull=*/true);
+      }
+      impl = ptr ? reinterpret_cast<FuncType>(ptr) : nullptr;
+    }
+    resolved = true;
+  }
+  if (!impl) return -1;
+  return impl(context, buf, buf_len);
+}
+
 }  // namespace mscclpp
 
 #endif  // defined(MSCCLPP_USE_MLX5DV)
diff --git a/src/core/semaphore.cc b/src/core/semaphore.cc
index c6299dec8..53635a0ba 100644
--- a/src/core/semaphore.cc
+++ b/src/core/semaphore.cc
@@ -123,19 +123,18 @@ MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(const Semaphore& sema
     THROW(CONN, Error, ErrorCode::InvalidUsage, "Local endpoint device type of Host2DeviceSemaphore should be GPU");
   }
   auto connImpl = BaseConnection::getImpl(connection());
-  if (connImpl->usesSignalForwarding()) {
-    // Signal forwarding mode: the recv thread writes the token to GPU memory.
-    // Allocate a separate inbound token via plain cudaMalloc (not TokenPool/VMM)
-    // so that it is always compatible with GDRCopy pinning (VMM memory cannot be pinned by gdr_pin_buffer).
+  if (connImpl->isSignalForwarding()) {
+    // Signal forwarding (HostNoAtomic): the receiver's recv thread polls the recv CQ for
+    // WRITE_WITH_IMM completions, then forwards the token to inboundToken_ via GDRCopy.
     CudaDeviceGuard deviceGuard(connection().localDevice().id);
 #if defined(MSCCLPP_USE_ROCM)
     inboundToken_ = detail::gpuCallocUncachedShared<uint64_t>();
 #else
     inboundToken_ = detail::gpuCallocShared<uint64_t>();
 #endif
-    connImpl->setSignalForwardingDst(inboundToken_);
+    connImpl->startSignalForwarding(inboundToken_);
   }
-  // When usesSignalForwarding() is false (e.g., atomic mode), inboundToken_ stays null
+  // When isSignalForwarding() is false (atomic mode), inboundToken_ stays null
   // and the GPU polls the SemaphoreStub token directly (the NIC atomic target).
 }
 
@@ -144,9 +143,9 @@ MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(Communicator& communi
 
 MSCCLPP_API_CPP Host2DeviceSemaphore::~Host2DeviceSemaphore() {
   if (inboundToken_) {
-    // Clear the connection's signal forwarding destination (and any associated GdrMap)
+    // Clear the connection's signal forwarding destination (and GdrMap)
     // before inboundToken_ is freed, to avoid use-after-free on the pinned GPU memory.
-    BaseConnection::getImpl(connection())->setSignalForwardingDst(nullptr);
+    BaseConnection::getImpl(connection())->stopSignalForwarding();
   }
 }
 
@@ -158,7 +157,7 @@ MSCCLPP_API_CPP void Host2DeviceSemaphore::signal() {
 
 MSCCLPP_API_CPP Host2DeviceSemaphore::DeviceHandle Host2DeviceSemaphore::deviceHandle() const {
   Host2DeviceSemaphore::DeviceHandle device;
-  // If inboundToken_ is allocated (host-no-atomic mode), the GPU polls it.
+  // If inboundToken_ is allocated (signal forwarding mode), the GPU polls it.
   // Otherwise (atomic mode), the GPU polls the SemaphoreStub token directly,
   // which is the same address targeted by the NIC's atomic operation.
   device.inboundToken =
@@ -178,12 +177,12 @@ MSCCLPP_API_CPP Host2HostSemaphore::Host2HostSemaphore(const Semaphore& semaphor
     THROW(CONN, Error, ErrorCode::InvalidUsage, "Local endpoint device type of Host2HostSemaphore should be CPU");
   }
   auto connImpl = BaseConnection::getImpl(connection());
-  if (connImpl->usesSignalForwarding()) {
+  if (connImpl->isSignalForwarding()) {
     // Signal forwarding mode: tell the recv thread where to write the incoming token.
     // Non-owning shared_ptr: Host2HostSemaphore outlives the connection, so the memory stays valid.
     auto token =
         std::shared_ptr<uint64_t>(reinterpret_cast<uint64_t*>(semaphore_.localMemory().data()), [](uint64_t*) {});
-    connImpl->setSignalForwardingDst(std::move(token));
+    connImpl->startSignalForwarding(std::move(token));
   }
 }
 
diff --git a/test/framework.cc b/test/framework.cc
index 73cf1272e..f5bf55aa4 100644
--- a/test/framework.cc
+++ b/test/framework.cc
@@ -285,6 +285,9 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
         passed++;
       } else {
         std::cout << "[  FAILED  ] " << fullName << std::endl;
+        if (!gCurrentTestFailureMessage.empty()) {
+          std::cout << "            Reason: " << gCurrentTestFailureMessage << std::endl;
+        }
         failed++;
       }
     }
diff --git a/test/mp_unit/ib_tests.cu b/test/mp_unit/ib_tests.cu
index 5809dd2fd..8c91db669 100644
--- a/test/mp_unit/ib_tests.cu
+++ b/test/mp_unit/ib_tests.cu
@@ -3,8 +3,12 @@
 
 #include <mpi.h>
 
+#include <atomic>
+#include <mscclpp/atomic_device.hpp>
 #include <mscclpp/gpu_utils.hpp>
+#include <thread>
 
+#include "gdr.hpp"
 #include "mp_unit_tests.hpp"
 #include "utils_internal.hpp"
 
@@ -41,7 +45,10 @@ void IbPeerToPeerTest::SetUp() {
 
   ibCtx = std::make_shared<mscclpp::IbCtx>(ibDevName);
   bool noAtomic = !ibCtx->supportsRdmaAtomics();
-  qp = ibCtx->createQp(-1, ib_gid_index, 1024, 1, 8192, 0, 64, noAtomic);
+  // When atomics are not supported, the MemoryConsistency test uses
+  // write-with-imm which requires recv WRs on the receiver side.
+  int maxRecvWr = noAtomic ? 64 : 0;
+  qp = ibCtx->createQp(-1, ib_gid_index, 1024, 1, 8192, maxRecvWr, 64, noAtomic);
 
   qpInfo[gEnv->rank] = qp->getInfo();
   bootstrap->allGather(qpInfo.data(), sizeof(mscclpp::IbQpInfo));
@@ -199,15 +206,34 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
     // This test needs only two ranks
     return;
   }
-  if (!ibCtx->supportsRdmaAtomics()) {
-    GTEST_SKIP() << "This test requires RDMA atomics support.";
-  }
+
+  // Use atomic path if supported by the IB device.
+  bool useAtomic = ibCtx->supportsRdmaAtomics();
 
   const uint64_t signalPeriod = 1024;
   const uint64_t maxIter = 10000;
   const uint64_t nelem = 65536 + 1;
   auto data = mscclpp::detail::gpuCallocUnique<uint64_t>(nelem);
 
+  // For no-atomic mode: allocate a separate signal buffer for write-with-imm destination.
+  // The sender writes-with-imm to this buffer; the receiver's CPU thread reads the imm_data
+  // from the recv CQ and writes the iteration value to data[0] via GDRCopy atomicStore.
+  std::shared_ptr<uint64_t> signalBuf;
+  std::unique_ptr<const mscclpp::IbMr> signalMr;
+  std::array<mscclpp::IbMrInfo, 2> signalMrInfo{};
+  if (!useAtomic) {
+    signalBuf = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+    signalMr = ibCtx->registerMr(signalBuf.get(), sizeof(uint64_t));
+    signalMrInfo[gEnv->rank] = signalMr->getInfo();
+    bootstrap->allGather(signalMrInfo.data(), sizeof(mscclpp::IbMrInfo));
+
+    // Pre-post recv WRs for write-with-imm on both ranks
+    for (int i = 0; i < 64; ++i) {
+      qp->stageRecv(0);
+    }
+    qp->postRecv();
+  }
+
   registerBufferAndConnect(data.get(), sizeof(uint64_t) * nelem);
 
   uint64_t res = 0;
@@ -226,6 +252,40 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
     ASSERT_EQ(*ptrCurIter, 0);
     ASSERT_EQ(*ptrResult, 0);
 
+    // For no-atomic mode: create a GDRCopy mapping for data[0] and start a CPU thread that
+    // polls recv CQ and forwards the signal via GDRCopy BAR1 write — the same mechanism
+    // used by IBConnection::recvThreadFunc for port channels.
+    std::atomic<bool> stopRecvThread(false);
+    std::thread recvThread;
+    std::unique_ptr<mscclpp::GdrMap> dataGdrMap;
+    if (!useAtomic) {
+      if (!mscclpp::gdrEnabled()) {
+        SKIP_TEST() << "No-atomic mode requires GDRCopy but it is not available.";
+      }
+      // Create GDRCopy BAR1 mapping for data[0] — same as how connection.cc maps inboundToken_
+      dataGdrMap =
+          std::make_unique<mscclpp::GdrMap>(std::shared_ptr<void>(data.get(), [](void*) {}),  // non-owning shared_ptr
+                                            cudaDevId);
+
+      recvThread = std::thread([&]() {
+        while (!stopRecvThread.load(std::memory_order_relaxed)) {
+          int wcNum = qp->pollRecvCq();
+          if (wcNum <= 0) continue;
+          for (int i = 0; i < wcNum; ++i) {
+            int status = qp->getRecvWcStatus(i);
+            if (status != static_cast<int>(mscclpp::WsStatus::Success)) continue;
+            uint64_t val = static_cast<uint64_t>(qp->getRecvWcImmData(i));
+            // Write the iteration value to data[0] via GDRCopy BAR1 atomicStore —
+            // same pattern as IBConnection::recvThreadFunc.
+            mscclpp::atomicStore(dataGdrMap->hostPtr(), val, mscclpp::memoryOrderRelaxed);
+            // Re-post recv
+            qp->stageRecv(0);
+            qp->postRecv();
+          }
+        }
+      });
+    }
+
     kernelMemoryConsistency<<<1, 1024>>>(data.get(), ptrCurIter, ptrResult, nelem, maxIter);
     MSCCLPP_CUDATHROW(cudaGetLastError());
 
@@ -247,6 +307,11 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
     }
 
     MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+
+    if (!useAtomic) {
+      stopRecvThread.store(true, std::memory_order_relaxed);
+      if (recvThread.joinable()) recvThread.join();
+    }
   } else if (gEnv->rank == 1) {
     // Sender
     std::vector<uint64_t> hostBuffer(nelem, 0);
@@ -267,15 +332,20 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
       stageSendWrite(sizeof(uint64_t) * (nelem - 1), 0, sizeof(uint64_t), sizeof(uint64_t), signaled);
       qp->postSend();
 
-#if 0
-      // For reference: send the first element using a normal send. This should occasionally see a wrong result.
-      stageSendWrite(sizeof(uint64_t), 0, 0, 0, false);
-      qp->postSend();
-#else
-      // Send the first element using AtomicAdd. This should see the correct result.
-      stageSendAtomicAdd(0, 0, 1, false);
-      qp->postSend();
-#endif
+      if (useAtomic) {
+        // Send the first element using AtomicAdd. The non-posted PCIe atomic operation
+        // provides end-to-end ordering: data[1..N] are guaranteed visible when data[0] updates.
+        stageSendAtomicAdd(0, 0, 1, false);
+        qp->postSend();
+      } else {
+        // No-atomic mode: send a 0-byte WRITE_WITH_IMM carrying the iteration in imm_data.
+        // The receiver's CPU thread polls the recv CQ and writes the value to data[0]
+        // via GDRCopy atomicStore.
+        // QP ordering guarantees data[1..N] WRITE completes before this write-with-imm.
+        const mscclpp::IbMrInfo& remoteSignalMrInfo = signalMrInfo[(gEnv->rank == 1) ? 0 : 1];
+        qp->stageSendWriteWithImm(nullptr, remoteSignalMrInfo, 0, 0, 0, 0, false, static_cast<unsigned int>(iter));
+        qp->postSend();
+      }
 
       if (signaled) {
         int wcNum = qp->pollSendCq();
@@ -296,13 +366,23 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
     }
   }
 
-  if (res & 2) {
-    FAIL() << "The receiver is stuck at iteration " << iter << ".";
-  } else if (res != 0 && res != 1) {
-    FAIL() << "Unknown error is detected at iteration " << iter << ". res =" << res;
+  if (useAtomic) {
+    // With RDMA atomics, memory consistency must be guaranteed.
+    if (res & 2) {
+      FAIL() << "The receiver is stuck at iteration " << iter << ".";
+    }
+    EXPECT_EQ(res, 0);
+  } else {
+    if (res == 0) {
+      // No-atomic path works correctly here.
+    } else if (res & 2) {
+      SKIP_TEST() << "No-atomic signal forwarding: receiver stuck at iteration " << iter
+                  << ". NIC DMA and CPU writes are not ordered on this platform.";
+    } else {
+      SKIP_TEST() << "No-atomic signal forwarding: memory inconsistency detected at iteration " << iter
+                  << ". NIC DMA and CPU writes are not ordered on this platform.";
+    }
   }
-
-  EXPECT_EQ(res, 0);
 }
 
 TEST(IbPeerToPeerTest, SimpleAtomicAdd) {
@@ -311,7 +391,7 @@ TEST(IbPeerToPeerTest, SimpleAtomicAdd) {
     return;
   }
   if (!ibCtx->supportsRdmaAtomics()) {
-    GTEST_SKIP() << "This test requires RDMA atomics support.";
+    SKIP_TEST() << "This test requires RDMA atomics support.";
   }
 
   mscclpp::Timer timeout(3);
diff --git a/test/mp_unit/port_channel_tests.cu b/test/mp_unit/port_channel_tests.cu
index 764c32999..b69f388a8 100644
--- a/test/mp_unit/port_channel_tests.cu
+++ b/test/mp_unit/port_channel_tests.cu
@@ -4,9 +4,24 @@
 #include <cstdint>
 #include <mscclpp/concurrency_device.hpp>
 
+#include "gdr.hpp"
 #include "mp_unit_tests.hpp"
 #include "utils_internal.hpp"
 
+// Skip the current test if HostNoAtomic mode is not supported.
+// On CUDA, HostNoAtomic requires GDRCopy for BAR1 signal forwarding.
+// On ROCm, HostNoAtomic uses direct volatile writes and does not need GDRCopy.
+#if defined(MSCCLPP_USE_CUDA)
+#define REQUIRE_HOST_NO_ATOMIC                                                         \
+  do {                                                                                 \
+    if (!mscclpp::gdrEnabled()) {                                                      \
+      SKIP_TEST() << "HostNoAtomic requires GDRCopy: " << mscclpp::gdrStatusMessage(); \
+    }                                                                                  \
+  } while (0)
+#else
+#define REQUIRE_HOST_NO_ATOMIC  // No extra requirements on non-CUDA platforms.
+#endif
+
 void PortChannelOneToOneTest::SetUp() {
   // Use only two ranks
   setNumRanksToUse(2);
@@ -272,6 +287,7 @@ TEST(PortChannelOneToOneTest, PingPongPerfIbHostMode) {
 
 TEST(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_HOST_NO_ATOMIC;
   testPingPongPerf(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
 }
@@ -465,16 +481,19 @@ TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
 
 TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_HOST_NO_ATOMIC;
   testPacketPingPongPerf(true, IbMode::HostNoAtomic);
 }
 
 TEST(PortChannelOneToOneTest, PingPongIbHostNoAtomicMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_HOST_NO_ATOMIC;
   testPingPong(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
 }
 
 TEST(PortChannelOneToOneTest, PacketPingPongIbHostNoAtomicMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_HOST_NO_ATOMIC;
   testPacketPingPong(true, IbMode::HostNoAtomic);
 }
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 7836e0632..a345effcb 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -4,6 +4,7 @@
 target_sources(unit_tests PRIVATE
     unit_tests_main.cc
     core_tests.cc
+    gdr_tests.cu
     gpu_utils_tests.cc
     errors_tests.cc
     fifo_tests.cu
diff --git a/test/unit/gdr_tests.cu b/test/unit/gdr_tests.cu
new file mode 100644
index 000000000..78bb2e1ad
--- /dev/null
+++ b/test/unit/gdr_tests.cu
@@ -0,0 +1,251 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <mscclpp/atomic_device.hpp>
+#include <mscclpp/errors.hpp>
+#include <mscclpp/gpu_utils.hpp>
+
+#include "../framework.hpp"
+#include "gdr.hpp"
+
+// GdrStatus and gdrEnabled
+
+class GdrStatusTest : public ::mscclpp::test::TestCase {};
+
+TEST(GdrStatusTest, StatusIsValid) {
+  // gdrStatus() should return one of the defined enum values
+  auto status = mscclpp::gdrStatus();
+  ASSERT_TRUE(status == mscclpp::GdrStatus::Ok || status == mscclpp::GdrStatus::NotBuilt ||
+              status == mscclpp::GdrStatus::Disabled || status == mscclpp::GdrStatus::DriverMissing ||
+              status == mscclpp::GdrStatus::OpenFailed);
+}
+
+TEST(GdrStatusTest, EnabledConsistentWithStatus) {
+  // gdrEnabled() should be true iff gdrStatus() == Ok
+  EXPECT_EQ(mscclpp::gdrEnabled(), mscclpp::gdrStatus() == mscclpp::GdrStatus::Ok);
+}
+
+// GdrMap tests — only run when GDRCopy is available
+
+class GdrMapTest : public ::mscclpp::test::TestCase {
+ protected:
+  void SetUp() override {
+    if (!mscclpp::gdrEnabled()) {
+      SKIP_TEST() << "GDRCopy not enabled on this platform.";
+    }
+    MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId_));
+    // Try creating a GDRCopy mapping to check if pin+map works on this platform.
+    try {
+      auto testMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+      mscclpp::GdrMap testMap(std::static_pointer_cast<void>(testMem), deviceId_);
+    } catch (const std::exception&) {
+      SKIP_TEST() << "GDRCopy mapping not supported on this platform.";
+    }
+  }
+
+  int deviceId_ = 0;
+};
+
+TEST(GdrMapTest, BasicMapping) {
+  // Allocate GPU memory via cudaMalloc (not VMM) and create a GDRCopy mapping
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+
+  ASSERT_TRUE(map.valid());
+  EXPECT_NE(map.hostPtr(), nullptr);
+}
+
+TEST(GdrMapTest, CopyToAndFrom) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write a value to GPU via GDRCopy
+  uint64_t writeVal = 0xDEADBEEFCAFE0123ULL;
+  map.copyTo(&writeVal, sizeof(uint64_t));
+
+  // Read it back via GDRCopy
+  uint64_t readVal = 0;
+  map.copyFrom(&readVal, sizeof(uint64_t));
+  EXPECT_EQ(readVal, writeVal);
+
+  // Also verify via cudaMemcpy
+  uint64_t cudaVal = 0;
+  MSCCLPP_CUDATHROW(cudaMemcpy(&cudaVal, gpuMem.get(), sizeof(uint64_t), cudaMemcpyDeviceToHost));
+  EXPECT_EQ(cudaVal, writeVal);
+}
+
+TEST(GdrMapTest, CopyToVisibleFromGpu) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write via GDRCopy, verify GPU sees it via cudaMemcpy
+  uint64_t val = 42;
+  map.copyTo(&val, sizeof(uint64_t));
+
+  uint64_t result = 0;
+  MSCCLPP_CUDATHROW(cudaMemcpy(&result, gpuMem.get(), sizeof(uint64_t), cudaMemcpyDeviceToHost));
+  EXPECT_EQ(result, 42);
+}
+
+TEST(GdrMapTest, MultipleWritesReadBack) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write multiple values sequentially and verify each
+  for (uint64_t i = 1; i <= 100; ++i) {
+    map.copyTo(&i, sizeof(uint64_t));
+    uint64_t readback = 0;
+    map.copyFrom(&readback, sizeof(uint64_t));
+    EXPECT_EQ(readback, i);
+    if (readback != i) break;
+  }
+}
+
+TEST(GdrMapTest, HostPtrIsWritable) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write directly through the hostPtr (volatile store)
+  volatile uint64_t* ptr = reinterpret_cast<volatile uint64_t*>(map.hostPtr());
+  *ptr = 12345;
+
+  // Read back via GDRCopy
+  uint64_t readback = 0;
+  map.copyFrom(&readback, sizeof(uint64_t));
+  EXPECT_EQ(readback, 12345);
+}
+
+TEST(GdrMapTest, HostPtrIsReadable) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write via GDRCopy copyTo (same BAR1 path as the read)
+  uint64_t val = 99999;
+  map.copyTo(&val, sizeof(uint64_t));
+
+  // Read through the hostPtr (volatile load via BAR1)
+  volatile uint64_t* ptr = reinterpret_cast<volatile uint64_t*>(map.hostPtr());
+  EXPECT_EQ(*ptr, 99999);
+}
+
+TEST(GdrMapTest, DestroyDoesNotCrash) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  {
+    mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+    ASSERT_TRUE(map.valid());
+    uint64_t val = 1;
+    map.copyTo(&val, sizeof(uint64_t));
+  }
+  // After GdrMap is destroyed, gpuMem should still be valid
+  uint64_t result = 0;
+  MSCCLPP_CUDATHROW(cudaMemcpy(&result, gpuMem.get(), sizeof(uint64_t), cudaMemcpyDeviceToHost));
+  EXPECT_EQ(result, 1);
+}
+
+// GPU kernel: polls signalFromCpu until it reaches expectedIter, then writes expectedIter to ackToHost.
+// Repeats for maxIter iterations. The GPU uses system-scope acquire loads on signalFromCpu
+// and plain stores to ackToHost (which is host-pinned memory visible to CPU).
+__global__ void kernelGdrVisibilityPingPong(volatile uint64_t* signalFromCpu, volatile uint64_t* ackToHost,
+                                            uint64_t maxIter) {
+  for (uint64_t iter = 1; iter <= maxIter; ++iter) {
+    // Poll until CPU writes the expected iteration value via GDRCopy BAR1
+    while (*signalFromCpu < iter) {
+    }
+    // Ack back to CPU via host-pinned memory
+    *ackToHost = iter;
+  }
+}
+
+TEST(GdrMapTest, CpuGpuVisibilityPingPong) {
+  const uint64_t maxIter = 10000;
+
+  // signalBuf: GPU memory mapped via GDRCopy BAR1. CPU writes here, GPU polls.
+  auto signalBuf = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap signalMap(std::static_pointer_cast<void>(signalBuf), deviceId_);
+  ASSERT_TRUE(signalMap.valid());
+
+  // ackBuf: host-pinned memory (gpuCallocHostShared). GPU writes here, CPU polls.
+  auto ackBuf = mscclpp::detail::gpuCallocHostShared<uint64_t>(1);
+  volatile uint64_t* ackPtr = reinterpret_cast<volatile uint64_t*>(ackBuf.get());
+  *ackPtr = 0;
+
+  // Launch kernel — it will poll signalBuf and write ackBuf for each iteration
+  kernelGdrVisibilityPingPong<<<1, 1>>>(signalBuf.get(), ackBuf.get(), maxIter);
+  MSCCLPP_CUDATHROW(cudaGetLastError());
+
+  for (uint64_t iter = 1; iter <= maxIter; ++iter) {
+    // CPU writes iteration value to GPU via GDRCopy BAR1
+    uint64_t val = iter;
+    signalMap.copyTo(&val, sizeof(uint64_t));
+
+    // CPU polls host-pinned ack until GPU confirms it saw the value
+    int spin = 0;
+    while (*ackPtr < iter) {
+      if (++spin > 100000000) {
+        FAIL() << "GPU did not ack iteration " << iter << " (ack=" << *ackPtr << ")";
+      }
+    }
+  }
+
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+  EXPECT_EQ(*ackPtr, maxIter);
+}
+
+// GPU kernel that polls a counter using system-scope acquire load.
+// When counter >= expectedIter, writes ack.
+__global__ void kernelCounterWait(uint64_t* counter, volatile uint64_t* ackToHost, uint64_t maxIter) {
+  for (uint64_t iter = 1; iter <= maxIter; ++iter) {
+    // System-scope acquire load — matches the atomicStore(relaxed) on the CPU side
+    uint64_t got;
+    do {
+      got = mscclpp::atomicLoad(counter, mscclpp::memoryOrderAcquire);
+    } while (got < iter);
+    // Ack back
+    *ackToHost = iter;
+  }
+}
+
+// Test the GDRCopy counter pattern used by HostNoAtomic mode:
+// - GPU memory allocated via gpuCallocShared (cudaMalloc)
+// - GdrMap for BAR1 mapping
+// - CPU writes via atomicStore(relaxed) through GDRCopy BAR1 mapping
+// - GPU reads via atomicLoad with memory_order_acquire
+TEST(GdrMapTest, AtomicStoreCounterPingPong) {
+  const uint64_t maxIter = 10000;
+
+  // Allocate GPU memory via gpuCallocShared
+  auto counterBuf = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap counterMap(std::static_pointer_cast<void>(counterBuf), deviceId_);
+  ASSERT_TRUE(counterMap.valid());
+
+  // Ack buffer: host-pinned memory
+  auto ackBuf = mscclpp::detail::gpuCallocHostShared<uint64_t>(1);
+  volatile uint64_t* ackPtr = reinterpret_cast<volatile uint64_t*>(ackBuf.get());
+  *ackPtr = 0;
+
+  // Launch kernel — polls counterBuf with system-scope acquire load
+  kernelCounterWait<<<1, 1>>>(counterBuf.get(), ackBuf.get(), maxIter);
+  MSCCLPP_CUDATHROW(cudaGetLastError());
+
+  for (uint64_t iter = 1; iter <= maxIter; ++iter) {
+    // CPU writes counter via atomicStore (relaxed — GPU uses acquire on read)
+    mscclpp::atomicStore(counterMap.hostPtr(), iter, mscclpp::memoryOrderRelaxed);
+
+    // Wait for GPU ack
+    int spin = 0;
+    while (*ackPtr < iter) {
+      if (++spin > 100000000) {
+        MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+        FAIL() << "GPU did not ack iteration " << iter;
+      }
+    }
+  }
+
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+  EXPECT_EQ(*ackPtr, maxIter);
+}

From d1124fba29d9da302b131d737dcf59564270ed07 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 1 Apr 2026 18:20:29 +0000
Subject: [PATCH 086/132] revert

---
 src/core/ibverbs_wrapper.cc | 40 -------------------------------------
 1 file changed, 40 deletions(-)

diff --git a/src/core/ibverbs_wrapper.cc b/src/core/ibverbs_wrapper.cc
index 4fdf1b1e1..a147e4582 100644
--- a/src/core/ibverbs_wrapper.cc
+++ b/src/core/ibverbs_wrapper.cc
@@ -10,32 +10,11 @@
 
 #include "logger.hpp"
 
-// NOTE: MRC_SUPPORT is a temporal macro that makes the current MRC implementation work.
-// MRC_SUPPORT is needed because the current libibverbs implmentation of MRC does not provide
-// all symbols that we need, so we need to load some symbols from the original libibverbs.
-// This macro will be removed (set 0) once MRC provides all necessary symbols.
-// Non-MRC environments will not be affected by this macro as long as VMRC_LIBIBVERBS_SO
-// environment variable is not set.
-#define MRC_SUPPORT 1
-#if (MRC_SUPPORT)
-#include <cstdlib>
-#include <set>
-#endif  // (MRC_SUPPORT)
-
 namespace mscclpp {
 
 static std::unique_ptr<void, int (*)(void*)> globalIBVerbsHandle(nullptr, &::dlclose);
-#if (MRC_SUPPORT)
-static std::unique_ptr<void, int (*)(void*)> globalOrigIBVerbsHandle(nullptr, &::dlclose);
-#endif  // (MRC_SUPPORT)
 
 void* IBVerbs::dlsym(const std::string& symbol, bool allowReturnNull) {
-#if (MRC_SUPPORT)
-  static std::set<std::string> mrcSymbols = {
-      "ibv_get_device_list", "ibv_get_device_name", "ibv_open_device", "ibv_close_device", "ibv_query_qp",
-      "ibv_create_cq",       "ibv_destroy_cq",      "ibv_create_qp",   "ibv_modify_qp",    "ibv_destroy_qp",
-  };
-#endif  // (MRC_SUPPORT)
   if (!globalIBVerbsHandle) {
     if (mscclpp::env()->ibvSo != "") {
       void* handle = ::dlopen(mscclpp::env()->ibvSo.c_str(), RTLD_NOW);
@@ -56,26 +35,7 @@ void* IBVerbs::dlsym(const std::string& symbol, bool allowReturnNull) {
       THROW(NET, SysError, errno, "Failed to open libibverbs: ", std::string(::dlerror()));
     }
   }
-#if (MRC_SUPPORT)
-  // In MRC mode, `VMRC_LIBIBVERBS_SO` should be set.
-  char* vmrcLibibverbsSo = ::getenv("VMRC_LIBIBVERBS_SO");
-  void* ptr;
-  if (vmrcLibibverbsSo != nullptr && mrcSymbols.find(symbol) == mrcSymbols.end()) {
-    // If we are in MRC mode and the symbol is not in the table, get it from the original libibverbs.
-    if (!globalOrigIBVerbsHandle) {
-      void* handle = ::dlopen(vmrcLibibverbsSo, RTLD_NOW);
-      if (!handle) {
-        THROW(NET, SysError, errno, "Failed to open ", std::string(vmrcLibibverbsSo));
-      }
-      globalOrigIBVerbsHandle.reset(handle);
-    }
-    ptr = ::dlsym(globalOrigIBVerbsHandle.get(), symbol.c_str());
-  } else {
-    ptr = ::dlsym(globalIBVerbsHandle.get(), symbol.c_str());
-  }
-#else   // !(MRC_SUPPORT)
   void* ptr = ::dlsym(globalIBVerbsHandle.get(), symbol.c_str());
-#endif  // !(MRC_SUPPORT)
   if (!ptr && !allowReturnNull) {
     THROW(NET, SysError, errno, "Failed to load libibverbs symbol: ", symbol);
   }

From 144046b8187ad67f7f81eee3290e281c61aba496 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 1 Apr 2026 18:22:16 +0000
Subject: [PATCH 087/132] revert

---
 src/core/ibverbs_wrapper.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/core/ibverbs_wrapper.cc b/src/core/ibverbs_wrapper.cc
index a147e4582..51f3f29c6 100644
--- a/src/core/ibverbs_wrapper.cc
+++ b/src/core/ibverbs_wrapper.cc
@@ -20,6 +20,9 @@ void* IBVerbs::dlsym(const std::string& symbol, bool allowReturnNull) {
       void* handle = ::dlopen(mscclpp::env()->ibvSo.c_str(), RTLD_NOW);
       if (handle) {
         globalIBVerbsHandle.reset(handle);
+      } else {
+        THROW(NET, SysError, errno, "Failed to load libibverbs library specified by MSCCLPP_IBV_SO ('",
+              mscclpp::env()->ibvSo, "'): ", std::string(::dlerror()));
       }
     } else {
       const char* possibleLibNames[] = {"libibverbs.so", "libibverbs.so.1", nullptr};

From f8e94d99719e9b7dea5977e1a8f980f3ca87bd12 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 1 Apr 2026 19:00:03 +0000
Subject: [PATCH 088/132] disable mlx5dv_reg_dmabuf_mr

---
 src/core/ib.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/core/ib.cc b/src/core/ib.cc
index f783daa9f..390e0a5c7 100644
--- a/src/core/ib.cc
+++ b/src/core/ib.cc
@@ -91,7 +91,7 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect) : mr_(nu
                       IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC;
 #if defined(MSCCLPP_USE_MLX5DV)
     if (isDataDirect && MLX5DV::isAvailable()) {
-      mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
+      // mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
     }
 #endif
     if (mr_ == nullptr) {

From 4cf53328ad8ac744fe65d7dd52c0e7bc65360180 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 1 Apr 2026 19:36:52 +0000
Subject: [PATCH 089/132] updates

---
 src/core/connection.cc  |  1 -
 src/core/ib.cc          | 10 +++++-----
 src/core/include/ib.hpp |  6 +++---
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/core/connection.cc b/src/core/connection.cc
index 172bca390..9767a3152 100644
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -402,7 +402,6 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6
     // has been committed to GPU memory. The recv thread then forwards the token to the
     // semaphore's inbound token via GDRCopy atomicStore.
     unsigned int immData = static_cast<unsigned int>(newValue);
-    *atomicSrc_ = newValue;
     qp_.lock()->stageSendWriteWithImm(nullptr, dstMrInfo,
                                       /*size=*/0, /*wrId=*/0,
                                       /*srcOffset=*/0, /*dstOffset=*/0,
diff --git a/src/core/ib.cc b/src/core/ib.cc
index 390e0a5c7..0b37ea5c1 100644
--- a/src/core/ib.cc
+++ b/src/core/ib.cc
@@ -90,7 +90,7 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect) : mr_(nu
     int accessFlags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
                       IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC;
 #if defined(MSCCLPP_USE_MLX5DV)
-    if (isDataDirect && MLX5DV::isAvailable()) {
+    if (isDataDirect) {
       // mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
     }
 #endif
@@ -452,7 +452,7 @@ IbCtx::IbCtx(const std::string& devName)
       pd_(nullptr),
       supportsRdmaAtomics_(false),
       isMlx5_(false),
-      dataDirect_(false),
+      isDataDirect_(false),
       isVF_(false) {
   int num;
   struct ibv_device** devices = IBVerbs::ibv_get_device_list(&num);
@@ -496,7 +496,7 @@ IbCtx::IbCtx(const std::string& devName)
     char sysfsPath[256];
     int ret = MLX5DV::mlx5dv_get_data_direct_sysfs_path(ctx_, sysfsPath, sizeof(sysfsPath));
     if (ret == 0) {
-      dataDirect_ = true;
+      isDataDirect_ = true;
       INFO(NET, "IB device ", devName_, " supports Data Direct (sysfs: ", sysfsPath, ")");
     } else {
       INFO(NET, "IB device ", devName_, " does not support Data Direct");
@@ -578,14 +578,14 @@ std::shared_ptr<IbQp> IbCtx::createQp(int port, int gidIndex, int maxSendCqSize,
 }
 
 std::unique_ptr<const IbMr> IbCtx::registerMr(void* buff, std::size_t size) {
-  return std::unique_ptr<const IbMr>(new IbMr(pd_, buff, size, dataDirect_));
+  return std::unique_ptr<const IbMr>(new IbMr(pd_, buff, size, isDataDirect_));
 }
 
 bool IbCtx::supportsRdmaAtomics() const { return supportsRdmaAtomics_; }
 
 bool IbCtx::isMlx5() const { return isMlx5_; }
 
-bool IbCtx::supportsDataDirect() const { return dataDirect_; }
+bool IbCtx::isDataDirect() const { return isDataDirect_; }
 
 bool IbCtx::isVirtualFunction() const { return isVF_; }
 
diff --git a/src/core/include/ib.hpp b/src/core/include/ib.hpp
index 923a7ca08..36c5a2373 100644
--- a/src/core/include/ib.hpp
+++ b/src/core/include/ib.hpp
@@ -144,7 +144,7 @@ class IbCtx {
   std::unique_ptr<const IbMr> registerMr(void* buff, std::size_t size);
   bool supportsRdmaAtomics() const;
   bool isMlx5() const;
-  bool supportsDataDirect() const;
+  bool isDataDirect() const;
   bool isVirtualFunction() const;
 #else
   IbCtx([[maybe_unused]] const std::string& devName) {}
@@ -156,7 +156,7 @@ class IbCtx {
   }
   bool supportsRdmaAtomics() const { return false; }
   bool isMlx5() const { return false; }
-  bool supportsDataDirect() const { return false; }
+  bool isDataDirect() const { return false; }
   bool isVirtualFunction() const { return false; }
 #endif
 
@@ -171,7 +171,7 @@ class IbCtx {
   ibv_pd* pd_;
   bool supportsRdmaAtomics_;
   bool isMlx5_;
-  bool dataDirect_;
+  bool isDataDirect_;
   bool isVF_;
 };
 

From 848b89b59c2f61b1834e6aaf32e4bdabc857a1ef Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 1 Apr 2026 21:00:54 +0000
Subject: [PATCH 090/132] 64-bit token reconstruction

---
 src/core/connection.cc          | 28 ++++++++++++++++++++++------
 src/core/include/connection.hpp |  9 +++++----
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/src/core/connection.cc b/src/core/connection.cc
index 9767a3152..db978943b 100644
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -198,6 +198,8 @@ void IBConnection::recvThreadFunc() {
     }
   }
 
+  uint32_t lastImmData = 0;
+  uint64_t immHighBits = 0;
   uint64_t newValueHost = 0;
 
   auto qp = qp_.lock();
@@ -220,8 +222,15 @@ void IBConnection::recvThreadFunc() {
         continue;
       }
 
-      // Read the token from imm_data (always available and correct in the CQE).
-      newValueHost = static_cast<uint64_t>(qp->getRecvWcImmData(i));
+      // Read the lower 32 bits of the token from imm_data. Reconstruct the full 64-bit value
+      // using wrap-around detection: tokens increase monotonically, so if the new lower 32 bits
+      // are less than the previous value, the upper 32 bits must have incremented by 1.
+      uint32_t immData = qp->getRecvWcImmData(i);
+      if (immData < lastImmData) {
+        immHighBits += (1ULL << 32);
+      }
+      lastImmData = immData;
+      newValueHost = immHighBits | static_cast<uint64_t>(immData);
 
       // Forward the token to the semaphore's inbound token address via atomicStore
       // through the GDRCopy BAR1 mapping. The GPU reads with system-scope acquire.
@@ -397,10 +406,17 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6
   *src = newValue;
 
   if (ibNoAtomic_) {
-    // Signal forwarding: send a 0-byte RDMA WRITE_WITH_IMM with the token in imm_data.
-    // The receiver's recv thread polls the CQE, which guarantees the preceding data WRITE
-    // has been committed to GPU memory. The recv thread then forwards the token to the
-    // semaphore's inbound token via GDRCopy atomicStore.
+    // Signal forwarding: send a 0-byte RDMA WRITE_WITH_IMM with the lower 32 bits of the
+    // token in imm_data. The receiver reconstructs the full 64-bit value using wrap-around
+    // detection (tokens are monotonically increasing, so a decrease in the lower 32 bits
+    // indicates the upper 32 bits incremented by 1).
+    if (newValue <= oldValue) {
+      WARN(CONN, "IBConnection signal forwarding: token is not monotonically increasing: ", oldValue, " -> ",
+           newValue);
+    } else if (newValue - oldValue >= (1ULL << 32)) {
+      WARN(CONN, "IBConnection signal forwarding: token increment too large for 32-bit wrap-around detection: ",
+           oldValue, " -> ", newValue, " (delta ", newValue - oldValue, " >= 2^32)");
+    }
     unsigned int immData = static_cast<unsigned int>(newValue);
     qp_.lock()->stageSendWriteWithImm(nullptr, dstMrInfo,
                                       /*size=*/0, /*wrId=*/0,
diff --git a/src/core/include/connection.hpp b/src/core/include/connection.hpp
index 47b03d6c4..432ce9ab0 100644
--- a/src/core/include/connection.hpp
+++ b/src/core/include/connection.hpp
@@ -113,13 +113,14 @@ class IBConnection : public BaseConnection {
   int localGpuDeviceId_;  // Local GPU device ID for CUDA context and GDR mapping
 
   // Signal forwarding design (HostNoAtomic mode):
-  // - Sender: 0-byte RDMA WRITE_WITH_IMM carrying the token value in imm_data (32-bit).
+  // - Sender: 0-byte RDMA WRITE_WITH_IMM carrying the lower 32 bits of the token in imm_data.
   // - Receiver: CPU recv thread polls recv CQ for WRITE_WITH_IMM completions (CQE), reads
-  //   the token from imm_data, then writes it to signalAddr_ (the semaphore's
-  //   inbound token) via atomicStore through the GDRCopy BAR1 mapping. The GPU reads
-  //   inboundToken with system-scope acquire ordering.
+  //   the lower 32 bits from imm_data, reconstructs the full 64-bit token using wrap-around
+  //   detection (monotonically increasing tokens: if lower 32 bits decrease, the upper half
+  //   incremented), then writes it to signalAddr_ via atomicStore through GDRCopy BAR1.
   uint64_t signalAddr_;
 
+
   std::unique_ptr<GdrMap> signalGdrMap_;
 
   void recvThreadFunc();

From 94d0508ec248e57c632b1686f2cd03ebcd21d8b8 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 1 Apr 2026 21:18:47 +0000
Subject: [PATCH 091/132] prerequisites update

---
 docs/quickstart.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/quickstart.md b/docs/quickstart.md
index b7a68050e..e0a383b71 100644
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -31,6 +31,9 @@
         ```
         If you don't want to build Python module, you need to set `-DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF` in your `cmake` command (see details in [Install from Source](#install-from-source)).
     * (Optional, for benchmarks) MPI
+    * (Optional, for NVIDIA platforms) [GDRCopy](https://github.com/NVIDIA/gdrcopy) >= 2.5.0
+        * GDRCopy is required for IB `HostNoAtomic` mode, which uses CPU-side signal forwarding to GPU memory via BAR1 mappings. This mode is used on platforms where RDMA atomics are not available (e.g., when using Data Direct Virtual Functions).
+        * Install GDRCopy from source or via packages. See the [GDRCopy installation guide](https://github.com/NVIDIA/gdrcopy#installation).
 * Others
     * For RDMA (InfiniBand or RoCE) support on NVIDIA platforms, [GPUDirect RDMA](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-rdma.html#gpudirect-rdma-and-gpudirect-storage) should be supported by the system. See the detailed prerequisites from [this NVIDIA documentation](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-rdma.html#common-prerequisites).
     * For NVLink SHARP (NVLS) support on NVIDIA platforms, the Linux kernel version should be 5.6 or above.

From 553fd3b2d8e3524f8b587777e7ca934822cd9e0a Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 1 Apr 2026 21:20:55 +0000
Subject: [PATCH 092/132] lint

---
 src/core/connection.cc          | 8 ++++----
 src/core/include/connection.hpp | 1 -
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/core/connection.cc b/src/core/connection.cc
index db978943b..26d9e169a 100644
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -411,11 +411,11 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6
     // detection (tokens are monotonically increasing, so a decrease in the lower 32 bits
     // indicates the upper 32 bits incremented by 1).
     if (newValue <= oldValue) {
-      WARN(CONN, "IBConnection signal forwarding: token is not monotonically increasing: ", oldValue, " -> ",
-           newValue);
+      WARN(CONN, "IBConnection signal forwarding: token is not monotonically increasing: ", oldValue, " -> ", newValue);
     } else if (newValue - oldValue >= (1ULL << 32)) {
-      WARN(CONN, "IBConnection signal forwarding: token increment too large for 32-bit wrap-around detection: ",
-           oldValue, " -> ", newValue, " (delta ", newValue - oldValue, " >= 2^32)");
+      WARN(CONN,
+           "IBConnection signal forwarding: token increment too large for 32-bit wrap-around detection: ", oldValue,
+           " -> ", newValue, " (delta ", newValue - oldValue, " >= 2^32)");
     }
     unsigned int immData = static_cast<unsigned int>(newValue);
     qp_.lock()->stageSendWriteWithImm(nullptr, dstMrInfo,
diff --git a/src/core/include/connection.hpp b/src/core/include/connection.hpp
index 432ce9ab0..c744b168f 100644
--- a/src/core/include/connection.hpp
+++ b/src/core/include/connection.hpp
@@ -120,7 +120,6 @@ class IBConnection : public BaseConnection {
   //   incremented), then writes it to signalAddr_ via atomicStore through GDRCopy BAR1.
   uint64_t signalAddr_;
 
-
   std::unique_ptr<GdrMap> signalGdrMap_;
 
   void recvThreadFunc();

From f62633ad4152fe39d1a09b5a674baa6f44f0c90c Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sat, 4 Apr 2026 06:18:44 +0000
Subject: [PATCH 093/132] mlx5dv bug fixes & enhanced unit tests perf reporting

---
 src/core/connection.cc               |  22 ++++--
 src/core/ib.cc                       |  36 +++++++++-
 src/core/include/connection.hpp      |   2 +
 src/core/mlx5dv_wrapper.cc           |  10 ++-
 test/framework.cc                    |  45 +++++++++++-
 test/framework.hpp                   |   7 ++
 test/mp_unit/ib_tests.cu             |   8 +--
 test/mp_unit/memory_channel_tests.cu |   6 +-
 test/mp_unit/mp_unit_tests.hpp       |   1 +
 test/mp_unit/port_channel_tests.cu   | 101 ++++++++++++++++++++++++---
 test/mp_unit/semaphore_perf_tests.cu |   2 +-
 11 files changed, 211 insertions(+), 29 deletions(-)

diff --git a/src/core/connection.cc b/src/core/connection.cc
index 26d9e169a..8b6c0afbf 100644
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -208,18 +208,22 @@ void IBConnection::recvThreadFunc() {
   while (!stopRecvThread_.load(std::memory_order_relaxed)) {
     int wcNum = qp->pollRecvCq();
     if (wcNum < 0) {
-      WARN(NET, "IBConnection recvThreadFunc: pollRecvCq failed");
+      recvThreadErrorMsg_ = "pollRecvCq failed";
+      recvThreadError_.store(true, std::memory_order_release);
+      WARN(NET, "IBConnection recvThreadFunc: ", recvThreadErrorMsg_);
       break;
     }
 
     for (int i = 0; i < wcNum; ++i) {
       int status = qp->getRecvWcStatus(i);
       if (status != static_cast<int>(WsStatus::Success)) {
-        WARN(NET, "IBConnection recvThreadFunc: recv work completion failed: ", qp->getRecvWcStatusString(i));
-        // Post another recv to replace the failed one
-        qp->stageRecv(/*wrId=*/0);
-        qp->postRecv();
-        continue;
+        // A failed recv WC typically means the QP entered error state (e.g., WR Flushed Error).
+        // All remaining WRs will also fail — no recovery without QP recreation. Exit the thread
+        // and set the error flag so the main thread can detect it.
+        recvThreadErrorMsg_ = std::string("recv work completion failed: ") + qp->getRecvWcStatusString(i);
+        recvThreadError_.store(true, std::memory_order_release);
+        WARN(NET, "IBConnection recvThreadFunc: ", recvThreadErrorMsg_);
+        return;
       }
 
       // Read the lower 32 bits of the token from imm_data. Reconstruct the full 64-bit value
@@ -260,6 +264,7 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
       ibNoAtomic_(getImpl(localEndpoint).ibNoAtomic_),
       gdrSignalForwarding_(false),
       stopRecvThread_(false),
+      recvThreadError_(false),
       localGpuDeviceId_(localEndpoint.device().id),
       signalAddr_(0) {
   qp_ = getImpl(localEndpoint).ibQp_;
@@ -442,6 +447,11 @@ void IBConnection::flush(int64_t timeoutUsec) {
   NpKit::CollectCpuEvent(NPKIT_EVENT_CONN_IB_FLUSH_ENTRY, 0, 0, *NpKit::GetCpuTimestamp(), 0);
 #endif
 
+  // Check if the recv thread has already reported an error (e.g., QP entered error state).
+  if (recvThreadError_.load(std::memory_order_acquire)) {
+    THROW(CONN, Error, ErrorCode::SystemError, "IBConnection recv thread failed: ", recvThreadErrorMsg_);
+  }
+
   Timer timer;
   while (qp_.lock()->getNumSendCqItems()) {
     int wcNum = qp_.lock()->pollSendCq();
diff --git a/src/core/ib.cc b/src/core/ib.cc
index 0b37ea5c1..290511e40 100644
--- a/src/core/ib.cc
+++ b/src/core/ib.cc
@@ -83,20 +83,50 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect) : mr_(nu
   bool isGpuBuff = (gpuId != -1);
   if (isGpuBuff && isDmabufSupportedByGpu(gpuId)) {
 #if !defined(MSCCLPP_USE_ROCM)
-    int fd;
-    MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, pages * pageSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+    int fd = -1;
+    size_t rangeSize = pages * pageSize;
+
+    // Obtain a DMA-BUF file descriptor for the GPU memory range. On platforms with a CPU-GPU
+    // bridge that reorders posted writes (e.g., Grace/GB200 NVLink-C2C), the PCIe mapping flag
+    // routes DMA through the Data Direct engine for correct ordering and higher throughput.
+    // Fall back to the default (non-PCIe) mapping if the flag is unsupported.
+#if (CUDA_VERSION >= 12030)
+    CUresult cuRes = cuMemGetHandleForAddressRange(
+        &fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE);
+    if (cuRes != CUDA_SUCCESS || fd < 0) {
+      if (fd >= 0) ::close(fd);
+      fd = -1;
+    }
+    bool usedPcieFlag = (fd >= 0);
+#endif  // CUDA_VERSION >= 12030
+    if (fd < 0) {
+      MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+    }
 
+    // Register the DMA-BUF memory region. When Data Direct is available, use the mlx5dv API
+    // which enables hardware-level Data Direct routing for the MR. Otherwise use standard verbs.
     size_t offsetInDmaBuf = buffIntPtr % pageSize;
     int accessFlags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
                       IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC;
+
 #if defined(MSCCLPP_USE_MLX5DV)
     if (isDataDirect) {
-      // mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
+      mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
     }
 #endif
     if (mr_ == nullptr) {
       mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
     }
+
+    // If MR registration failed with a PCIe-mapped fd, retry with the default mapping.
+#if (CUDA_VERSION >= 12030)
+    if (mr_ == nullptr && usedPcieFlag) {
+      ::close(fd);
+      MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+      mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
+    }
+#endif  // CUDA_VERSION >= 12030
+
     ::close(fd);
     if (mr_ == nullptr) {
       THROW(NET, IbError, errno, "ibv_reg_dmabuf_mr failed (errno ", errno, ")");
diff --git a/src/core/include/connection.hpp b/src/core/include/connection.hpp
index c744b168f..077a6c6af 100644
--- a/src/core/include/connection.hpp
+++ b/src/core/include/connection.hpp
@@ -110,6 +110,8 @@ class IBConnection : public BaseConnection {
   bool gdrSignalForwarding_;  // ibNoAtomic_ && gdrEnabled() — decided once at construction
   std::thread recvThread_;
   std::atomic<bool> stopRecvThread_;
+  std::atomic<bool> recvThreadError_;    // Set by recv thread on fatal error
+  std::string recvThreadErrorMsg_;       // Error message from recv thread (written before recvThreadError_ is set)
   int localGpuDeviceId_;  // Local GPU device ID for CUDA context and GDR mapping
 
   // Signal forwarding design (HostNoAtomic mode):
diff --git a/src/core/mlx5dv_wrapper.cc b/src/core/mlx5dv_wrapper.cc
index 5d13d9c81..a56fad96b 100644
--- a/src/core/mlx5dv_wrapper.cc
+++ b/src/core/mlx5dv_wrapper.cc
@@ -85,12 +85,18 @@ bool MLX5DV::mlx5dv_is_supported(struct ibv_device* device) {
 struct ibv_mr* MLX5DV::mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset, size_t length, uint64_t iova, int fd,
                                             int access) {
   // mlx5dv_reg_dmabuf_mr(pd, offset, length, iova, fd, access, mlx5_access) — the last arg is mlx5-specific flags.
+  // Must use dlvsym with "MLX5_1.25" version to get the Data Direct-capable symbol.
   using FuncType = struct ibv_mr* (*)(struct ibv_pd*, uint64_t, size_t, uint64_t, int, int, int);
   static FuncType impl = nullptr;
   static bool resolved = false;
   if (!resolved) {
-    void* ptr = MLX5DV::dlsym("mlx5dv_reg_dmabuf_mr", /*allowReturnNull=*/true);
-    impl = ptr ? reinterpret_cast<FuncType>(ptr) : nullptr;
+    if (globalMLX5Handle) {
+      void* ptr = dlvsym(globalMLX5Handle.get(), "mlx5dv_reg_dmabuf_mr", "MLX5_1.25");
+      if (!ptr) {
+        ptr = MLX5DV::dlsym("mlx5dv_reg_dmabuf_mr", /*allowReturnNull=*/true);
+      }
+      impl = ptr ? reinterpret_cast<FuncType>(ptr) : nullptr;
+    }
     resolved = true;
   }
   if (!impl) return nullptr;
diff --git a/test/framework.cc b/test/framework.cc
index f5bf55aa4..f62d8bbd8 100644
--- a/test/framework.cc
+++ b/test/framework.cc
@@ -20,8 +20,30 @@ static bool gCurrentTestPassed = true;
 static std::string gCurrentTestFailureMessage;
 static std::string gCurrentTestName;
 
+// Performance result collection
+struct PerfResult {
+  std::string label;
+  double value;
+  std::string unit;
+};
+struct PerfTestResults {
+  std::string testName;
+  std::vector<PerfResult> results;
+};
+static std::vector<PerfTestResults> gPerfResults;
+
 std::string currentTestName() { return gCurrentTestName; }
 
+void reportPerfResult(const std::string& label, double value, const std::string& unit) {
+  if (gMpiRank != 0) return;
+  if (gCurrentTestName.empty()) return;
+  // Find or create entry for the current test
+  if (gPerfResults.empty() || gPerfResults.back().testName != gCurrentTestName) {
+    gPerfResults.push_back({gCurrentTestName, {}});
+  }
+  gPerfResults.back().results.push_back({label, value, unit});
+}
+
 namespace utils {
 
 void initializeMPI(int argc, char* argv[]) {
@@ -151,6 +173,7 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
   // Parse command line arguments
   std::string filter;
   bool excludePerfTests = false;
+  bool onlyPerfTests = false;
 
   for (int i = 1; i < argc; ++i) {
     std::string arg = argv[i];
@@ -161,6 +184,8 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
       ++i;
     } else if (arg == "--exclude-perf-tests") {
       excludePerfTests = true;
+    } else if (arg == "--only-perf-tests") {
+      onlyPerfTests = true;
     }
   }
 
@@ -189,11 +214,15 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
       skippedByFilter++;
       continue;
     }
+    if (onlyPerfTests && !entry.isPerfTest) {
+      skippedByFilter++;
+      continue;
+    }
     if (!matchesFilter(fullName, filter)) {
       skippedByFilter++;
       continue;
     }
-    totalToRun++;
+    totalToRun++;;
   }
 
   if (gMpiRank == 0) {
@@ -208,6 +237,7 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
     std::string fullName = entry.suiteName + "." + entry.testName;
 
     if (excludePerfTests && entry.isPerfTest) continue;
+    if (onlyPerfTests && !entry.isPerfTest) continue;
     if (!matchesFilter(fullName, filter)) continue;
 
     gCurrentTestPassed = true;
@@ -304,6 +334,19 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
     if (failed > 0) {
       std::cout << "[  FAILED  ] " << failed << " tests.\n";
     }
+
+    // Print collected performance results
+    if (!gPerfResults.empty()) {
+      std::cout << "\n[   PERF   ] Performance results:\n";
+      for (const auto& testResult : gPerfResults) {
+        std::cout << "[   PERF   ] " << testResult.testName << "\n";
+        for (const auto& r : testResult.results) {
+          std::cout << "[   PERF   ]   " << std::setw(12) << r.label << ": " << std::setprecision(4) << r.value << " "
+                    << r.unit << "\n";
+        }
+      }
+      gPerfResults.clear();
+    }
   }
 
   // Tear down global test environments (in reverse order)
diff --git a/test/framework.hpp b/test/framework.hpp
index 26a32d5bc..b2431ed9c 100644
--- a/test/framework.hpp
+++ b/test/framework.hpp
@@ -63,6 +63,13 @@ class TestRegistry {
 // Returns "Suite.Name" for the currently running test, or "" if none.
 std::string currentTestName();
 
+/// Collect a performance result for the current test. Results are printed together
+/// after all tests complete. Only rank 0 should call this (results are ignored on other ranks).
+/// @param label A label for this measurement (e.g., "128 MB" or "latency").
+/// @param value The numeric result.
+/// @param unit The unit string (e.g., "GB/s", "us/iter").
+void reportPerfResult(const std::string& label, double value, const std::string& unit);
+
 // Utility functions
 namespace utils {
 
diff --git a/test/mp_unit/ib_tests.cu b/test/mp_unit/ib_tests.cu
index 8c91db669..e5945563e 100644
--- a/test/mp_unit/ib_tests.cu
+++ b/test/mp_unit/ib_tests.cu
@@ -86,7 +86,7 @@ void IbPeerToPeerTest::stageSendWriteWithImm(uint32_t size, uint64_t wrId, uint6
   qp->stageSendWriteWithImm(mr.get(), remoteMrInfo, size, wrId, srcOffset, dstOffset, signaled, immData);
 }
 
-TEST(IbPeerToPeerTest, SimpleSendRecv) {
+PERF_TEST(IbPeerToPeerTest, SimpleSendRecv) {
   if (gEnv->rank >= 2) {
     // This test needs only two ranks
     return;
@@ -122,7 +122,7 @@ TEST(IbPeerToPeerTest, SimpleSendRecv) {
       }
     }
     float us = (float)timer.elapsed();
-    std::cout << "IbPeerToPeerTest.SimpleSendRecv: " << us / maxIter << " us/iter" << std::endl;
+    ::mscclpp::test::reportPerfResult("latency", us / maxIter, "us/iter");
   }
   bootstrap->barrier();
 }
@@ -385,7 +385,7 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
   }
 }
 
-TEST(IbPeerToPeerTest, SimpleAtomicAdd) {
+PERF_TEST(IbPeerToPeerTest, SimpleAtomicAdd) {
   if (gEnv->rank >= 2) {
     // This test needs only two ranks
     return;
@@ -426,7 +426,7 @@ TEST(IbPeerToPeerTest, SimpleAtomicAdd) {
       }
     }
     float us = (float)timer.elapsed();
-    std::cout << "IbPeerToPeerTest.SimpleAtomicAdd: " << us / maxIter << " us/iter" << std::endl;
+    ::mscclpp::test::reportPerfResult("latency", us / maxIter, "us/iter");
   }
   bootstrap->barrier();
 }
diff --git a/test/mp_unit/memory_channel_tests.cu b/test/mp_unit/memory_channel_tests.cu
index 318d301af..1ce9eb0bd 100644
--- a/test/mp_unit/memory_channel_tests.cu
+++ b/test/mp_unit/memory_channel_tests.cu
@@ -103,7 +103,7 @@ void MemoryChannelOneToOneTest::packetPingPongTest(const std::string testName,
   communicator->bootstrap()->barrier();
 
   if (gEnv->rank == 0) {
-    std::cout << testName << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)(nTries) << " us/iter\n";
+    ::mscclpp::test::reportPerfResult("latency", (float)timer.elapsed() / (float)(nTries), "us/iter");
   }
 }
 
@@ -324,14 +324,14 @@ __global__ void kernelMemLL16PacketPingPong(int* buff, int rank, int nElem, int*
   }
 }
 
-TEST(MemoryChannelOneToOneTest, LL8PacketPingPong) {
+PERF_TEST(MemoryChannelOneToOneTest, LL8PacketPingPong) {
   auto kernelMemLL8PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
     kernelMemLL8PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
   };
   packetPingPongTest("memoryLL8PacketPingPong", kernelMemLL8PacketPingPongWrapper);
 }
 
-TEST(MemoryChannelOneToOneTest, LL16PacketPingPong) {
+PERF_TEST(MemoryChannelOneToOneTest, LL16PacketPingPong) {
   auto kernelMemLL16PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
     kernelMemLL16PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
   };
diff --git a/test/mp_unit/mp_unit_tests.hpp b/test/mp_unit/mp_unit_tests.hpp
index 5f95d660a..f4a26cf99 100644
--- a/test/mp_unit/mp_unit_tests.hpp
+++ b/test/mp_unit/mp_unit_tests.hpp
@@ -159,6 +159,7 @@ class PortChannelOneToOneTest : public CommunicatorTestBase {
   void testPingPongPerf(PingPongTestParams params);
   void testPacketPingPong(bool useIbOnly, IbMode ibMode = IbMode::Default);
   void testPacketPingPongPerf(bool useIbOnly, IbMode ibMode = IbMode::Default);
+  void testBandwidth(PingPongTestParams params);
 
   std::shared_ptr<mscclpp::ProxyService> proxyService;
 };
diff --git a/test/mp_unit/port_channel_tests.cu b/test/mp_unit/port_channel_tests.cu
index b69f388a8..4a9c8f3cc 100644
--- a/test/mp_unit/port_channel_tests.cu
+++ b/test/mp_unit/port_channel_tests.cu
@@ -241,7 +241,7 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
   communicator->bootstrap()->barrier();
 
   if (gEnv->rank == 0) {
-    std::cout << testName << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)nTries << " us/iter\n";
+    ::mscclpp::test::reportPerfResult("latency", (float)timer.elapsed() / (float)nTries, "us/iter");
   }
 
   proxyService->stopProxy();
@@ -274,25 +274,25 @@ TEST(PortChannelOneToOneTest, PingPongIbHostModeWithPoll) {
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Host});
 }
 
-TEST(PortChannelOneToOneTest, PingPongPerf) {
+PERF_TEST(PortChannelOneToOneTest, PingPongPerf) {
   testPingPongPerf(PingPongTestParams{
       .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
 
-TEST(PortChannelOneToOneTest, PingPongPerfIbHostMode) {
+PERF_TEST(PortChannelOneToOneTest, PingPongPerfIbHostMode) {
   REQUIRE_IBVERBS;
   testPingPongPerf(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
 }
 
-TEST(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) {
+PERF_TEST(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) {
   REQUIRE_IBVERBS;
   REQUIRE_HOST_NO_ATOMIC;
   testPingPongPerf(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
 }
 
-TEST(PortChannelOneToOneTest, PingPongPerfEthernet) {
+PERF_TEST(PortChannelOneToOneTest, PingPongPerfEthernet) {
   testPingPongPerf(PingPongTestParams{
       .useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
@@ -459,7 +459,7 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb, IbMode ibMode)
   communicator->bootstrap()->barrier();
 
   if (gEnv->rank == 0) {
-    std::cout << testName << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)nTries << " us/iter\n";
+    ::mscclpp::test::reportPerfResult("latency", (float)timer.elapsed() / (float)nTries, "us/iter");
   }
 
   proxyService->stopProxy();
@@ -472,14 +472,14 @@ TEST(PortChannelOneToOneTest, PacketPingPongIbHostMode) {
   testPacketPingPong(true, IbMode::Host);
 }
 
-TEST(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); }
+PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); }
 
-TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
+PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
   REQUIRE_IBVERBS;
   testPacketPingPongPerf(true, IbMode::Host);
 }
 
-TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) {
+PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) {
   REQUIRE_IBVERBS;
   REQUIRE_HOST_NO_ATOMIC;
   testPacketPingPongPerf(true, IbMode::HostNoAtomic);
@@ -497,3 +497,86 @@ TEST(PortChannelOneToOneTest, PacketPingPongIbHostNoAtomicMode) {
   REQUIRE_HOST_NO_ATOMIC;
   testPacketPingPong(true, IbMode::HostNoAtomic);
 }
+
+// Bandwidth test: bidirectional bulk transfer matching the tutorial pattern.
+// Both ranks do signal+wait+putWithSignal+wait per iteration.
+__global__ void kernelBandwidthBidir(int* buff, int nElem, int nIters, int rank) {
+  DeviceHandle<mscclpp::PortChannel>& portChan = gChannelOneToOneTestConstPortChans;
+  if (threadIdx.x != 0) return;
+  const uint64_t srcOffset = rank * nElem * sizeof(int);
+  const uint64_t dstOffset = srcOffset;
+  for (int i = 0; i < nIters; i++) {
+    portChan.signal();
+    portChan.wait();
+    portChan.putWithSignal(dstOffset, srcOffset, nElem * sizeof(int));
+    portChan.wait();
+  }
+}
+
+void PortChannelOneToOneTest::testBandwidth(PingPongTestParams params) {
+  if (gEnv->rank >= numRanksToUse) return;
+
+  const int maxElem = 32 * 1024 * 1024;  // 128 MB per direction
+  const int bufElem = maxElem * 2;        // 2x for bidirectional
+
+  std::vector<mscclpp::PortChannel> portChannels;
+  std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(bufElem).memory();
+  setupMeshConnections(portChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(),
+                       bufElem * sizeof(int), nullptr, 0, params.ibMode);
+
+  std::vector<DeviceHandle<mscclpp::PortChannel>> portChannelHandles;
+  for (auto& ch : portChannels) portChannelHandles.push_back(ch.deviceHandle());
+
+  ASSERT_EQ(portChannels.size(), 1);
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstPortChans, portChannelHandles.data(),
+                                       sizeof(DeviceHandle<mscclpp::PortChannel>)));
+
+  proxyService->startProxy();
+
+  const std::string testName = ::mscclpp::test::currentTestName();
+  const int nIters = 1000;
+
+  for (int nElem : {256, 16 * 1024, 256 * 1024, 1024 * 1024, 4 * 1024 * 1024, 16 * 1024 * 1024, 32 * 1024 * 1024}) {
+    // Warm-up
+    kernelBandwidthBidir<<<1, 1024>>>(buff.get(), nElem, 10, gEnv->rank);
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    communicator->bootstrap()->barrier();
+
+    // Measure
+    mscclpp::Timer timer;
+    kernelBandwidthBidir<<<1, 1024>>>(buff.get(), nElem, nIters, gEnv->rank);
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    double elapsedUs = timer.elapsed();
+    communicator->bootstrap()->barrier();
+
+    if (gEnv->rank == 0) {
+      double copyBytes = (double)nElem * sizeof(int);
+      double elapsedMsPerIter = elapsedUs / 1e3 / nIters;
+      double gbps = copyBytes / elapsedMsPerIter * 1e-6;
+      double sizeKB = copyBytes / 1024.0;
+      std::string label = (sizeKB >= 1024.0) ? (std::to_string((int)(sizeKB / 1024.0)) + " MB")
+                                             : (std::to_string((int)sizeKB) + " KB");
+      ::mscclpp::test::reportPerfResult(label, gbps, "GB/s");
+    }
+  }
+
+  proxyService->stopProxy();
+}
+
+PERF_TEST(PortChannelOneToOneTest, Bandwidth) {
+  testBandwidth(PingPongTestParams{
+      .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
+}
+
+PERF_TEST(PortChannelOneToOneTest, BandwidthIbHostMode) {
+  REQUIRE_IBVERBS;
+  testBandwidth(PingPongTestParams{
+      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
+}
+
+PERF_TEST(PortChannelOneToOneTest, BandwidthIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
+  REQUIRE_HOST_NO_ATOMIC;
+  testBandwidth(PingPongTestParams{
+      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
+}
diff --git a/test/mp_unit/semaphore_perf_tests.cu b/test/mp_unit/semaphore_perf_tests.cu
index 925605396..a4c0e29ff 100644
--- a/test/mp_unit/semaphore_perf_tests.cu
+++ b/test/mp_unit/semaphore_perf_tests.cu
@@ -68,6 +68,6 @@ PERF_TEST(SemaphorePerfTest, SignalPingPong) {
   communicator->bootstrap()->barrier();
 
   if (gEnv->rank == 0) {
-    std::cout << testName << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)nIters << " us/iter\n";
+    ::mscclpp::test::reportPerfResult("latency", (float)timer.elapsed() / (float)nIters, "us/iter");
   }
 }

From b04fa2daa7d95f357a7e61449d3d78238bacb76f Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sat, 4 Apr 2026 06:22:04 +0000
Subject: [PATCH 094/132] lint

---
 src/core/ib.cc                     |  4 ++--
 src/core/include/connection.hpp    |  6 +++---
 test/framework.cc                  |  2 +-
 test/mp_unit/port_channel_tests.cu | 10 +++++-----
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/core/ib.cc b/src/core/ib.cc
index 290511e40..557f04268 100644
--- a/src/core/ib.cc
+++ b/src/core/ib.cc
@@ -91,8 +91,8 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect) : mr_(nu
     // routes DMA through the Data Direct engine for correct ordering and higher throughput.
     // Fall back to the default (non-PCIe) mapping if the flag is unsupported.
 #if (CUDA_VERSION >= 12030)
-    CUresult cuRes = cuMemGetHandleForAddressRange(
-        &fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE);
+    CUresult cuRes = cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
+                                                   CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE);
     if (cuRes != CUDA_SUCCESS || fd < 0) {
       if (fd >= 0) ::close(fd);
       fd = -1;
diff --git a/src/core/include/connection.hpp b/src/core/include/connection.hpp
index 077a6c6af..22a9930f6 100644
--- a/src/core/include/connection.hpp
+++ b/src/core/include/connection.hpp
@@ -110,9 +110,9 @@ class IBConnection : public BaseConnection {
   bool gdrSignalForwarding_;  // ibNoAtomic_ && gdrEnabled() — decided once at construction
   std::thread recvThread_;
   std::atomic<bool> stopRecvThread_;
-  std::atomic<bool> recvThreadError_;    // Set by recv thread on fatal error
-  std::string recvThreadErrorMsg_;       // Error message from recv thread (written before recvThreadError_ is set)
-  int localGpuDeviceId_;  // Local GPU device ID for CUDA context and GDR mapping
+  std::atomic<bool> recvThreadError_;  // Set by recv thread on fatal error
+  std::string recvThreadErrorMsg_;     // Error message from recv thread (written before recvThreadError_ is set)
+  int localGpuDeviceId_;               // Local GPU device ID for CUDA context and GDR mapping
 
   // Signal forwarding design (HostNoAtomic mode):
   // - Sender: 0-byte RDMA WRITE_WITH_IMM carrying the lower 32 bits of the token in imm_data.
diff --git a/test/framework.cc b/test/framework.cc
index f62d8bbd8..941fdcbaf 100644
--- a/test/framework.cc
+++ b/test/framework.cc
@@ -222,7 +222,7 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
       skippedByFilter++;
       continue;
     }
-    totalToRun++;;
+    totalToRun++;
   }
 
   if (gMpiRank == 0) {
diff --git a/test/mp_unit/port_channel_tests.cu b/test/mp_unit/port_channel_tests.cu
index 4a9c8f3cc..166d7ed21 100644
--- a/test/mp_unit/port_channel_tests.cu
+++ b/test/mp_unit/port_channel_tests.cu
@@ -517,12 +517,12 @@ void PortChannelOneToOneTest::testBandwidth(PingPongTestParams params) {
   if (gEnv->rank >= numRanksToUse) return;
 
   const int maxElem = 32 * 1024 * 1024;  // 128 MB per direction
-  const int bufElem = maxElem * 2;        // 2x for bidirectional
+  const int bufElem = maxElem * 2;       // 2x for bidirectional
 
   std::vector<mscclpp::PortChannel> portChannels;
   std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(bufElem).memory();
-  setupMeshConnections(portChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(),
-                       bufElem * sizeof(int), nullptr, 0, params.ibMode);
+  setupMeshConnections(portChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), bufElem * sizeof(int),
+                       nullptr, 0, params.ibMode);
 
   std::vector<DeviceHandle<mscclpp::PortChannel>> portChannelHandles;
   for (auto& ch : portChannels) portChannelHandles.push_back(ch.deviceHandle());
@@ -554,8 +554,8 @@ void PortChannelOneToOneTest::testBandwidth(PingPongTestParams params) {
       double elapsedMsPerIter = elapsedUs / 1e3 / nIters;
       double gbps = copyBytes / elapsedMsPerIter * 1e-6;
       double sizeKB = copyBytes / 1024.0;
-      std::string label = (sizeKB >= 1024.0) ? (std::to_string((int)(sizeKB / 1024.0)) + " MB")
-                                             : (std::to_string((int)sizeKB) + " KB");
+      std::string label =
+          (sizeKB >= 1024.0) ? (std::to_string((int)(sizeKB / 1024.0)) + " MB") : (std::to_string((int)sizeKB) + " KB");
       ::mscclpp::test::reportPerfResult(label, gbps, "GB/s");
     }
   }

From a4bb8fb4bf0b94310071fab6b48d747174eab733 Mon Sep 17 00:00:00 2001
From: Ubuntu <mahdiehghazi@microsoft.com>
Date: Fri, 3 Apr 2026 21:30:21 +0000
Subject: [PATCH 095/132] add debugging code

---
 python/test/executor_test.py          |  22 ++-
 python/test/executor_test_verifier.cu | 193 +++++++++++++++++++++++++-
 2 files changed, 212 insertions(+), 3 deletions(-)

diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index 59bc16616..83b2cb863 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -166,9 +166,11 @@ def build_bufs(
     else:
         input_buf = GpuBuffer(nelems_input, dtype=dtype)
 
+    in_place = False
+
     test_buf = cp.zeros(nelems, dtype=dtype)
 
-    return input_buf, result_buf, test_buf
+    return input_buf, result_buf, test_buf, nelems
 
 
 def main(
@@ -190,7 +192,7 @@ def main(
     collective = execution_plan.collective
 
     dtype = parse_dtype(dtype_str)
-    input_buf, result_buf, test_buf = build_bufs(
+    input_buf, result_buf, test_buf, nelem = build_bufs(
         collective,
         size,
         in_place,
@@ -212,6 +214,22 @@ def main(
     )
 
     mscclpp_group.barrier()
+    print("size= ", size, "nelem= ", nelem)
+
+    # Sentinel fill: choose something unlikely in your pattern
+    result_buf.fill(cp.float16(123.0))
+    cp.cuda.runtime.deviceSynchronize()
+
+    # Run ONE execution (no graph), then sync
+    stream = cp.cuda.Stream(non_blocking=True)
+    with stream:
+        executor_func(stream)
+    stream.synchronize()
+
+    # Count how many elements changed
+    changed = cp.count_nonzero(result_buf != cp.float16(123.0)).item()
+    print("changed elements:", changed, "out of", result_buf.size)
+
     bench_correctness(
         collective,
         input_buf,
diff --git a/python/test/executor_test_verifier.cu b/python/test/executor_test_verifier.cu
index cf3cd4a6f..5c96a9229 100644
--- a/python/test/executor_test_verifier.cu
+++ b/python/test/executor_test_verifier.cu
@@ -120,4 +120,195 @@ TEST_DATA_REDUCE_SCATTER(int32, int)
 
 TEST_DATA_ALL_TO_ALL(float16, __half)
 TEST_DATA_ALL_TO_ALL(float32, float)
-TEST_DATA_ALL_TO_ALL(int32, int)
\ No newline at end of file
+TEST_DATA_ALL_TO_ALL(int32, int)
+
+/*#define TEST_DATA_SENDRECV(FuncNameType, DataType)                                                          \
+  extern "C" __global__ void __launch_bounds__(1024, 1) test_data_sendrecv_##FuncNameType(                  \
+      DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) {    \
+                                                                                                             \
+    /* Ring semantics: receive from prev rank */                                                             \
+/*    int peer_rank = (my_rank - 1 + num_ranks) % num_ranks;                                                   \
+                                                                                                             \
+    unsigned int seed =                                                                                      \
+        (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + peer_rank + seq);                             \
+                                                                                                             \
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x;                                                   \
+         i < num_elems;                                                                                      \
+         i += blockDim.x * gridDim.x) {                                                                      \
+      seed = ranqd1(seed);                                                                                   \
+      test_buf[i] = DataType(seed % blockDim.x) / DataType(blockDim.x);                                      \
+                                                                                                             \
+      /* Optional: print first few mismatches */                                                             \
+/*      if (result_buf[i] != test_buf[i] && blockIdx.x == 0 && threadIdx.x == 0 && i < 8) {                    \
+        printf("MISMATCH rank=%d peer=%d i=%zu result=%f expected=%f\n",                                     \
+               my_rank, peer_rank, i, (float)result_buf[i], (float)test_buf[i]);                             \
+      }                                                                                                      \
+                                                                                                             \
+      assert(result_buf[i] == test_buf[i]);                                                                  \
+    }                                                                                                        \
+  }*/
+
+
+/*#define TEST_DATA_SENDRECV(FuncNameType, DataType)                                                        \
+  extern "C" __global__ void __launch_bounds__(1024, 1) test_data_sendrecv_##FuncNameType(                \
+      DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) {  \
+                                                                                                           \
+    int prev_rank = (my_rank - 1 + num_ranks) % num_ranks;                                                 \
+    int next_rank = (my_rank + 1) % num_ranks;                                                             \
+    int self_rank = my_rank;                                                                               \
+                                                                                                           \
+    unsigned int seed_prev =                                                                               \
+        (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + prev_rank + seq);                           \
+    unsigned int seed_next =                                                                               \
+        (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + next_rank + seq);                           \
+    unsigned int seed_self =                                                                               \
+        (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + self_rank + seq);                           \
+                                                                                                           \
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x;                                                 \
+         i < num_elems;                                                                                    \
+         i += blockDim.x * gridDim.x) {                                                                    \
+                                                                                                           \
+      seed_prev = ranqd1(seed_prev);                                                                       \
+      seed_next = ranqd1(seed_next);                                                                       \
+      seed_self = ranqd1(seed_self);                                                                       \
+                                                                                                           \
+      DataType exp_prev = DataType(seed_prev % blockDim.x) / DataType(blockDim.x);                         \
+      DataType exp_next = DataType(seed_next % blockDim.x) / DataType(blockDim.x);                         \
+      DataType exp_self = DataType(seed_self % blockDim.x) / DataType(blockDim.x);                         \
+                                                                                                           \
+      /* For compatibility: avoid %zu formatting quirks on device */                                        \
+/*      unsigned long long ii = (unsigned long long)i;                                                       \
+                                                                                                           \
+      if (result_buf[i] != exp_prev) {                                                                     \
+        /* Print only a few mismatches to avoid flooding */                                                 \
+/*        if (blockIdx.x == 0 && (threadIdx.x == 0 || threadIdx.x == 192) && ii < 256ULL) {                  \
+          printf("sendrecv-mismatch rank=%d nranks=%d i=%llu result=%f exp_prev(from %d)=%f "              \
+                 "exp_next(from %d)=%f exp_self(from %d)=%f\n",                                            \
+                 my_rank, num_ranks, ii,                                                                   \
+                 (float)result_buf[i],                                                                     \
+                 prev_rank, (float)exp_prev,                                                               \
+                 next_rank, (float)exp_next,                                                               \
+                 self_rank, (float)exp_self);                                                              \
+        }                                                                                                  \
+      }                                                                                                    \
+                                                                                                           \
+      test_buf[i] = exp_prev;                                                                              \
+      assert(result_buf[i] == test_buf[i]);                                                                \
+    }                                                                                                      \
+  }
+*/
+
+
+#define TEST_DATA_SENDRECV(FuncNameType, DataType)                                                        \
+  extern "C" __global__ void __launch_bounds__(1024, 1) test_data_sendrecv_##FuncNameType(                \
+      DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) {  \
+                                                                                                           \
+    /* Expected ring semantics (if your algorithm is ring-prev). */                                        \
+    int prev_rank = (my_rank - 1 + num_ranks) % num_ranks;                                                 \
+    int next_rank = (my_rank + 1) % num_ranks;                                                             \
+    int self_rank = my_rank;                                                                               \
+                                                                                                           \
+    /* Thread identity and stride must match fill_data_* generation pattern. */                            \
+    const unsigned long long tid =                                                                        \
+        (unsigned long long)(blockIdx.x * blockDim.x + threadIdx.x);                                       \
+    const unsigned long long stride =                                                                      \
+        (unsigned long long)(blockDim.x * gridDim.x);                                                      \
+                                                                                                           \
+    for (unsigned long long i = tid; i < (unsigned long long)num_elems; i += stride) {                    \
+                                                                                                           \
+      /* Compute how many iterations this thread advanced before reaching i. */                            \
+      unsigned long long k = (i - tid) / stride;                                                           \
+                                                                                                           \
+      /* Helper lambda: compute expected value for a given sender rank r at element i for this thread. */  \
+      auto expected_for_rank = [&](int r) -> DataType {                                                    \
+        unsigned int s = (unsigned int)(tid + (unsigned long long)r + (unsigned long long)seq);            \
+        /* fill_data does: seed=ranqd1(seed) once per element visited.                                     \
+           For the k-th visited element, apply ranqd1 (k+1) times. */                                      \
+        for (unsigned long long step = 0; step < k + 1; ++step) {                                          \
+          s = ranqd1(s);                                                                                   \
+        }                                                                                                  \
+        return DataType(s % blockDim.x) / DataType(blockDim.x);                                            \
+      };                                                                                                   \
+                                                                                                           \
+      DataType exp_prev = expected_for_rank(prev_rank);                                                    \
+      DataType exp_next = expected_for_rank(next_rank);                                                    \
+      DataType exp_self = expected_for_rank(self_rank);                                                    \
+                                                                                                           \
+      /* Store expected(prev) in test_buf for the assert (keeps compatibility with your current check). */ \
+      test_buf[i] = exp_prev;                                                                              \
+                                                                                                           \
+      if (result_buf[i] != test_buf[i]) {                                                                  \
+        /* Try to identify which rank's stream matches the observed result. */                             \
+        int matched = -1;                                                                                  \
+        for (int r = 0; r < num_ranks; ++r) {                                                              \
+          DataType exp_r = expected_for_rank(r);                                                           \
+          if (result_buf[i] == exp_r) {                                                                    \
+            matched = r;                                                                                   \
+            break;                                                                                          \
+          }                                                                                                \
+        }                                                                                                  \
+                                                                                                           \
+        /* Print only a small number of mismatches to avoid log spam. */                                   \
+        if (blockIdx.x == 0 && (threadIdx.x == 0 || threadIdx.x == 160) && i < 256ULL) {                   \
+          printf("sendrecv-mismatch rank=%d nranks=%d i=%llu result=%f "                                   \
+                 "exp_prev(from %d)=%f exp_next(from %d)=%f exp_self(from %d)=%f matched_sender=%d\n",     \
+                 my_rank, num_ranks, i,                                                                    \
+                 (float)result_buf[i],                                                                     \
+                 prev_rank, (float)exp_prev,                                                               \
+                 next_rank, (float)exp_next,                                                               \
+                 self_rank, (float)exp_self,                                                               \
+                 matched);                                                                                 \
+        }                                                                                                  \
+                                                                                                           \
+        assert(result_buf[i] == test_buf[i]);                                                              \
+      }                                                                                                    \
+    }                                                                                                      \
+  }
+
+
+/*
+#define TEST_DATA_SENDRECV(FuncNameType, DataType)                                      \
+extern "C" __global__ void __launch_bounds__(1024, 1)                                  \
+test_data_sendrecv_##FuncNameType(                                                     \
+    DataType* result_buf,                                                              \
+    DataType* test_buf,                                                                \
+    size_t num_elems,                                                                  \
+    int num_ranks,                                                                     \
+    int my_rank,                                                                       \
+    int seq) {                                                                         \
+                                                                                       \
+  int prev_rank = (my_rank - 1 + num_ranks) % num_ranks;                               \
+  int next_rank = (my_rank + 1) % num_ranks;                                           \
+                                                                                       \
+  unsigned int seed_prev =                                                             \
+      (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + prev_rank + seq);         \
+  unsigned int seed_next =                                                             \
+      (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + next_rank + seq);         \
+                                                                                       \
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x;                               \
+       i < num_elems;                                                                  \
+       i += blockDim.x * gridDim.x) {                                                   \
+                                                                                       \
+    seed_prev = ranqd1(seed_prev);                                                     \
+    seed_next = ranqd1(seed_next);                                                     \
+                                                                                       \
+    DataType exp_prev = DataType(seed_prev % blockDim.x) / DataType(blockDim.x);       \
+    DataType exp_next = DataType(seed_next % blockDim.x) / DataType(blockDim.x);       \
+                                                                                       \
+    if (result_buf[i] != exp_prev) {                                                   \
+      if (blockIdx.x == 0 && threadIdx.x == 0 && i < 8) {                              \
+        printf("***rank=%d i=%zu result=%f prev(from %d)=%f next(from %d)=%f\n",          \
+               my_rank, i, (float)result_buf[i],                                      \
+               prev_rank, (float)exp_prev,                                            \
+               next_rank, (float)exp_next);                                           \
+      }                                                                                \
+    }                                                                                  \
+                                                                                       \
+    test_buf[i] = exp_prev;                                                           \
+    assert(result_buf[i] == test_buf[i]);                                              \
+  }                                                                                    \
+}
+*/
+TEST_DATA_SENDRECV(float16, __half)
+TEST_DATA_SENDRECV(float32, float)
+TEST_DATA_SENDRECV(int32, int)

From 194a79f77294d73eaf278c2e47a72f3b97152d9c Mon Sep 17 00:00:00 2001
From: Ubuntu <mahdiehghazi@microsoft.com>
Date: Fri, 3 Apr 2026 20:01:22 +0000
Subject: [PATCH 096/132] add sendrecv correctness check

---
 python/test/executor_test.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index 83b2cb863..74dbca118 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -70,6 +70,7 @@ def bench_correctness(
 ):
     type_size = cp.dtype(parse_dtype(dtype_str)).itemsize
 
+    print("collective: ", collective)
     fill_data_kernel_name = "fill_data_%s" % dtype_str
     if "allgather" in collective:
         coll = "all_gather"
@@ -78,7 +79,7 @@ def bench_correctness(
     elif "allreduce" in collective:
         coll = "all_reduce"
     else:
-        coll = "all_to_all"
+        coll = "sendrecv"
     test_data_kernel_name = "test_data_%s_%s" % (coll, dtype_str)
 
     file_dir = os.path.dirname(os.path.abspath(__file__))

From 49979e58ab602593425d28a8bfc6e949f448a54a Mon Sep 17 00:00:00 2001
From: Ubuntu <mahdiehghazi@microsoft.com>
Date: Thu, 19 Mar 2026 00:41:33 +0000
Subject: [PATCH 097/132] tune #instances and remoce extra barriers

---
 .../default_algos/mscclpp_send_recv.py        | 90 +++++++++++++++++++
 python/test/executor_test.py                  | 79 +++++++++++++---
 2 files changed, 158 insertions(+), 11 deletions(-)
 create mode 100644 python/mscclpp/default_algos/mscclpp_send_recv.py

diff --git a/python/mscclpp/default_algos/mscclpp_send_recv.py b/python/mscclpp/default_algos/mscclpp_send_recv.py
new file mode 100644
index 000000000..ef052210c
--- /dev/null
+++ b/python/mscclpp/default_algos/mscclpp_send_recv.py
@@ -0,0 +1,90 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def send_recv_test(name, nnodes, gpus_per_node, split_mask):
+    gpu_size = nnodes * gpus_per_node
+    collective = TestCollective(gpu_size, 1, 1)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        protocol="Simple",
+        num_threads_per_block=1024,
+        use_double_scratch_buffer=False,
+        min_message_size=0,
+        max_message_size=2**64 - 1,
+        instances=4
+    ):
+        # Creating separate port channels for next and prev directions.
+        # When prev and next are the same peer (e.g., 2-node ring), both channels go to the same peer
+        # and get distinct tags. To ensure cross-rank tag matching (rank A's prev_channel signal
+        # arrives at rank B's next_channel wait), we create channels in opposite order for the
+        # "higher" rank so that tags cross-match:
+        #   Lower rank:  [next(tag0), prev(tag1)]
+        #   Higher rank:  [prev(tag0), next(tag1)]
+        # Then lower.prev(tag1) == higher.next(tag1) ✓ and higher.prev(tag0) == lower.next(tag0) ✓
+        # When prev != next (3+ nodes), each channel targets a different peer so each gets tag 0
+        # and this ordering doesn't matter.
+        group_size = split_mask + 1
+        num_groups = gpu_size // group_size
+        next_channels = {}  # channel for sending to next rank
+        prev_channels = {}  # channel for receiving from prev rank
+        prev_next_ids = {}
+        for node in range(nnodes):
+            for gpu in range(gpus_per_node):
+                global_rank_id = gpu + gpus_per_node * node
+                position_in_group = global_rank_id & split_mask
+                group_id = global_rank_id // group_size
+                next_group_id = (group_id + 1) % num_groups
+                next_global_rank_id = next_group_id * group_size + position_in_group
+                prev_group_id = (group_id - 1 + num_groups) % num_groups
+                prev_global_rank_id = prev_group_id * group_size + position_in_group
+                if prev_global_rank_id == next_global_rank_id and global_rank_id > prev_global_rank_id:
+                    # Higher rank: create prev first, then next (swapped order)
+                    prev_channels[global_rank_id] = PortChannel(prev_global_rank_id, global_rank_id)
+                    next_channels[global_rank_id] = PortChannel(next_global_rank_id, global_rank_id)
+                else:
+                    # Lower rank or different peers: create next first, then prev
+                    next_channels[global_rank_id] = PortChannel(next_global_rank_id, global_rank_id)
+                    prev_channels[global_rank_id] = PortChannel(prev_global_rank_id, global_rank_id)
+                prev_next_ids[global_rank_id] = (prev_global_rank_id, next_global_rank_id)
+
+        # sync with the next rank and the previous rank in the group
+        for node in range(nnodes):
+            for gpu in range(gpus_per_node):
+                global_rank_id = gpu + gpus_per_node * node
+                prev_global_rank_id, next_global_rank_id = prev_next_ids[global_rank_id]
+                prev_channels[global_rank_id].signal(tb=0, data_sync=SyncType.none)
+                next_channels[global_rank_id].wait(tb=0, data_sync=SyncType.after)
+                
+                src_rank = Rank(global_rank_id)
+                src_buffer = src_rank.get_input_buffer()
+                dst_rank = Rank(next_global_rank_id)
+                dst_buffer = dst_rank.get_output_buffer()
+
+                next_channels[global_rank_id].put_with_signal(dst_buffer[:], src_buffer[:], tb=0)
+                prev_channels[global_rank_id].wait(tb=0, data_sync=SyncType.none)
+                
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--nnodes", type=int, default=1, help="number of nodes")
+parser.add_argument("--gpus_per_node", type=int, help="number of gpus per node")
+parser.add_argument("--split_mask", type=lambda x: int(x, 0), default=0x3, help="split mask (e.g. 0x3)")
+
+args = parser.parse_args()
+
+send_recv_test(
+    args.name, args.nnodes, args.gpus_per_node, args.split_mask
+)
diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index 74dbca118..250409d95 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -202,17 +202,74 @@ def main(
         mscclpp_group.nranks,
     )
 
-    executor_func = lambda stream: executor.execute(
-        mscclpp_group.my_rank,
-        input_buf.data.ptr,
-        result_buf.data.ptr,
-        input_buf.nbytes,
-        result_buf.nbytes,
-        dtype_to_mscclpp_dtype(dtype),
-        execution_plan,
-        stream.ptr,
-        packet_type,
-    )
+    # Print header once
+    if my_rank == 0:
+        print(
+            f"{'NRanks':>8} {'Message Size (B)':>18} {'BW (GB/s)':>12} "
+            f"{'Latency (us)':>14}      {'Packet Type':>12}"
+        )
+
+    for size in sizes:
+        input_buf, result_buf, test_buf = build_bufs(
+            collective,
+            size,
+            in_place,
+            dtype,
+            my_rank,
+            nranks,
+        )
+
+        executor_func = lambda stream, in_buf=input_buf, out_buf=result_buf: executor.execute(
+            my_rank,
+            in_buf.data.ptr,
+            out_buf.data.ptr,
+            in_buf.nbytes,
+            out_buf.nbytes,
+            dtype_to_mscclpp_dtype(dtype),
+            execution_plan,
+            stream.ptr,
+            packet_type,
+        )
+
+        #mscclpp_group.barrier()
+
+        # Optional correctness check
+        # bench_correctness(
+        #     collective,
+        #     input_buf,
+        #     result_buf,
+        #     test_buf,
+        #     dtype_str,
+        #     my_rank,
+        #     nranks,
+        #     n_iters,
+        #     executor_func,
+        # )
+
+        mscclpp_group.barrier()
+        execution_time = bench_time(n_iters, n_graph_iters, executor_func)
+        #mscclpp_group.barrier()
+
+        if my_rank == 0:
+            msg_size = size
+            bw = result_buf.nbytes / execution_time / 1e3  # GB/s
+            latency = execution_time  # us
+
+            print(
+                f"{nranks:8d} {msg_size:18d} {bw:12.2f} "
+                f"{latency:14.2f}       {str(packet_type):>12}"
+            )
+
+        # Release buffers for this size
+        input_buf = None
+        result_buf = None
+        test_buf = None
+
+        #mscclpp_group.barrier()
+
+    if npkit_dump_dir != "":
+        npkit.dump(npkit_dump_dir)
+        npkit.shutdown()
 
     mscclpp_group.barrier()
     print("size= ", size, "nelem= ", nelem)

From 27fbddb707902210927b54b15a9bdad331689498 Mon Sep 17 00:00:00 2001
From: Ubuntu <mahdiehghazi@microsoft.com>
Date: Tue, 17 Mar 2026 21:00:48 +0000
Subject: [PATCH 098/132] update the executor so we have message size range

---
 python/test/executor_test.py | 165 +++++++++++++++++++++++++++++------
 1 file changed, 138 insertions(+), 27 deletions(-)

diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index 250409d95..9649da7bd 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -20,7 +20,7 @@
 
 
 def parse_dtype(dtype_str):
-    """Convert a human-readable data type string to a numpy data type."""
+    """Convert a human-readable data type string to a CuPy data type."""
     dtype_str = dtype_str.strip().lower()
     if dtype_str == "float16":
         return cp.float16
@@ -33,18 +33,18 @@ def parse_dtype(dtype_str):
 
 
 def bench_time(n_iters: int, n_graph_iters: int, func):
-    # capture cuda graph for n_iters of the kernel launch
+    # Capture CUDA graph for n_iters of the kernel launch
     stream = cp.cuda.Stream(non_blocking=True)
     with stream:
         stream.begin_capture()
-        for i in range(n_iters):
+        for _ in range(n_iters):
             func(stream)
         graph = stream.end_capture()
 
-    # now run a warm up round
+    # Warm-up round
     graph.launch(stream)
 
-    # now run the benchmark and measure time
+    # Benchmark and measure time
     start = cp.cuda.Event()
     end = cp.cuda.Event()
 
@@ -54,6 +54,7 @@ def bench_time(n_iters: int, n_graph_iters: int, func):
     end.record(stream)
     end.synchronize()
 
+    # Return average execution time in microseconds
     return cp.cuda.get_elapsed_time(start, end) / n_iters * 1000.0 / n_graph_iters
 
 
@@ -84,11 +85,16 @@ def bench_correctness(
 
     file_dir = os.path.dirname(os.path.abspath(__file__))
     fill_data_kernel = KernelBuilder(
-        file="executor_test_verifier.cu", kernel_name=fill_data_kernel_name, file_dir=file_dir
+        file="executor_test_verifier.cu",
+        kernel_name=fill_data_kernel_name,
+        file_dir=file_dir,
     ).get_compiled_kernel()
     test_data_kernel = KernelBuilder(
-        file="executor_test_verifier.cu", kernel_name=test_data_kernel_name, file_dir=file_dir
+        file="executor_test_verifier.cu",
+        kernel_name=test_data_kernel_name,
+        file_dir=file_dir,
     ).get_compiled_kernel()
+
     nblocks = 64
     nthreads = 1024
 
@@ -98,27 +104,72 @@ def bench_correctness(
         for i in range(n_iters):
             fill_data_params = pack(input_buf) + struct.pack("Q", input_buf.nbytes // type_size) + pack(rank, i)
             fill_data_kernel.launch_kernel(fill_data_params, nblocks, nthreads, 0, stream)
+
             func(stream)
+
             test_data_params = (
-                pack(result_buf, test_buf) + struct.pack("Q", input_buf.nbytes // type_size) + pack(num_ranks, rank, i)
+                pack(result_buf, test_buf)
+                + struct.pack("Q", input_buf.nbytes // type_size)
+                + pack(num_ranks, rank, i)
             )
             test_data_kernel.launch_kernel(test_data_params, nblocks, nthreads, 0, stream)
+
         graph = stream.end_capture()
+
     graph.launch(stream)
     stream.synchronize()
 
 
 def parse_size(size_str):
-    """Convert a human-readable buffer size string to an integer."""
+    """Convert a human-readable buffer size string to an integer (bytes)."""
     size_str = size_str.strip()
     if not size_str:
-        raise ValueError("Size string can not be empty")
+        raise ValueError("Size string cannot be empty")
+
     units = {"K": 1024, "M": 1024**2, "G": 1024**3}
     if size_str[-1].upper() in units:
         return int(size_str[:-1]) * units[size_str[-1].upper()]
-    else:
-        return int(size_str)
+    return int(size_str)
+
+def parse_size_list(size_arg):
+    """
+    Accept:
+      - single size: '1M'
+      - comma-separated list: '1K,2K,4K'
+      - geometric range: '1K:64K:2' -> start:end:factor
+
+    Returns a list of integer sizes in bytes.
+    """
+    size_arg = size_arg.strip()
 
+    if "," in size_arg:
+        return [parse_size(x) for x in size_arg.split(",")]
+
+    if ":" in size_arg:
+        parts = size_arg.split(":")
+        if len(parts) != 3:
+            raise ValueError("Range format must be start:end:factor, e.g. 1K:64K:2")
+
+        start = parse_size(parts[0])
+        end = parse_size(parts[1])
+        factor = int(parts[2])
+
+        if start <= 0:
+            raise ValueError("Start must be positive")
+        if end < start:
+            raise ValueError("End must be >= start")
+        if factor <= 1:
+            raise ValueError("Factor must be greater than 1")
+
+        sizes = []
+        current = start
+        while current <= end:
+            sizes.append(current)
+            current *= factor
+
+        return sizes
+
+    return [parse_size(size_arg)]
 
 def dtype_to_mscclpp_dtype(dtype):
     if dtype == cp.float16:
@@ -140,22 +191,23 @@ def build_bufs(
     num_ranks: int,
 ):
     type_size = cp.dtype(dtype).itemsize
-    assert (size % type_size) == 0, "size %d not multiple of type size %d" % (size, type_size)
+    assert (size % type_size) == 0, f"size {size} not multiple of type size {type_size}"
     nelems = size // type_size
 
     if "allgather" in collective:
-        assert (nelems % num_ranks) == 0, "nelems %d not multiple of num_ranks %d" % (nelems, num_ranks)
+        assert (nelems % num_ranks) == 0, f"nelems {nelems} not multiple of num_ranks {num_ranks}"
         nelems_input = nelems if in_place else nelems // num_ranks
     else:
         nelems_input = nelems
 
     if "reducescatter" in collective:
-        assert (nelems % num_ranks) == 0, "nelems %d not multiple of num_ranks %d" % (nelems, num_ranks)
+        assert (nelems % num_ranks) == 0, f"nelems {nelems} not multiple of num_ranks {num_ranks}"
         nelems_output = nelems // num_ranks
     else:
         nelems_output = nelems
 
     result_buf = GpuBuffer(nelems_output, dtype=dtype)
+
     if in_place:
         if "allgather" in collective:
             input_buf = cp.split(result_buf, num_ranks)[rank]
@@ -176,7 +228,7 @@ def build_bufs(
 
 def main(
     execution_plan_path: str,
-    size: int,
+    sizes: list[int],
     in_place: bool = True,
     dtype_str: str = "float16",
     packet_type: PacketType = PacketType.LL16,
@@ -184,14 +236,18 @@ def main(
     n_graph_iters: int = 10,
 ):
     mscclpp_group = CommGroup(MPI.COMM_WORLD)
-    cp.cuda.Device(mscclpp_group.my_rank % mscclpp_group.nranks_per_node).use()
+    nranks = mscclpp_group.nranks
+    my_rank = mscclpp_group.my_rank
+
+    cp.cuda.Device(my_rank % mscclpp_group.nranks_per_node).use()
+
     executor = Executor(mscclpp_group.communicator)
     npkit_dump_dir = env().npkit_dump_dir
     if npkit_dump_dir != "":
-        npkit.init(mscclpp_group.my_rank)
-    execution_plan = ExecutionPlan(execution_plan_path, mscclpp_group.my_rank)
-    collective = execution_plan.collective
+        npkit.init(my_rank)
 
+    execution_plan = ExecutionPlan(execution_plan_path, my_rank)
+    collective = execution_plan.collective
     dtype = parse_dtype(dtype_str)
     input_buf, result_buf, test_buf, nelem = build_bufs(
         collective,
@@ -300,9 +356,55 @@ def main(
         executor_func,
     )
 
-    mscclpp_group.barrier()
-    execution_time = bench_time(n_iters, n_graph_iters, executor_func)
-    if npkit_dump_dir is not None:
+        executor_func = lambda stream, in_buf=input_buf, out_buf=result_buf: executor.execute(
+            my_rank,
+            in_buf.data.ptr,
+            out_buf.data.ptr,
+            in_buf.nbytes,
+            out_buf.nbytes,
+            dtype_to_mscclpp_dtype(dtype),
+            execution_plan,
+            stream.ptr,
+            packet_type,
+        )
+
+        mscclpp_group.barrier()
+
+        # Optional correctness check
+        # bench_correctness(
+        #     collective,
+        #     input_buf,
+        #     result_buf,
+        #     test_buf,
+        #     dtype_str,
+        #     my_rank,
+        #     nranks,
+        #     n_iters,
+        #     executor_func,
+        # )
+
+        mscclpp_group.barrier()
+        execution_time = bench_time(n_iters, n_graph_iters, executor_func)
+        mscclpp_group.barrier()
+
+        if my_rank == 0:
+            msg_size = size
+            bw = result_buf.nbytes / execution_time / 1e3  # GB/s
+            latency = execution_time  # us
+
+            print(
+                f"{nranks:8d} {msg_size:18d} {bw:12.2f} "
+                f"{latency:14.2f}       {str(packet_type):>12}"
+            )
+
+        # Release buffers for this size
+        input_buf = None
+        result_buf = None
+        test_buf = None
+
+        mscclpp_group.barrier()
+
+    if npkit_dump_dir != "":
         npkit.dump(npkit_dump_dir)
         npkit.shutdown()
     print(
@@ -317,8 +419,16 @@ def main(
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("-path", "--execution_plan_path", type=str, required=True)
-    parser.add_argument("--size", type=str, required=True)
-    parser.add_argument("--in_place", action="store_true", help="flag to define an in-place operation")
+    parser.add_argument(
+        "--size",
+        type=str,
+        required=True,
+        help=(
+            "Single size (e.g. 1M), comma-separated list (e.g. 1K,2K,4K), "
+            "or range start:end:factor (e.g. 1K:64K:2)"
+        ),
+    )
+    parser.add_argument("--in_place", action="store_true", help="Flag to define an in-place operation")
     parser.add_argument("--dtype", type=str, default="float16", help="Choose from float16, float32, int32")
     parser.add_argument("--packet_type", type=str, default="LL16", help="Choose from LL8, LL16")
     parser.add_argument("--n_iters", type=int, default=10)
@@ -329,10 +439,11 @@ def main(
     if args.packet_type == "LL8":
         packet_type = PacketType.LL8
 
-    buffer_size = parse_size(args.size)
+    buffer_sizes = parse_size_list(args.size)
+
     main(
         args.execution_plan_path,
-        buffer_size,
+        buffer_sizes,
         args.in_place,
         args.dtype,
         packet_type,

From d07a1ba28ca9e209faa509162b656a8f5db4b6b3 Mon Sep 17 00:00:00 2001
From: Ubuntu <mahdiehghazi@microsoft.com>
Date: Tue, 17 Mar 2026 20:43:32 +0000
Subject: [PATCH 099/132] show scale in output

---
 python/test/executor_test.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index 9649da7bd..e5e8cdf25 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -327,8 +327,9 @@ def main(
         npkit.dump(npkit_dump_dir)
         npkit.shutdown()
 
-    mscclpp_group.barrier()
-    print("size= ", size, "nelem= ", nelem)
+        # Print header once
+        print(f"{'NRanks':>8}  {'Message Size (B)':>18} {'BW (GB/s)':>12} {'Latency (us)':>14}     {'Packet Type':>12}")
+        print(f"{nranks:8d}  {msg_size:18d} {bw:12.2f} {latency:14.2f}       {str(packet_type):>12}")
 
     # Sentinel fill: choose something unlikely in your pattern
     result_buf.fill(cp.float16(123.0))

From a191f16b76dbb0b27b94484fd468a741f9f73e5b Mon Sep 17 00:00:00 2001
From: Ubuntu <mahdiehghazi@microsoft.com>
Date: Tue, 17 Mar 2026 20:06:15 +0000
Subject: [PATCH 100/132] add scripts

---
 generate-json.sh | 18 ++++++++++++++++++
 run.sh           | 15 +++++++++++++++
 run_onenode.sh   | 14 ++++++++++++++
 3 files changed, 47 insertions(+)
 create mode 100755 generate-json.sh
 create mode 100755 run.sh
 create mode 100755 run_onenode.sh

diff --git a/generate-json.sh b/generate-json.sh
new file mode 100755
index 000000000..25c21b14e
--- /dev/null
+++ b/generate-json.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+set -ex
+
+# Check if the number of arguments is exactly 1
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 <hostfile> <nnodes> <ppn>"
+    exit 1
+fi
+
+HOSTFILE=$1
+NNODES=$2
+PPN=$3
+
+parallel-scp -h "$HOSTFILE" -p32 -t1800 -r python/test/executor_test.py /home/azhpcuser/mahdieh/mscclpp/python/test/
+
+parallel-scp -h "$HOSTFILE" -p32 -t1800 -r python/mscclpp/default_algos/mscclpp_send_recv.py /home/azhpcuser/mahdieh/mscclpp/python/mscclpp/default_algos/ 
+
+parallel-ssh -h "$HOSTFILE" -p32 -i -t1800 "cd /home/azhpcuser/mahdieh/mscclpp && source mscclpp/bin/activate && python3 python/mscclpp/default_algos/mscclpp_send_recv.py --name send_recv_test --nnodes $NNODES --gpus_per_node $PPN --split_mask 0x3 > test.json "
diff --git a/run.sh b/run.sh
new file mode 100755
index 000000000..1d603f267
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,15 @@
+
+module load mpi/hpcx #mpi/hpcx-mrc #mpi/hpcx-mrc-2.23.1
+
+MPI_ARGS=""
+MPI_ARGS+=" -mca coll_hcoll_enable 0 --mca coll ^ucc,hcoll --mca btl tcp,vader,self --mca pml ob1   --mca oob_tcp_if_include enP22p1s0f1 --mca btl_tcp_if_include enP22p1s0f1"
+MPI_ARGS+=" -x MSCCLPP_IBV_SO=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/libibverbs.so -x UCX_NET_DEVICES=enP22p1s0f1 -x LD_LIBRARY_PATH=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/mrc-header-lib:$LD_LIBRARY_PATH"
+MPI_ARGS+=" -x MSCCLPP_SOCKET_IFNAME=enP22p1s0f1 -x MSCCLPP_IBV_MODE=host-no-atomic  -x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so"
+MPI_ARGS+=" -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1 -x PATH=/home/azhpcuser/mahdieh/mscclpp/mscclpp2/bin/:$PATH "
+MPI_ARGS+=" -x MSCCLPP_LOG_LEVEL=ERROR -x MSCCLPP_DEBUG=ERROR  -x MSCCLPP_IB_GID_INDEX=3 -x MSCCLPP_HCA_DEVICES=mlx5_1,mlx5_0,mlx5_3,mlx5_2"
+MPI_ARGS+=" /home/azhpcuser/mahdieh/mscclpp/mscclpp/bin/python3   /home/azhpcuser/mahdieh/mscclpp/python/test/executor_test.py   -path /home/azhpcuser/mahdieh/mscclpp/test.json"
+
+
+mpirun -np 16 --hostfile ./hosts --map-by ppr:4:node  $MPI_ARGS --size 1G --n_iters 30 #--n_graph_iters 100
+
+#mpirun -np 8  --hostfile /home/azhpcuser/binyli/hostfile   --map-by ppr:4:node   -mca coll_hcoll_enable 0 --mca btl tcp,vader,self --mca pml ob1   --mca oob_tcp_if_include enP22p1s0f1 --mca btl_tcp_if_include enP22p1s0f1 -x MSCCLPP_IBV_SO=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/libibverbs.so -x UCX_NET_DEVICES=enP22p1s0f1 -x LD_LIBRARY_PATH=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/mrc-header-lib:$LD_LIBRARY_PATH -x MSCCLPP_IBV_MODE=host-no-atomic  -x MSCCLPP_SOCKET_IFNAME=enP22p1s0f1   -x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so   -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1     -x MSCCLPP_HCA_DEVICES=mlx5_1,mlx5_0,mlx5_3,mlx5_2   -x PATH=/home/azhpcuser/binyli/mscclpp/bin:$PATH  -x MSCCLPP_LOG_LEVEL=ERROR -x MSCCLPP_DEBUG=WARN -x MSCCLPP_IB_GID_INDEX=3 /home/azhpcuser/binyli/mscclpp/bin/python3   /home/azhpcuser/binyli/mscclpp/python/test/executor_test.py   -path /home/azhpcuser/binyli/mscclpp/test.json   --size 1G --n_iters 30
diff --git a/run_onenode.sh b/run_onenode.sh
new file mode 100755
index 000000000..6e7541d15
--- /dev/null
+++ b/run_onenode.sh
@@ -0,0 +1,14 @@
+
+module load mpi/hpcx #mpi/hpcx-mrc #mpi/hpcx-mrc-2.23.1
+
+MPI_ARGS=""
+MPI_ARGS+="-x CUDA_VISIBLE_DEVICES=0,2 --mca coll ^ucc,hcoll   -mca coll_hcoll_enable 0 --mca btl tcp,vader,self --mca pml ob1   --mca oob_tcp_if_include enP22p1s0f1 --mca btl_tcp_if_include enP22p1s0f1 "
+MPI_ARGS+="-x MSCCLPP_IBV_SO=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/libibverbs.so -x UCX_NET_DEVICES=enP22p1s0f1 -x LD_LIBRARY_PATH=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/mrc-header-lib:$LD_LIBRARY_PATH"
+MPI_ARGS+=" -x MSCCLPP_IBV_MODE=host  -x MSCCLPP_SOCKET_IFNAME=enP22p1s0f1 -x MSCCLPP_IBV_MODE=host-no-atomic  -x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so"
+MPI_ARGS+=" -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1     -x MSCCLPP_HCA_DEVICES=mlx5_1,mlx5_3   -x PATH=/home/azhpcuser/mahdieh/mscclpp/mscclpp2/bin/:$PATH "
+MPI_ARGS+=" -x MSCCLPP_LOG_LEVEL=ERROR -x MSCCLPP_DEBUG=ERROR  -x MSCCLPP_IB_GID_INDEX=3"
+MPI_ARGS+=" /home/azhpcuser/mahdieh/mscclpp/mscclpp/bin/python3   /home/azhpcuser/mahdieh/mscclpp/python/test/executor_test.py   -path /home/azhpcuser/mahdieh/mscclpp/test.json"
+
+
+
+mpirun -np 2 $MPI_ARGS --size 4K --n_iters 500  --n_graph_iters 100

From b1cc6494703940838671634fa48884de0ed53d1c Mon Sep 17 00:00:00 2001
From: Ubuntu <mahdiehghazi@microsoft.com>
Date: Tue, 17 Mar 2026 19:59:35 +0000
Subject: [PATCH 101/132] re-format output

---
 python/test/executor_test.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index e5e8cdf25..9773be5ba 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -408,11 +408,16 @@ def main(
     if npkit_dump_dir != "":
         npkit.dump(npkit_dump_dir)
         npkit.shutdown()
-    print(
-        f"Rank: {mscclpp_group.my_rank} Execution time: {execution_time} us, "
-        f"data size: {result_buf.nbytes} bytes data type: {dtype().dtype.name} "
-        f"packet type: {packet_type}"
-    )
+    # Only rank 0 reports output
+    if mscclpp_group.my_rank == 0:
+        msg_size = result_buf.nbytes
+        bw = result_buf.nbytes / execution_time / 1e3   # GB/s
+        latency = execution_time                        # us
+
+        # Print header once
+        print(f"{'Message Size (B)':>18} {'BW (GB/s)':>12} {'Latency (us)':>14}     {'Packet Type':>12}")
+        print(f"{msg_size:18d} {bw:12.2f} {latency:14.2f}       {str(packet_type):>12}")
+
     executor = None
     mscclpp_group = None
 

From a4118eae7317586eb6bc95eaa418f3d0606c2139 Mon Sep 17 00:00:00 2001
From: Ubuntu <mahdiehghazi@microsoft.com>
Date: Tue, 17 Mar 2026 17:39:29 +0000
Subject: [PATCH 102/132] update the number of instances

---
 python/mscclpp/default_algos/mscclpp_send_recv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mscclpp/default_algos/mscclpp_send_recv.py b/python/mscclpp/default_algos/mscclpp_send_recv.py
index ef052210c..c09cdc27e 100644
--- a/python/mscclpp/default_algos/mscclpp_send_recv.py
+++ b/python/mscclpp/default_algos/mscclpp_send_recv.py
@@ -21,7 +21,7 @@ def send_recv_test(name, nnodes, gpus_per_node, split_mask):
         use_double_scratch_buffer=False,
         min_message_size=0,
         max_message_size=2**64 - 1,
-        instances=4
+        instances=2
     ):
         # Creating separate port channels for next and prev directions.
         # When prev and next are the same peer (e.g., 2-node ring), both channels go to the same peer

From 289f89ddfe04d350fae79789485fe4b8382afd0b Mon Sep 17 00:00:00 2001
From: Ubuntu <binyli@microsoft.com>
Date: Thu, 12 Mar 2026 16:48:35 +0000
Subject: [PATCH 103/132] update

---
 python/mscclpp/default_algos/mscclpp_send_recv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mscclpp/default_algos/mscclpp_send_recv.py b/python/mscclpp/default_algos/mscclpp_send_recv.py
index c09cdc27e..ed7cc9b73 100644
--- a/python/mscclpp/default_algos/mscclpp_send_recv.py
+++ b/python/mscclpp/default_algos/mscclpp_send_recv.py
@@ -21,7 +21,7 @@ def send_recv_test(name, nnodes, gpus_per_node, split_mask):
         use_double_scratch_buffer=False,
         min_message_size=0,
         max_message_size=2**64 - 1,
-        instances=2
+        instances=1
     ):
         # Creating separate port channels for next and prev directions.
         # When prev and next are the same peer (e.g., 2-node ring), both channels go to the same peer

From 1e6d4939a8ad05ae70573e7d21a98a7073f8ac47 Mon Sep 17 00:00:00 2001
From: Ubuntu <binyli@microsoft.com>
Date: Mon, 9 Mar 2026 22:40:35 +0000
Subject: [PATCH 104/132] update

---
 include/mscclpp/env.hpp | 4 ++++
 src/core/env.cpp        | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp
index fb1da22c4..c7575fcab 100644
--- a/include/mscclpp/env.hpp
+++ b/include/mscclpp/env.hpp
@@ -115,6 +115,10 @@ class Env {
   /// Default is false.
   const bool forceDisableGdr;
 
+  /// Env name: `MSCCLPP_IB_GID_INDEX`. The GID index to use for IB transport.
+  /// If unset or set to -1, it defaults to `EndpointConfig::Ib::DefaultGidIndex` (0).
+  const int ibGidIndex;
+
  private:
   Env();
 
diff --git a/src/core/env.cpp b/src/core/env.cpp
index 96f53492e..b48163e90 100644
--- a/src/core/env.cpp
+++ b/src/core/env.cpp
@@ -66,7 +66,8 @@ Env::Env()
       forceNcclFallbackOperation(readEnv<std::string>("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", "")),
       ncclSymmetricMemory(readEnv<bool>("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)),
       forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)),
-      forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)) {}
+      forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)),
+      ibGidIndex(readEnv<int>("MSCCLPP_IB_GID_INDEX", -1)) {}
 
 std::shared_ptr<Env> env() {
   static std::shared_ptr<Env> globalEnv = std::shared_ptr<Env>(new Env());

From 251873ca8eea007d659eee2c5dfdd553ab366133 Mon Sep 17 00:00:00 2001
From: Ubuntu <binyli@microsoft.com>
Date: Mon, 9 Mar 2026 22:38:08 +0000
Subject: [PATCH 105/132] update

---
 include/mscclpp/core.hpp      |  3 ++-
 src/core/executor/executor.cc | 35 +++++++++++++++++++++--------------
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 37bdbd514..5b184f0a3 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -8,6 +8,7 @@
 #include <bitset>
 #include <future>
 #include <memory>
+#include <mscclpp/env.hpp>
 #include <mscclpp/errors.hpp>
 #include <mscclpp/gpu_data_types.hpp>
 #include <mscclpp/version.hpp>
@@ -430,7 +431,7 @@ struct EndpointConfig {
        int maxWrPerSend = DefaultMaxWrPerSend, Mode mode = Mode::Default)
         : deviceIndex(deviceIndex),
           port(port),
-          gidIndex(gidIndex),
+          gidIndex(env()->ibGidIndex > 0 ? env()->ibGidIndex : gidIndex),
           maxCqSize(maxCqSize),
           maxCqPollNum(maxCqPollNum),
           maxSendWr(maxSendWr),
diff --git a/src/core/executor/executor.cc b/src/core/executor/executor.cc
index bf2caf97f..3020cbec9 100644
--- a/src/core/executor/executor.cc
+++ b/src/core/executor/executor.cc
@@ -109,7 +109,7 @@ namespace mscclpp {
 
 struct ExecutionContext {
   std::shared_ptr<ProxyService> proxyService;
-  std::unordered_map<int, Connection> connections;
+  std::vector<Connection> connections;
   std::vector<std::shared_ptr<NvlsConnection>> nvlsConnections;
   MemoryId localMemoryIdBegin = MemoryId(0);
 
@@ -121,8 +121,6 @@ struct ExecutionContext {
   // local registered memories to keep resources alive
   std::vector<mscclpp::RegisteredMemory> localRegisteredMemories;
 
-  std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores;
-  std::vector<mscclpp::SemaphoreId> proxySemaphores;
   std::vector<mscclpp::BaseMemoryChannel> memoryChannels;
   std::vector<mscclpp::BasePortChannel> portChannels;
   std::vector<mscclpp::SwitchChannel> nvlsChannels;
@@ -266,12 +264,24 @@ struct Executor::Impl {
       }
     };
 
-    std::vector<int> connectedPeers = plan.impl_->getConnectedPeers();
-    std::vector<std::shared_future<mscclpp::Connection>> connectionFutures;
-    for (int peer : connectedPeers) {
-      Transport transport =
-          !useIB(rank, peer, this->nranksPerNode) ? Transport::CudaIpc : IBs[rank % this->nranksPerNode];
-      connectionFutures.push_back(this->comm->connect(transport, peer));
+    std::unordered_map<int, int> peerTags;
+    Transport ibTransport = IBs[rank % this->nranksPerNode];
+    std::vector<std::shared_future<Connection>> connFutures;
+    for (ChannelType channelType : {ChannelType::MEMORY, ChannelType::PORT}) {
+      std::vector<ChannelInfo> channelInfos = plan.impl_->getChannelInfos(channelType);
+      for (const auto& info : channelInfos) {
+        for (int peer : info.connectedPeers) {
+          Transport transport = useIB(rank, peer, this->nranksPerNode) ? ibTransport : Transport::CudaIpc;
+          connFutures.push_back(this->comm->connect(transport, peer, peerTags[peer]++));
+        }
+      }
+      channelInfos = plan.impl_->getUnpairedChannelInfos(nranks, channelType);
+      for (const auto& info : channelInfos) {
+        for (int peer : info.connectedPeers) {
+          Transport transport = useIB(rank, peer, this->nranksPerNode) ? ibTransport : Transport::CudaIpc;
+          connFutures.push_back(this->comm->connect(transport, peer, peerTags[peer]++));
+        }
+      }
     }
     for (size_t i = 0; i < connectionFutures.size(); i++) {
       context.connections[connectedPeers[i]] = connectionFutures[i].get();
@@ -360,18 +370,15 @@ struct Executor::Impl {
       proxySemaphores.push_back(context.proxyService->addSemaphore(sem.get()));
     }
 
-    context.memorySemaphores = std::move(memorySemaphores);
-    context.proxySemaphores = std::move(proxySemaphores);
-
     for (ChannelType channelType : channelTypes) {
       std::vector<ChannelInfo> channelInfos = plan.impl_->getChannelInfos(channelType);
       int index = 0;
       for (ChannelInfo& info : channelInfos) {
         for (size_t i = 0; i < info.connectedPeers.size(); i++) {
           if (channelType == ChannelType::MEMORY) {
-            context.memoryChannels.emplace_back(context.memorySemaphores[index++]);
+            context.memoryChannels.emplace_back(memorySemaphores[index++]);
           } else if (channelType == ChannelType::PORT) {
-            context.portChannels.emplace_back(context.proxyService->basePortChannel(context.proxySemaphores[index++]));
+            context.portChannels.emplace_back(context.proxyService->basePortChannel(proxySemaphores[index++]));
           }
         }
       }

From 07d97f6f17e940f4502ee89e81c921bebf52c1cf Mon Sep 17 00:00:00 2001
From: Ubuntu <binyli@microsoft.com>
Date: Mon, 9 Mar 2026 20:27:28 +0000
Subject: [PATCH 106/132] Unique QP per channel and env-controlled GID index

- Change executor to create one connection (unique QP) per channel entry
  instead of sharing connections per peer. This is required for HostNoAtomic
  IB mode where each connection can only forward signals to one semaphore
  via setSignalForwardingDst.

- Add MSCCLPP_IB_GID_INDEX environment variable to override the default
  GID index (3) used for IB transport. Set to the desired GID index value,
  or leave unset/-1 to use the default.
---
 src/core/endpoint.cc | 8 +++++++-
 src/core/env.cpp     | 1 +
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/core/endpoint.cc b/src/core/endpoint.cc
index 5ab4bad0a..3ae2e154a 100644
--- a/src/core/endpoint.cc
+++ b/src/core/endpoint.cc
@@ -49,8 +49,14 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl)
 
     int maxRecvWr = ibNoAtomic_ ? config_.ib.maxRecvWr : 0;
 
+    // Override GID index from environment variable if set
+    int gidIndex = config_.ib.gidIndex;
+    if (env()->ibGidIndex >= 0) {
+      gidIndex = env()->ibGidIndex;
+    }
+
     ibQp_ = contextImpl.getIbContext(config_.transport)
-                ->createQp(config_.ib.port, config_.ib.gidIndex, config_.ib.maxCqSize, config_.ib.maxCqPollNum,
+                ->createQp(config_.ib.port, gidIndex, config_.ib.maxCqSize, config_.ib.maxCqPollNum,
                            config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend, ibNoAtomic_);
     ibQpInfo_ = ibQp_->getInfo();
   } else if (config_.transport == Transport::Ethernet) {
diff --git a/src/core/env.cpp b/src/core/env.cpp
index b48163e90..2af5bddf0 100644
--- a/src/core/env.cpp
+++ b/src/core/env.cpp
@@ -96,6 +96,7 @@ std::shared_ptr<Env> env() {
     logEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", globalEnv->ncclSymmetricMemory);
     logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls);
     logEnv("MSCCLPP_FORCE_DISABLE_GDR", globalEnv->forceDisableGdr);
+    logEnv("MSCCLPP_IB_GID_INDEX", globalEnv->ibGidIndex);
   }
   return globalEnv;
 }

From 8cecfee270ebf7ac169f4c1a388dde5198c43b70 Mon Sep 17 00:00:00 2001
From: Ubuntu <binyli@microsoft.com>
Date: Mon, 9 Mar 2026 20:05:46 +0000
Subject: [PATCH 107/132] debug

---
 .../default_algos/mscclpp_send_recv.py        |  2 +-
 src/core/connection.cc                        |  8 +++++++
 src/core/executor/executor.cc                 | 22 ++++++++++++-------
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/python/mscclpp/default_algos/mscclpp_send_recv.py b/python/mscclpp/default_algos/mscclpp_send_recv.py
index ed7cc9b73..ef052210c 100644
--- a/python/mscclpp/default_algos/mscclpp_send_recv.py
+++ b/python/mscclpp/default_algos/mscclpp_send_recv.py
@@ -21,7 +21,7 @@ def send_recv_test(name, nnodes, gpus_per_node, split_mask):
         use_double_scratch_buffer=False,
         min_message_size=0,
         max_message_size=2**64 - 1,
-        instances=1
+        instances=4
     ):
         # Creating separate port channels for next and prev directions.
         # When prev and next are the same peer (e.g., 2-node ring), both channels go to the same peer
diff --git a/src/core/connection.cc b/src/core/connection.cc
index 8b6c0afbf..d0fb19e7d 100644
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -309,6 +309,14 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
     // Pre-post receive requests for incoming WRITE_WITH_IMM notifications.
     // The recv CQE guarantees the preceding data WRITE has been committed to GPU memory.
     auto qp = qp_.lock();
+    // dataDirectEnabled_ = localImpl.ibSignalGpuMr_ && localImpl.ibSignalGpuMr_->isDataDirect() &&
+    //                       localSignalGpuMap_ && localSignalGpuMap_->valid();
+    dataDirectEnabled_ = true;
+    if (dataDirectEnabled_) {
+      INFO(CONN, "IBConnection: Data Direct enabled");
+    }
+
+    // Pre-post receive requests for incoming write-with-imm
     int maxRecvWr = localEndpoint.config().ib.maxRecvWr;
     for (int i = 0; i < maxRecvWr; ++i) {
       qp->stageRecv(/*wrId=*/0);
diff --git a/src/core/executor/executor.cc b/src/core/executor/executor.cc
index 3020cbec9..b5510b630 100644
--- a/src/core/executor/executor.cc
+++ b/src/core/executor/executor.cc
@@ -96,6 +96,7 @@ namespace {
 auto hasIBDevices = []() { return mscclpp::getIBDeviceCount() > 0; };
 
 auto useIB = [](int rank1, int rank2, int nranksPerNode) {
+  return true;
   bool inSameNode = rank1 / nranksPerNode == rank2 / nranksPerNode;
   return hasIBDevices() && !inSameNode;
 };
@@ -109,7 +110,7 @@ namespace mscclpp {
 
 struct ExecutionContext {
   std::shared_ptr<ProxyService> proxyService;
-  std::vector<Connection> connections;
+  std::vector<Connection> connections;  // one connection (unique QP) per channel
   std::vector<std::shared_ptr<NvlsConnection>> nvlsConnections;
   MemoryId localMemoryIdBegin = MemoryId(0);
 
@@ -264,7 +265,10 @@ struct Executor::Impl {
       }
     };
 
-    std::unordered_map<int, int> peerTags;
+    // Create one connection (unique QP) per channel entry. Each channel gets its own
+    // QP — no shared connections. This is required for HostNoAtomic IB mode where each
+    // connection can only forward signals to one semaphore via setSignalForwardingDst.
+    int tag = 0;
     Transport ibTransport = IBs[rank % this->nranksPerNode];
     std::vector<std::shared_future<Connection>> connFutures;
     for (ChannelType channelType : {ChannelType::MEMORY, ChannelType::PORT}) {
@@ -272,19 +276,20 @@ struct Executor::Impl {
       for (const auto& info : channelInfos) {
         for (int peer : info.connectedPeers) {
           Transport transport = useIB(rank, peer, this->nranksPerNode) ? ibTransport : Transport::CudaIpc;
-          connFutures.push_back(this->comm->connect(transport, peer, peerTags[peer]++));
+          connFutures.push_back(this->comm->connect(transport, peer, tag++));
         }
       }
       channelInfos = plan.impl_->getUnpairedChannelInfos(nranks, channelType);
       for (const auto& info : channelInfos) {
         for (int peer : info.connectedPeers) {
           Transport transport = useIB(rank, peer, this->nranksPerNode) ? ibTransport : Transport::CudaIpc;
-          connFutures.push_back(this->comm->connect(transport, peer, peerTags[peer]++));
+          connFutures.push_back(this->comm->connect(transport, peer, tag++));
         }
       }
     }
-    for (size_t i = 0; i < connectionFutures.size(); i++) {
-      context.connections[connectedPeers[i]] = connectionFutures[i].get();
+
+    for (auto& future : connFutures) {
+      context.connections.push_back(future.get());
     }
 
     std::vector<NvlsInfo> nvlsInfos = plan.impl_->nvlsInfos.at(rank);
@@ -338,10 +343,11 @@ struct Executor::Impl {
     std::vector<std::shared_future<Semaphore>> futureProxySemaphores;
     std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores;
     std::vector<mscclpp::SemaphoreId> proxySemaphores;
+    int connIdx = 0;
     auto processChannelInfos = [&](std::vector<ChannelInfo>& channelInfos) {
       for (ChannelInfo& info : channelInfos) {
-        for (int peer : info.connectedPeers) {
-          auto connection = context.connections.at(peer);
+        for (size_t i = 0; i < info.connectedPeers.size(); i++) {
+          auto& connection = context.connections[connIdx++];
           if (info.channelType == ChannelType::MEMORY) {
             futureMemorySemaphores.push_back(this->comm->buildSemaphore(
                 connection, this->comm->remoteRankOf(connection), this->comm->tagOf(connection)));

From ad56728c6d2a3545edbd421f0d03165eadb21c35 Mon Sep 17 00:00:00 2001
From: Ubuntu <binyli@microsoft.com>
Date: Sun, 8 Mar 2026 23:36:43 +0000
Subject: [PATCH 108/132] fix

---
 src/core/ib.cc                        | 58 +++++++++++----------------
 src/core/include/execution_kernel.hpp |  8 ++--
 2 files changed, 28 insertions(+), 38 deletions(-)

diff --git a/src/core/ib.cc b/src/core/ib.cc
index 557f04268..f4972f46b 100644
--- a/src/core/ib.cc
+++ b/src/core/ib.cc
@@ -84,50 +84,40 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect) : mr_(nu
   if (isGpuBuff && isDmabufSupportedByGpu(gpuId)) {
 #if !defined(MSCCLPP_USE_ROCM)
     int fd = -1;
-    size_t rangeSize = pages * pageSize;
-
-    // Obtain a DMA-BUF file descriptor for the GPU memory range. On platforms with a CPU-GPU
-    // bridge that reorders posted writes (e.g., Grace/GB200 NVLink-C2C), the PCIe mapping flag
-    // routes DMA through the Data Direct engine for correct ordering and higher throughput.
-    // Fall back to the default (non-PCIe) mapping if the flag is unsupported.
-#if (CUDA_VERSION >= 12030)
-    CUresult cuRes = cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
-                                                   CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE);
-    if (cuRes != CUDA_SUCCESS || fd < 0) {
-      if (fd >= 0) ::close(fd);
-      fd = -1;
-    }
-    bool usedPcieFlag = (fd >= 0);
-#endif  // CUDA_VERSION >= 12030
-    if (fd < 0) {
-      MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
-    }
-
-    // Register the DMA-BUF memory region. When Data Direct is available, use the mlx5dv API
-    // which enables hardware-level Data Direct routing for the MR. Otherwise use standard verbs.
     size_t offsetInDmaBuf = buffIntPtr % pageSize;
     int accessFlags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
                       IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC;
 
 #if defined(MSCCLPP_USE_MLX5DV)
-    if (isDataDirect) {
-      mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
+    if (isMlx5 && MLX5DV::isAvailable()) {
+      // DATA_DIRECT requires a PCIe BAR1-mapped DMA-BUF fd (CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE).
+      // This matches the perftest approach for achieving full bandwidth with DATA_DIRECT.
+      CUresult cuRes = cuMemGetHandleForAddressRange(&fd, addr, pages * pageSize,
+                                                     CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
+                                                     CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE);
+      if (cuRes == CUDA_SUCCESS && fd >= 0) {
+        mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
+        if (mr_ != nullptr) {
+          isDataDirect_ = true;
+        } else {
+          INFO(NET, "mlx5dv_reg_dmabuf_mr failed with PCIe DMA-BUF, falling back to regular DMA-BUF");
+          ::close(fd);
+          fd = -1;
+        }
+      } else {
+        INFO(NET, "cuMemGetHandleForAddressRange with PCIE flag failed (", cuRes, "), falling back");
+        if (fd >= 0) { ::close(fd); fd = -1; }
+      }
     }
 #endif
     if (mr_ == nullptr) {
+      if (fd < 0) {
+        MSCCLPP_CUTHROW(
+            cuMemGetHandleForAddressRange(&fd, addr, pages * pageSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+      }
       mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
     }
-
-    // If MR registration failed with a PCIe-mapped fd, retry with the default mapping.
-#if (CUDA_VERSION >= 12030)
-    if (mr_ == nullptr && usedPcieFlag) {
-      ::close(fd);
-      MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
-      mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
-    }
-#endif  // CUDA_VERSION >= 12030
-
-    ::close(fd);
+    if (fd >= 0) ::close(fd);
     if (mr_ == nullptr) {
       THROW(NET, IbError, errno, "ibv_reg_dmabuf_mr failed (errno ", errno, ")");
     }
diff --git a/src/core/include/execution_kernel.hpp b/src/core/include/execution_kernel.hpp
index 20147c30f..7719e61ad 100644
--- a/src/core/include/execution_kernel.hpp
+++ b/src/core/include/execution_kernel.hpp
@@ -173,11 +173,11 @@ MSCCLPP_DEVICE_INLINE void handlePut(const Operation& op, void* input, void* out
       uint32_t dstOffset =
           dstOffsets[tid] + getOffset<ReuseScratch>(portChannelBufferTypes_[op.outputBufferRefs[tid].id], offset);
       uint32_t srcOffset = srcOffsets[tid] + getOffset<ReuseScratch>(op.inputBufferRefs[tid].type, offset);
-      if constexpr (PutWithSignal) {
-        portChannels_[channelIndexes[tid]].putWithSignal(dstMemoryId, dstOffset, srcMemoryId, srcOffset, size);
-      } else if constexpr (PutWithSignalAndFlush) {
+      if constexpr (PutWithSignalAndFlush) {
         portChannels_[channelIndexes[tid]].putWithSignalAndFlush(dstMemoryId, (uint64_t)dstOffset, srcMemoryId,
-                                                                 (uint64_t)srcOffsets, size);
+                                                                 (uint64_t)srcOffset, size);
+      } else if constexpr (PutWithSignal) {
+        portChannels_[channelIndexes[tid]].putWithSignal(dstMemoryId, dstOffset, srcMemoryId, srcOffset, size);
       } else {
         portChannels_[channelIndexes[tid]].put(dstMemoryId, dstOffset, srcMemoryId, srcOffset, size);
       }

From e487f831e6483a5e7ab00e9ee1b3878754b69308 Mon Sep 17 00:00:00 2001
From: Ubuntu <binyli@microsoft.com>
Date: Fri, 6 Mar 2026 18:25:03 +0000
Subject: [PATCH 109/132] debug

---
 cmake/FindGDRCopy.cmake         |  12 +-
 include/mscclpp/core.hpp        |   2 +-
 python/mscclpp/language/rank.py |  15 ++-
 test.json                       | 218 ++++++++++++++++++++++++++++++++
 4 files changed, 237 insertions(+), 10 deletions(-)
 create mode 100644 test.json

diff --git a/cmake/FindGDRCopy.cmake b/cmake/FindGDRCopy.cmake
index 54e0ba1c6..c1f786aec 100644
--- a/cmake/FindGDRCopy.cmake
+++ b/cmake/FindGDRCopy.cmake
@@ -30,15 +30,19 @@ find_library(GDRCOPY_LIBRARIES
   ${GDRCOPY_ROOT_DIR}/lib
   /usr/local/lib
   /usr/lib
-  /usr/lib/x86_64-linux-gnu)
+  /usr/lib/x86_64-linux-gnu
+  /usr/lib/aarch64-linux-gnu)
 
 if(GDRCOPY_INCLUDE_DIRS)
-    include(CheckSymbolExists)
+    include(CheckCXXSourceCompiles)
     set(CMAKE_REQUIRED_INCLUDES ${GDRCOPY_INCLUDE_DIRS})
     set(CMAKE_REQUIRED_LIBRARIES ${GDRCOPY_LIBRARIES})
-    check_symbol_exists(gdr_pin_buffer_v2 "gdrapi.h" GDRCOPY_HAS_PIN_BUFFER_V2)
-    unset(CMAKE_REQUIRED_LIBRARIES)
+    check_cxx_source_compiles("
+      #include <gdrapi.h>
+      int main() { gdr_pin_buffer_v2(0, 0, 0, 0, 0); return 0; }
+    " GDRCOPY_HAS_PIN_BUFFER_V2)
     unset(CMAKE_REQUIRED_INCLUDES)
+    unset(CMAKE_REQUIRED_LIBRARIES)
     if(NOT GDRCOPY_HAS_PIN_BUFFER_V2)
         message(STATUS "GDRCopy found but too old (gdr_pin_buffer_v2 not available). Requires >= 2.5.")
         set(GDRCOPY_INCLUDE_DIRS GDRCOPY_INCLUDE_DIRS-NOTFOUND)
diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 5b184f0a3..4aeab6545 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -390,7 +390,7 @@ struct EndpointConfig {
     };
 
     static constexpr int DefaultPort = -1;
-    static constexpr int DefaultGidIndex = 0;
+    static constexpr int DefaultGidIndex = 3;
     static constexpr int DefaultMaxCqSize = 1024;
     static constexpr int DefaultMaxCqPollNum = 1;
     static constexpr int DefaultMaxSendWr = 8192;
diff --git a/python/mscclpp/language/rank.py b/python/mscclpp/language/rank.py
index e5b7aab89..0c38cb064 100644
--- a/python/mscclpp/language/rank.py
+++ b/python/mscclpp/language/rank.py
@@ -304,11 +304,16 @@ def __init__(self, rank: int, buffer_type: BufferType, offset: int, size: int):
         self.size = offset + size
 
     def __getitem__(self, key):
-        if self.offset + key.stop > self.size:
-            raise RuntimeError(
-                f"Index range from {self.offset + key.start} - {self.offset + key.stop} is out of bounds for buffer {self.buffer_type}. Buffer size: {self.size}"
-            )
-        return Chunk(self.rank, self.buffer_type, self.offset + key.start, key.stop - key.start)
+        if isinstance(key, slice):
+            start = key.start if key.start is not None else 0
+            stop = key.stop if key.stop is not None else (self.size - self.offset)
+            if self.offset + stop > self.size:
+                raise RuntimeError(
+                    f"Index range from {self.offset + start} - {self.offset + stop} is out of bounds for buffer {self.buffer_type}. Buffer size: {self.size}"
+                )
+            return Chunk(self.rank, self.buffer_type, self.offset + start, stop - start)
+        else:
+            raise TypeError(f"Buffer indices must be slices, not {type(key).__name__}")
 
 
 class Buffer(BaseBuffer):
diff --git a/test.json b/test.json
new file mode 100644
index 000000000..294c2a13e
--- /dev/null
+++ b/test.json
@@ -0,0 +1,218 @@
+{
+  "name": "send_recv_test",
+  "collective": "test",
+  "protocol": "Simple",
+  "inplace": false,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 1,
+      "output_chunks": 1,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "put",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "port",
+          "connected_to": [
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "o",
+          "access_channel_types": [
+            "port"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 1,
+      "output_chunks": 1,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "put",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "port",
+          "connected_to": [
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "o",
+          "access_channel_types": [
+            "port"
+          ]
+        }
+      ],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": false,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}

From 2c3f125d4c1481b53bfd2a3c267e15946f7db4d8 Mon Sep 17 00:00:00 2001
From: Ubuntu <mahdiehghazi@microsoft.com>
Date: Mon, 6 Apr 2026 03:29:54 +0000
Subject: [PATCH 110/132] add changes from ib and connection

---
 src/core/connection.cc |  8 ------
 src/core/ib.cc         | 58 +++++++++++++++++++++++++-----------------
 2 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/src/core/connection.cc b/src/core/connection.cc
index d0fb19e7d..8b6c0afbf 100644
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -309,14 +309,6 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
     // Pre-post receive requests for incoming WRITE_WITH_IMM notifications.
     // The recv CQE guarantees the preceding data WRITE has been committed to GPU memory.
     auto qp = qp_.lock();
-    // dataDirectEnabled_ = localImpl.ibSignalGpuMr_ && localImpl.ibSignalGpuMr_->isDataDirect() &&
-    //                       localSignalGpuMap_ && localSignalGpuMap_->valid();
-    dataDirectEnabled_ = true;
-    if (dataDirectEnabled_) {
-      INFO(CONN, "IBConnection: Data Direct enabled");
-    }
-
-    // Pre-post receive requests for incoming write-with-imm
     int maxRecvWr = localEndpoint.config().ib.maxRecvWr;
     for (int i = 0; i < maxRecvWr; ++i) {
       qp->stageRecv(/*wrId=*/0);
diff --git a/src/core/ib.cc b/src/core/ib.cc
index f4972f46b..557f04268 100644
--- a/src/core/ib.cc
+++ b/src/core/ib.cc
@@ -84,40 +84,50 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect) : mr_(nu
   if (isGpuBuff && isDmabufSupportedByGpu(gpuId)) {
 #if !defined(MSCCLPP_USE_ROCM)
     int fd = -1;
+    size_t rangeSize = pages * pageSize;
+
+    // Obtain a DMA-BUF file descriptor for the GPU memory range. On platforms with a CPU-GPU
+    // bridge that reorders posted writes (e.g., Grace/GB200 NVLink-C2C), the PCIe mapping flag
+    // routes DMA through the Data Direct engine for correct ordering and higher throughput.
+    // Fall back to the default (non-PCIe) mapping if the flag is unsupported.
+#if (CUDA_VERSION >= 12030)
+    CUresult cuRes = cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
+                                                   CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE);
+    if (cuRes != CUDA_SUCCESS || fd < 0) {
+      if (fd >= 0) ::close(fd);
+      fd = -1;
+    }
+    bool usedPcieFlag = (fd >= 0);
+#endif  // CUDA_VERSION >= 12030
+    if (fd < 0) {
+      MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+    }
+
+    // Register the DMA-BUF memory region. When Data Direct is available, use the mlx5dv API
+    // which enables hardware-level Data Direct routing for the MR. Otherwise use standard verbs.
     size_t offsetInDmaBuf = buffIntPtr % pageSize;
     int accessFlags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
                       IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC;
 
 #if defined(MSCCLPP_USE_MLX5DV)
-    if (isMlx5 && MLX5DV::isAvailable()) {
-      // DATA_DIRECT requires a PCIe BAR1-mapped DMA-BUF fd (CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE).
-      // This matches the perftest approach for achieving full bandwidth with DATA_DIRECT.
-      CUresult cuRes = cuMemGetHandleForAddressRange(&fd, addr, pages * pageSize,
-                                                     CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
-                                                     CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE);
-      if (cuRes == CUDA_SUCCESS && fd >= 0) {
-        mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
-        if (mr_ != nullptr) {
-          isDataDirect_ = true;
-        } else {
-          INFO(NET, "mlx5dv_reg_dmabuf_mr failed with PCIe DMA-BUF, falling back to regular DMA-BUF");
-          ::close(fd);
-          fd = -1;
-        }
-      } else {
-        INFO(NET, "cuMemGetHandleForAddressRange with PCIE flag failed (", cuRes, "), falling back");
-        if (fd >= 0) { ::close(fd); fd = -1; }
-      }
+    if (isDataDirect) {
+      mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
     }
 #endif
     if (mr_ == nullptr) {
-      if (fd < 0) {
-        MSCCLPP_CUTHROW(
-            cuMemGetHandleForAddressRange(&fd, addr, pages * pageSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
-      }
       mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
     }
-    if (fd >= 0) ::close(fd);
+
+    // If MR registration failed with a PCIe-mapped fd, retry with the default mapping.
+#if (CUDA_VERSION >= 12030)
+    if (mr_ == nullptr && usedPcieFlag) {
+      ::close(fd);
+      MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+      mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
+    }
+#endif  // CUDA_VERSION >= 12030
+
+    ::close(fd);
     if (mr_ == nullptr) {
       THROW(NET, IbError, errno, "ibv_reg_dmabuf_mr failed (errno ", errno, ")");
     }

From 1a065dd6ada25cc337135ba2a0f75d1e36122dff Mon Sep 17 00:00:00 2001
From: Ubuntu <mahdiehghazi@microsoft.com>
Date: Mon, 6 Apr 2026 20:06:21 +0000
Subject: [PATCH 111/132] add help scripts

---
 copyjson.sh                  |  17 +++
 python/test/executor_test.py | 265 ++++++-----------------------------
 run-sendrecv2.sh             |  12 ++
 3 files changed, 75 insertions(+), 219 deletions(-)
 create mode 100755 copyjson.sh
 create mode 100755 run-sendrecv2.sh

diff --git a/copyjson.sh b/copyjson.sh
new file mode 100755
index 000000000..9e0771e13
--- /dev/null
+++ b/copyjson.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+set -ex
+
+# Check if the number of arguments is exactly 1
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 <hostfile>"
+    exit 1
+fi
+export MSCCLPPHOME=/home/azhpcuser/mahdieh/mscclpp-unittest/mscclpp/
+
+HOSTFILE=$1
+
+parallel-scp -h "$HOSTFILE" -p128 -t1800 -r  ./*.json $MSCCLPPHOME
+
+parallel-scp -h "$HOSTFILE" -p128 -t1800 -r ./python/test/executor_test.py $MSCCLPPHOME/python/test/
+
+parallel-scp -h "$HOSTFILE" -p128 -t1800 -r ./python/test/executor_test_verifier.cu $MSCCLPPHOME/python/test/
diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index 9773be5ba..eeace1a12 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -20,7 +20,7 @@
 
 
 def parse_dtype(dtype_str):
-    """Convert a human-readable data type string to a CuPy data type."""
+    """Convert a human-readable data type string to a numpy data type."""
     dtype_str = dtype_str.strip().lower()
     if dtype_str == "float16":
         return cp.float16
@@ -33,18 +33,18 @@ def parse_dtype(dtype_str):
 
 
 def bench_time(n_iters: int, n_graph_iters: int, func):
-    # Capture CUDA graph for n_iters of the kernel launch
+    # capture cuda graph for n_iters of the kernel launch
     stream = cp.cuda.Stream(non_blocking=True)
     with stream:
         stream.begin_capture()
-        for _ in range(n_iters):
+        for i in range(n_iters):
             func(stream)
         graph = stream.end_capture()
 
-    # Warm-up round
+    # now run a warm up round
     graph.launch(stream)
 
-    # Benchmark and measure time
+    # now run the benchmark and measure time
     start = cp.cuda.Event()
     end = cp.cuda.Event()
 
@@ -54,7 +54,6 @@ def bench_time(n_iters: int, n_graph_iters: int, func):
     end.record(stream)
     end.synchronize()
 
-    # Return average execution time in microseconds
     return cp.cuda.get_elapsed_time(start, end) / n_iters * 1000.0 / n_graph_iters
 
 
@@ -85,16 +84,11 @@ def bench_correctness(
 
     file_dir = os.path.dirname(os.path.abspath(__file__))
     fill_data_kernel = KernelBuilder(
-        file="executor_test_verifier.cu",
-        kernel_name=fill_data_kernel_name,
-        file_dir=file_dir,
+        file="executor_test_verifier.cu", kernel_name=fill_data_kernel_name, file_dir=file_dir
     ).get_compiled_kernel()
     test_data_kernel = KernelBuilder(
-        file="executor_test_verifier.cu",
-        kernel_name=test_data_kernel_name,
-        file_dir=file_dir,
+        file="executor_test_verifier.cu", kernel_name=test_data_kernel_name, file_dir=file_dir
     ).get_compiled_kernel()
-
     nblocks = 64
     nthreads = 1024
 
@@ -104,72 +98,27 @@ def bench_correctness(
         for i in range(n_iters):
             fill_data_params = pack(input_buf) + struct.pack("Q", input_buf.nbytes // type_size) + pack(rank, i)
             fill_data_kernel.launch_kernel(fill_data_params, nblocks, nthreads, 0, stream)
-
             func(stream)
-
             test_data_params = (
-                pack(result_buf, test_buf)
-                + struct.pack("Q", input_buf.nbytes // type_size)
-                + pack(num_ranks, rank, i)
+                pack(result_buf, test_buf) + struct.pack("Q", input_buf.nbytes // type_size) + pack(num_ranks, rank, i)
             )
             test_data_kernel.launch_kernel(test_data_params, nblocks, nthreads, 0, stream)
-
         graph = stream.end_capture()
-
     graph.launch(stream)
     stream.synchronize()
 
 
 def parse_size(size_str):
-    """Convert a human-readable buffer size string to an integer (bytes)."""
+    """Convert a human-readable buffer size string to an integer."""
     size_str = size_str.strip()
     if not size_str:
-        raise ValueError("Size string cannot be empty")
-
+        raise ValueError("Size string can not be empty")
     units = {"K": 1024, "M": 1024**2, "G": 1024**3}
     if size_str[-1].upper() in units:
         return int(size_str[:-1]) * units[size_str[-1].upper()]
-    return int(size_str)
-
-def parse_size_list(size_arg):
-    """
-    Accept:
-      - single size: '1M'
-      - comma-separated list: '1K,2K,4K'
-      - geometric range: '1K:64K:2' -> start:end:factor
-
-    Returns a list of integer sizes in bytes.
-    """
-    size_arg = size_arg.strip()
-
-    if "," in size_arg:
-        return [parse_size(x) for x in size_arg.split(",")]
-
-    if ":" in size_arg:
-        parts = size_arg.split(":")
-        if len(parts) != 3:
-            raise ValueError("Range format must be start:end:factor, e.g. 1K:64K:2")
-
-        start = parse_size(parts[0])
-        end = parse_size(parts[1])
-        factor = int(parts[2])
-
-        if start <= 0:
-            raise ValueError("Start must be positive")
-        if end < start:
-            raise ValueError("End must be >= start")
-        if factor <= 1:
-            raise ValueError("Factor must be greater than 1")
-
-        sizes = []
-        current = start
-        while current <= end:
-            sizes.append(current)
-            current *= factor
-
-        return sizes
+    else:
+        return int(size_str)
 
-    return [parse_size(size_arg)]
 
 def dtype_to_mscclpp_dtype(dtype):
     if dtype == cp.float16:
@@ -191,23 +140,22 @@ def build_bufs(
     num_ranks: int,
 ):
     type_size = cp.dtype(dtype).itemsize
-    assert (size % type_size) == 0, f"size {size} not multiple of type size {type_size}"
+    assert (size % type_size) == 0, "size %d not multiple of type size %d" % (size, type_size)
     nelems = size // type_size
 
     if "allgather" in collective:
-        assert (nelems % num_ranks) == 0, f"nelems {nelems} not multiple of num_ranks {num_ranks}"
+        assert (nelems % num_ranks) == 0, "nelems %d not multiple of num_ranks %d" % (nelems, num_ranks)
         nelems_input = nelems if in_place else nelems // num_ranks
     else:
         nelems_input = nelems
 
     if "reducescatter" in collective:
-        assert (nelems % num_ranks) == 0, f"nelems {nelems} not multiple of num_ranks {num_ranks}"
+        assert (nelems % num_ranks) == 0, "nelems %d not multiple of num_ranks %d" % (nelems, num_ranks)
         nelems_output = nelems // num_ranks
     else:
         nelems_output = nelems
 
     result_buf = GpuBuffer(nelems_output, dtype=dtype)
-
     if in_place:
         if "allgather" in collective:
             input_buf = cp.split(result_buf, num_ranks)[rank]
@@ -228,7 +176,7 @@ def build_bufs(
 
 def main(
     execution_plan_path: str,
-    sizes: list[int],
+    size: int,
     in_place: bool = True,
     dtype_str: str = "float16",
     packet_type: PacketType = PacketType.LL16,
@@ -236,18 +184,14 @@ def main(
     n_graph_iters: int = 10,
 ):
     mscclpp_group = CommGroup(MPI.COMM_WORLD)
-    nranks = mscclpp_group.nranks
-    my_rank = mscclpp_group.my_rank
-
-    cp.cuda.Device(my_rank % mscclpp_group.nranks_per_node).use()
-
+    cp.cuda.Device(mscclpp_group.my_rank % mscclpp_group.nranks_per_node).use()
     executor = Executor(mscclpp_group.communicator)
     npkit_dump_dir = env().npkit_dump_dir
     if npkit_dump_dir != "":
-        npkit.init(my_rank)
-
-    execution_plan = ExecutionPlan(execution_plan_path, my_rank)
+        npkit.init(mscclpp_group.my_rank)
+    execution_plan = ExecutionPlan(execution_plan_path, mscclpp_group.my_rank)
     collective = execution_plan.collective
+
     dtype = parse_dtype(dtype_str)
     input_buf, result_buf, test_buf, nelem = build_bufs(
         collective,
@@ -258,78 +202,20 @@ def main(
         mscclpp_group.nranks,
     )
 
-    # Print header once
-    if my_rank == 0:
-        print(
-            f"{'NRanks':>8} {'Message Size (B)':>18} {'BW (GB/s)':>12} "
-            f"{'Latency (us)':>14}      {'Packet Type':>12}"
-        )
-
-    for size in sizes:
-        input_buf, result_buf, test_buf = build_bufs(
-            collective,
-            size,
-            in_place,
-            dtype,
-            my_rank,
-            nranks,
-        )
-
-        executor_func = lambda stream, in_buf=input_buf, out_buf=result_buf: executor.execute(
-            my_rank,
-            in_buf.data.ptr,
-            out_buf.data.ptr,
-            in_buf.nbytes,
-            out_buf.nbytes,
-            dtype_to_mscclpp_dtype(dtype),
-            execution_plan,
-            stream.ptr,
-            packet_type,
-        )
-
-        #mscclpp_group.barrier()
-
-        # Optional correctness check
-        # bench_correctness(
-        #     collective,
-        #     input_buf,
-        #     result_buf,
-        #     test_buf,
-        #     dtype_str,
-        #     my_rank,
-        #     nranks,
-        #     n_iters,
-        #     executor_func,
-        # )
-
-        mscclpp_group.barrier()
-        execution_time = bench_time(n_iters, n_graph_iters, executor_func)
-        #mscclpp_group.barrier()
-
-        if my_rank == 0:
-            msg_size = size
-            bw = result_buf.nbytes / execution_time / 1e3  # GB/s
-            latency = execution_time  # us
-
-            print(
-                f"{nranks:8d} {msg_size:18d} {bw:12.2f} "
-                f"{latency:14.2f}       {str(packet_type):>12}"
-            )
-
-        # Release buffers for this size
-        input_buf = None
-        result_buf = None
-        test_buf = None
-
-        #mscclpp_group.barrier()
-
-    if npkit_dump_dir != "":
-        npkit.dump(npkit_dump_dir)
-        npkit.shutdown()
+    executor_func = lambda stream: executor.execute(
+        mscclpp_group.my_rank,
+        input_buf.data.ptr,
+        result_buf.data.ptr,
+        input_buf.nbytes,
+        result_buf.nbytes,
+        dtype_to_mscclpp_dtype(dtype),
+        execution_plan,
+        stream.ptr,
+        packet_type,
+    )
 
-        # Print header once
-        print(f"{'NRanks':>8}  {'Message Size (B)':>18} {'BW (GB/s)':>12} {'Latency (us)':>14}     {'Packet Type':>12}")
-        print(f"{nranks:8d}  {msg_size:18d} {bw:12.2f} {latency:14.2f}       {str(packet_type):>12}")
+    mscclpp_group.barrier()
+    print("size= ", size, "nelem= ", nelem)
 
     # Sentinel fill: choose something unlikely in your pattern
     result_buf.fill(cp.float16(123.0))
@@ -357,67 +243,17 @@ def main(
         executor_func,
     )
 
-        executor_func = lambda stream, in_buf=input_buf, out_buf=result_buf: executor.execute(
-            my_rank,
-            in_buf.data.ptr,
-            out_buf.data.ptr,
-            in_buf.nbytes,
-            out_buf.nbytes,
-            dtype_to_mscclpp_dtype(dtype),
-            execution_plan,
-            stream.ptr,
-            packet_type,
-        )
-
-        mscclpp_group.barrier()
-
-        # Optional correctness check
-        # bench_correctness(
-        #     collective,
-        #     input_buf,
-        #     result_buf,
-        #     test_buf,
-        #     dtype_str,
-        #     my_rank,
-        #     nranks,
-        #     n_iters,
-        #     executor_func,
-        # )
-
-        mscclpp_group.barrier()
-        execution_time = bench_time(n_iters, n_graph_iters, executor_func)
-        mscclpp_group.barrier()
-
-        if my_rank == 0:
-            msg_size = size
-            bw = result_buf.nbytes / execution_time / 1e3  # GB/s
-            latency = execution_time  # us
-
-            print(
-                f"{nranks:8d} {msg_size:18d} {bw:12.2f} "
-                f"{latency:14.2f}       {str(packet_type):>12}"
-            )
-
-        # Release buffers for this size
-        input_buf = None
-        result_buf = None
-        test_buf = None
-
-        mscclpp_group.barrier()
-
-    if npkit_dump_dir != "":
+    mscclpp_group.barrier()
+    execution_time = bench_time(n_iters, n_graph_iters, executor_func)
+    if npkit_dump_dir is not None:
         npkit.dump(npkit_dump_dir)
         npkit.shutdown()
-    # Only rank 0 reports output
-    if mscclpp_group.my_rank == 0:
-        msg_size = result_buf.nbytes
-        bw = result_buf.nbytes / execution_time / 1e3   # GB/s
-        latency = execution_time                        # us
-
-        # Print header once
-        print(f"{'Message Size (B)':>18} {'BW (GB/s)':>12} {'Latency (us)':>14}     {'Packet Type':>12}")
-        print(f"{msg_size:18d} {bw:12.2f} {latency:14.2f}       {str(packet_type):>12}")
-
+    print(
+        f"Rank: {mscclpp_group.my_rank} Execution time: {execution_time} us, "
+        f"data size: {result_buf.nbytes} bytes data type: {dtype().dtype.name} "
+        f"bandwidth: {result_buf.nbytes / (execution_time * 1e-6) / (1024**3):.2f} GB/s, "
+        f"packet type: {packet_type}"
+    )
     executor = None
     mscclpp_group = None
 
@@ -425,16 +261,8 @@ def main(
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("-path", "--execution_plan_path", type=str, required=True)
-    parser.add_argument(
-        "--size",
-        type=str,
-        required=True,
-        help=(
-            "Single size (e.g. 1M), comma-separated list (e.g. 1K,2K,4K), "
-            "or range start:end:factor (e.g. 1K:64K:2)"
-        ),
-    )
-    parser.add_argument("--in_place", action="store_true", help="Flag to define an in-place operation")
+    parser.add_argument("--size", type=str, required=True)
+    parser.add_argument("--in_place", action="store_true", help="flag to define an in-place operation")
     parser.add_argument("--dtype", type=str, default="float16", help="Choose from float16, float32, int32")
     parser.add_argument("--packet_type", type=str, default="LL16", help="Choose from LL8, LL16")
     parser.add_argument("--n_iters", type=int, default=10)
@@ -445,11 +273,10 @@ def main(
     if args.packet_type == "LL8":
         packet_type = PacketType.LL8
 
-    buffer_sizes = parse_size_list(args.size)
-
+    buffer_size = parse_size(args.size)
     main(
         args.execution_plan_path,
-        buffer_sizes,
+        buffer_size,
         args.in_place,
         args.dtype,
         packet_type,
diff --git a/run-sendrecv2.sh b/run-sendrecv2.sh
new file mode 100755
index 000000000..556cc09dd
--- /dev/null
+++ b/run-sendrecv2.sh
@@ -0,0 +1,12 @@
+module load mpi/hpcx #mpi/hpcx-mrc #mpi/hpcx-mrc-2.23.1
+
+MPI_ARGS=""
+MPI_ARGS+=" -x CUDA_VISIBLE_DEVICES=1 -mca coll_hcoll_enable 0 --mca coll ^ucc,hcoll --mca btl tcp,vader,self --mca pml ob1   --mca oob_tcp_if_include enP22p1s0f1 --mca btl_tcp_if_include enP22p1s0f1"
+MPI_ARGS+=" -x MSCCLPP_IBV_SO=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/libibverbs.so -x UCX_NET_DEVICES=enP22p1s0f1 -x LD_LIBRARY_PATH=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/mrc-header-lib:$LD_LIBRARY_PATH"
+MPI_ARGS+=" -x MSCCLPP_SOCKET_IFNAME=enP22p1s0f1 -x MSCCLPP_IBV_MODE=host-no-atomic  -x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so"
+MPI_ARGS+=" -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1 -x PATH=/home/azhpcuser/mahdieh/mscclpp-unittest/mscclpp/mscclpp/bin/:$PATH "
+MPI_ARGS+=" -x MSCCLPP_LOG_LEVEL=ERROR -x MSCCLPP_DEBUG=ERROR  -x MSCCLPP_IB_GID_INDEX=3 -x MSCCLPP_HCA_DEVICES=mlx5_0"
+MPI_ARGS+=" /home/azhpcuser/mahdieh/mscclpp-unittest/mscclpp/mscclpp/bin/python3   /home/azhpcuser/mahdieh/mscclpp-unittest/mscclpp/python/test/executor_test.py   -path /home/azhpcuser/mahdieh/mscclpp-unittest/mscclpp/sendrecv.json"
+
+
+mpirun -np 2 --hostfile ./hosts --map-by ppr:1:node  $MPI_ARGS --size 1K

From 812f6cfdede1a7102a105e9530cada27f9defed6 Mon Sep 17 00:00:00 2001
From: Ubuntu <mahdiehghazi@microsoft.com>
Date: Tue, 7 Apr 2026 01:32:54 +0000
Subject: [PATCH 112/132] fix hang on 4 ranks and make send/recv test more like
 nccl-test

---
 .../default_algos/mscclpp_send_recv.py        | 123 +++++++++++-------
 1 file changed, 78 insertions(+), 45 deletions(-)

diff --git a/python/mscclpp/default_algos/mscclpp_send_recv.py b/python/mscclpp/default_algos/mscclpp_send_recv.py
index ef052210c..7f68fe861 100644
--- a/python/mscclpp/default_algos/mscclpp_send_recv.py
+++ b/python/mscclpp/default_algos/mscclpp_send_recv.py
@@ -12,6 +12,7 @@
 def send_recv_test(name, nnodes, gpus_per_node, split_mask):
     gpu_size = nnodes * gpus_per_node
     collective = TestCollective(gpu_size, 1, 1)
+
     with CollectiveProgram(
         name,
         collective,
@@ -21,70 +22,102 @@ def send_recv_test(name, nnodes, gpus_per_node, split_mask):
         use_double_scratch_buffer=False,
         min_message_size=0,
         max_message_size=2**64 - 1,
-        instances=4
+        instances=1,   # ✅ correctness-first
     ):
-        # Creating separate port channels for next and prev directions.
-        # When prev and next are the same peer (e.g., 2-node ring), both channels go to the same peer
-        # and get distinct tags. To ensure cross-rank tag matching (rank A's prev_channel signal
-        # arrives at rank B's next_channel wait), we create channels in opposite order for the
-        # "higher" rank so that tags cross-match:
-        #   Lower rank:  [next(tag0), prev(tag1)]
-        #   Higher rank:  [prev(tag0), next(tag1)]
-        # Then lower.prev(tag1) == higher.next(tag1) ✓ and higher.prev(tag0) == lower.next(tag0) ✓
-        # When prev != next (3+ nodes), each channel targets a different peer so each gets tag 0
-        # and this ordering doesn't matter.
+
+        # Ring grouping
         group_size = split_mask + 1
         num_groups = gpu_size // group_size
-        next_channels = {}  # channel for sending to next rank
-        prev_channels = {}  # channel for receiving from prev rank
+
+        next_channels = {}
+        prev_channels = {}
         prev_next_ids = {}
+
+        # ------------------------------------------------------------------
+        # Channel creation (parity-based for deterministic tag matching)
+        # ------------------------------------------------------------------
         for node in range(nnodes):
             for gpu in range(gpus_per_node):
-                global_rank_id = gpu + gpus_per_node * node
-                position_in_group = global_rank_id & split_mask
-                group_id = global_rank_id // group_size
-                next_group_id = (group_id + 1) % num_groups
-                next_global_rank_id = next_group_id * group_size + position_in_group
-                prev_group_id = (group_id - 1 + num_groups) % num_groups
-                prev_global_rank_id = prev_group_id * group_size + position_in_group
-                if prev_global_rank_id == next_global_rank_id and global_rank_id > prev_global_rank_id:
-                    # Higher rank: create prev first, then next (swapped order)
-                    prev_channels[global_rank_id] = PortChannel(prev_global_rank_id, global_rank_id)
-                    next_channels[global_rank_id] = PortChannel(next_global_rank_id, global_rank_id)
+                rank = gpu + gpus_per_node * node
+
+                pos = rank & split_mask
+                group = rank // group_size
+
+                next_group = (group + 1) % num_groups
+                prev_group = (group - 1 + num_groups) % num_groups
+
+                next_rank = next_group * group_size + pos
+                prev_rank = prev_group * group_size + pos
+
+                # ✅ parity-based creation order
+                if (rank & 1) == 0:
+                    next_channels[rank] = PortChannel(next_rank, rank)
+                    prev_channels[rank] = PortChannel(prev_rank, rank)
                 else:
-                    # Lower rank or different peers: create next first, then prev
-                    next_channels[global_rank_id] = PortChannel(next_global_rank_id, global_rank_id)
-                    prev_channels[global_rank_id] = PortChannel(prev_global_rank_id, global_rank_id)
-                prev_next_ids[global_rank_id] = (prev_global_rank_id, next_global_rank_id)
+                    prev_channels[rank] = PortChannel(prev_rank, rank)
+                    next_channels[rank] = PortChannel(next_rank, rank)
+
+                prev_next_ids[rank] = (prev_rank, next_rank)
 
-        # sync with the next rank and the previous rank in the group
+        # ------------------------------------------------------------------
+        # Ring send/recv (deadlock-free)
+        # ------------------------------------------------------------------
         for node in range(nnodes):
             for gpu in range(gpus_per_node):
-                global_rank_id = gpu + gpus_per_node * node
-                prev_global_rank_id, next_global_rank_id = prev_next_ids[global_rank_id]
-                prev_channels[global_rank_id].signal(tb=0, data_sync=SyncType.none)
-                next_channels[global_rank_id].wait(tb=0, data_sync=SyncType.after)
-                
-                src_rank = Rank(global_rank_id)
-                src_buffer = src_rank.get_input_buffer()
-                dst_rank = Rank(next_global_rank_id)
-                dst_buffer = dst_rank.get_output_buffer()
-
-                next_channels[global_rank_id].put_with_signal(dst_buffer[:], src_buffer[:], tb=0)
-                prev_channels[global_rank_id].wait(tb=0, data_sync=SyncType.none)
-                
+                rank = gpu + gpus_per_node * node
+                prev_rank, next_rank = prev_next_ids[rank]
+
+                ch_from_prev = prev_channels[rank]
+                ch_to_next = next_channels[rank]
+
+                src_rank = Rank(rank)
+                src_buf = src_rank.get_input_buffer()
+                src_chunk = src_buf[0:src_buf.size]
+
+                dst_rank = Rank(next_rank)
+                dst_buf = dst_rank.get_output_buffer()
+                dst_chunk = dst_buf[0:dst_buf.size]
+
+                if rank == 0:
+                    # ✅ starter sends first
+                    ch_to_next.put_with_signal_and_flush(
+                        dst_chunk,
+                        src_chunk,
+                        tb=0,
+                    )
+                    # then receive from prev
+                    ch_from_prev.wait(tb=0, data_sync=SyncType.after)
+                else:
+                    # ✅ everyone else receives first
+                    ch_from_prev.wait(tb=0, data_sync=SyncType.after)
+                    ch_to_next.put_with_signal_and_flush(
+                        dst_chunk,
+                        src_chunk,
+                        tb=0,
+                    )
+
         print(JSON())
 
 
+# ----------------------------------------------------------------------
+# CLI
+# ----------------------------------------------------------------------
 parser = argparse.ArgumentParser()
-
 parser.add_argument("--name", type=str, help="name of the program")
 parser.add_argument("--nnodes", type=int, default=1, help="number of nodes")
 parser.add_argument("--gpus_per_node", type=int, help="number of gpus per node")
-parser.add_argument("--split_mask", type=lambda x: int(x, 0), default=0x3, help="split mask (e.g. 0x3)")
+parser.add_argument(
+    "--split_mask",
+    type=lambda x: int(x, 0),
+    default=0x3,
+    help="split mask (e.g. 0x3)",
+)
 
 args = parser.parse_args()
 
 send_recv_test(
-    args.name, args.nnodes, args.gpus_per_node, args.split_mask
+    args.name,
+    args.nnodes,
+    args.gpus_per_node,
+    args.split_mask,
 )

From 3f2ade22cb043ded33d7fde801082d1f37fc5aef Mon Sep 17 00:00:00 2001
From: Ubuntu <mahdiehghazi@microsoft.com>
Date: Tue, 7 Apr 2026 01:40:15 +0000
Subject: [PATCH 113/132] add barrier

---
 python/test/executor_test.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index eeace1a12..1175d6298 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -230,6 +230,8 @@ def main(
     # Count how many elements changed
     changed = cp.count_nonzero(result_buf != cp.float16(123.0)).item()
     print("changed elements:", changed, "out of", result_buf.size)
+    cp.cuda.runtime.deviceSynchronize()
+    mscclpp_group.barrier()
 
     bench_correctness(
         collective,

From 6d8fb00a91e6a12bfa07f42f82f4574b390ac3af Mon Sep 17 00:00:00 2001
From: Ubuntu <mahdiehghazi@microsoft.com>
Date: Thu, 9 Apr 2026 15:58:07 +0000
Subject: [PATCH 114/132] add extra signal/wait and avoid local flush

---
 .../default_algos/mscclpp_send_recv.py        | 243 ++++++++++++------
 1 file changed, 159 insertions(+), 84 deletions(-)

diff --git a/python/mscclpp/default_algos/mscclpp_send_recv.py b/python/mscclpp/default_algos/mscclpp_send_recv.py
index 7f68fe861..d4ce00042 100644
--- a/python/mscclpp/default_algos/mscclpp_send_recv.py
+++ b/python/mscclpp/default_algos/mscclpp_send_recv.py
@@ -9,93 +9,175 @@
 from mscclpp.language.collectives import *
 
 
-def send_recv_test(name, nnodes, gpus_per_node, split_mask):
-    gpu_size = nnodes * gpus_per_node
-    collective = TestCollective(gpu_size, 1, 1)
+def send_recv_test_ring_even_ranks(name, nnodes, gpus_per_node):
+    nranks = nnodes * gpus_per_node
+
+    if nranks < 2:
+        raise ValueError("This test requires at least 2 ranks")
+    if nranks % 2 != 0:
+        raise ValueError(
+            f"This odd/even ring schedule requires an even number of ranks, got {nranks}"
+        )
+
+    collective = TestCollective(nranks, 1, 1)
 
     with CollectiveProgram(
         name,
         collective,
-        gpu_size,
+        nranks,
         protocol="Simple",
         num_threads_per_block=1024,
         use_double_scratch_buffer=False,
         min_message_size=0,
         max_message_size=2**64 - 1,
-        instances=1,   # ✅ correctness-first
+        instances=2,
     ):
-
-        # Ring grouping
-        group_size = split_mask + 1
-        num_groups = gpu_size // group_size
-
         next_channels = {}
         prev_channels = {}
-        prev_next_ids = {}
-
-        # ------------------------------------------------------------------
-        # Channel creation (parity-based for deterministic tag matching)
-        # ------------------------------------------------------------------
-        for node in range(nnodes):
-            for gpu in range(gpus_per_node):
-                rank = gpu + gpus_per_node * node
-
-                pos = rank & split_mask
-                group = rank // group_size
-
-                next_group = (group + 1) % num_groups
-                prev_group = (group - 1 + num_groups) % num_groups
-
-                next_rank = next_group * group_size + pos
-                prev_rank = prev_group * group_size + pos
-
-                # ✅ parity-based creation order
-                if (rank & 1) == 0:
-                    next_channels[rank] = PortChannel(next_rank, rank)
-                    prev_channels[rank] = PortChannel(prev_rank, rank)
-                else:
-                    prev_channels[rank] = PortChannel(prev_rank, rank)
-                    next_channels[rank] = PortChannel(next_rank, rank)
-
-                prev_next_ids[rank] = (prev_rank, next_rank)
-
-        # ------------------------------------------------------------------
-        # Ring send/recv (deadlock-free)
-        # ------------------------------------------------------------------
-        for node in range(nnodes):
-            for gpu in range(gpus_per_node):
-                rank = gpu + gpus_per_node * node
-                prev_rank, next_rank = prev_next_ids[rank]
-
-                ch_from_prev = prev_channels[rank]
-                ch_to_next = next_channels[rank]
-
-                src_rank = Rank(rank)
-                src_buf = src_rank.get_input_buffer()
-                src_chunk = src_buf[0:src_buf.size]
-
-                dst_rank = Rank(next_rank)
-                dst_buf = dst_rank.get_output_buffer()
-                dst_chunk = dst_buf[0:dst_buf.size]
-
-                if rank == 0:
-                    # ✅ starter sends first
-                    ch_to_next.put_with_signal_and_flush(
-                        dst_chunk,
-                        src_chunk,
-                        tb=0,
-                    )
-                    # then receive from prev
-                    ch_from_prev.wait(tb=0, data_sync=SyncType.after)
-                else:
-                    # ✅ everyone else receives first
-                    ch_from_prev.wait(tb=0, data_sync=SyncType.after)
-                    ch_to_next.put_with_signal_and_flush(
-                        dst_chunk,
-                        src_chunk,
-                        tb=0,
-                    )
 
+        # --------------------------------------------------------------
+        # Classic ring across all ranks:
+        #   prev = (rank - 1 + nranks) % nranks
+        #   next = (rank + 1) % nranks
+        # --------------------------------------------------------------
+        for rank in range(nranks):
+            prev_rank = (rank - 1 + nranks) % nranks
+            next_rank = (rank + 1) % nranks
+
+            # Deterministic channel creation order
+            if (rank & 1) == 0:
+                next_channels[rank] = PortChannel(next_rank, rank)
+                prev_channels[rank] = PortChannel(prev_rank, rank)
+            else:
+                prev_channels[rank] = PortChannel(prev_rank, rank)
+                next_channels[rank] = PortChannel(next_rank, rank)
+
+                # --------------------------------------------------------------
+        # --------------------------------------------------------------
+        # Ring send/recv with explicit ACK
+        #
+        # Data path:
+        #   sender:   put_with_signal() to next
+        #   receiver: wait() from prev
+        #
+        # ACK path:
+        #   receiver: signal() back to prev after data is available
+        #   sender:   wait() for ACK from next before proceeding
+        #
+        # Even ranks: send first, then recv, then ACK prev, then wait ACK
+        # Odd ranks : recv first, then ACK prev, then send, then wait ACK
+        # --------------------------------------------------------------
+        for rank in range(nranks):
+            prev_rank = (rank - 1 + nranks) % nranks
+            next_rank = (rank + 1) % nranks
+
+            src_rank = Rank(rank)
+            next_rank_obj = Rank(next_rank)
+
+            src_buf = src_rank.get_input_buffer()
+            next_out_buf = next_rank_obj.get_output_buffer()
+
+            src_chunk = src_buf[0:src_buf.size]
+            dst_chunk = next_out_buf[0:next_out_buf.size]
+
+            ch_to_next = next_channels[rank]
+            ch_from_prev = prev_channels[rank]
+
+            if (rank & 1) == 0:
+                # Send data to next and signal arrival
+                ch_to_next.put_with_signal(
+                    dst_chunk,
+                    src_chunk,
+                    tb=0,
+                )
+
+                # Wait for data from prev to become visible locally
+                ch_from_prev.wait(
+                    tb=0,
+                    data_sync=SyncType.after,
+                )
+
+                # Ack back to prev that this rank has observed/consumed input
+                ch_from_prev.signal(
+                    tb=0,
+                )
+
+                # Wait for next rank to ack our outgoing transfer
+                ch_to_next.wait(
+                    tb=0,
+                )
+
+            else:
+                # Wait for data from prev first
+                ch_from_prev.wait(
+                    tb=0,
+                    data_sync=SyncType.after,
+                )
+
+                # Ack back to prev that this rank has observed/consumed input
+                ch_from_prev.signal(
+                    tb=0,
+                )
+
+                # Then send data to next
+                ch_to_next.put_with_signal(
+                    dst_chunk,
+                    src_chunk,
+                    tb=0,
+                )
+
+                # Wait for next rank to ack our outgoing transfer
+                ch_to_next.wait(
+                    tb=0,
+                )
+        # --------------------------------------------------------------
+        # Ring send/recv
+        #
+        # Even ranks: send first, then wait
+        # Odd ranks : wait first, then send
+        #
+        # This is safe for an even-sized ring and avoids the
+        # single-rank-starter wave.
+        # --------------------------------------------------------------
+        '''
+        for rank in range(nranks):
+            prev_rank = (rank - 1 + nranks) % nranks
+            next_rank = (rank + 1) % nranks
+
+            src_rank = Rank(rank)
+            next_rank_obj = Rank(next_rank)
+
+            src_buf = src_rank.get_input_buffer()
+            next_out_buf = next_rank_obj.get_output_buffer()
+
+            src_chunk = src_buf[0:src_buf.size]
+            dst_chunk = next_out_buf[0:next_out_buf.size]
+
+            ch_to_next = next_channels[rank]
+            ch_from_prev = prev_channels[rank]
+
+            if (rank & 1) == 0:
+                ch_to_next.put_with_signal_and_flush(
+                    dst_chunk,
+                    src_chunk,
+                    tb=0,
+                )
+                ch_from_prev.wait(
+                    tb=0,
+                    data_sync=SyncType.after,
+                )
+            else:
+                ch_from_prev.wait(
+                    tb=0,
+                    data_sync=SyncType.after,
+                )
+                ch_to_next.put_with_signal_and_flush(
+                    dst_chunk,
+                    src_chunk,
+                    tb=0,
+                )
+
+        '''
         print(JSON())
 
 
@@ -103,21 +185,14 @@ def send_recv_test(name, nnodes, gpus_per_node, split_mask):
 # CLI
 # ----------------------------------------------------------------------
 parser = argparse.ArgumentParser()
-parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--name", type=str, required=True, help="name of the program")
 parser.add_argument("--nnodes", type=int, default=1, help="number of nodes")
-parser.add_argument("--gpus_per_node", type=int, help="number of gpus per node")
-parser.add_argument(
-    "--split_mask",
-    type=lambda x: int(x, 0),
-    default=0x3,
-    help="split mask (e.g. 0x3)",
-)
+parser.add_argument("--gpus_per_node", type=int, required=True, help="number of GPUs per node")
 
 args = parser.parse_args()
 
-send_recv_test(
+send_recv_test_ring_even_ranks(
     args.name,
     args.nnodes,
     args.gpus_per_node,
-    args.split_mask,
 )

From 96defbd8a87a60aa0fc1eac68b70d7ec73a46208 Mon Sep 17 00:00:00 2001
From: Ubuntu <mahdiehghazi@microsoft.com>
Date: Fri, 10 Apr 2026 15:39:03 +0000
Subject: [PATCH 115/132] add executor for testing

---
 executor_test.py | 323 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 323 insertions(+)
 create mode 100644 executor_test.py

diff --git a/executor_test.py b/executor_test.py
new file mode 100644
index 000000000..232f2f8bd
--- /dev/null
+++ b/executor_test.py
@@ -0,0 +1,323 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from mscclpp import (
+    DataType,
+    Executor,
+    ExecutionPlan,
+    PacketType,
+    npkit,
+    env,
+)
+from mscclpp import CommGroup, GpuBuffer
+from mscclpp.utils import KernelBuilder, pack
+import os
+import struct
+
+import cupy as cp
+from mpi4py import MPI
+
+
+def parse_dtype(dtype_str):
+    dtype_str = dtype_str.strip().lower()
+    if dtype_str == "float16":
+        return cp.float16
+    elif dtype_str == "float32":
+        return cp.float32
+    elif dtype_str == "int32":
+        return cp.int32
+    else:
+        raise ValueError(f"Unknown data type: {dtype_str}")
+
+
+def parse_size(size_str):
+    size_str = size_str.strip()
+    if not size_str:
+        raise ValueError("Size string can not be empty")
+    units = {"K": 1024, "M": 1024**2, "G": 1024**3}
+    if size_str[-1].upper() in units:
+        return int(size_str[:-1]) * units[size_str[-1].upper()]
+    else:
+        return int(size_str)
+
+
+def dtype_to_mscclpp_dtype(dtype):
+    if dtype == cp.float16:
+        return DataType.float16
+    elif dtype == cp.float32:
+        return DataType.float32
+    elif dtype == cp.int32:
+        return DataType.int32
+    else:
+        raise ValueError(f"Unknown data type: {dtype}")
+
+
+def bench_time(n_iters: int, n_graph_iters: int, func_iter):
+    """
+    Capture CUDA graph for n_iters launches. func_iter(stream, i) must vary slot by i.
+    """
+    stream = cp.cuda.Stream(non_blocking=True)
+    with stream:
+        stream.begin_capture()
+        for i in range(n_iters):
+            func_iter(stream, i)
+        graph = stream.end_capture()
+
+    # warmup
+    graph.launch(stream)
+
+    start = cp.cuda.Event()
+    end = cp.cuda.Event()
+
+    start.record(stream)
+    for _ in range(n_graph_iters):
+        graph.launch(stream)
+    end.record(stream)
+    end.synchronize()
+
+    # us per iteration
+    return cp.cuda.get_elapsed_time(start, end) / n_iters * 1000.0 / n_graph_iters
+
+
+def bench_correctness(
+    collective: str,
+    input_slot: cp.ndarray,
+    result_slot: cp.ndarray,
+    test_buf: cp.ndarray,
+    dtype_str: str,
+    rank: int,
+    num_ranks: int,
+    n_iters: int,
+    func_iter,
+):
+    """
+    Correctness check on ONE per-iteration slot view (input_slot/result_slot change per i via func_iter).
+    We pass the per-iteration element count to verifier kernels.
+    """
+    type_size = cp.dtype(parse_dtype(dtype_str)).itemsize
+    nelems_per_iter = input_slot.nbytes // type_size
+
+    print("collective: ", collective)
+
+    fill_data_kernel_name = "fill_data_%s" % dtype_str
+    if "allgather" in collective:
+        coll = "all_gather"
+    elif "reducescatter" in collective:
+        coll = "reduce_scatter"
+    elif "allreduce" in collective:
+        coll = "all_reduce"
+    else:
+        coll = "sendrecv"
+    test_data_kernel_name = "test_data_%s_%s" % (coll, dtype_str)
+
+    file_dir = os.path.dirname(os.path.abspath(__file__))
+    fill_data_kernel = KernelBuilder(
+        file="executor_test_verifier.cu", kernel_name=fill_data_kernel_name, file_dir=file_dir
+    ).get_compiled_kernel()
+    test_data_kernel = KernelBuilder(
+        file="executor_test_verifier.cu", kernel_name=test_data_kernel_name, file_dir=file_dir
+    ).get_compiled_kernel()
+
+    nblocks = 64
+    nthreads = 1024
+
+    stream = cp.cuda.Stream(non_blocking=True)
+    with stream:
+        stream.begin_capture()
+        for i in range(n_iters):
+            # WARNING: input_slot/result_slot variables are placeholders; actual slot views are chosen inside func_iter.
+            # We only use these kernels with the CURRENT slot views computed below for this iteration.
+            func_iter(stream, i, do_verify=True, fill_kernel=fill_data_kernel, test_kernel=test_data_kernel,
+                      nblocks=nblocks, nthreads=nthreads, nelems_per_iter=nelems_per_iter,
+                      test_buf=test_buf, rank=rank, num_ranks=num_ranks)
+        graph = stream.end_capture()
+
+    graph.launch(stream)
+    stream.synchronize()
+
+
+def build_bufs_sendrecv_ring(size_bytes: int, slots: int, dtype: cp.dtype):
+    """
+    Build ring buffers for sendrecv:
+      - per-iteration message bytes = size_bytes
+      - total allocated bytes per buffer = slots * size_bytes
+    """
+    type_size = cp.dtype(dtype).itemsize
+    assert (size_bytes % type_size) == 0, "size not multiple of dtype size"
+
+    nelems_per_iter = size_bytes // type_size
+    total_nelems = nelems_per_iter * slots
+
+    input_buf = GpuBuffer(total_nelems, dtype=dtype)
+    result_buf = GpuBuffer(total_nelems, dtype=dtype)
+    test_buf = cp.zeros(nelems_per_iter, dtype=dtype)  # expected for one iteration
+
+    return input_buf, result_buf, test_buf, nelems_per_iter
+
+
+def main(
+    execution_plan_path: str,
+    size: int,                 # per-iteration bytes
+    in_place: bool = True,
+    dtype_str: str = "float16",
+    packet_type: PacketType = PacketType.LL16,
+    n_iters: int = 10,
+    n_graph_iters: int = 10,
+    slots: int = 4,            # ring buffer depth
+):
+    mscclpp_group = CommGroup(MPI.COMM_WORLD)
+    cp.cuda.Device(mscclpp_group.my_rank % mscclpp_group.nranks_per_node).use()
+
+    executor = Executor(mscclpp_group.communicator)
+
+    npkit_dump_dir = env().npkit_dump_dir
+    if npkit_dump_dir != "":
+        npkit.init(mscclpp_group.my_rank)
+
+    execution_plan = ExecutionPlan(execution_plan_path, mscclpp_group.my_rank)
+    collective = execution_plan.collective
+
+    dtype = parse_dtype(dtype_str)
+
+    # We only change allocation/behavior for sendrecv
+    if "sendrecv" in collective.lower():
+        input_buf, result_buf, test_buf, nelems_per_iter = build_bufs_sendrecv_ring(size, slots, dtype)
+        type_size = cp.dtype(dtype).itemsize
+        bytes_per_iter = nelems_per_iter * type_size
+
+        def slot_view(buf, slot_idx):
+            start = slot_idx * nelems_per_iter
+            end = start + nelems_per_iter
+            return buf[start:end]
+
+        # Iteration-aware executor call (rotates slot each iteration)
+        def executor_func_iter(stream, i, do_verify=False, **vk):
+            slot = i % slots
+            in_slot = slot_view(input_buf, slot)
+            out_slot = slot_view(result_buf, slot)
+
+            if do_verify:
+                # Fill per-iteration input slot with unique (rank, i) pattern
+                fill_data_kernel = vk["fill_kernel"]
+                test_data_kernel = vk["test_kernel"]
+                nblocks = vk["nblocks"]
+                nthreads = vk["nthreads"]
+                nelems = vk["nelems_per_iter"]
+                test_buf_local = vk["test_buf"]
+                rank = vk["rank"]
+                num_ranks = vk["num_ranks"]
+
+                fill_params = pack(in_slot) + struct.pack("Q", nelems) + pack(rank, i)
+                fill_data_kernel.launch_kernel(fill_params, nblocks, nthreads, 0, stream)
+
+            # Execute exactly one per-iteration message: bytes_per_iter == user --size
+            executor.execute(
+                mscclpp_group.my_rank,
+                in_slot.data.ptr,
+                out_slot.data.ptr,
+                in_slot.nbytes,
+                out_slot.nbytes,
+                dtype_to_mscclpp_dtype(dtype),
+                execution_plan,
+                stream.ptr,
+                packet_type,
+            )
+
+            if do_verify:
+                # Validate the output slot for this iteration i
+                test_params = (
+                    pack(out_slot, test_buf_local)
+                    + struct.pack("Q", nelems)
+                    + pack(num_ranks, rank, i)
+                )
+                test_data_kernel.launch_kernel(test_params, nblocks, nthreads, 0, stream)
+
+        # One-shot sentinel check (slot 0)
+        mscclpp_group.barrier()
+        print("per-iter size= ", bytes_per_iter, "bytes, slots=", slots, "total buffer bytes=", input_buf.nbytes)
+
+        # Fill whole result with sentinel then run ONE iter (i=0)
+        result_buf.fill(cp.asarray(123.0, dtype=dtype))
+        cp.cuda.runtime.deviceSynchronize()
+
+        stream = cp.cuda.Stream(non_blocking=True)
+        with stream:
+            executor_func_iter(stream, 0)
+        stream.synchronize()
+
+        # Count changes only in slot 0 region
+        out0 = slot_view(result_buf, 0)
+        changed = cp.count_nonzero(out0 != cp.asarray(123.0, dtype=dtype)).item()
+        print("changed elements in slot0:", changed, "out of", out0.size)
+
+        cp.cuda.runtime.deviceSynchronize()
+        mscclpp_group.barrier()
+
+        # Correctness: fills + executes + tests with unique i and rotating slots
+        bench_correctness(
+            collective,
+            slot_view(input_buf, 0),   # placeholder; real slot chosen per i
+            slot_view(result_buf, 0),  # placeholder; real slot chosen per i
+            test_buf,
+            dtype_str,
+            mscclpp_group.my_rank,
+            mscclpp_group.nranks,
+            n_iters,
+            executor_func_iter,
+        )
+
+        mscclpp_group.barrier()
+
+        # Timing (CUDA graph captures n_iters launches with varying slot pointers)
+        execution_time = bench_time(n_iters, n_graph_iters, executor_func_iter)
+
+        if npkit_dump_dir is not None:
+            npkit.dump(npkit_dump_dir)
+            npkit.shutdown()
+
+        print(
+            f"Rank: {mscclpp_group.my_rank} Execution time: {execution_time} us, "
+            f"per-iter data size: {bytes_per_iter} bytes dtype: {dtype().dtype.name} "
+            f"bandwidth: {bytes_per_iter / (execution_time * 1e-6) / (1024**3):.2f} GB/s, "
+            f"packet type: {packet_type}, slots: {slots}"
+        )
+
+    else:
+        raise RuntimeError(
+            f"This rewritten executor_test.py currently specializes sendrecv. "
+            f"Plan collective was: {collective}"
+        )
+
+    executor = None
+    mscclpp_group = None
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-path", "--execution_plan_path", type=str, required=True)
+    parser.add_argument("--size", type=str, required=True, help="PER-ITERATION bytes (e.g., 1K, 4M, 1G)")
+    parser.add_argument("--in_place", action="store_true", help="flag to define an in-place operation")
+    parser.add_argument("--dtype", type=str, default="float16", help="Choose from float16, float32, int32")
+    parser.add_argument("--packet_type", type=str, default="LL16", help="Choose from LL8, LL16")
+    parser.add_argument("--n_iters", type=int, default=10)
+    parser.add_argument("--n_graph_iters", type=int, default=10)
+    parser.add_argument("--slots", type=int, default=4, help="ring buffer depth; rotates slot = iter % slots")
+    args = parser.parse_args()
+
+    packet_type = PacketType.LL16
+    if args.packet_type == "LL8":
+        packet_type = PacketType.LL8
+
+    per_iter_size = parse_size(args.size)
+
+    main(
+        args.execution_plan_path,
+        per_iter_size,
+        args.in_place,
+        args.dtype,
+        packet_type,
+        args.n_iters,
+        args.n_graph_iters,
+        args.slots,
+    )

From 68690ecdcd5c8e5a9184463b54434157c4efc8dc Mon Sep 17 00:00:00 2001
From: Ubuntu <mahdiehghazi@microsoft.com>
Date: Fri, 10 Apr 2026 17:21:50 +0000
Subject: [PATCH 116/132] revert dsl

---
 executor_test.py                              | 323 ------------------
 .../default_algos/mscclpp_send_recv.py        | 228 ++++---------
 2 files changed, 60 insertions(+), 491 deletions(-)
 delete mode 100644 executor_test.py

diff --git a/executor_test.py b/executor_test.py
deleted file mode 100644
index 232f2f8bd..000000000
--- a/executor_test.py
+++ /dev/null
@@ -1,323 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-
-import argparse
-from mscclpp import (
-    DataType,
-    Executor,
-    ExecutionPlan,
-    PacketType,
-    npkit,
-    env,
-)
-from mscclpp import CommGroup, GpuBuffer
-from mscclpp.utils import KernelBuilder, pack
-import os
-import struct
-
-import cupy as cp
-from mpi4py import MPI
-
-
-def parse_dtype(dtype_str):
-    dtype_str = dtype_str.strip().lower()
-    if dtype_str == "float16":
-        return cp.float16
-    elif dtype_str == "float32":
-        return cp.float32
-    elif dtype_str == "int32":
-        return cp.int32
-    else:
-        raise ValueError(f"Unknown data type: {dtype_str}")
-
-
-def parse_size(size_str):
-    size_str = size_str.strip()
-    if not size_str:
-        raise ValueError("Size string can not be empty")
-    units = {"K": 1024, "M": 1024**2, "G": 1024**3}
-    if size_str[-1].upper() in units:
-        return int(size_str[:-1]) * units[size_str[-1].upper()]
-    else:
-        return int(size_str)
-
-
-def dtype_to_mscclpp_dtype(dtype):
-    if dtype == cp.float16:
-        return DataType.float16
-    elif dtype == cp.float32:
-        return DataType.float32
-    elif dtype == cp.int32:
-        return DataType.int32
-    else:
-        raise ValueError(f"Unknown data type: {dtype}")
-
-
-def bench_time(n_iters: int, n_graph_iters: int, func_iter):
-    """
-    Capture CUDA graph for n_iters launches. func_iter(stream, i) must vary slot by i.
-    """
-    stream = cp.cuda.Stream(non_blocking=True)
-    with stream:
-        stream.begin_capture()
-        for i in range(n_iters):
-            func_iter(stream, i)
-        graph = stream.end_capture()
-
-    # warmup
-    graph.launch(stream)
-
-    start = cp.cuda.Event()
-    end = cp.cuda.Event()
-
-    start.record(stream)
-    for _ in range(n_graph_iters):
-        graph.launch(stream)
-    end.record(stream)
-    end.synchronize()
-
-    # us per iteration
-    return cp.cuda.get_elapsed_time(start, end) / n_iters * 1000.0 / n_graph_iters
-
-
-def bench_correctness(
-    collective: str,
-    input_slot: cp.ndarray,
-    result_slot: cp.ndarray,
-    test_buf: cp.ndarray,
-    dtype_str: str,
-    rank: int,
-    num_ranks: int,
-    n_iters: int,
-    func_iter,
-):
-    """
-    Correctness check on ONE per-iteration slot view (input_slot/result_slot change per i via func_iter).
-    We pass the per-iteration element count to verifier kernels.
-    """
-    type_size = cp.dtype(parse_dtype(dtype_str)).itemsize
-    nelems_per_iter = input_slot.nbytes // type_size
-
-    print("collective: ", collective)
-
-    fill_data_kernel_name = "fill_data_%s" % dtype_str
-    if "allgather" in collective:
-        coll = "all_gather"
-    elif "reducescatter" in collective:
-        coll = "reduce_scatter"
-    elif "allreduce" in collective:
-        coll = "all_reduce"
-    else:
-        coll = "sendrecv"
-    test_data_kernel_name = "test_data_%s_%s" % (coll, dtype_str)
-
-    file_dir = os.path.dirname(os.path.abspath(__file__))
-    fill_data_kernel = KernelBuilder(
-        file="executor_test_verifier.cu", kernel_name=fill_data_kernel_name, file_dir=file_dir
-    ).get_compiled_kernel()
-    test_data_kernel = KernelBuilder(
-        file="executor_test_verifier.cu", kernel_name=test_data_kernel_name, file_dir=file_dir
-    ).get_compiled_kernel()
-
-    nblocks = 64
-    nthreads = 1024
-
-    stream = cp.cuda.Stream(non_blocking=True)
-    with stream:
-        stream.begin_capture()
-        for i in range(n_iters):
-            # WARNING: input_slot/result_slot variables are placeholders; actual slot views are chosen inside func_iter.
-            # We only use these kernels with the CURRENT slot views computed below for this iteration.
-            func_iter(stream, i, do_verify=True, fill_kernel=fill_data_kernel, test_kernel=test_data_kernel,
-                      nblocks=nblocks, nthreads=nthreads, nelems_per_iter=nelems_per_iter,
-                      test_buf=test_buf, rank=rank, num_ranks=num_ranks)
-        graph = stream.end_capture()
-
-    graph.launch(stream)
-    stream.synchronize()
-
-
-def build_bufs_sendrecv_ring(size_bytes: int, slots: int, dtype: cp.dtype):
-    """
-    Build ring buffers for sendrecv:
-      - per-iteration message bytes = size_bytes
-      - total allocated bytes per buffer = slots * size_bytes
-    """
-    type_size = cp.dtype(dtype).itemsize
-    assert (size_bytes % type_size) == 0, "size not multiple of dtype size"
-
-    nelems_per_iter = size_bytes // type_size
-    total_nelems = nelems_per_iter * slots
-
-    input_buf = GpuBuffer(total_nelems, dtype=dtype)
-    result_buf = GpuBuffer(total_nelems, dtype=dtype)
-    test_buf = cp.zeros(nelems_per_iter, dtype=dtype)  # expected for one iteration
-
-    return input_buf, result_buf, test_buf, nelems_per_iter
-
-
-def main(
-    execution_plan_path: str,
-    size: int,                 # per-iteration bytes
-    in_place: bool = True,
-    dtype_str: str = "float16",
-    packet_type: PacketType = PacketType.LL16,
-    n_iters: int = 10,
-    n_graph_iters: int = 10,
-    slots: int = 4,            # ring buffer depth
-):
-    mscclpp_group = CommGroup(MPI.COMM_WORLD)
-    cp.cuda.Device(mscclpp_group.my_rank % mscclpp_group.nranks_per_node).use()
-
-    executor = Executor(mscclpp_group.communicator)
-
-    npkit_dump_dir = env().npkit_dump_dir
-    if npkit_dump_dir != "":
-        npkit.init(mscclpp_group.my_rank)
-
-    execution_plan = ExecutionPlan(execution_plan_path, mscclpp_group.my_rank)
-    collective = execution_plan.collective
-
-    dtype = parse_dtype(dtype_str)
-
-    # We only change allocation/behavior for sendrecv
-    if "sendrecv" in collective.lower():
-        input_buf, result_buf, test_buf, nelems_per_iter = build_bufs_sendrecv_ring(size, slots, dtype)
-        type_size = cp.dtype(dtype).itemsize
-        bytes_per_iter = nelems_per_iter * type_size
-
-        def slot_view(buf, slot_idx):
-            start = slot_idx * nelems_per_iter
-            end = start + nelems_per_iter
-            return buf[start:end]
-
-        # Iteration-aware executor call (rotates slot each iteration)
-        def executor_func_iter(stream, i, do_verify=False, **vk):
-            slot = i % slots
-            in_slot = slot_view(input_buf, slot)
-            out_slot = slot_view(result_buf, slot)
-
-            if do_verify:
-                # Fill per-iteration input slot with unique (rank, i) pattern
-                fill_data_kernel = vk["fill_kernel"]
-                test_data_kernel = vk["test_kernel"]
-                nblocks = vk["nblocks"]
-                nthreads = vk["nthreads"]
-                nelems = vk["nelems_per_iter"]
-                test_buf_local = vk["test_buf"]
-                rank = vk["rank"]
-                num_ranks = vk["num_ranks"]
-
-                fill_params = pack(in_slot) + struct.pack("Q", nelems) + pack(rank, i)
-                fill_data_kernel.launch_kernel(fill_params, nblocks, nthreads, 0, stream)
-
-            # Execute exactly one per-iteration message: bytes_per_iter == user --size
-            executor.execute(
-                mscclpp_group.my_rank,
-                in_slot.data.ptr,
-                out_slot.data.ptr,
-                in_slot.nbytes,
-                out_slot.nbytes,
-                dtype_to_mscclpp_dtype(dtype),
-                execution_plan,
-                stream.ptr,
-                packet_type,
-            )
-
-            if do_verify:
-                # Validate the output slot for this iteration i
-                test_params = (
-                    pack(out_slot, test_buf_local)
-                    + struct.pack("Q", nelems)
-                    + pack(num_ranks, rank, i)
-                )
-                test_data_kernel.launch_kernel(test_params, nblocks, nthreads, 0, stream)
-
-        # One-shot sentinel check (slot 0)
-        mscclpp_group.barrier()
-        print("per-iter size= ", bytes_per_iter, "bytes, slots=", slots, "total buffer bytes=", input_buf.nbytes)
-
-        # Fill whole result with sentinel then run ONE iter (i=0)
-        result_buf.fill(cp.asarray(123.0, dtype=dtype))
-        cp.cuda.runtime.deviceSynchronize()
-
-        stream = cp.cuda.Stream(non_blocking=True)
-        with stream:
-            executor_func_iter(stream, 0)
-        stream.synchronize()
-
-        # Count changes only in slot 0 region
-        out0 = slot_view(result_buf, 0)
-        changed = cp.count_nonzero(out0 != cp.asarray(123.0, dtype=dtype)).item()
-        print("changed elements in slot0:", changed, "out of", out0.size)
-
-        cp.cuda.runtime.deviceSynchronize()
-        mscclpp_group.barrier()
-
-        # Correctness: fills + executes + tests with unique i and rotating slots
-        bench_correctness(
-            collective,
-            slot_view(input_buf, 0),   # placeholder; real slot chosen per i
-            slot_view(result_buf, 0),  # placeholder; real slot chosen per i
-            test_buf,
-            dtype_str,
-            mscclpp_group.my_rank,
-            mscclpp_group.nranks,
-            n_iters,
-            executor_func_iter,
-        )
-
-        mscclpp_group.barrier()
-
-        # Timing (CUDA graph captures n_iters launches with varying slot pointers)
-        execution_time = bench_time(n_iters, n_graph_iters, executor_func_iter)
-
-        if npkit_dump_dir is not None:
-            npkit.dump(npkit_dump_dir)
-            npkit.shutdown()
-
-        print(
-            f"Rank: {mscclpp_group.my_rank} Execution time: {execution_time} us, "
-            f"per-iter data size: {bytes_per_iter} bytes dtype: {dtype().dtype.name} "
-            f"bandwidth: {bytes_per_iter / (execution_time * 1e-6) / (1024**3):.2f} GB/s, "
-            f"packet type: {packet_type}, slots: {slots}"
-        )
-
-    else:
-        raise RuntimeError(
-            f"This rewritten executor_test.py currently specializes sendrecv. "
-            f"Plan collective was: {collective}"
-        )
-
-    executor = None
-    mscclpp_group = None
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-path", "--execution_plan_path", type=str, required=True)
-    parser.add_argument("--size", type=str, required=True, help="PER-ITERATION bytes (e.g., 1K, 4M, 1G)")
-    parser.add_argument("--in_place", action="store_true", help="flag to define an in-place operation")
-    parser.add_argument("--dtype", type=str, default="float16", help="Choose from float16, float32, int32")
-    parser.add_argument("--packet_type", type=str, default="LL16", help="Choose from LL8, LL16")
-    parser.add_argument("--n_iters", type=int, default=10)
-    parser.add_argument("--n_graph_iters", type=int, default=10)
-    parser.add_argument("--slots", type=int, default=4, help="ring buffer depth; rotates slot = iter % slots")
-    args = parser.parse_args()
-
-    packet_type = PacketType.LL16
-    if args.packet_type == "LL8":
-        packet_type = PacketType.LL8
-
-    per_iter_size = parse_size(args.size)
-
-    main(
-        args.execution_plan_path,
-        per_iter_size,
-        args.in_place,
-        args.dtype,
-        packet_type,
-        args.n_iters,
-        args.n_graph_iters,
-        args.slots,
-    )
diff --git a/python/mscclpp/default_algos/mscclpp_send_recv.py b/python/mscclpp/default_algos/mscclpp_send_recv.py
index d4ce00042..caa0575d1 100644
--- a/python/mscclpp/default_algos/mscclpp_send_recv.py
+++ b/python/mscclpp/default_algos/mscclpp_send_recv.py
@@ -9,190 +9,82 @@
 from mscclpp.language.collectives import *
 
 
-def send_recv_test_ring_even_ranks(name, nnodes, gpus_per_node):
-    nranks = nnodes * gpus_per_node
-
-    if nranks < 2:
-        raise ValueError("This test requires at least 2 ranks")
-    if nranks % 2 != 0:
-        raise ValueError(
-            f"This odd/even ring schedule requires an even number of ranks, got {nranks}"
-        )
-
-    collective = TestCollective(nranks, 1, 1)
-
+def send_recv_test(name, nnodes, gpus_per_node, split_mask):
+    gpu_size = nnodes * gpus_per_node
+    collective = TestCollective(gpu_size, 1, 1)
     with CollectiveProgram(
         name,
         collective,
-        nranks,
+        gpu_size,
         protocol="Simple",
         num_threads_per_block=1024,
         use_double_scratch_buffer=False,
         min_message_size=0,
         max_message_size=2**64 - 1,
-        instances=2,
+        instances=4
     ):
-        next_channels = {}
-        prev_channels = {}
-
-        # --------------------------------------------------------------
-        # Classic ring across all ranks:
-        #   prev = (rank - 1 + nranks) % nranks
-        #   next = (rank + 1) % nranks
-        # --------------------------------------------------------------
-        for rank in range(nranks):
-            prev_rank = (rank - 1 + nranks) % nranks
-            next_rank = (rank + 1) % nranks
-
-            # Deterministic channel creation order
-            if (rank & 1) == 0:
-                next_channels[rank] = PortChannel(next_rank, rank)
-                prev_channels[rank] = PortChannel(prev_rank, rank)
-            else:
-                prev_channels[rank] = PortChannel(prev_rank, rank)
-                next_channels[rank] = PortChannel(next_rank, rank)
-
-                # --------------------------------------------------------------
-        # --------------------------------------------------------------
-        # Ring send/recv with explicit ACK
-        #
-        # Data path:
-        #   sender:   put_with_signal() to next
-        #   receiver: wait() from prev
-        #
-        # ACK path:
-        #   receiver: signal() back to prev after data is available
-        #   sender:   wait() for ACK from next before proceeding
-        #
-        # Even ranks: send first, then recv, then ACK prev, then wait ACK
-        # Odd ranks : recv first, then ACK prev, then send, then wait ACK
-        # --------------------------------------------------------------
-        for rank in range(nranks):
-            prev_rank = (rank - 1 + nranks) % nranks
-            next_rank = (rank + 1) % nranks
-
-            src_rank = Rank(rank)
-            next_rank_obj = Rank(next_rank)
-
-            src_buf = src_rank.get_input_buffer()
-            next_out_buf = next_rank_obj.get_output_buffer()
-
-            src_chunk = src_buf[0:src_buf.size]
-            dst_chunk = next_out_buf[0:next_out_buf.size]
-
-            ch_to_next = next_channels[rank]
-            ch_from_prev = prev_channels[rank]
-
-            if (rank & 1) == 0:
-                # Send data to next and signal arrival
-                ch_to_next.put_with_signal(
-                    dst_chunk,
-                    src_chunk,
-                    tb=0,
-                )
-
-                # Wait for data from prev to become visible locally
-                ch_from_prev.wait(
-                    tb=0,
-                    data_sync=SyncType.after,
-                )
+        # Creating separate port channels for next and prev directions.
+        # When prev and next are the same peer (e.g., 2-node ring), both channels go to the same peer
+        # and get distinct tags. To ensure cross-rank tag matching (rank A's prev_channel signal
+        # arrives at rank B's next_channel wait), we create channels in opposite order for the
+        # "higher" rank so that tags cross-match:
+        #   Lower rank:  [next(tag0), prev(tag1)]
+        #   Higher rank:  [prev(tag0), next(tag1)]
+        # Then lower.prev(tag1) == higher.next(tag1) ✓ and higher.prev(tag0) == lower.next(tag0) ✓
+        # When prev != next (3+ nodes), each channel targets a different peer so each gets tag 0
+        # and this ordering doesn't matter.
+        group_size = split_mask + 1
+        num_groups = gpu_size // group_size
+        next_channels = {}  # channel for sending to next rank
+        prev_channels = {}  # channel for receiving from prev rank
+        prev_next_ids = {}
+        for node in range(nnodes):
+            for gpu in range(gpus_per_node):
+                global_rank_id = gpu + gpus_per_node * node
+                position_in_group = global_rank_id & split_mask
+                group_id = global_rank_id // group_size
+                next_group_id = (group_id + 1) % num_groups
+                next_global_rank_id = next_group_id * group_size + position_in_group
+                prev_group_id = (group_id - 1 + num_groups) % num_groups
+                prev_global_rank_id = prev_group_id * group_size + position_in_group
+                if prev_global_rank_id == next_global_rank_id and global_rank_id > prev_global_rank_id:
+                    # Higher rank: create prev first, then next (swapped order)
+                    prev_channels[global_rank_id] = PortChannel(prev_global_rank_id, global_rank_id)
+                    next_channels[global_rank_id] = PortChannel(next_global_rank_id, global_rank_id)
+                else:
+                    # Lower rank or different peers: create next first, then prev
+                    next_channels[global_rank_id] = PortChannel(next_global_rank_id, global_rank_id)
+                    prev_channels[global_rank_id] = PortChannel(prev_global_rank_id, global_rank_id)
+                prev_next_ids[global_rank_id] = (prev_global_rank_id, next_global_rank_id)
+
+        # sync with the next rank and the previous rank in the group
+        for node in range(nnodes):
+            for gpu in range(gpus_per_node):
+                global_rank_id = gpu + gpus_per_node * node
+                prev_global_rank_id, next_global_rank_id = prev_next_ids[global_rank_id]
+                prev_channels[global_rank_id].signal(tb=0, data_sync=SyncType.none)
+                next_channels[global_rank_id].wait(tb=0, data_sync=SyncType.after)
+
+                src_rank = Rank(global_rank_id)
+                src_buffer = src_rank.get_input_buffer()
+                dst_rank = Rank(next_global_rank_id)
+                dst_buffer = dst_rank.get_output_buffer()
+
+                next_channels[global_rank_id].put_with_signal(dst_buffer[:], src_buffer[:], tb=0)
+                prev_channels[global_rank_id].wait(tb=0, data_sync=SyncType.none)
 
-                # Ack back to prev that this rank has observed/consumed input
-                ch_from_prev.signal(
-                    tb=0,
-                )
-
-                # Wait for next rank to ack our outgoing transfer
-                ch_to_next.wait(
-                    tb=0,
-                )
-
-            else:
-                # Wait for data from prev first
-                ch_from_prev.wait(
-                    tb=0,
-                    data_sync=SyncType.after,
-                )
-
-                # Ack back to prev that this rank has observed/consumed input
-                ch_from_prev.signal(
-                    tb=0,
-                )
-
-                # Then send data to next
-                ch_to_next.put_with_signal(
-                    dst_chunk,
-                    src_chunk,
-                    tb=0,
-                )
-
-                # Wait for next rank to ack our outgoing transfer
-                ch_to_next.wait(
-                    tb=0,
-                )
-        # --------------------------------------------------------------
-        # Ring send/recv
-        #
-        # Even ranks: send first, then wait
-        # Odd ranks : wait first, then send
-        #
-        # This is safe for an even-sized ring and avoids the
-        # single-rank-starter wave.
-        # --------------------------------------------------------------
-        '''
-        for rank in range(nranks):
-            prev_rank = (rank - 1 + nranks) % nranks
-            next_rank = (rank + 1) % nranks
-
-            src_rank = Rank(rank)
-            next_rank_obj = Rank(next_rank)
-
-            src_buf = src_rank.get_input_buffer()
-            next_out_buf = next_rank_obj.get_output_buffer()
-
-            src_chunk = src_buf[0:src_buf.size]
-            dst_chunk = next_out_buf[0:next_out_buf.size]
-
-            ch_to_next = next_channels[rank]
-            ch_from_prev = prev_channels[rank]
-
-            if (rank & 1) == 0:
-                ch_to_next.put_with_signal_and_flush(
-                    dst_chunk,
-                    src_chunk,
-                    tb=0,
-                )
-                ch_from_prev.wait(
-                    tb=0,
-                    data_sync=SyncType.after,
-                )
-            else:
-                ch_from_prev.wait(
-                    tb=0,
-                    data_sync=SyncType.after,
-                )
-                ch_to_next.put_with_signal_and_flush(
-                    dst_chunk,
-                    src_chunk,
-                    tb=0,
-                )
-
-        '''
         print(JSON())
 
 
-# ----------------------------------------------------------------------
-# CLI
-# ----------------------------------------------------------------------
 parser = argparse.ArgumentParser()
-parser.add_argument("--name", type=str, required=True, help="name of the program")
+
+parser.add_argument("--name", type=str, help="name of the program")
 parser.add_argument("--nnodes", type=int, default=1, help="number of nodes")
-parser.add_argument("--gpus_per_node", type=int, required=True, help="number of GPUs per node")
+parser.add_argument("--gpus_per_node", type=int, help="number of gpus per node")
+parser.add_argument("--split_mask", type=lambda x: int(x, 0), default=0x3, help="split mask (e.g. 0x3)")
 
 args = parser.parse_args()
 
-send_recv_test_ring_even_ranks(
-    args.name,
-    args.nnodes,
-    args.gpus_per_node,
+send_recv_test(
+    args.name, args.nnodes, args.gpus_per_node, args.split_mask
 )

From f83a5571b8611e34630b6f637c85d3e6588b2799 Mon Sep 17 00:00:00 2001
From: Ubuntu <Binyang Li>
Date: Sat, 11 Apr 2026 04:47:33 +0000
Subject: [PATCH 117/132] Add sendrecv support with double-buffer to
 executor_test

- Add TEST_DATA_SEND_RECV verifier kernel that replays fill_data PRNG
  with peer_rank seed to validate received data
- Add double-buffer support for sendrecv in executor_test.py:
  allocate 2 input/result/test buffers, alternate per iteration
- Create two executor funcs for sendrecv, one per buffer pair
- Update bench_correctness and bench_time to handle double-buffer
- Add bandwidth reporting to output

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 python/test/executor_test.py          | 123 ++++++++++------
 python/test/executor_test_verifier.cu | 201 ++------------------------
 2 files changed, 90 insertions(+), 234 deletions(-)

diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index 1175d6298..be6e5834c 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -14,6 +14,7 @@
 from mscclpp.utils import KernelBuilder, pack
 import os
 import struct
+from typing import Callable, Union
 
 import cupy as cp
 from mpi4py import MPI
@@ -32,13 +33,16 @@ def parse_dtype(dtype_str):
         raise ValueError(f"Unknown data type: {dtype_str}")
 
 
-def bench_time(n_iters: int, n_graph_iters: int, func):
-    # capture cuda graph for n_iters of the kernel launch
+def bench_time(n_iters: int, n_graph_iters: int, func: Union[Callable, list[Callable]]):
+    """Benchmark execution time. func can be a single callable or a list of 2 for double-buffer."""
     stream = cp.cuda.Stream(non_blocking=True)
     with stream:
         stream.begin_capture()
         for i in range(n_iters):
-            func(stream)
+            if isinstance(func, list):
+                func[i % 2](stream)
+            else:
+                func(stream)
         graph = stream.end_capture()
 
     # now run a warm up round
@@ -59,18 +63,19 @@ def bench_time(n_iters: int, n_graph_iters: int, func):
 
 def bench_correctness(
     collective: str,
-    input_buf: cp.ndarray,
-    result_buf: cp.ndarray,
-    test_buf: cp.ndarray,
+    input_buf: Union[cp.ndarray, list[cp.ndarray]],
+    result_buf: Union[cp.ndarray, list[cp.ndarray]],
+    test_buf: Union[cp.ndarray, list[cp.ndarray]],
     dtype_str: str,
     rank: int,
     num_ranks: int,
     n_iters: int,
-    func,
+    func: Union[Callable, list[Callable]],
 ):
+    """Validate correctness. For sendrecv, buffers and func are lists of 2 for double-buffer."""
     type_size = cp.dtype(parse_dtype(dtype_str)).itemsize
+    double_buf = isinstance(input_buf, list)
 
-    print("collective: ", collective)
     fill_data_kernel_name = "fill_data_%s" % dtype_str
     if "allgather" in collective:
         coll = "all_gather"
@@ -78,8 +83,10 @@ def bench_correctness(
         coll = "reduce_scatter"
     elif "allreduce" in collective:
         coll = "all_reduce"
+    elif "sendrecv" in collective:
+        coll = "send_recv"
     else:
-        coll = "sendrecv"
+        raise ValueError(f"Unknown collective: {collective}")
     test_data_kernel_name = "test_data_%s_%s" % (coll, dtype_str)
 
     file_dir = os.path.dirname(os.path.abspath(__file__))
@@ -96,11 +103,25 @@ def bench_correctness(
     with stream:
         stream.begin_capture()
         for i in range(n_iters):
-            fill_data_params = pack(input_buf) + struct.pack("Q", input_buf.nbytes // type_size) + pack(rank, i)
+            if double_buf:
+                idx = i % 2
+                cur_input = input_buf[idx]
+                cur_result = result_buf[idx]
+                cur_test = test_buf[idx]
+                cur_func = func[idx]
+            else:
+                cur_input = input_buf
+                cur_result = result_buf
+                cur_test = test_buf
+                cur_func = func
+
+            fill_data_params = pack(cur_input) + struct.pack("Q", cur_input.nbytes // type_size) + pack(rank, i)
             fill_data_kernel.launch_kernel(fill_data_params, nblocks, nthreads, 0, stream)
-            func(stream)
+            cur_func(stream)
             test_data_params = (
-                pack(result_buf, test_buf) + struct.pack("Q", input_buf.nbytes // type_size) + pack(num_ranks, rank, i)
+                pack(cur_result, cur_test)
+                + struct.pack("Q", cur_input.nbytes // type_size)
+                + pack(num_ranks, rank, i)
             )
             test_data_kernel.launch_kernel(test_data_params, nblocks, nthreads, 0, stream)
         graph = stream.end_capture()
@@ -143,6 +164,13 @@ def build_bufs(
     assert (size % type_size) == 0, "size %d not multiple of type size %d" % (size, type_size)
     nelems = size // type_size
 
+    # Sendrecv uses double buffering: return lists of 2 buffers
+    if "sendrecv" in collective:
+        input_bufs = [GpuBuffer(nelems, dtype=dtype) for _ in range(2)]
+        result_bufs = [GpuBuffer(nelems, dtype=dtype) for _ in range(2)]
+        test_bufs = [cp.zeros(nelems, dtype=dtype) for _ in range(2)]
+        return input_bufs, result_bufs, test_bufs, nelems
+
     if "allgather" in collective:
         assert (nelems % num_ranks) == 0, "nelems %d not multiple of num_ranks %d" % (nelems, num_ranks)
         nelems_input = nelems if in_place else nelems // num_ranks
@@ -167,8 +195,6 @@ def build_bufs(
     else:
         input_buf = GpuBuffer(nelems_input, dtype=dtype)
 
-    in_place = False
-
     test_buf = cp.zeros(nelems, dtype=dtype)
 
     return input_buf, result_buf, test_buf, nelems
@@ -202,37 +228,38 @@ def main(
         mscclpp_group.nranks,
     )
 
-    executor_func = lambda stream: executor.execute(
-        mscclpp_group.my_rank,
-        input_buf.data.ptr,
-        result_buf.data.ptr,
-        input_buf.nbytes,
-        result_buf.nbytes,
-        dtype_to_mscclpp_dtype(dtype),
-        execution_plan,
-        stream.ptr,
-        packet_type,
-    )
-
-    mscclpp_group.barrier()
-    print("size= ", size, "nelem= ", nelem)
-
-    # Sentinel fill: choose something unlikely in your pattern
-    result_buf.fill(cp.float16(123.0))
-    cp.cuda.runtime.deviceSynchronize()
-
-    # Run ONE execution (no graph), then sync
-    stream = cp.cuda.Stream(non_blocking=True)
-    with stream:
-        executor_func(stream)
-    stream.synchronize()
+    sendrecv_mode = "sendrecv" in collective
+
+    if sendrecv_mode:
+        # Double-buffer: create two executor funcs, one per buffer pair
+        executor_funcs = []
+        for idx in range(2):
+            func = lambda stream, i=idx: executor.execute(
+                mscclpp_group.my_rank,
+                input_buf[i].data.ptr,
+                result_buf[i].data.ptr,
+                input_buf[i].nbytes,
+                result_buf[i].nbytes,
+                dtype_to_mscclpp_dtype(dtype),
+                execution_plan,
+                stream.ptr,
+                packet_type,
+            )
+            executor_funcs.append(func)
+    else:
+        executor_func = lambda stream: executor.execute(
+            mscclpp_group.my_rank,
+            input_buf.data.ptr,
+            result_buf.data.ptr,
+            input_buf.nbytes,
+            result_buf.nbytes,
+            dtype_to_mscclpp_dtype(dtype),
+            execution_plan,
+            stream.ptr,
+            packet_type,
+        )
 
-    # Count how many elements changed
-    changed = cp.count_nonzero(result_buf != cp.float16(123.0)).item()
-    print("changed elements:", changed, "out of", result_buf.size)
-    cp.cuda.runtime.deviceSynchronize()
     mscclpp_group.barrier()
-
     bench_correctness(
         collective,
         input_buf,
@@ -242,18 +269,20 @@ def main(
         mscclpp_group.my_rank,
         mscclpp_group.nranks,
         n_iters,
-        executor_func,
+        executor_funcs if sendrecv_mode else executor_func,
     )
 
     mscclpp_group.barrier()
-    execution_time = bench_time(n_iters, n_graph_iters, executor_func)
+    execution_time = bench_time(n_iters, n_graph_iters, executor_funcs if sendrecv_mode else executor_func)
     if npkit_dump_dir is not None:
         npkit.dump(npkit_dump_dir)
         npkit.shutdown()
+
+    result_nbytes = result_buf[0].nbytes if sendrecv_mode else result_buf.nbytes
     print(
         f"Rank: {mscclpp_group.my_rank} Execution time: {execution_time} us, "
-        f"data size: {result_buf.nbytes} bytes data type: {dtype().dtype.name} "
-        f"bandwidth: {result_buf.nbytes / (execution_time * 1e-6) / (1024**3):.2f} GB/s, "
+        f"data size: {result_nbytes} bytes data type: {dtype().dtype.name} "
+        f"bandwidth: {result_nbytes / (execution_time * 1e-6) / (1024**3):.2f} GB/s, "
         f"packet type: {packet_type}"
     )
     executor = None
diff --git a/python/test/executor_test_verifier.cu b/python/test/executor_test_verifier.cu
index 5c96a9229..b70aee4a6 100644
--- a/python/test/executor_test_verifier.cu
+++ b/python/test/executor_test_verifier.cu
@@ -122,193 +122,20 @@ TEST_DATA_ALL_TO_ALL(float16, __half)
 TEST_DATA_ALL_TO_ALL(float32, float)
 TEST_DATA_ALL_TO_ALL(int32, int)
 
-/*#define TEST_DATA_SENDRECV(FuncNameType, DataType)                                                          \
-  extern "C" __global__ void __launch_bounds__(1024, 1) test_data_sendrecv_##FuncNameType(                  \
-      DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) {    \
-                                                                                                             \
-    /* Ring semantics: receive from prev rank */                                                             \
-/*    int peer_rank = (my_rank - 1 + num_ranks) % num_ranks;                                                   \
-                                                                                                             \
-    unsigned int seed =                                                                                      \
-        (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + peer_rank + seq);                             \
-                                                                                                             \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x;                                                   \
-         i < num_elems;                                                                                      \
-         i += blockDim.x * gridDim.x) {                                                                      \
-      seed = ranqd1(seed);                                                                                   \
-      test_buf[i] = DataType(seed % blockDim.x) / DataType(blockDim.x);                                      \
-                                                                                                             \
-      /* Optional: print first few mismatches */                                                             \
-/*      if (result_buf[i] != test_buf[i] && blockIdx.x == 0 && threadIdx.x == 0 && i < 8) {                    \
-        printf("MISMATCH rank=%d peer=%d i=%zu result=%f expected=%f\n",                                     \
-               my_rank, peer_rank, i, (float)result_buf[i], (float)test_buf[i]);                             \
-      }                                                                                                      \
-                                                                                                             \
-      assert(result_buf[i] == test_buf[i]);                                                                  \
-    }                                                                                                        \
-  }*/
-
-
-/*#define TEST_DATA_SENDRECV(FuncNameType, DataType)                                                        \
-  extern "C" __global__ void __launch_bounds__(1024, 1) test_data_sendrecv_##FuncNameType(                \
-      DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) {  \
-                                                                                                           \
-    int prev_rank = (my_rank - 1 + num_ranks) % num_ranks;                                                 \
-    int next_rank = (my_rank + 1) % num_ranks;                                                             \
-    int self_rank = my_rank;                                                                               \
-                                                                                                           \
-    unsigned int seed_prev =                                                                               \
-        (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + prev_rank + seq);                           \
-    unsigned int seed_next =                                                                               \
-        (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + next_rank + seq);                           \
-    unsigned int seed_self =                                                                               \
-        (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + self_rank + seq);                           \
-                                                                                                           \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x;                                                 \
-         i < num_elems;                                                                                    \
-         i += blockDim.x * gridDim.x) {                                                                    \
-                                                                                                           \
-      seed_prev = ranqd1(seed_prev);                                                                       \
-      seed_next = ranqd1(seed_next);                                                                       \
-      seed_self = ranqd1(seed_self);                                                                       \
-                                                                                                           \
-      DataType exp_prev = DataType(seed_prev % blockDim.x) / DataType(blockDim.x);                         \
-      DataType exp_next = DataType(seed_next % blockDim.x) / DataType(blockDim.x);                         \
-      DataType exp_self = DataType(seed_self % blockDim.x) / DataType(blockDim.x);                         \
-                                                                                                           \
-      /* For compatibility: avoid %zu formatting quirks on device */                                        \
-/*      unsigned long long ii = (unsigned long long)i;                                                       \
-                                                                                                           \
-      if (result_buf[i] != exp_prev) {                                                                     \
-        /* Print only a few mismatches to avoid flooding */                                                 \
-/*        if (blockIdx.x == 0 && (threadIdx.x == 0 || threadIdx.x == 192) && ii < 256ULL) {                  \
-          printf("sendrecv-mismatch rank=%d nranks=%d i=%llu result=%f exp_prev(from %d)=%f "              \
-                 "exp_next(from %d)=%f exp_self(from %d)=%f\n",                                            \
-                 my_rank, num_ranks, ii,                                                                   \
-                 (float)result_buf[i],                                                                     \
-                 prev_rank, (float)exp_prev,                                                               \
-                 next_rank, (float)exp_next,                                                               \
-                 self_rank, (float)exp_self);                                                              \
-        }                                                                                                  \
-      }                                                                                                    \
-                                                                                                           \
-      test_buf[i] = exp_prev;                                                                              \
-      assert(result_buf[i] == test_buf[i]);                                                                \
-    }                                                                                                      \
-  }
-*/
-
-
-#define TEST_DATA_SENDRECV(FuncNameType, DataType)                                                        \
-  extern "C" __global__ void __launch_bounds__(1024, 1) test_data_sendrecv_##FuncNameType(                \
+// Sendrecv verification: ring receive from prev rank.
+// Replays the same PRNG sequence that fill_data used on the sender (prev_rank).
+#define TEST_DATA_SEND_RECV(FuncNameType, DataType)                                                       \
+  extern "C" __global__ void __launch_bounds__(1024, 1) test_data_send_recv_##FuncNameType(               \
       DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) {  \
-                                                                                                           \
-    /* Expected ring semantics (if your algorithm is ring-prev). */                                        \
-    int prev_rank = (my_rank - 1 + num_ranks) % num_ranks;                                                 \
-    int next_rank = (my_rank + 1) % num_ranks;                                                             \
-    int self_rank = my_rank;                                                                               \
-                                                                                                           \
-    /* Thread identity and stride must match fill_data_* generation pattern. */                            \
-    const unsigned long long tid =                                                                        \
-        (unsigned long long)(blockIdx.x * blockDim.x + threadIdx.x);                                       \
-    const unsigned long long stride =                                                                      \
-        (unsigned long long)(blockDim.x * gridDim.x);                                                      \
-                                                                                                           \
-    for (unsigned long long i = tid; i < (unsigned long long)num_elems; i += stride) {                    \
-                                                                                                           \
-      /* Compute how many iterations this thread advanced before reaching i. */                            \
-      unsigned long long k = (i - tid) / stride;                                                           \
-                                                                                                           \
-      /* Helper lambda: compute expected value for a given sender rank r at element i for this thread. */  \
-      auto expected_for_rank = [&](int r) -> DataType {                                                    \
-        unsigned int s = (unsigned int)(tid + (unsigned long long)r + (unsigned long long)seq);            \
-        /* fill_data does: seed=ranqd1(seed) once per element visited.                                     \
-           For the k-th visited element, apply ranqd1 (k+1) times. */                                      \
-        for (unsigned long long step = 0; step < k + 1; ++step) {                                          \
-          s = ranqd1(s);                                                                                   \
-        }                                                                                                  \
-        return DataType(s % blockDim.x) / DataType(blockDim.x);                                            \
-      };                                                                                                   \
-                                                                                                           \
-      DataType exp_prev = expected_for_rank(prev_rank);                                                    \
-      DataType exp_next = expected_for_rank(next_rank);                                                    \
-      DataType exp_self = expected_for_rank(self_rank);                                                    \
-                                                                                                           \
-      /* Store expected(prev) in test_buf for the assert (keeps compatibility with your current check). */ \
-      test_buf[i] = exp_prev;                                                                              \
-                                                                                                           \
-      if (result_buf[i] != test_buf[i]) {                                                                  \
-        /* Try to identify which rank's stream matches the observed result. */                             \
-        int matched = -1;                                                                                  \
-        for (int r = 0; r < num_ranks; ++r) {                                                              \
-          DataType exp_r = expected_for_rank(r);                                                           \
-          if (result_buf[i] == exp_r) {                                                                    \
-            matched = r;                                                                                   \
-            break;                                                                                          \
-          }                                                                                                \
-        }                                                                                                  \
-                                                                                                           \
-        /* Print only a small number of mismatches to avoid log spam. */                                   \
-        if (blockIdx.x == 0 && (threadIdx.x == 0 || threadIdx.x == 160) && i < 256ULL) {                   \
-          printf("sendrecv-mismatch rank=%d nranks=%d i=%llu result=%f "                                   \
-                 "exp_prev(from %d)=%f exp_next(from %d)=%f exp_self(from %d)=%f matched_sender=%d\n",     \
-                 my_rank, num_ranks, i,                                                                    \
-                 (float)result_buf[i],                                                                     \
-                 prev_rank, (float)exp_prev,                                                               \
-                 next_rank, (float)exp_next,                                                               \
-                 self_rank, (float)exp_self,                                                               \
-                 matched);                                                                                 \
-        }                                                                                                  \
-                                                                                                           \
-        assert(result_buf[i] == test_buf[i]);                                                              \
-      }                                                                                                    \
-    }                                                                                                      \
+    int peer_rank = (my_rank - 1 + num_ranks) % num_ranks;                                                \
+    unsigned int seed = (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + peer_rank + seq);          \
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) { \
+      seed = ranqd1(seed);                                                                                \
+      test_buf[i] = DataType(seed % blockDim.x) / DataType(blockDim.x);                                  \
+      assert(result_buf[i] == test_buf[i]);                                                               \
+    }                                                                                                     \
   }
 
-
-/*
-#define TEST_DATA_SENDRECV(FuncNameType, DataType)                                      \
-extern "C" __global__ void __launch_bounds__(1024, 1)                                  \
-test_data_sendrecv_##FuncNameType(                                                     \
-    DataType* result_buf,                                                              \
-    DataType* test_buf,                                                                \
-    size_t num_elems,                                                                  \
-    int num_ranks,                                                                     \
-    int my_rank,                                                                       \
-    int seq) {                                                                         \
-                                                                                       \
-  int prev_rank = (my_rank - 1 + num_ranks) % num_ranks;                               \
-  int next_rank = (my_rank + 1) % num_ranks;                                           \
-                                                                                       \
-  unsigned int seed_prev =                                                             \
-      (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + prev_rank + seq);         \
-  unsigned int seed_next =                                                             \
-      (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + next_rank + seq);         \
-                                                                                       \
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x;                               \
-       i < num_elems;                                                                  \
-       i += blockDim.x * gridDim.x) {                                                   \
-                                                                                       \
-    seed_prev = ranqd1(seed_prev);                                                     \
-    seed_next = ranqd1(seed_next);                                                     \
-                                                                                       \
-    DataType exp_prev = DataType(seed_prev % blockDim.x) / DataType(blockDim.x);       \
-    DataType exp_next = DataType(seed_next % blockDim.x) / DataType(blockDim.x);       \
-                                                                                       \
-    if (result_buf[i] != exp_prev) {                                                   \
-      if (blockIdx.x == 0 && threadIdx.x == 0 && i < 8) {                              \
-        printf("***rank=%d i=%zu result=%f prev(from %d)=%f next(from %d)=%f\n",          \
-               my_rank, i, (float)result_buf[i],                                      \
-               prev_rank, (float)exp_prev,                                            \
-               next_rank, (float)exp_next);                                           \
-      }                                                                                \
-    }                                                                                  \
-                                                                                       \
-    test_buf[i] = exp_prev;                                                           \
-    assert(result_buf[i] == test_buf[i]);                                              \
-  }                                                                                    \
-}
-*/
-TEST_DATA_SENDRECV(float16, __half)
-TEST_DATA_SENDRECV(float32, float)
-TEST_DATA_SENDRECV(int32, int)
+TEST_DATA_SEND_RECV(float16, __half)
+TEST_DATA_SEND_RECV(float32, float)
+TEST_DATA_SEND_RECV(int32, int)

From 76fdd1db7ab8a53330e8024c9e261f07c302056f Mon Sep 17 00:00:00 2001
From: Ubuntu <Binyang Li>
Date: Sat, 11 Apr 2026 04:53:49 +0000
Subject: [PATCH 118/132] WIP

---
 .../{mscclpp_send_recv.py => send_recv.py}    |  2 +-
 python/mscclpp/language/collectives.py        | 43 +++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)
 rename python/mscclpp/default_algos/{mscclpp_send_recv.py => send_recv.py} (98%)

diff --git a/python/mscclpp/default_algos/mscclpp_send_recv.py b/python/mscclpp/default_algos/send_recv.py
similarity index 98%
rename from python/mscclpp/default_algos/mscclpp_send_recv.py
rename to python/mscclpp/default_algos/send_recv.py
index caa0575d1..2127eb913 100644
--- a/python/mscclpp/default_algos/mscclpp_send_recv.py
+++ b/python/mscclpp/default_algos/send_recv.py
@@ -11,7 +11,7 @@
 
 def send_recv_test(name, nnodes, gpus_per_node, split_mask):
     gpu_size = nnodes * gpus_per_node
-    collective = TestCollective(gpu_size, 1, 1)
+    collective = SendRecv(gpu_size, 1, False)
     with CollectiveProgram(
         name,
         collective,
diff --git a/python/mscclpp/language/collectives.py b/python/mscclpp/language/collectives.py
index 55c0e6b69..01c766bae 100644
--- a/python/mscclpp/language/collectives.py
+++ b/python/mscclpp/language/collectives.py
@@ -236,3 +236,46 @@ def init_buffers(self):
             }
             rank_buffers.append(buffers)
         return rank_buffers
+
+
+class SendRecv(Collective):
+    """A SendRecv collective communication pattern.
+
+    SendRecv performs a point-to-point send/receive operation in a ring topology.
+    Each rank sends its input buffer to the next rank and receives data from the
+    previous rank into its output buffer.
+
+    This operation creates input and output buffers both sized by chunk_factor,
+    as each rank sends and receives the same amount of data.
+    """
+
+    def __init__(self, num_ranks, chunk_factor, inplace):
+        """Initialize a new SendRecv collective.
+
+        Args:
+            num_ranks (int): The number of ranks participating in the SendRecv.
+            chunk_factor (int): The size factor for data chunks.
+            inplace (bool): Whether the operation should be performed in-place.
+
+        Example:
+            >>> sendrecv = SendRecv(num_ranks=4, chunk_factor=1, inplace=False)
+        """
+        Collective.__init__(self, num_ranks, chunk_factor, inplace)
+        self.name = "sendrecv"
+
+    def init_buffers(self):
+        """Initialize buffers for the SendRecv operation.
+
+        Creates input and output buffers both sized by chunk_factor.
+
+        Returns:
+            list: A list of buffer dictionaries, one for each rank.
+        """
+        rank_buffers = []
+        for rank in range(self.num_ranks):
+            buffers = {
+                BufferType.input: BaseBuffer(rank, BufferType.input, 0, self.chunk_factor),
+                BufferType.output: BaseBuffer(rank, BufferType.output, 0, self.chunk_factor),
+            }
+            rank_buffers.append(buffers)
+        return rank_buffers

From 57f7be62602c0a6a68cc6c607af6bc7ccce504d7 Mon Sep 17 00:00:00 2001
From: Ubuntu <Binyang Li>
Date: Sat, 11 Apr 2026 05:28:29 +0000
Subject: [PATCH 119/132] WIP

---
 python/mscclpp/default_algos/send_recv.py |  2 +-
 run_onenode.sh                            |  4 +--
 test.json                                 | 42 +++++++----------------
 3 files changed, 16 insertions(+), 32 deletions(-)

diff --git a/python/mscclpp/default_algos/send_recv.py b/python/mscclpp/default_algos/send_recv.py
index 2127eb913..08a49ad20 100644
--- a/python/mscclpp/default_algos/send_recv.py
+++ b/python/mscclpp/default_algos/send_recv.py
@@ -21,7 +21,7 @@ def send_recv_test(name, nnodes, gpus_per_node, split_mask):
         use_double_scratch_buffer=False,
         min_message_size=0,
         max_message_size=2**64 - 1,
-        instances=4
+        instances=1
     ):
         # Creating separate port channels for next and prev directions.
         # When prev and next are the same peer (e.g., 2-node ring), both channels go to the same peer
diff --git a/run_onenode.sh b/run_onenode.sh
index 6e7541d15..50b49e128 100755
--- a/run_onenode.sh
+++ b/run_onenode.sh
@@ -5,9 +5,9 @@ MPI_ARGS=""
 MPI_ARGS+="-x CUDA_VISIBLE_DEVICES=0,2 --mca coll ^ucc,hcoll   -mca coll_hcoll_enable 0 --mca btl tcp,vader,self --mca pml ob1   --mca oob_tcp_if_include enP22p1s0f1 --mca btl_tcp_if_include enP22p1s0f1 "
 MPI_ARGS+="-x MSCCLPP_IBV_SO=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/libibverbs.so -x UCX_NET_DEVICES=enP22p1s0f1 -x LD_LIBRARY_PATH=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/mrc-header-lib:$LD_LIBRARY_PATH"
 MPI_ARGS+=" -x MSCCLPP_IBV_MODE=host  -x MSCCLPP_SOCKET_IFNAME=enP22p1s0f1 -x MSCCLPP_IBV_MODE=host-no-atomic  -x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so"
-MPI_ARGS+=" -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1     -x MSCCLPP_HCA_DEVICES=mlx5_1,mlx5_3   -x PATH=/home/azhpcuser/mahdieh/mscclpp/mscclpp2/bin/:$PATH "
+MPI_ARGS+=" -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1     -x MSCCLPP_HCA_DEVICES=mlx5_1,mlx5_3   -x PATH=/home/azhpcuser/binyli/mscclpp_venv/bin/:$PATH "
 MPI_ARGS+=" -x MSCCLPP_LOG_LEVEL=ERROR -x MSCCLPP_DEBUG=ERROR  -x MSCCLPP_IB_GID_INDEX=3"
-MPI_ARGS+=" /home/azhpcuser/mahdieh/mscclpp/mscclpp/bin/python3   /home/azhpcuser/mahdieh/mscclpp/python/test/executor_test.py   -path /home/azhpcuser/mahdieh/mscclpp/test.json"
+MPI_ARGS+=" /home/azhpcuser/binyli/mscclpp_venv/bin/python3   /home/azhpcuser/binyli/mscclpp/python/test/executor_test.py   -path /home/azhpcuser/binyli/mscclpp/test.json"
 
 
 
diff --git a/test.json b/test.json
index 294c2a13e..511b7907e 100644
--- a/test.json
+++ b/test.json
@@ -1,6 +1,6 @@
 {
-  "name": "send_recv_test",
-  "collective": "test",
+  "name": "sendrecv",
+  "collective": "sendrecv",
   "protocol": "Simple",
   "inplace": false,
   "reuse_resources": false,
@@ -24,7 +24,7 @@
             {
               "name": "wait",
               "channel_ids": [
-                0
+                1
               ],
               "channel_type": "port"
             },
@@ -32,7 +32,7 @@
               "name": "nop"
             },
             {
-              "name": "put",
+              "name": "pws",
               "src_buff": [
                 {
                   "type": "i",
@@ -48,17 +48,7 @@
                 }
               ],
               "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
+                1
               ],
               "channel_type": "port"
             },
@@ -74,6 +64,7 @@
             {
               "channel_type": "port",
               "channel_ids": [
+                1,
                 0
               ]
             }
@@ -92,6 +83,7 @@
         {
           "channel_type": "port",
           "connected_to": [
+            1,
             1
           ]
         }
@@ -126,7 +118,7 @@
             {
               "name": "wait",
               "channel_ids": [
-                0
+                1
               ],
               "channel_type": "port"
             },
@@ -134,7 +126,7 @@
               "name": "nop"
             },
             {
-              "name": "put",
+              "name": "pws",
               "src_buff": [
                 {
                   "type": "i",
@@ -150,17 +142,7 @@
                 }
               ],
               "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
+                1
               ],
               "channel_type": "port"
             },
@@ -176,7 +158,8 @@
             {
               "channel_type": "port",
               "channel_ids": [
-                0
+                0,
+                1
               ]
             }
           ],
@@ -194,6 +177,7 @@
         {
           "channel_type": "port",
           "connected_to": [
+            0,
             0
           ]
         }

From 65139d6f6d6594e10c23c402ab2bab90108edd40 Mon Sep 17 00:00:00 2001
From: Ubuntu <mahdiehghazi@microsoft.com>
Date: Sat, 11 Apr 2026 06:12:46 +0000
Subject: [PATCH 120/132] WIP

---
 python/mscclpp/default_algos/send_recv.py |   2 +-
 test.json                                 | 424 +++++++++++++++++++++-
 2 files changed, 420 insertions(+), 6 deletions(-)

diff --git a/python/mscclpp/default_algos/send_recv.py b/python/mscclpp/default_algos/send_recv.py
index 08a49ad20..2127eb913 100644
--- a/python/mscclpp/default_algos/send_recv.py
+++ b/python/mscclpp/default_algos/send_recv.py
@@ -21,7 +21,7 @@ def send_recv_test(name, nnodes, gpus_per_node, split_mask):
         use_double_scratch_buffer=False,
         min_message_size=0,
         max_message_size=2**64 - 1,
-        instances=1
+        instances=4
     ):
         # Creating separate port channels for next and prev directions.
         # When prev and next are the same peer (e.g., 2-node ring), both channels go to the same peer
diff --git a/test.json b/test.json
index 511b7907e..4f412033e 100644
--- a/test.json
+++ b/test.json
@@ -7,8 +7,8 @@
   "gpus": [
     {
       "id": 0,
-      "input_chunks": 1,
-      "output_chunks": 1,
+      "input_chunks": 4,
+      "output_chunks": 4,
       "scratch_chunks": 0,
       "threadblocks": [
         {
@@ -64,8 +64,209 @@
             {
               "channel_type": "port",
               "channel_ids": [
-                1,
+                4,
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                5,
+                1
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
                 0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                6,
+                2
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                7,
+                3
               ]
             }
           ],
@@ -83,6 +284,12 @@
         {
           "channel_type": "port",
           "connected_to": [
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
             1,
             1
           ]
@@ -101,8 +308,8 @@
     },
     {
       "id": 1,
-      "input_chunks": 1,
-      "output_chunks": 1,
+      "input_chunks": 4,
+      "output_chunks": 4,
       "scratch_chunks": 0,
       "threadblocks": [
         {
@@ -159,7 +366,208 @@
               "channel_type": "port",
               "channel_ids": [
                 0,
+                4
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                1,
+                5
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
                 1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                2,
+                6
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                3,
+                7
               ]
             }
           ],
@@ -177,6 +585,12 @@
         {
           "channel_type": "port",
           "connected_to": [
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
             0,
             0
           ]

From 456ef7e5babf3f79796c0b1a3550871a03bc3ea6 Mon Sep 17 00:00:00 2001
From: Ubuntu <mahdiehghazi@microsoft.com>
Date: Sat, 11 Apr 2026 06:33:36 +0000
Subject: [PATCH 121/132] fix

---
 src/core/executor/executor.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/core/executor/executor.cc b/src/core/executor/executor.cc
index b5510b630..3097bcdec 100644
--- a/src/core/executor/executor.cc
+++ b/src/core/executor/executor.cc
@@ -268,7 +268,9 @@ struct Executor::Impl {
     // Create one connection (unique QP) per channel entry. Each channel gets its own
     // QP — no shared connections. This is required for HostNoAtomic IB mode where each
     // connection can only forward signals to one semaphore via setSignalForwardingDst.
-    int tag = 0;
+    // Use per-peer tag counters so that matched connections between pairs of ranks use
+    // the same tag, regardless of the order peers appear in each rank's connected_to list.
+    std::unordered_map<int, int> peerTagCounters;
     Transport ibTransport = IBs[rank % this->nranksPerNode];
     std::vector<std::shared_future<Connection>> connFutures;
     for (ChannelType channelType : {ChannelType::MEMORY, ChannelType::PORT}) {
@@ -276,14 +278,14 @@ struct Executor::Impl {
       for (const auto& info : channelInfos) {
         for (int peer : info.connectedPeers) {
           Transport transport = useIB(rank, peer, this->nranksPerNode) ? ibTransport : Transport::CudaIpc;
-          connFutures.push_back(this->comm->connect(transport, peer, tag++));
+          connFutures.push_back(this->comm->connect(transport, peer, peerTagCounters[peer]++));
         }
       }
       channelInfos = plan.impl_->getUnpairedChannelInfos(nranks, channelType);
       for (const auto& info : channelInfos) {
         for (int peer : info.connectedPeers) {
           Transport transport = useIB(rank, peer, this->nranksPerNode) ? ibTransport : Transport::CudaIpc;
-          connFutures.push_back(this->comm->connect(transport, peer, tag++));
+          connFutures.push_back(this->comm->connect(transport, peer, peerTagCounters[peer]++));
         }
       }
     }

From 36abcbedd39d4c6bca55b64dff0daeb26d00bbd6 Mon Sep 17 00:00:00 2001
From: Ubuntu <mahdiehghazi@microsoft.com>
Date: Sat, 11 Apr 2026 06:40:19 +0000
Subject: [PATCH 122/132] WIP

---
 run-sendrecv2.sh |   6 +-
 test.json        | 630 +++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 619 insertions(+), 17 deletions(-)

diff --git a/run-sendrecv2.sh b/run-sendrecv2.sh
index 556cc09dd..c6fd42de4 100755
--- a/run-sendrecv2.sh
+++ b/run-sendrecv2.sh
@@ -4,9 +4,9 @@ MPI_ARGS=""
 MPI_ARGS+=" -x CUDA_VISIBLE_DEVICES=1 -mca coll_hcoll_enable 0 --mca coll ^ucc,hcoll --mca btl tcp,vader,self --mca pml ob1   --mca oob_tcp_if_include enP22p1s0f1 --mca btl_tcp_if_include enP22p1s0f1"
 MPI_ARGS+=" -x MSCCLPP_IBV_SO=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/libibverbs.so -x UCX_NET_DEVICES=enP22p1s0f1 -x LD_LIBRARY_PATH=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/mrc-header-lib:$LD_LIBRARY_PATH"
 MPI_ARGS+=" -x MSCCLPP_SOCKET_IFNAME=enP22p1s0f1 -x MSCCLPP_IBV_MODE=host-no-atomic  -x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so"
-MPI_ARGS+=" -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1 -x PATH=/home/azhpcuser/mahdieh/mscclpp-unittest/mscclpp/mscclpp/bin/:$PATH "
+MPI_ARGS+=" -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1 -x PATH=/home/azhpcuser/binyli/mscclpp_venv/bin:$PATH "
 MPI_ARGS+=" -x MSCCLPP_LOG_LEVEL=ERROR -x MSCCLPP_DEBUG=ERROR  -x MSCCLPP_IB_GID_INDEX=3 -x MSCCLPP_HCA_DEVICES=mlx5_0"
-MPI_ARGS+=" /home/azhpcuser/mahdieh/mscclpp-unittest/mscclpp/mscclpp/bin/python3   /home/azhpcuser/mahdieh/mscclpp-unittest/mscclpp/python/test/executor_test.py   -path /home/azhpcuser/mahdieh/mscclpp-unittest/mscclpp/sendrecv.json"
+MPI_ARGS+=" /home/azhpcuser/binyli/mscclpp_venv/bin/python3   /home/azhpcuser/binyli/mscclpp/python/test/executor_test.py   -path /home/azhpcuser/binyli/mscclpp/test.json"
 
 
-mpirun -np 2 --hostfile ./hosts --map-by ppr:1:node  $MPI_ARGS --size 1K
+mpirun -np 4 --hostfile ./hosts --map-by ppr:1:node  $MPI_ARGS --size 1G --n_iters 20 --n_graph_iters 5 
diff --git a/test.json b/test.json
index 4f412033e..3b98c1a4d 100644
--- a/test.json
+++ b/test.json
@@ -288,6 +288,608 @@
             1,
             1,
             1,
+            3,
+            3,
+            3,
+            3
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "o",
+          "access_channel_types": [
+            "port"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                4,
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                5,
+                1
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                6,
+                2
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                7,
+                3
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "port",
+          "connected_to": [
+            2,
+            2,
+            2,
+            2,
+            0,
+            0,
+            0,
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 2,
+          "type": "o",
+          "access_channel_types": [
+            "port"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 2,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                4,
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                5,
+                1
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                6,
+                2
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                7,
+                3
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "port",
+          "connected_to": [
+            3,
+            3,
+            3,
+            3,
             1,
             1,
             1,
@@ -297,7 +899,7 @@
       ],
       "remote_buffers": [
         {
-          "rank": 1,
+          "rank": 3,
           "type": "o",
           "access_channel_types": [
             "port"
@@ -307,7 +909,7 @@
       "semaphores": []
     },
     {
-      "id": 1,
+      "id": 3,
       "input_chunks": 4,
       "output_chunks": 4,
       "scratch_chunks": 0,
@@ -365,8 +967,8 @@
             {
               "channel_type": "port",
               "channel_ids": [
-                0,
-                4
+                4,
+                0
               ]
             }
           ],
@@ -432,8 +1034,8 @@
             {
               "channel_type": "port",
               "channel_ids": [
-                1,
-                5
+                5,
+                1
               ]
             }
           ],
@@ -499,8 +1101,8 @@
             {
               "channel_type": "port",
               "channel_ids": [
-                2,
-                6
+                6,
+                2
               ]
             }
           ],
@@ -566,8 +1168,8 @@
             {
               "channel_type": "port",
               "channel_ids": [
-                3,
-                7
+                7,
+                3
               ]
             }
           ],
@@ -589,10 +1191,10 @@
             0,
             0,
             0,
-            0,
-            0,
-            0,
-            0
+            2,
+            2,
+            2,
+            2
           ]
         }
       ],

From a2a1b89181678f7f1d955e9e2c271a218ea57b8d Mon Sep 17 00:00:00 2001
From: binyli <binyli@microsoft.com>
Date: Mon, 13 Apr 2026 20:52:52 +0000
Subject: [PATCH 123/132] for 4 nodes

---
 python/test/executor_test.py          |   28 +
 python/test/executor_test_verifier.cu |    9 +-
 run-sendrecv2.sh                      |    6 +-
 test.json                             | 3686 ++++++++++++++++++++++++-
 4 files changed, 3685 insertions(+), 44 deletions(-)

diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index be6e5834c..14e3e21c2 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -61,6 +61,26 @@ def bench_time(n_iters: int, n_graph_iters: int, func: Union[Callable, list[Call
     return cp.cuda.get_elapsed_time(start, end) / n_iters * 1000.0 / n_graph_iters
 
 
+def get_prev_rank(my_rank: int, num_ranks: int, split_mask: int) -> int:
+    """Determine the previous rank in the ring based on the split_mask topology."""
+    group_size = split_mask + 1
+    num_groups = num_ranks // group_size
+    position_in_group = my_rank & split_mask
+    group_id = my_rank // group_size
+    prev_group_id = (group_id - 1 + num_groups) % num_groups
+    return prev_group_id * group_size + position_in_group
+
+
+def get_next_rank(my_rank: int, num_ranks: int, split_mask: int) -> int:
+    """Determine the next rank in the ring based on the split_mask topology."""
+    group_size = split_mask + 1
+    num_groups = num_ranks // group_size
+    position_in_group = my_rank & split_mask
+    group_id = my_rank // group_size
+    next_group_id = (group_id + 1) % num_groups
+    return next_group_id * group_size + position_in_group
+
+
 def bench_correctness(
     collective: str,
     input_buf: Union[cp.ndarray, list[cp.ndarray]],
@@ -71,6 +91,7 @@ def bench_correctness(
     num_ranks: int,
     n_iters: int,
     func: Union[Callable, list[Callable]],
+    split_mask: int = 0,
 ):
     """Validate correctness. For sendrecv, buffers and func are lists of 2 for double-buffer."""
     type_size = cp.dtype(parse_dtype(dtype_str)).itemsize
@@ -123,6 +144,9 @@ def bench_correctness(
                 + struct.pack("Q", cur_input.nbytes // type_size)
                 + pack(num_ranks, rank, i)
             )
+            if "sendrecv" in collective:
+                prev_rank = get_prev_rank(rank, num_ranks, split_mask)
+                test_data_params += pack(prev_rank)
             test_data_kernel.launch_kernel(test_data_params, nblocks, nthreads, 0, stream)
         graph = stream.end_capture()
     graph.launch(stream)
@@ -208,6 +232,7 @@ def main(
     packet_type: PacketType = PacketType.LL16,
     n_iters: int = 10,
     n_graph_iters: int = 10,
+    split_mask: int = 0,
 ):
     mscclpp_group = CommGroup(MPI.COMM_WORLD)
     cp.cuda.Device(mscclpp_group.my_rank % mscclpp_group.nranks_per_node).use()
@@ -270,6 +295,7 @@ def main(
         mscclpp_group.nranks,
         n_iters,
         executor_funcs if sendrecv_mode else executor_func,
+        split_mask=split_mask,
     )
 
     mscclpp_group.barrier()
@@ -298,6 +324,7 @@ def main(
     parser.add_argument("--packet_type", type=str, default="LL16", help="Choose from LL8, LL16")
     parser.add_argument("--n_iters", type=int, default=10)
     parser.add_argument("--n_graph_iters", type=int, default=10)
+    parser.add_argument("--split_mask", type=lambda x: int(x, 0), default=0x0, help="split mask for sendrecv (e.g. 0x3)")
     args = parser.parse_args()
 
     packet_type = PacketType.LL16
@@ -313,4 +340,5 @@ def main(
         packet_type,
         args.n_iters,
         args.n_graph_iters,
+        args.split_mask,
     )
diff --git a/python/test/executor_test_verifier.cu b/python/test/executor_test_verifier.cu
index b70aee4a6..38fa39d72 100644
--- a/python/test/executor_test_verifier.cu
+++ b/python/test/executor_test_verifier.cu
@@ -122,13 +122,14 @@ TEST_DATA_ALL_TO_ALL(float16, __half)
 TEST_DATA_ALL_TO_ALL(float32, float)
 TEST_DATA_ALL_TO_ALL(int32, int)
 
-// Sendrecv verification: ring receive from prev rank.
+// Sendrecv verification: receive from prev rank in the ring.
 // Replays the same PRNG sequence that fill_data used on the sender (prev_rank).
+// prev_rank is passed explicitly since the ring topology depends on split_mask.
 #define TEST_DATA_SEND_RECV(FuncNameType, DataType)                                                       \
   extern "C" __global__ void __launch_bounds__(1024, 1) test_data_send_recv_##FuncNameType(               \
-      DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) {  \
-    int peer_rank = (my_rank - 1 + num_ranks) % num_ranks;                                                \
-    unsigned int seed = (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + peer_rank + seq);          \
+      DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq,    \
+      int prev_rank) {                                                                                    \
+    unsigned int seed = (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + prev_rank + seq);          \
     for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) { \
       seed = ranqd1(seed);                                                                                \
       test_buf[i] = DataType(seed % blockDim.x) / DataType(blockDim.x);                                  \
diff --git a/run-sendrecv2.sh b/run-sendrecv2.sh
index c6fd42de4..57102bfb2 100755
--- a/run-sendrecv2.sh
+++ b/run-sendrecv2.sh
@@ -1,12 +1,12 @@
 module load mpi/hpcx #mpi/hpcx-mrc #mpi/hpcx-mrc-2.23.1
 
 MPI_ARGS=""
-MPI_ARGS+=" -x CUDA_VISIBLE_DEVICES=1 -mca coll_hcoll_enable 0 --mca coll ^ucc,hcoll --mca btl tcp,vader,self --mca pml ob1   --mca oob_tcp_if_include enP22p1s0f1 --mca btl_tcp_if_include enP22p1s0f1"
+MPI_ARGS+=" -mca coll_hcoll_enable 0 --mca coll ^ucc,hcoll --mca btl tcp,vader,self --mca pml ob1   --mca oob_tcp_if_include enP22p1s0f1 --mca btl_tcp_if_include enP22p1s0f1"
 MPI_ARGS+=" -x MSCCLPP_IBV_SO=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/libibverbs.so -x UCX_NET_DEVICES=enP22p1s0f1 -x LD_LIBRARY_PATH=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/mrc-header-lib:$LD_LIBRARY_PATH"
 MPI_ARGS+=" -x MSCCLPP_SOCKET_IFNAME=enP22p1s0f1 -x MSCCLPP_IBV_MODE=host-no-atomic  -x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so"
 MPI_ARGS+=" -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1 -x PATH=/home/azhpcuser/binyli/mscclpp_venv/bin:$PATH "
-MPI_ARGS+=" -x MSCCLPP_LOG_LEVEL=ERROR -x MSCCLPP_DEBUG=ERROR  -x MSCCLPP_IB_GID_INDEX=3 -x MSCCLPP_HCA_DEVICES=mlx5_0"
+MPI_ARGS+=" -x MSCCLPP_LOG_LEVEL=ERROR -x MSCCLPP_DEBUG=ERROR  -x MSCCLPP_IB_GID_INDEX=3 -x MSCCLPP_HCA_DEVICES=mlx5_1,mlx5_0,mlx5_3,mlx5_2"
 MPI_ARGS+=" /home/azhpcuser/binyli/mscclpp_venv/bin/python3   /home/azhpcuser/binyli/mscclpp/python/test/executor_test.py   -path /home/azhpcuser/binyli/mscclpp/test.json"
 
 
-mpirun -np 4 --hostfile ./hosts --map-by ppr:1:node  $MPI_ARGS --size 1G --n_iters 20 --n_graph_iters 5 
+mpirun -np 16 --hostfile ./hosts --map-by ppr:4:node  $MPI_ARGS --size 1G --n_iters 20 --n_graph_iters 5 --split_mask 0x3
diff --git a/test.json b/test.json
index 3b98c1a4d..eb28dd27c 100644
--- a/test.json
+++ b/test.json
@@ -1,5 +1,5 @@
 {
-  "name": "sendrecv",
+  "name": "send_recv",
   "collective": "sendrecv",
   "protocol": "Simple",
   "inplace": false,
@@ -284,20 +284,20 @@
         {
           "channel_type": "port",
           "connected_to": [
-            1,
-            1,
-            1,
-            1,
-            3,
-            3,
-            3,
-            3
+            4,
+            4,
+            4,
+            4,
+            12,
+            12,
+            12,
+            12
           ]
         }
       ],
       "remote_buffers": [
         {
-          "rank": 1,
+          "rank": 4,
           "type": "o",
           "access_channel_types": [
             "port"
@@ -585,20 +585,20 @@
         {
           "channel_type": "port",
           "connected_to": [
-            2,
-            2,
-            2,
-            2,
-            0,
-            0,
-            0,
-            0
+            5,
+            5,
+            5,
+            5,
+            13,
+            13,
+            13,
+            13
           ]
         }
       ],
       "remote_buffers": [
         {
-          "rank": 2,
+          "rank": 5,
           "type": "o",
           "access_channel_types": [
             "port"
@@ -886,20 +886,20 @@
         {
           "channel_type": "port",
           "connected_to": [
-            3,
-            3,
-            3,
-            3,
-            1,
-            1,
-            1,
-            1
+            6,
+            6,
+            6,
+            6,
+            14,
+            14,
+            14,
+            14
           ]
         }
       ],
       "remote_buffers": [
         {
-          "rank": 3,
+          "rank": 6,
           "type": "o",
           "access_channel_types": [
             "port"
@@ -1187,20 +1187,3632 @@
         {
           "channel_type": "port",
           "connected_to": [
-            0,
-            0,
-            0,
-            0,
-            2,
-            2,
-            2,
-            2
+            7,
+            7,
+            7,
+            7,
+            15,
+            15,
+            15,
+            15
           ]
         }
       ],
       "remote_buffers": [
         {
-          "rank": 0,
+          "rank": 7,
+          "type": "o",
+          "access_channel_types": [
+            "port"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 4,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                4,
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                5,
+                1
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                6,
+                2
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                7,
+                3
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "port",
+          "connected_to": [
+            8,
+            8,
+            8,
+            8,
+            0,
+            0,
+            0,
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 8,
+          "type": "o",
+          "access_channel_types": [
+            "port"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 5,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                4,
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                5,
+                1
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                6,
+                2
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                7,
+                3
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "port",
+          "connected_to": [
+            9,
+            9,
+            9,
+            9,
+            1,
+            1,
+            1,
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 9,
+          "type": "o",
+          "access_channel_types": [
+            "port"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 6,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                4,
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                5,
+                1
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                6,
+                2
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                7,
+                3
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "port",
+          "connected_to": [
+            10,
+            10,
+            10,
+            10,
+            2,
+            2,
+            2,
+            2
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 10,
+          "type": "o",
+          "access_channel_types": [
+            "port"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 7,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                4,
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                5,
+                1
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                6,
+                2
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                7,
+                3
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "port",
+          "connected_to": [
+            11,
+            11,
+            11,
+            11,
+            3,
+            3,
+            3,
+            3
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 11,
+          "type": "o",
+          "access_channel_types": [
+            "port"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 8,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                4,
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                5,
+                1
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                6,
+                2
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                7,
+                3
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "port",
+          "connected_to": [
+            12,
+            12,
+            12,
+            12,
+            4,
+            4,
+            4,
+            4
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 12,
+          "type": "o",
+          "access_channel_types": [
+            "port"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 9,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                4,
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                5,
+                1
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                6,
+                2
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                7,
+                3
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "port",
+          "connected_to": [
+            13,
+            13,
+            13,
+            13,
+            5,
+            5,
+            5,
+            5
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 13,
+          "type": "o",
+          "access_channel_types": [
+            "port"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 10,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                4,
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                5,
+                1
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                6,
+                2
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                7,
+                3
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "port",
+          "connected_to": [
+            14,
+            14,
+            14,
+            14,
+            6,
+            6,
+            6,
+            6
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 14,
+          "type": "o",
+          "access_channel_types": [
+            "port"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 11,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                4,
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                5,
+                1
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                6,
+                2
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                7,
+                3
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "port",
+          "connected_to": [
+            15,
+            15,
+            15,
+            15,
+            7,
+            7,
+            7,
+            7
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 15,
+          "type": "o",
+          "access_channel_types": [
+            "port"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 12,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                4,
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                5,
+                1
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                6,
+                2
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                7,
+                3
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "port",
+          "connected_to": [
+            0,
+            0,
+            0,
+            0,
+            8,
+            8,
+            8,
+            8
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "o",
+          "access_channel_types": [
+            "port"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 13,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                4,
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                5,
+                1
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                6,
+                2
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                7,
+                3
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "port",
+          "connected_to": [
+            1,
+            1,
+            1,
+            1,
+            9,
+            9,
+            9,
+            9
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "o",
+          "access_channel_types": [
+            "port"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 14,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                4,
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                5,
+                1
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                6,
+                2
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                7,
+                3
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "port",
+          "connected_to": [
+            2,
+            2,
+            2,
+            2,
+            10,
+            10,
+            10,
+            10
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 2,
+          "type": "o",
+          "access_channel_types": [
+            "port"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 15,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                4,
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                5,
+                1
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                6,
+                2
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pws",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 3,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "port"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "port"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "port",
+              "channel_ids": [
+                7,
+                3
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "port",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "port",
+          "connected_to": [
+            3,
+            3,
+            3,
+            3,
+            11,
+            11,
+            11,
+            11
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 3,
           "type": "o",
           "access_channel_types": [
             "port"

From 1fd5ed8f18fd6d4479da9f497f7663b12429c981 Mon Sep 17 00:00:00 2001
From: Ubuntu <mahdiehghazi@microsoft.com>
Date: Mon, 13 Apr 2026 21:20:04 +0000
Subject: [PATCH 124/132] update the script

---
 run-sendrecv2.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/run-sendrecv2.sh b/run-sendrecv2.sh
index 57102bfb2..f4c0e8982 100755
--- a/run-sendrecv2.sh
+++ b/run-sendrecv2.sh
@@ -1,12 +1,14 @@
 module load mpi/hpcx #mpi/hpcx-mrc #mpi/hpcx-mrc-2.23.1
 
+export MSCCLPPHOME=/home/azhpcuser/mscclpp-test/mscclpp/
+
 MPI_ARGS=""
 MPI_ARGS+=" -mca coll_hcoll_enable 0 --mca coll ^ucc,hcoll --mca btl tcp,vader,self --mca pml ob1   --mca oob_tcp_if_include enP22p1s0f1 --mca btl_tcp_if_include enP22p1s0f1"
 MPI_ARGS+=" -x MSCCLPP_IBV_SO=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/libibverbs.so -x UCX_NET_DEVICES=enP22p1s0f1 -x LD_LIBRARY_PATH=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/mrc-header-lib:$LD_LIBRARY_PATH"
 MPI_ARGS+=" -x MSCCLPP_SOCKET_IFNAME=enP22p1s0f1 -x MSCCLPP_IBV_MODE=host-no-atomic  -x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so"
-MPI_ARGS+=" -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1 -x PATH=/home/azhpcuser/binyli/mscclpp_venv/bin:$PATH "
+MPI_ARGS+=" -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1 -x PATH=$MSCCLPPHOME/mscclpp/bin:$PATH "
 MPI_ARGS+=" -x MSCCLPP_LOG_LEVEL=ERROR -x MSCCLPP_DEBUG=ERROR  -x MSCCLPP_IB_GID_INDEX=3 -x MSCCLPP_HCA_DEVICES=mlx5_1,mlx5_0,mlx5_3,mlx5_2"
-MPI_ARGS+=" /home/azhpcuser/binyli/mscclpp_venv/bin/python3   /home/azhpcuser/binyli/mscclpp/python/test/executor_test.py   -path /home/azhpcuser/binyli/mscclpp/test.json"
+MPI_ARGS+=" $MSCCLPPHOME/mscclpp/bin/python3   $MSCCLPPHOME/python/test/executor_test.py   -path $MSCCLPPHOME/test.json"
 
 
 mpirun -np 16 --hostfile ./hosts --map-by ppr:4:node  $MPI_ARGS --size 1G --n_iters 20 --n_graph_iters 5 --split_mask 0x3

From 3a1e2d4808a1b4475f5875532142366e8bc70e93 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Thu, 21 May 2026 00:00:33 +0000
Subject: [PATCH 125/132] clean

---
 copyjson.sh      |   17 -
 generate-json.sh |   18 -
 run-sendrecv2.sh |   14 -
 run.sh           |   15 -
 run_onenode.sh   |   14 -
 test.json        | 4830 ----------------------------------------------
 6 files changed, 4908 deletions(-)
 delete mode 100755 copyjson.sh
 delete mode 100755 generate-json.sh
 delete mode 100755 run-sendrecv2.sh
 delete mode 100755 run.sh
 delete mode 100755 run_onenode.sh
 delete mode 100644 test.json

diff --git a/copyjson.sh b/copyjson.sh
deleted file mode 100755
index 9e0771e13..000000000
--- a/copyjson.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-set -ex
-
-# Check if the number of arguments is exactly 1
-if [ "$#" -ne 1 ]; then
-    echo "Usage: $0 <hostfile>"
-    exit 1
-fi
-export MSCCLPPHOME=/home/azhpcuser/mahdieh/mscclpp-unittest/mscclpp/
-
-HOSTFILE=$1
-
-parallel-scp -h "$HOSTFILE" -p128 -t1800 -r  ./*.json $MSCCLPPHOME
-
-parallel-scp -h "$HOSTFILE" -p128 -t1800 -r ./python/test/executor_test.py $MSCCLPPHOME/python/test/
-
-parallel-scp -h "$HOSTFILE" -p128 -t1800 -r ./python/test/executor_test_verifier.cu $MSCCLPPHOME/python/test/
diff --git a/generate-json.sh b/generate-json.sh
deleted file mode 100755
index 25c21b14e..000000000
--- a/generate-json.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-set -ex
-
-# Check if the number of arguments is exactly 1
-if [ "$#" -ne 3 ]; then
-    echo "Usage: $0 <hostfile> <nnodes> <ppn>"
-    exit 1
-fi
-
-HOSTFILE=$1
-NNODES=$2
-PPN=$3
-
-parallel-scp -h "$HOSTFILE" -p32 -t1800 -r python/test/executor_test.py /home/azhpcuser/mahdieh/mscclpp/python/test/
-
-parallel-scp -h "$HOSTFILE" -p32 -t1800 -r python/mscclpp/default_algos/mscclpp_send_recv.py /home/azhpcuser/mahdieh/mscclpp/python/mscclpp/default_algos/ 
-
-parallel-ssh -h "$HOSTFILE" -p32 -i -t1800 "cd /home/azhpcuser/mahdieh/mscclpp && source mscclpp/bin/activate && python3 python/mscclpp/default_algos/mscclpp_send_recv.py --name send_recv_test --nnodes $NNODES --gpus_per_node $PPN --split_mask 0x3 > test.json "
diff --git a/run-sendrecv2.sh b/run-sendrecv2.sh
deleted file mode 100755
index f4c0e8982..000000000
--- a/run-sendrecv2.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-module load mpi/hpcx #mpi/hpcx-mrc #mpi/hpcx-mrc-2.23.1
-
-export MSCCLPPHOME=/home/azhpcuser/mscclpp-test/mscclpp/
-
-MPI_ARGS=""
-MPI_ARGS+=" -mca coll_hcoll_enable 0 --mca coll ^ucc,hcoll --mca btl tcp,vader,self --mca pml ob1   --mca oob_tcp_if_include enP22p1s0f1 --mca btl_tcp_if_include enP22p1s0f1"
-MPI_ARGS+=" -x MSCCLPP_IBV_SO=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/libibverbs.so -x UCX_NET_DEVICES=enP22p1s0f1 -x LD_LIBRARY_PATH=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/mrc-header-lib:$LD_LIBRARY_PATH"
-MPI_ARGS+=" -x MSCCLPP_SOCKET_IFNAME=enP22p1s0f1 -x MSCCLPP_IBV_MODE=host-no-atomic  -x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so"
-MPI_ARGS+=" -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1 -x PATH=$MSCCLPPHOME/mscclpp/bin:$PATH "
-MPI_ARGS+=" -x MSCCLPP_LOG_LEVEL=ERROR -x MSCCLPP_DEBUG=ERROR  -x MSCCLPP_IB_GID_INDEX=3 -x MSCCLPP_HCA_DEVICES=mlx5_1,mlx5_0,mlx5_3,mlx5_2"
-MPI_ARGS+=" $MSCCLPPHOME/mscclpp/bin/python3   $MSCCLPPHOME/python/test/executor_test.py   -path $MSCCLPPHOME/test.json"
-
-
-mpirun -np 16 --hostfile ./hosts --map-by ppr:4:node  $MPI_ARGS --size 1G --n_iters 20 --n_graph_iters 5 --split_mask 0x3
diff --git a/run.sh b/run.sh
deleted file mode 100755
index 1d603f267..000000000
--- a/run.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-
-module load mpi/hpcx #mpi/hpcx-mrc #mpi/hpcx-mrc-2.23.1
-
-MPI_ARGS=""
-MPI_ARGS+=" -mca coll_hcoll_enable 0 --mca coll ^ucc,hcoll --mca btl tcp,vader,self --mca pml ob1   --mca oob_tcp_if_include enP22p1s0f1 --mca btl_tcp_if_include enP22p1s0f1"
-MPI_ARGS+=" -x MSCCLPP_IBV_SO=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/libibverbs.so -x UCX_NET_DEVICES=enP22p1s0f1 -x LD_LIBRARY_PATH=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/mrc-header-lib:$LD_LIBRARY_PATH"
-MPI_ARGS+=" -x MSCCLPP_SOCKET_IFNAME=enP22p1s0f1 -x MSCCLPP_IBV_MODE=host-no-atomic  -x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so"
-MPI_ARGS+=" -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1 -x PATH=/home/azhpcuser/mahdieh/mscclpp/mscclpp2/bin/:$PATH "
-MPI_ARGS+=" -x MSCCLPP_LOG_LEVEL=ERROR -x MSCCLPP_DEBUG=ERROR  -x MSCCLPP_IB_GID_INDEX=3 -x MSCCLPP_HCA_DEVICES=mlx5_1,mlx5_0,mlx5_3,mlx5_2"
-MPI_ARGS+=" /home/azhpcuser/mahdieh/mscclpp/mscclpp/bin/python3   /home/azhpcuser/mahdieh/mscclpp/python/test/executor_test.py   -path /home/azhpcuser/mahdieh/mscclpp/test.json"
-
-
-mpirun -np 16 --hostfile ./hosts --map-by ppr:4:node  $MPI_ARGS --size 1G --n_iters 30 #--n_graph_iters 100
-
-#mpirun -np 8  --hostfile /home/azhpcuser/binyli/hostfile   --map-by ppr:4:node   -mca coll_hcoll_enable 0 --mca btl tcp,vader,self --mca pml ob1   --mca oob_tcp_if_include enP22p1s0f1 --mca btl_tcp_if_include enP22p1s0f1 -x MSCCLPP_IBV_SO=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/libibverbs.so -x UCX_NET_DEVICES=enP22p1s0f1 -x LD_LIBRARY_PATH=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/mrc-header-lib:$LD_LIBRARY_PATH -x MSCCLPP_IBV_MODE=host-no-atomic  -x MSCCLPP_SOCKET_IFNAME=enP22p1s0f1   -x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so   -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1     -x MSCCLPP_HCA_DEVICES=mlx5_1,mlx5_0,mlx5_3,mlx5_2   -x PATH=/home/azhpcuser/binyli/mscclpp/bin:$PATH  -x MSCCLPP_LOG_LEVEL=ERROR -x MSCCLPP_DEBUG=WARN -x MSCCLPP_IB_GID_INDEX=3 /home/azhpcuser/binyli/mscclpp/bin/python3   /home/azhpcuser/binyli/mscclpp/python/test/executor_test.py   -path /home/azhpcuser/binyli/mscclpp/test.json   --size 1G --n_iters 30
diff --git a/run_onenode.sh b/run_onenode.sh
deleted file mode 100755
index 50b49e128..000000000
--- a/run_onenode.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-
-module load mpi/hpcx #mpi/hpcx-mrc #mpi/hpcx-mrc-2.23.1
-
-MPI_ARGS=""
-MPI_ARGS+="-x CUDA_VISIBLE_DEVICES=0,2 --mca coll ^ucc,hcoll   -mca coll_hcoll_enable 0 --mca btl tcp,vader,self --mca pml ob1   --mca oob_tcp_if_include enP22p1s0f1 --mca btl_tcp_if_include enP22p1s0f1 "
-MPI_ARGS+="-x MSCCLPP_IBV_SO=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/libibverbs.so -x UCX_NET_DEVICES=enP22p1s0f1 -x LD_LIBRARY_PATH=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/mrc-header-lib:$LD_LIBRARY_PATH"
-MPI_ARGS+=" -x MSCCLPP_IBV_MODE=host  -x MSCCLPP_SOCKET_IFNAME=enP22p1s0f1 -x MSCCLPP_IBV_MODE=host-no-atomic  -x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so"
-MPI_ARGS+=" -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1     -x MSCCLPP_HCA_DEVICES=mlx5_1,mlx5_3   -x PATH=/home/azhpcuser/binyli/mscclpp_venv/bin/:$PATH "
-MPI_ARGS+=" -x MSCCLPP_LOG_LEVEL=ERROR -x MSCCLPP_DEBUG=ERROR  -x MSCCLPP_IB_GID_INDEX=3"
-MPI_ARGS+=" /home/azhpcuser/binyli/mscclpp_venv/bin/python3   /home/azhpcuser/binyli/mscclpp/python/test/executor_test.py   -path /home/azhpcuser/binyli/mscclpp/test.json"
-
-
-
-mpirun -np 2 $MPI_ARGS --size 4K --n_iters 500  --n_graph_iters 100
diff --git a/test.json b/test.json
deleted file mode 100644
index eb28dd27c..000000000
--- a/test.json
+++ /dev/null
@@ -1,4830 +0,0 @@
-{
-  "name": "send_recv",
-  "collective": "sendrecv",
-  "protocol": "Simple",
-  "inplace": false,
-  "reuse_resources": false,
-  "gpus": [
-    {
-      "id": 0,
-      "input_chunks": 4,
-      "output_chunks": 4,
-      "scratch_chunks": 0,
-      "threadblocks": [
-        {
-          "id": 0,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                4,
-                0
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 1,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                5,
-                1
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 2,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                6,
-                2
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 3,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                7,
-                3
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        }
-      ],
-      "channels": [
-        {
-          "channel_type": "port",
-          "connected_to": [
-            4,
-            4,
-            4,
-            4,
-            12,
-            12,
-            12,
-            12
-          ]
-        }
-      ],
-      "remote_buffers": [
-        {
-          "rank": 4,
-          "type": "o",
-          "access_channel_types": [
-            "port"
-          ]
-        }
-      ],
-      "semaphores": []
-    },
-    {
-      "id": 1,
-      "input_chunks": 4,
-      "output_chunks": 4,
-      "scratch_chunks": 0,
-      "threadblocks": [
-        {
-          "id": 0,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                4,
-                0
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 1,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                5,
-                1
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 2,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                6,
-                2
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 3,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                7,
-                3
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        }
-      ],
-      "channels": [
-        {
-          "channel_type": "port",
-          "connected_to": [
-            5,
-            5,
-            5,
-            5,
-            13,
-            13,
-            13,
-            13
-          ]
-        }
-      ],
-      "remote_buffers": [
-        {
-          "rank": 5,
-          "type": "o",
-          "access_channel_types": [
-            "port"
-          ]
-        }
-      ],
-      "semaphores": []
-    },
-    {
-      "id": 2,
-      "input_chunks": 4,
-      "output_chunks": 4,
-      "scratch_chunks": 0,
-      "threadblocks": [
-        {
-          "id": 0,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                4,
-                0
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 1,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                5,
-                1
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 2,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                6,
-                2
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 3,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                7,
-                3
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        }
-      ],
-      "channels": [
-        {
-          "channel_type": "port",
-          "connected_to": [
-            6,
-            6,
-            6,
-            6,
-            14,
-            14,
-            14,
-            14
-          ]
-        }
-      ],
-      "remote_buffers": [
-        {
-          "rank": 6,
-          "type": "o",
-          "access_channel_types": [
-            "port"
-          ]
-        }
-      ],
-      "semaphores": []
-    },
-    {
-      "id": 3,
-      "input_chunks": 4,
-      "output_chunks": 4,
-      "scratch_chunks": 0,
-      "threadblocks": [
-        {
-          "id": 0,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                4,
-                0
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 1,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                5,
-                1
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 2,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                6,
-                2
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 3,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                7,
-                3
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        }
-      ],
-      "channels": [
-        {
-          "channel_type": "port",
-          "connected_to": [
-            7,
-            7,
-            7,
-            7,
-            15,
-            15,
-            15,
-            15
-          ]
-        }
-      ],
-      "remote_buffers": [
-        {
-          "rank": 7,
-          "type": "o",
-          "access_channel_types": [
-            "port"
-          ]
-        }
-      ],
-      "semaphores": []
-    },
-    {
-      "id": 4,
-      "input_chunks": 4,
-      "output_chunks": 4,
-      "scratch_chunks": 0,
-      "threadblocks": [
-        {
-          "id": 0,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                4,
-                0
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 1,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                5,
-                1
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 2,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                6,
-                2
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 3,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                7,
-                3
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        }
-      ],
-      "channels": [
-        {
-          "channel_type": "port",
-          "connected_to": [
-            8,
-            8,
-            8,
-            8,
-            0,
-            0,
-            0,
-            0
-          ]
-        }
-      ],
-      "remote_buffers": [
-        {
-          "rank": 8,
-          "type": "o",
-          "access_channel_types": [
-            "port"
-          ]
-        }
-      ],
-      "semaphores": []
-    },
-    {
-      "id": 5,
-      "input_chunks": 4,
-      "output_chunks": 4,
-      "scratch_chunks": 0,
-      "threadblocks": [
-        {
-          "id": 0,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                4,
-                0
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 1,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                5,
-                1
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 2,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                6,
-                2
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 3,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                7,
-                3
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        }
-      ],
-      "channels": [
-        {
-          "channel_type": "port",
-          "connected_to": [
-            9,
-            9,
-            9,
-            9,
-            1,
-            1,
-            1,
-            1
-          ]
-        }
-      ],
-      "remote_buffers": [
-        {
-          "rank": 9,
-          "type": "o",
-          "access_channel_types": [
-            "port"
-          ]
-        }
-      ],
-      "semaphores": []
-    },
-    {
-      "id": 6,
-      "input_chunks": 4,
-      "output_chunks": 4,
-      "scratch_chunks": 0,
-      "threadblocks": [
-        {
-          "id": 0,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                4,
-                0
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 1,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                5,
-                1
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 2,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                6,
-                2
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 3,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                7,
-                3
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        }
-      ],
-      "channels": [
-        {
-          "channel_type": "port",
-          "connected_to": [
-            10,
-            10,
-            10,
-            10,
-            2,
-            2,
-            2,
-            2
-          ]
-        }
-      ],
-      "remote_buffers": [
-        {
-          "rank": 10,
-          "type": "o",
-          "access_channel_types": [
-            "port"
-          ]
-        }
-      ],
-      "semaphores": []
-    },
-    {
-      "id": 7,
-      "input_chunks": 4,
-      "output_chunks": 4,
-      "scratch_chunks": 0,
-      "threadblocks": [
-        {
-          "id": 0,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                4,
-                0
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 1,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                5,
-                1
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 2,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                6,
-                2
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 3,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                7,
-                3
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        }
-      ],
-      "channels": [
-        {
-          "channel_type": "port",
-          "connected_to": [
-            11,
-            11,
-            11,
-            11,
-            3,
-            3,
-            3,
-            3
-          ]
-        }
-      ],
-      "remote_buffers": [
-        {
-          "rank": 11,
-          "type": "o",
-          "access_channel_types": [
-            "port"
-          ]
-        }
-      ],
-      "semaphores": []
-    },
-    {
-      "id": 8,
-      "input_chunks": 4,
-      "output_chunks": 4,
-      "scratch_chunks": 0,
-      "threadblocks": [
-        {
-          "id": 0,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                4,
-                0
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 1,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                5,
-                1
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 2,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                6,
-                2
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 3,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                7,
-                3
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        }
-      ],
-      "channels": [
-        {
-          "channel_type": "port",
-          "connected_to": [
-            12,
-            12,
-            12,
-            12,
-            4,
-            4,
-            4,
-            4
-          ]
-        }
-      ],
-      "remote_buffers": [
-        {
-          "rank": 12,
-          "type": "o",
-          "access_channel_types": [
-            "port"
-          ]
-        }
-      ],
-      "semaphores": []
-    },
-    {
-      "id": 9,
-      "input_chunks": 4,
-      "output_chunks": 4,
-      "scratch_chunks": 0,
-      "threadblocks": [
-        {
-          "id": 0,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                4,
-                0
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 1,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                5,
-                1
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 2,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                6,
-                2
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 3,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                7,
-                3
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        }
-      ],
-      "channels": [
-        {
-          "channel_type": "port",
-          "connected_to": [
-            13,
-            13,
-            13,
-            13,
-            5,
-            5,
-            5,
-            5
-          ]
-        }
-      ],
-      "remote_buffers": [
-        {
-          "rank": 13,
-          "type": "o",
-          "access_channel_types": [
-            "port"
-          ]
-        }
-      ],
-      "semaphores": []
-    },
-    {
-      "id": 10,
-      "input_chunks": 4,
-      "output_chunks": 4,
-      "scratch_chunks": 0,
-      "threadblocks": [
-        {
-          "id": 0,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                4,
-                0
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 1,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                5,
-                1
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 2,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                6,
-                2
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 3,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                7,
-                3
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        }
-      ],
-      "channels": [
-        {
-          "channel_type": "port",
-          "connected_to": [
-            14,
-            14,
-            14,
-            14,
-            6,
-            6,
-            6,
-            6
-          ]
-        }
-      ],
-      "remote_buffers": [
-        {
-          "rank": 14,
-          "type": "o",
-          "access_channel_types": [
-            "port"
-          ]
-        }
-      ],
-      "semaphores": []
-    },
-    {
-      "id": 11,
-      "input_chunks": 4,
-      "output_chunks": 4,
-      "scratch_chunks": 0,
-      "threadblocks": [
-        {
-          "id": 0,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                4,
-                0
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 1,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                5,
-                1
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 2,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                6,
-                2
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 3,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                7,
-                3
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        }
-      ],
-      "channels": [
-        {
-          "channel_type": "port",
-          "connected_to": [
-            15,
-            15,
-            15,
-            15,
-            7,
-            7,
-            7,
-            7
-          ]
-        }
-      ],
-      "remote_buffers": [
-        {
-          "rank": 15,
-          "type": "o",
-          "access_channel_types": [
-            "port"
-          ]
-        }
-      ],
-      "semaphores": []
-    },
-    {
-      "id": 12,
-      "input_chunks": 4,
-      "output_chunks": 4,
-      "scratch_chunks": 0,
-      "threadblocks": [
-        {
-          "id": 0,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                4,
-                0
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 1,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                5,
-                1
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 2,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                6,
-                2
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 3,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                7,
-                3
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        }
-      ],
-      "channels": [
-        {
-          "channel_type": "port",
-          "connected_to": [
-            0,
-            0,
-            0,
-            0,
-            8,
-            8,
-            8,
-            8
-          ]
-        }
-      ],
-      "remote_buffers": [
-        {
-          "rank": 0,
-          "type": "o",
-          "access_channel_types": [
-            "port"
-          ]
-        }
-      ],
-      "semaphores": []
-    },
-    {
-      "id": 13,
-      "input_chunks": 4,
-      "output_chunks": 4,
-      "scratch_chunks": 0,
-      "threadblocks": [
-        {
-          "id": 0,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                4,
-                0
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 1,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                5,
-                1
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 2,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                6,
-                2
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 3,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                7,
-                3
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        }
-      ],
-      "channels": [
-        {
-          "channel_type": "port",
-          "connected_to": [
-            1,
-            1,
-            1,
-            1,
-            9,
-            9,
-            9,
-            9
-          ]
-        }
-      ],
-      "remote_buffers": [
-        {
-          "rank": 1,
-          "type": "o",
-          "access_channel_types": [
-            "port"
-          ]
-        }
-      ],
-      "semaphores": []
-    },
-    {
-      "id": 14,
-      "input_chunks": 4,
-      "output_chunks": 4,
-      "scratch_chunks": 0,
-      "threadblocks": [
-        {
-          "id": 0,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                4,
-                0
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 1,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                5,
-                1
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 2,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                6,
-                2
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 3,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                7,
-                3
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        }
-      ],
-      "channels": [
-        {
-          "channel_type": "port",
-          "connected_to": [
-            2,
-            2,
-            2,
-            2,
-            10,
-            10,
-            10,
-            10
-          ]
-        }
-      ],
-      "remote_buffers": [
-        {
-          "rank": 2,
-          "type": "o",
-          "access_channel_types": [
-            "port"
-          ]
-        }
-      ],
-      "semaphores": []
-    },
-    {
-      "id": 15,
-      "input_chunks": 4,
-      "output_chunks": 4,
-      "scratch_chunks": 0,
-      "threadblocks": [
-        {
-          "id": 0,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 0,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                4,
-                0
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 1,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 1,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                5,
-                1
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 2,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 2,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                6,
-                2
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        },
-        {
-          "id": 3,
-          "ops": [
-            {
-              "name": "signal",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "nop"
-            },
-            {
-              "name": "pws",
-              "src_buff": [
-                {
-                  "type": "i",
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "dst_buff": [
-                {
-                  "buffer_id": 0,
-                  "index": 3,
-                  "size": 1
-                }
-              ],
-              "channel_ids": [
-                1
-              ],
-              "channel_type": "port"
-            },
-            {
-              "name": "wait",
-              "channel_ids": [
-                0
-              ],
-              "channel_type": "port"
-            }
-          ],
-          "channels": [
-            {
-              "channel_type": "port",
-              "channel_ids": [
-                7,
-                3
-              ]
-            }
-          ],
-          "remote_buffer_refs": [
-            {
-              "access_channel_type": "port",
-              "remote_buffer_ids": [
-                0
-              ]
-            }
-          ]
-        }
-      ],
-      "channels": [
-        {
-          "channel_type": "port",
-          "connected_to": [
-            3,
-            3,
-            3,
-            3,
-            11,
-            11,
-            11,
-            11
-          ]
-        }
-      ],
-      "remote_buffers": [
-        {
-          "rank": 3,
-          "type": "o",
-          "access_channel_types": [
-            "port"
-          ]
-        }
-      ],
-      "semaphores": []
-    }
-  ],
-  "num_threads_per_block": 1024,
-  "use_double_scratch_buffer": false,
-  "buffer_alignment": 16,
-  "min_message_size": 0,
-  "max_message_size": 18446744073709551615
-}

From 8a42fe2886ee954f32157036f0d371d207dfa6e0 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 22 May 2026 05:22:27 +0000
Subject: [PATCH 126/132] revert

---
 include/mscclpp/core.hpp      | 3 +--
 src/core/endpoint.cc          | 6 ------
 src/core/executor/executor.cc | 1 -
 3 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 31ab80ae2..45b56bcc0 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -8,7 +8,6 @@
 #include <bitset>
 #include <future>
 #include <memory>
-#include <mscclpp/env.hpp>
 #include <mscclpp/errors.hpp>
 #include <mscclpp/gpu_data_types.hpp>
 #include <mscclpp/version.hpp>
@@ -431,7 +430,7 @@ struct EndpointConfig {
        int maxWrPerSend = DefaultMaxWrPerSend, Mode mode = Mode::Default)
         : deviceIndex(deviceIndex),
           port(port),
-          gidIndex(env()->ibGidIndex > 0 ? env()->ibGidIndex : gidIndex),
+          gidIndex(gidIndex),
           maxCqSize(maxCqSize),
           maxCqPollNum(maxCqPollNum),
           maxSendWr(maxSendWr),
diff --git a/src/core/endpoint.cc b/src/core/endpoint.cc
index 298288d14..fe51e348a 100644
--- a/src/core/endpoint.cc
+++ b/src/core/endpoint.cc
@@ -54,12 +54,6 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl)
 
     int maxRecvWr = ibNoAtomic_ ? config_.ib.maxRecvWr : 0;
 
-    // Override GID index from environment variable if set
-    int gidIndex = config_.ib.gidIndex;
-    if (env()->ibGidIndex >= 0) {
-      gidIndex = env()->ibGidIndex;
-    }
-
     ibQp_ = contextImpl.getIbContext(config_.transport)
                 ->createQp(config_.ib.port, config_.ib.gidIndex, config_.ib.maxCqSize, config_.ib.maxCqPollNum,
                            config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend, ibNoAtomic_);
diff --git a/src/core/executor/executor.cc b/src/core/executor/executor.cc
index e358dae03..85c1c9907 100644
--- a/src/core/executor/executor.cc
+++ b/src/core/executor/executor.cc
@@ -95,7 +95,6 @@ namespace {
 auto hasIBDevices = []() { return mscclpp::getIBDeviceCount() > 0; };
 
 auto useIB = [](int rank1, int rank2, int nranksPerNode) {
-  return true;
   bool inSameNode = rank1 / nranksPerNode == rank2 / nranksPerNode;
   return hasIBDevices() && !inSameNode;
 };

From 7784407565247e114a43674399fc5c551fdd9687 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 22 May 2026 16:24:43 +0000
Subject: [PATCH 127/132] WIP

---
 python/test/executor_test.py          |  29 +----
 python/test/executor_test_verifier.cu | 159 ++++++++++++++------------
 2 files changed, 92 insertions(+), 96 deletions(-)

diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index b1b3a36ba..11a88f879 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -63,26 +63,6 @@ def bench_time(n_iters: int, n_graph_iters: int, func: Union[Callable, list[Call
     return cp.cuda.get_elapsed_time(start, end) / n_iters * 1000.0 / n_graph_iters
 
 
-def get_prev_rank(my_rank: int, num_ranks: int, split_mask: int) -> int:
-    """Determine the previous rank in the ring based on the split_mask topology."""
-    group_size = split_mask + 1
-    num_groups = num_ranks // group_size
-    position_in_group = my_rank & split_mask
-    group_id = my_rank // group_size
-    prev_group_id = (group_id - 1 + num_groups) % num_groups
-    return prev_group_id * group_size + position_in_group
-
-
-def get_next_rank(my_rank: int, num_ranks: int, split_mask: int) -> int:
-    """Determine the next rank in the ring based on the split_mask topology."""
-    group_size = split_mask + 1
-    num_groups = num_ranks // group_size
-    position_in_group = my_rank & split_mask
-    group_id = my_rank // group_size
-    next_group_id = (group_id + 1) % num_groups
-    return next_group_id * group_size + position_in_group
-
-
 def bench_correctness(
     collective: str,
     input_buf: Union[cp.ndarray, list[cp.ndarray]],
@@ -138,17 +118,16 @@ def bench_correctness(
                 cur_test = test_buf
                 cur_func = func
 
-            fill_data_params = pack(cur_input) + struct.pack("Q", cur_input.nbytes // type_size) + pack(rank, i)
+            fill_data_params = (
+                pack(cur_input) + struct.pack("Q", cur_input.nbytes // type_size) + pack(rank, i, split_mask)
+            )
             fill_data_kernel.launch_kernel(fill_data_params, nblocks, nthreads, 0, stream)
             cur_func(stream)
             test_data_params = (
                 pack(cur_result, cur_test)
                 + struct.pack("Q", cur_input.nbytes // type_size)
-                + pack(num_ranks, rank, i)
+                + pack(num_ranks, rank, i, split_mask)
             )
-            if "sendrecv" in collective:
-                prev_rank = get_prev_rank(rank, num_ranks, split_mask)
-                test_data_params += pack(prev_rank)
             test_data_kernel.launch_kernel(test_data_params, nblocks, nthreads, 0, stream)
         graph = stream.end_capture()
     graph.launch(stream)
diff --git a/python/test/executor_test_verifier.cu b/python/test/executor_test_verifier.cu
index 1da42a7b8..f784c9d37 100644
--- a/python/test/executor_test_verifier.cu
+++ b/python/test/executor_test_verifier.cu
@@ -22,14 +22,19 @@ static __device__ unsigned int ranqd1(unsigned int seed) {
 // fill/test kernel pairs must have the same thread block size to
 // match their random number series.
 
-#define FILL_DATA(FuncNameType, DataType)                                                                \
-  extern "C" __global__ void __launch_bounds__(1024, 1)                                                  \
-      fill_data_##FuncNameType(DataType* input_buf, size_t num_elems, int rank, int seq) {               \
-    unsigned int seed = (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + rank + seq);              \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) { \
-      seed = ranqd1(seed);                                                                               \
-      input_buf[i] = DataType(seed % blockDim.x) / DataType(blockDim.x);                                 \
-    }                                                                                                    \
+// `split_mask` groups ranks together: group_size = split_mask + 1, group_id = rank / group_size.
+// Data is seeded by group_id so that all ranks within a group produce the same fill, and ranks
+// in different groups produce different fills. With split_mask == 0 this reduces to per-rank
+// seeding (group_id == rank).
+#define FILL_DATA(FuncNameType, DataType)                                                                  \
+  extern "C" __global__ void __launch_bounds__(1024, 1)                                                    \
+      fill_data_##FuncNameType(DataType* input_buf, size_t num_elems, int rank, int seq, int split_mask) { \
+    int seed_rank = rank / (split_mask + 1);                                                               \
+    unsigned int seed = (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + seed_rank + seq);           \
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) {   \
+      seed = ranqd1(seed);                                                                                 \
+      input_buf[i] = DataType(seed % blockDim.x) / DataType(blockDim.x);                                   \
+    }                                                                                                      \
   }
 
 FILL_DATA(bfloat16, __nv_bfloat16)
@@ -37,18 +42,20 @@ FILL_DATA(float16, __half)
 FILL_DATA(float32, float)
 FILL_DATA(int32, int)
 
-#define TEST_DATA_ALL_GATHER(FuncNameType, DataType)                                                       \
-  extern "C" __global__ void __launch_bounds__(1024, 1) test_data_all_gather_##FuncNameType(               \
-      DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) {   \
-    for (int rank = 0; rank < num_ranks; rank++) {                                                         \
-      size_t rank_offset = rank * num_elems;                                                               \
-      unsigned int seed = (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + rank + seq);              \
-      for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) { \
-        seed = ranqd1(seed);                                                                               \
-        test_buf[rank_offset + i] = DataType(seed % blockDim.x) / DataType(blockDim.x);                    \
-        assert(result_buf[rank_offset + i] == test_buf[rank_offset + i]);                                  \
-      }                                                                                                    \
-    }                                                                                                      \
+#define TEST_DATA_ALL_GATHER(FuncNameType, DataType)                                                                 \
+  extern "C" __global__ void __launch_bounds__(1024, 1)                                                              \
+      test_data_all_gather_##FuncNameType(DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, \
+                                          int my_rank, int seq, int split_mask) {                                    \
+    for (int rank = 0; rank < num_ranks; rank++) {                                                                   \
+      size_t rank_offset = rank * num_elems;                                                                         \
+      int seed_rank = rank / (split_mask + 1);                                                                       \
+      unsigned int seed = (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + seed_rank + seq);                   \
+      for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) {           \
+        seed = ranqd1(seed);                                                                                         \
+        test_buf[rank_offset + i] = DataType(seed % blockDim.x) / DataType(blockDim.x);                              \
+        assert(result_buf[rank_offset + i] == test_buf[rank_offset + i]);                                            \
+      }                                                                                                              \
+    }                                                                                                                \
   }
 
 TEST_DATA_ALL_GATHER(bfloat16, __nv_bfloat16)
@@ -56,25 +63,27 @@ TEST_DATA_ALL_GATHER(float16, __half)
 TEST_DATA_ALL_GATHER(float32, float)
 TEST_DATA_ALL_GATHER(int32, int)
 
-#define TEST_DATA_ALL_REDUCE(FuncNameType, DataType, Eps)                                                  \
-  extern "C" __global__ void __launch_bounds__(1024, 1) test_data_all_reduce_##FuncNameType(               \
-      DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) {   \
-    for (int rank = 0; rank < num_ranks; rank++) {                                                         \
-      unsigned int seed = (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + rank + seq);              \
-      for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) { \
-        if (rank == 0) {                                                                                   \
-          test_buf[i] = 0;                                                                                 \
-        }                                                                                                  \
-        seed = ranqd1(seed);                                                                               \
-        test_buf[i] += DataType(seed % blockDim.x) / DataType(blockDim.x);                                 \
-      }                                                                                                    \
-    }                                                                                                      \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) {   \
-      float expected = float(test_buf[i]);                                                                 \
-      float result = float(result_buf[i]);                                                                 \
-      float tol = Eps * num_ranks * (1.0f + abs(expected));                                                \
-      assert(abs(result - expected) <= tol);                                                               \
-    }                                                                                                      \
+#define TEST_DATA_ALL_REDUCE(FuncNameType, DataType, Eps)                                                            \
+  extern "C" __global__ void __launch_bounds__(1024, 1)                                                              \
+      test_data_all_reduce_##FuncNameType(DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, \
+                                          int my_rank, int seq, int split_mask) {                                    \
+    for (int rank = 0; rank < num_ranks; rank++) {                                                                   \
+      int seed_rank = rank / (split_mask + 1);                                                                       \
+      unsigned int seed = (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + seed_rank + seq);                   \
+      for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) {           \
+        if (rank == 0) {                                                                                             \
+          test_buf[i] = 0;                                                                                           \
+        }                                                                                                            \
+        seed = ranqd1(seed);                                                                                         \
+        test_buf[i] += DataType(seed % blockDim.x) / DataType(blockDim.x);                                           \
+      }                                                                                                              \
+    }                                                                                                                \
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) {             \
+      float expected = float(test_buf[i]);                                                                           \
+      float result = float(result_buf[i]);                                                                           \
+      float tol = Eps * num_ranks * (1.0f + abs(expected));                                                          \
+      assert(abs(result - expected) <= tol);                                                                         \
+    }                                                                                                                \
   }
 
 TEST_DATA_ALL_REDUCE(bfloat16, __nv_bfloat16, 7.8125e-3f)
@@ -83,12 +92,14 @@ TEST_DATA_ALL_REDUCE(float32, float, 1.1920929e-7f)
 TEST_DATA_ALL_REDUCE(int32, int, 0.0f)
 
 #define TEST_DATA_REDUCE_SCATTER(FuncNameType, DataType, Eps)                                              \
-  extern "C" __global__ void __launch_bounds__(1024, 1) test_data_reduce_scatter_##FuncNameType(           \
-      DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) {   \
+  extern "C" __global__ void __launch_bounds__(1024, 1)                                                    \
+      test_data_reduce_scatter_##FuncNameType(DataType* result_buf, DataType* test_buf, size_t num_elems,  \
+                                              int num_ranks, int my_rank, int seq, int split_mask) {       \
     int nem_elems_per_rank = num_elems / num_ranks;                                                        \
     int offset = nem_elems_per_rank * my_rank;                                                             \
     for (int rank = 0; rank < num_ranks; rank++) {                                                         \
-      unsigned int seed = (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + rank + seq);              \
+      int seed_rank = rank / (split_mask + 1);                                                             \
+      unsigned int seed = (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + seed_rank + seq);         \
       for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) { \
         if (rank == 0) {                                                                                   \
           test_buf[i] = 0;                                                                                 \
@@ -112,22 +123,24 @@ TEST_DATA_REDUCE_SCATTER(float16, __half, 9.765625e-4f)
 TEST_DATA_REDUCE_SCATTER(float32, float, 1.1920929e-7f)
 TEST_DATA_REDUCE_SCATTER(int32, int, 0.0f)
 
-#define TEST_DATA_ALL_TO_ALL(FuncNameType, DataType)                                                       \
-  extern "C" __global__ void __launch_bounds__(1024, 1) test_data_all_to_all_##FuncNameType(               \
-      DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) {   \
-    int nem_elems_per_rank = num_elems / num_ranks;                                                        \
-    int offset = nem_elems_per_rank * my_rank;                                                             \
-    for (int rank = 0; rank < num_ranks; rank++) {                                                         \
-      size_t rank_offset = rank * nem_elems_per_rank;                                                      \
-      unsigned int seed = (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + rank + seq);              \
-      for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) { \
-        seed = ranqd1(seed);                                                                               \
-        if (i >= my_rank * nem_elems_per_rank && i < (my_rank + 1) * nem_elems_per_rank) {                 \
-          test_buf[rank_offset + i - offset] = DataType(seed % blockDim.x) / DataType(blockDim.x);         \
-          assert(result_buf[rank_offset + i - offset] == test_buf[rank_offset + i - offset]);              \
-        }                                                                                                  \
-      }                                                                                                    \
-    }                                                                                                      \
+#define TEST_DATA_ALL_TO_ALL(FuncNameType, DataType)                                                                 \
+  extern "C" __global__ void __launch_bounds__(1024, 1)                                                              \
+      test_data_all_to_all_##FuncNameType(DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, \
+                                          int my_rank, int seq, int split_mask) {                                    \
+    int nem_elems_per_rank = num_elems / num_ranks;                                                                  \
+    int offset = nem_elems_per_rank * my_rank;                                                                       \
+    for (int rank = 0; rank < num_ranks; rank++) {                                                                   \
+      size_t rank_offset = rank * nem_elems_per_rank;                                                                \
+      int seed_rank = rank / (split_mask + 1);                                                                       \
+      unsigned int seed = (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + seed_rank + seq);                   \
+      for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) {           \
+        seed = ranqd1(seed);                                                                                         \
+        if (i >= my_rank * nem_elems_per_rank && i < (my_rank + 1) * nem_elems_per_rank) {                           \
+          test_buf[rank_offset + i - offset] = DataType(seed % blockDim.x) / DataType(blockDim.x);                   \
+          assert(result_buf[rank_offset + i - offset] == test_buf[rank_offset + i - offset]);                        \
+        }                                                                                                            \
+      }                                                                                                              \
+    }                                                                                                                \
   }
 
 TEST_DATA_ALL_TO_ALL(bfloat16, __nv_bfloat16)
@@ -135,19 +148,23 @@ TEST_DATA_ALL_TO_ALL(float16, __half)
 TEST_DATA_ALL_TO_ALL(float32, float)
 TEST_DATA_ALL_TO_ALL(int32, int)
 
-// Sendrecv verification: receive from prev rank in the ring.
-// Replays the same PRNG sequence that fill_data used on the sender (prev_rank).
-// prev_rank is passed explicitly since the ring topology depends on split_mask.
-#define TEST_DATA_SEND_RECV(FuncNameType, DataType)                                                       \
-  extern "C" __global__ void __launch_bounds__(1024, 1) test_data_send_recv_##FuncNameType(               \
-      DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq,    \
-      int prev_rank) {                                                                                    \
-    unsigned int seed = (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + prev_rank + seq);          \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) { \
-      seed = ranqd1(seed);                                                                                \
-      test_buf[i] = DataType(seed % blockDim.x) / DataType(blockDim.x);                                  \
-      assert(result_buf[i] == test_buf[i]);                                                               \
-    }                                                                                                     \
+// Sendrecv verification: receive from the prev group in the ring.
+// fill_data seeds by group_id (rank / (split_mask + 1)); the receiver in group g expects the
+// data produced by group (g - 1 + num_groups) % num_groups, so we recompute that seed here.
+#define TEST_DATA_SEND_RECV(FuncNameType, DataType)                                                                 \
+  extern "C" __global__ void __launch_bounds__(1024, 1)                                                             \
+      test_data_send_recv_##FuncNameType(DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, \
+                                         int my_rank, int seq, int split_mask) {                                    \
+    int group_size = split_mask + 1;                                                                                \
+    int num_groups = num_ranks / group_size;                                                                        \
+    int my_group_id = my_rank / group_size;                                                                         \
+    int prev_group_id = (my_group_id - 1 + num_groups) % num_groups;                                                \
+    unsigned int seed = (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + prev_group_id + seq);                \
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) {            \
+      seed = ranqd1(seed);                                                                                          \
+      test_buf[i] = DataType(seed % blockDim.x) / DataType(blockDim.x);                                             \
+      assert(result_buf[i] == test_buf[i]);                                                                         \
+    }                                                                                                               \
   }
 
 TEST_DATA_SEND_RECV(float16, __half)

From e6005205098b823d27b1c7a87456f0dc37646ded Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 22 May 2026 17:11:00 +0000
Subject: [PATCH 128/132] WIP

---
 python/mscclpp/language/collectives.py             |  2 +-
 .../tests/multi_node}/send_recv.py                 | 11 +++++------
 python/test/executor_test.py                       |  4 +++-
 src/core/executor/executor.cc                      | 14 +++++++++-----
 4 files changed, 18 insertions(+), 13 deletions(-)
 rename python/mscclpp/{default_algos => language/tests/multi_node}/send_recv.py (91%)

diff --git a/python/mscclpp/language/collectives.py b/python/mscclpp/language/collectives.py
index 01c766bae..15d41ad10 100644
--- a/python/mscclpp/language/collectives.py
+++ b/python/mscclpp/language/collectives.py
@@ -241,7 +241,7 @@ def init_buffers(self):
 class SendRecv(Collective):
     """A SendRecv collective communication pattern.
 
-    SendRecv performs a point-to-point send/receive operation in a ring topology.
+    SendRecv performs a point-to-point send/receive operation.
     Each rank sends its input buffer to the next rank and receives data from the
     previous rank into its output buffer.
 
diff --git a/python/mscclpp/default_algos/send_recv.py b/python/mscclpp/language/tests/multi_node/send_recv.py
similarity index 91%
rename from python/mscclpp/default_algos/send_recv.py
rename to python/mscclpp/language/tests/multi_node/send_recv.py
index 2127eb913..fd70b543f 100644
--- a/python/mscclpp/default_algos/send_recv.py
+++ b/python/mscclpp/language/tests/multi_node/send_recv.py
@@ -9,7 +9,7 @@
 from mscclpp.language.collectives import *
 
 
-def send_recv_test(name, nnodes, gpus_per_node, split_mask):
+def send_recv(name, nnodes, gpus_per_node, split_mask, instances):
     gpu_size = nnodes * gpus_per_node
     collective = SendRecv(gpu_size, 1, False)
     with CollectiveProgram(
@@ -21,7 +21,7 @@ def send_recv_test(name, nnodes, gpus_per_node, split_mask):
         use_double_scratch_buffer=False,
         min_message_size=0,
         max_message_size=2**64 - 1,
-        instances=4
+        instances=instances,
     ):
         # Creating separate port channels for next and prev directions.
         # When prev and next are the same peer (e.g., 2-node ring), both channels go to the same peer
@@ -30,7 +30,7 @@ def send_recv_test(name, nnodes, gpus_per_node, split_mask):
         # "higher" rank so that tags cross-match:
         #   Lower rank:  [next(tag0), prev(tag1)]
         #   Higher rank:  [prev(tag0), next(tag1)]
-        # Then lower.prev(tag1) == higher.next(tag1) ✓ and higher.prev(tag0) == lower.next(tag0) ✓
+        # Then lower.prev(tag1) == higher.next(tag1) and higher.prev(tag0) == lower.next(tag0)
         # When prev != next (3+ nodes), each channel targets a different peer so each gets tag 0
         # and this ordering doesn't matter.
         group_size = split_mask + 1
@@ -82,9 +82,8 @@ def send_recv_test(name, nnodes, gpus_per_node, split_mask):
 parser.add_argument("--nnodes", type=int, default=1, help="number of nodes")
 parser.add_argument("--gpus_per_node", type=int, help="number of gpus per node")
 parser.add_argument("--split_mask", type=lambda x: int(x, 0), default=0x3, help="split mask (e.g. 0x3)")
+parser.add_argument("--instances", type=int, default=4, help="number of instances")
 
 args = parser.parse_args()
 
-send_recv_test(
-    args.name, args.nnodes, args.gpus_per_node, args.split_mask
-)
+send_recv(args.name, args.nnodes, args.gpus_per_node, args.split_mask, args.instances)
diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index 11a88f879..d4ff28749 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -308,7 +308,9 @@ def main(
     parser.add_argument("--packet_type", type=str, default="LL16", help="Choose from LL8, LL16")
     parser.add_argument("--n_iters", type=int, default=10)
     parser.add_argument("--n_graph_iters", type=int, default=10)
-    parser.add_argument("--split_mask", type=lambda x: int(x, 0), default=0x0, help="split mask for sendrecv (e.g. 0x3)")
+    parser.add_argument(
+        "--split_mask", type=lambda x: int(x, 0), default=0x0, help="split mask for sendrecv (e.g. 0x3)"
+    )
     args = parser.parse_args()
 
     packet_type = PacketType.LL16
diff --git a/src/core/executor/executor.cc b/src/core/executor/executor.cc
index 85c1c9907..9ef59bc1f 100644
--- a/src/core/executor/executor.cc
+++ b/src/core/executor/executor.cc
@@ -94,6 +94,7 @@ struct hash<mscclpp::DeviceExecutionPlanKey> {
 namespace {
 auto hasIBDevices = []() { return mscclpp::getIBDeviceCount() > 0; };
 
+// TODO(binyli): Need to add NVL domain check.
 auto useIB = [](int rank1, int rank2, int nranksPerNode) {
   bool inSameNode = rank1 / nranksPerNode == rank2 / nranksPerNode;
   return hasIBDevices() && !inSameNode;
@@ -108,7 +109,7 @@ namespace mscclpp {
 
 struct ExecutionContext {
   std::shared_ptr<ProxyService> proxyService;
-  std::vector<Connection> connections;  // one connection (unique QP) per channel
+  std::vector<Connection> connections;
   std::vector<std::shared_ptr<NvlsConnection>> nvlsConnections;
   MemoryId localMemoryIdBegin = MemoryId(0);
 
@@ -264,8 +265,7 @@ struct Executor::Impl {
     };
 
     // Create one connection (unique QP) per channel entry. Each channel gets its own
-    // QP — no shared connections. This is required for HostNoAtomic IB mode where each
-    // connection can only forward signals to one semaphore via setSignalForwardingDst.
+    // QP — no shared connections.
     // Use per-peer tag counters so that matched connections between pairs of ranks use
     // the same tag, regardless of the order peers appear in each rank's connected_to list.
     std::unordered_map<int, int> peerTagCounters;
@@ -275,14 +275,18 @@ struct Executor::Impl {
       std::vector<ChannelInfo> channelInfos = plan.impl_->getChannelInfos(channelType);
       for (const auto& info : channelInfos) {
         for (int peer : info.connectedPeers) {
-          Transport transport = useIB(rank, peer, this->nranksPerNode) ? ibTransport : Transport::CudaIpc;
+          Transport transport = channelType == ChannelType::PORT && useIB(rank, peer, this->nranksPerNode)
+                                    ? ibTransport
+                                    : Transport::CudaIpc;
           connFutures.push_back(this->comm->connect(transport, peer, peerTagCounters[peer]++));
         }
       }
       channelInfos = plan.impl_->getUnpairedChannelInfos(nranks, channelType);
       for (const auto& info : channelInfos) {
         for (int peer : info.connectedPeers) {
-          Transport transport = useIB(rank, peer, this->nranksPerNode) ? ibTransport : Transport::CudaIpc;
+          Transport transport = channelType == ChannelType::PORT && useIB(rank, peer, this->nranksPerNode)
+                                    ? ibTransport
+                                    : Transport::CudaIpc;
           connFutures.push_back(this->comm->connect(transport, peer, peerTagCounters[peer]++));
         }
       }

From 3bd24e17b665ce2dd4bcfc164f9dc804608b8e81 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 22 May 2026 18:05:35 +0000
Subject: [PATCH 129/132] WIP

---
 python/mscclpp/language/rank.py       | 26 +++++++++++++++++---------
 python/test/executor_test.py          |  8 +++++---
 python/test/executor_test_verifier.cu |  1 +
 3 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/python/mscclpp/language/rank.py b/python/mscclpp/language/rank.py
index 0c38cb064..3fd93dc75 100644
--- a/python/mscclpp/language/rank.py
+++ b/python/mscclpp/language/rank.py
@@ -304,16 +304,24 @@ def __init__(self, rank: int, buffer_type: BufferType, offset: int, size: int):
         self.size = offset + size
 
     def __getitem__(self, key):
-        if isinstance(key, slice):
-            start = key.start if key.start is not None else 0
-            stop = key.stop if key.stop is not None else (self.size - self.offset)
-            if self.offset + stop > self.size:
-                raise RuntimeError(
-                    f"Index range from {self.offset + start} - {self.offset + stop} is out of bounds for buffer {self.buffer_type}. Buffer size: {self.size}"
-                )
-            return Chunk(self.rank, self.buffer_type, self.offset + start, stop - start)
-        else:
+        if not isinstance(key, slice):
             raise TypeError(f"Buffer indices must be slices, not {type(key).__name__}")
+        if key.step is not None and key.step != 1:
+            raise ValueError(f"Buffer slicing does not support step != 1 (got step={key.step})")
+        buffer_size = self.size - self.offset
+        start = key.start if key.start is not None else 0
+        stop = key.stop if key.stop is not None else buffer_size
+        if start < 0 or stop < 0:
+            raise ValueError(
+                f"Buffer slicing does not support negative indices (got start={key.start}, stop={key.stop})"
+            )
+        if start > stop:
+            raise ValueError(f"Buffer slice start ({start}) must be <= stop ({stop})")
+        if self.offset + stop > self.size:
+            raise RuntimeError(
+                f"Index range from {self.offset + start} - {self.offset + stop} is out of bounds for buffer {self.buffer_type}. Buffer size: {self.size}"
+            )
+        return Chunk(self.rank, self.buffer_type, self.offset + start, stop - start)
 
 
 class Buffer(BaseBuffer):
diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index d4ff28749..9600ed3f3 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -86,6 +86,8 @@ def bench_correctness(
         coll = "reduce_scatter"
     elif "allreduce" in collective:
         coll = "all_reduce"
+    elif "alltoall" in collective:
+        coll = "all_to_all"
     elif "sendrecv" in collective:
         coll = "send_recv"
     else:
@@ -249,7 +251,7 @@ def main(
                 result_buf[i].data.ptr,
                 input_buf[i].nbytes,
                 result_buf[i].nbytes,
-                dtype_to_mscclpp_dtype(dtype),
+                dtype_to_mscclpp_dtype(dtype_str),
                 execution_plan,
                 stream.ptr,
                 packet_type,
@@ -262,7 +264,7 @@ def main(
             result_buf.data.ptr,
             input_buf.nbytes,
             result_buf.nbytes,
-            dtype_to_mscclpp_dtype(dtype),
+            dtype_to_mscclpp_dtype(dtype_str),
             execution_plan,
             stream.ptr,
             packet_type,
@@ -291,7 +293,7 @@ def main(
     result_nbytes = result_buf[0].nbytes if sendrecv_mode else result_buf.nbytes
     print(
         f"Rank: {mscclpp_group.my_rank} Execution time: {execution_time} us, "
-        f"data size: {result_nbytes} bytes data type: {dtype().dtype.name} "
+        f"data size: {result_nbytes} bytes data type: {dtype_str} "
         f"bandwidth: {result_nbytes / (execution_time * 1e-6) / (1024**3):.2f} GB/s, "
         f"packet type: {packet_type}"
     )
diff --git a/python/test/executor_test_verifier.cu b/python/test/executor_test_verifier.cu
index f784c9d37..96ab25c42 100644
--- a/python/test/executor_test_verifier.cu
+++ b/python/test/executor_test_verifier.cu
@@ -167,6 +167,7 @@ TEST_DATA_ALL_TO_ALL(int32, int)
     }                                                                                                               \
   }
 
+TEST_DATA_SEND_RECV(bfloat16, __nv_bfloat16)
 TEST_DATA_SEND_RECV(float16, __half)
 TEST_DATA_SEND_RECV(float32, float)
 TEST_DATA_SEND_RECV(int32, int)

From 142e7941dfa012cf240049e7fb42f536c417fddd Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 22 May 2026 18:11:26 +0000
Subject: [PATCH 130/132] WIP

---
 python/mscclpp/language/tests/multi_node/send_recv.py | 8 +++++++-
 python/test/executor_test.py                          | 5 +++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/python/mscclpp/language/tests/multi_node/send_recv.py b/python/mscclpp/language/tests/multi_node/send_recv.py
index fd70b543f..a3e543e8c 100644
--- a/python/mscclpp/language/tests/multi_node/send_recv.py
+++ b/python/mscclpp/language/tests/multi_node/send_recv.py
@@ -11,6 +11,12 @@
 
 def send_recv(name, nnodes, gpus_per_node, split_mask, instances):
     gpu_size = nnodes * gpus_per_node
+    group_size = split_mask + 1
+    if split_mask < 0 or (split_mask & (split_mask + 1)) != 0 or gpu_size % group_size != 0:
+        raise ValueError(
+            f"split_mask must be of the form 2^k - 1 and gpu_size ({gpu_size}) must be divisible by "
+            f"group_size ({group_size}), got split_mask={hex(split_mask)}"
+        )
     collective = SendRecv(gpu_size, 1, False)
     with CollectiveProgram(
         name,
@@ -33,7 +39,7 @@ def send_recv(name, nnodes, gpus_per_node, split_mask, instances):
         # Then lower.prev(tag1) == higher.next(tag1) and higher.prev(tag0) == lower.next(tag0)
         # When prev != next (3+ nodes), each channel targets a different peer so each gets tag 0
         # and this ordering doesn't matter.
-        group_size = split_mask + 1
+        group_size = group_size
         num_groups = gpu_size // group_size
         next_channels = {}  # channel for sending to next rank
         prev_channels = {}  # channel for receiving from prev rank
diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index 9600ed3f3..96012eae3 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -221,6 +221,11 @@ def main(
     split_mask: int = 0,
 ):
     mscclpp_group = CommGroup(MPI.COMM_WORLD)
+    if split_mask < 0 or (split_mask & (split_mask + 1)) != 0 or mscclpp_group.nranks % (split_mask + 1) != 0:
+        raise ValueError(
+            f"split_mask must be of the form 2^k - 1 and nranks ({mscclpp_group.nranks}) must be divisible "
+            f"by group_size ({split_mask + 1}), got split_mask={hex(split_mask)}"
+        )
     cp.cuda.Device(mscclpp_group.my_rank % mscclpp_group.nranks_per_node).use()
     executor = Executor(mscclpp_group.communicator)
     npkit_dump_dir = env().npkit_dump_dir

From fd27fa0ae74080a181aaa78425a7125dd9936b32 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 22 May 2026 18:27:37 +0000
Subject: [PATCH 131/132] Simplify executor_test: unify single/double-buffer
 paths via lists

build_bufs now always returns parallel lists of buffers (length 1 for
normal collectives, length 2 for sendrecv double-buffering), so
bench_time, bench_correctness, and main() no longer branch on
sendrecv_mode or double_buf. Iteration i uses funcs[i % len(funcs)]
uniformly.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 python/test/executor_test.py | 98 ++++++++++++++----------------------
 1 file changed, 38 insertions(+), 60 deletions(-)

diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index 96012eae3..0159b8fab 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -14,7 +14,7 @@
 from mscclpp.utils import KernelBuilder, pack
 import os
 import struct
-from typing import Callable, Union
+from typing import Callable
 
 import cupy as cp
 from mpi4py import MPI
@@ -35,16 +35,13 @@ def parse_dtype(dtype_str):
         raise ValueError(f"Unknown data type: {dtype_str}")
 
 
-def bench_time(n_iters: int, n_graph_iters: int, func: Union[Callable, list[Callable]]):
-    """Benchmark execution time. func can be a single callable or a list of 2 for double-buffer."""
+def bench_time(n_iters: int, n_graph_iters: int, funcs: list[Callable]):
+    """Benchmark execution time. `funcs` is a list of callables; iteration i runs funcs[i % len(funcs)]."""
     stream = cp.cuda.Stream(non_blocking=True)
     with stream:
         stream.begin_capture()
         for i in range(n_iters):
-            if isinstance(func, list):
-                func[i % 2](stream)
-            else:
-                func(stream)
+            funcs[i % len(funcs)](stream)
         graph = stream.end_capture()
 
     # now run a warm up round
@@ -65,19 +62,18 @@ def bench_time(n_iters: int, n_graph_iters: int, func: Union[Callable, list[Call
 
 def bench_correctness(
     collective: str,
-    input_buf: Union[cp.ndarray, list[cp.ndarray]],
-    result_buf: Union[cp.ndarray, list[cp.ndarray]],
-    test_buf: Union[cp.ndarray, list[cp.ndarray]],
+    input_bufs: list[cp.ndarray],
+    result_bufs: list[cp.ndarray],
+    test_bufs: list[cp.ndarray],
     dtype_str: str,
     rank: int,
     num_ranks: int,
     n_iters: int,
-    func: Union[Callable, list[Callable]],
+    funcs: list[Callable],
     split_mask: int = 0,
 ):
-    """Validate correctness. For sendrecv, buffers and func are lists of 2 for double-buffer."""
+    """Validate correctness. Buffers and funcs are parallel lists; iteration i uses index i % len(funcs)."""
     type_size = cp.dtype(parse_dtype(dtype_str)).itemsize
-    double_buf = isinstance(input_buf, list)
 
     fill_data_kernel_name = "fill_data_%s" % dtype_str
     if "allgather" in collective:
@@ -108,23 +104,16 @@ def bench_correctness(
     with stream:
         stream.begin_capture()
         for i in range(n_iters):
-            if double_buf:
-                idx = i % 2
-                cur_input = input_buf[idx]
-                cur_result = result_buf[idx]
-                cur_test = test_buf[idx]
-                cur_func = func[idx]
-            else:
-                cur_input = input_buf
-                cur_result = result_buf
-                cur_test = test_buf
-                cur_func = func
+            idx = i % len(funcs)
+            cur_input = input_bufs[idx]
+            cur_result = result_bufs[idx]
+            cur_test = test_bufs[idx]
 
             fill_data_params = (
                 pack(cur_input) + struct.pack("Q", cur_input.nbytes // type_size) + pack(rank, i, split_mask)
             )
             fill_data_kernel.launch_kernel(fill_data_params, nblocks, nthreads, 0, stream)
-            cur_func(stream)
+            funcs[idx](stream)
             test_data_params = (
                 pack(cur_result, cur_test)
                 + struct.pack("Q", cur_input.nbytes // type_size)
@@ -170,15 +159,18 @@ def build_bufs(
     rank: int,
     num_ranks: int,
 ):
+    """Allocate input/result/test buffers. Returns parallel lists (length 2 for sendrecv double-buffering,
+    length 1 otherwise) so callers can iterate uniformly."""
     type_size = cp.dtype(dtype).itemsize
     assert (size % type_size) == 0, "size %d not multiple of type size %d" % (size, type_size)
     nelems = size // type_size
 
-    # Sendrecv uses double buffering: return lists of 2 buffers
+    # Sendrecv uses double buffering: build two parallel buffer slots.
     if "sendrecv" in collective:
-        input_bufs = [GpuBuffer(nelems, dtype=dtype) for _ in range(2)]
-        result_bufs = [GpuBuffer(nelems, dtype=dtype) for _ in range(2)]
-        test_bufs = [cp.zeros(nelems, dtype=dtype) for _ in range(2)]
+        n_slots = 2
+        input_bufs = [GpuBuffer(nelems, dtype=dtype) for _ in range(n_slots)]
+        result_bufs = [GpuBuffer(nelems, dtype=dtype) for _ in range(n_slots)]
+        test_bufs = [cp.zeros(nelems, dtype=dtype) for _ in range(n_slots)]
         return input_bufs, result_bufs, test_bufs, nelems
 
     if "allgather" in collective:
@@ -207,7 +199,7 @@ def build_bufs(
 
     test_buf = cp.zeros(nelems, dtype=dtype)
 
-    return input_buf, result_buf, test_buf, nelems
+    return [input_buf], [result_buf], [test_buf], nelems
 
 
 def main(
@@ -235,7 +227,7 @@ def main(
     collective = execution_plan.collective
 
     dtype = parse_dtype(dtype_str)
-    input_buf, result_buf, test_buf, nelem = build_bufs(
+    input_bufs, result_bufs, test_bufs, nelem = build_bufs(
         collective,
         size,
         in_place,
@@ -244,58 +236,44 @@ def main(
         mscclpp_group.nranks,
     )
 
-    sendrecv_mode = "sendrecv" in collective
-
-    if sendrecv_mode:
-        # Double-buffer: create two executor funcs, one per buffer pair
-        executor_funcs = []
-        for idx in range(2):
-            func = lambda stream, i=idx: executor.execute(
+    executor_funcs = [
+        (
+            lambda stream, inp=inp, res=res: executor.execute(
                 mscclpp_group.my_rank,
-                input_buf[i].data.ptr,
-                result_buf[i].data.ptr,
-                input_buf[i].nbytes,
-                result_buf[i].nbytes,
+                inp.data.ptr,
+                res.data.ptr,
+                inp.nbytes,
+                res.nbytes,
                 dtype_to_mscclpp_dtype(dtype_str),
                 execution_plan,
                 stream.ptr,
                 packet_type,
             )
-            executor_funcs.append(func)
-    else:
-        executor_func = lambda stream: executor.execute(
-            mscclpp_group.my_rank,
-            input_buf.data.ptr,
-            result_buf.data.ptr,
-            input_buf.nbytes,
-            result_buf.nbytes,
-            dtype_to_mscclpp_dtype(dtype_str),
-            execution_plan,
-            stream.ptr,
-            packet_type,
         )
+        for inp, res in zip(input_bufs, result_bufs)
+    ]
 
     mscclpp_group.barrier()
     bench_correctness(
         collective,
-        input_buf,
-        result_buf,
-        test_buf,
+        input_bufs,
+        result_bufs,
+        test_bufs,
         dtype_str,
         mscclpp_group.my_rank,
         mscclpp_group.nranks,
         n_iters,
-        executor_funcs if sendrecv_mode else executor_func,
+        executor_funcs,
         split_mask=split_mask,
     )
 
     mscclpp_group.barrier()
-    execution_time = bench_time(n_iters, n_graph_iters, executor_funcs if sendrecv_mode else executor_func)
+    execution_time = bench_time(n_iters, n_graph_iters, executor_funcs)
     if npkit_dump_dir is not None:
         npkit.dump(npkit_dump_dir)
         npkit.shutdown()
 
-    result_nbytes = result_buf[0].nbytes if sendrecv_mode else result_buf.nbytes
+    result_nbytes = result_bufs[0].nbytes
     print(
         f"Rank: {mscclpp_group.my_rank} Execution time: {execution_time} us, "
         f"data size: {result_nbytes} bytes data type: {dtype_str} "

From bde8d454a6b5fc808b63be1a0194407756aae523 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 27 May 2026 04:06:04 +0000
Subject: [PATCH 132/132] WIP

---
 python/mscclpp/language/tests/multi_node/send_recv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mscclpp/language/tests/multi_node/send_recv.py b/python/mscclpp/language/tests/multi_node/send_recv.py
index a3e543e8c..0e898f952 100644
--- a/python/mscclpp/language/tests/multi_node/send_recv.py
+++ b/python/mscclpp/language/tests/multi_node/send_recv.py
@@ -87,7 +87,7 @@ def send_recv(name, nnodes, gpus_per_node, split_mask, instances):
 parser.add_argument("--name", type=str, help="name of the program")
 parser.add_argument("--nnodes", type=int, default=1, help="number of nodes")
 parser.add_argument("--gpus_per_node", type=int, help="number of gpus per node")
-parser.add_argument("--split_mask", type=lambda x: int(x, 0), default=0x3, help="split mask (e.g. 0x3)")
+parser.add_argument("--split_mask", type=lambda x: int(x, 0), default=0x0, help="split mask (e.g. 0x3)")
 parser.add_argument("--instances", type=int, default=4, help="number of instances")
 
 args = parser.parse_args()