NVIDIA
diff --git a/‎CHANGELOG.md‎
Lines changed: 22 additions & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 22 additions & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 13 additions & 2 deletions b/‎CMakeLists.txt‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 264 additions & 15 deletions b/‎README.md‎
Lines changed: 264 additions & 15 deletions
diff --git a/‎cli/nvloom_cli.cpp‎
Lines changed: 17 additions & 5 deletions b/‎cli/nvloom_cli.cpp‎
Lines changed: 17 additions & 5 deletions
@@ -1,7 +1,28 @@
 # Changelog
 
+## [1.1.0] - 2025-05-22
+
+### Added
+
+- Heatmap plotter
+- Support for CUDA Error Log Management
+- Retry mechanism for CUDA multicast allocations
+- Nvloom_cli argument to set number of samples in gpu-to-rack testcases
+- Nvloom_cli now prints its version, git commit it was built from, and specified buffer size
+- Nvloom_cli now prints units when reporting results
+- Native compilation for sm_103 on CUDA 12.9 toolkits
+
+### Changed
+
+- Expanded README.md
+- Rack-to-rack are now both unidir and bidir, and bidir rack-to-rack are symmetry-optimized.
+
+### Fixed
+
+- Bug where requesting allocations over 4 GiB would fail with CUDA_OUT_OF_MEMORY
+
 ## [1.0.0] - 2025-03-17
 
 ### Added
 
-Initial release
+- Initial release
@@ -42,13 +42,24 @@ set(src
     library/kernels.cu
 )
 
+execute_process(
+    COMMAND git describe --always --tags
+    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
+    OUTPUT_VARIABLE GIT_COMMIT
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
 add_executable(nvloom_cli ${src})
 target_include_directories(nvloom_cli PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${CMAKE_CURRENT_SOURCE_DIR}/library .)
 target_link_libraries(nvloom_cli cuda nvidia-ml Boost::program_options)
 
-set_target_properties(nvloom_cli PROPERTIES CUDA_ARCHITECTURES "86;90;100")
+if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.9")
+    set_target_properties(nvloom_cli PROPERTIES CUDA_ARCHITECTURES "86;90;100;103")
+else()
+    set_target_properties(nvloom_cli PROPERTIES CUDA_ARCHITECTURES "86;90;100")
+endif()
 
 find_package(MPI REQUIRED)
 include_directories(SYSTEM ${MPI_INCLUDE_PATH})
 target_link_libraries(nvloom_cli MPI::MPI_CXX)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMULTINODE")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGIT_COMMIT=\\\"\"${GIT_COMMIT}\"\\\"")
@@ -24,7 +24,13 @@
 #include <iostream>
 #include <memory>
 
+#define NVLOOM_VERSION "1.1.0"
+#ifndef GIT_COMMIT
+#define GIT_COMMIT "unknown"
+#endif
+
 bool richOutput = false;
+int gpuToRackSamples = 5;
 
 int run_program(int argc, char **argv) {
     boost::program_options::options_description opts("nvloom CLI");
@@ -43,7 +49,8 @@ int run_program(int argc, char **argv) {
         ("suite,s", boost::program_options::value<std::vector<std::string>>(&suitesToRun)->multitoken(), suitesOptionDescription.c_str())
         ("listTestcases,l", boost::program_options::bool_switch(&listTestcases)->default_value(listTestcases), "List testcases")
         ("richOutput,r", boost::program_options::bool_switch(&richOutput)->default_value(richOutput), "Rich output")
-        ("allocatorStrategy,a", boost::program_options::value<std::string>(&allocatorStrategyString)->default_value("reuse"), "Allocator strategy: choose between unique and reuse");
+        ("allocatorStrategy,a", boost::program_options::value<std::string>(&allocatorStrategyString)->default_value("reuse"), "Allocator strategy: choose between unique and reuse")
+        ("gpuToRackSamples", boost::program_options::value<int>(&gpuToRackSamples)->default_value(gpuToRackSamples), "Number of per-rack samples to use in gpu_to_rack testcases")
         ;
 
     boost::program_options::variables_map vm;
@@ -62,6 +69,9 @@ int run_program(int argc, char **argv) {
         return 0;
     }
 
+    OUTPUT << "nvloom_cli " << NVLOOM_VERSION << std::endl;
+    OUTPUT << "git commit: " << GIT_COMMIT << std::endl;
+
     AllocatorStrategy allocatorStrategy;
     if (allocatorStrategyString == "reuse"){
         allocatorStrategy = ALLOCATOR_STRATEGY_REUSE;
@@ -77,11 +87,13 @@ int run_program(int argc, char **argv) {
 
     size_t bufferSizeInB = (size_t) bufferSizeInMiB * 1024 * 1024;
 
+    OUTPUT << "Buffer size: " << bufferSizeInMiB << " MiB" << std::endl;
+
     auto [testcases, suites] = buildTestcases(allocatorStrategy);
 
     if (listTestcases) {
         OUTPUT << "Available testcases: " << std::endl;
-        for (auto const& [testcaseName, testcaseFunction] : testcases) { 
+        for (auto const& [testcaseName, testcaseFunction] : testcases) {
             OUTPUT << testcaseName << std::endl;
         }
         return 0;
@@ -96,13 +108,13 @@ int run_program(int argc, char **argv) {
     for (auto suite: suitesToRun) {
         if (suites.count(suite) == 0) {
             std::cerr << "No such suite as \"" << suite << "\"\n";
-            return 1; 
+            return 1;
         }
         for (auto testcase: suites[suite]) {
             testcasesToRunSet.insert(testcase);
         }
     }
-    
+
     for (auto testcase: testcasesToRun) {
         if (testcases.count(testcase) == 0) {
             std::cerr << "No such testcase as \"" << testcase << "\"\n";
@@ -123,7 +135,7 @@ int run_program(int argc, char **argv) {
         testcases[testcase]->filterRun(bufferSizeInB);
         clearAllocationPools();
         auto endTime = std::chrono::high_resolution_clock::now();
-        OUTPUT << "ExecutionTime[s] " << testcase << " " << std::chrono::duration<double>(endTime - startTime).count() << std::endl;
+        OUTPUT << "ExecutionTime " << testcase << " " << std::chrono::duration<double>(endTime - startTime).count() << " s" << std::endl;
         OUTPUT << "Done " << testcase << std::endl;
         OUTPUT << std::endl;
     }