NVIDIA
diff --git a/‎CHANGELOG.md‎
Lines changed: 12 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 43 additions & 0 deletions b/‎README.md‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎cli/nvloom_cli.cpp‎
Lines changed: 66 additions & 11 deletions b/‎cli/nvloom_cli.cpp‎
Lines changed: 66 additions & 11 deletions
@@ -1,5 +1,17 @@
 # Changelog
 
+## [1.2.0] - 2025-07-21
+
+### Added
+- Multicast reductions benchmarks
+- Option to specify iteration count (-i/--iterations)
+- Option to repeat a testcase for a specified number of iterations (-c/--repeat)
+- Option to repeat a testcase for a specified number of seconds (-d/--duration)
+- CUDA Stream Ordered Memory Allocator was added as a new allocator option (-a cudapool)
+
+### Changed
+- Caching multicast allocations is now much faster, thanks to multicast-specific memory pool
+
 ## [1.1.0] - 2025-05-22
 
 ### Added
 
@@ -1,5 +1,5 @@
 #[[
-SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: Apache-2.0
 
 Licensed under the Apache License, Version 2.0 (the "License");
 
@@ -174,6 +174,28 @@ The default is 5 samples per rack.
 
 When `--richOutput` is enabled, all sample measurements will be shown. Otherwise, only the median value across the samples is reported.
 
+## --iterations
+The `--iterations` option controls how many copy operations are performed within each measurement, not including the initial warmup iteration.
+
+The default is 16.
+
+Controlled with `-i` or `--iterations` option.
+
+## --repeat
+The `--repeat` option controls how many times each testcase is executed in a single run. By default, each testcase is run once. You can use this option to repeat the same testcase multiple times.
+
+Controlled with `-c` or `--repeat` option.
+
+For example, `-c 10` will run each selected testcase 10 times in a row.
+
+## --duration
+
+The `--duration` option allows you to specify how long (in seconds) each testcase should be repeated. The testcase will be executed repeatedly until the specified duration has elapsed.
+
+Controlled with `-d` or `--duration` option.
+
+**Note:** You cannot specify both `--duration` and `--repeat` at the same time; only one of these options can be used per run. If neither is specified, each testcase will run once (the default).
+
 # Heatmap plotter
 
 `plot_heatmaps.py` included in the `nvloom_cli` directory produces heatmaps for each testcase of a given `nvloom_cli` output.
@@ -351,3 +373,24 @@ Bandwidth of the "continuous arrow" copy is reported, however it causes `NUM_GPU
 Multicast_all_to_all measures bandwidth of every single GPU broadcasting to every single GPU at the same time. In essence, it's `NUM_GPU` of `multicast_one_to_all` running simultaneously.
 
 Sum of all "continuous arrow" bandwidth is reported.
+
+### Multicast_one_to_all_red: multimem.red
+
+Each measurement in multicast_one_to_all_red performs an addition of data from a regular "device" buffer on the source GPU to a multicast allocation that's allocated on all GPUs in the job. Multimem.red PTX instruction is used for this reduction. For more information, see [Data Movement and Conversion Instructions: multimem.ld_reduce, multimem.st, multimem.red](https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-multimem)
+
+![Diagram of multicast multimem.red traffic pattern](docs/multicast_red.png)
+
+### Multicast_all_to_all_red: multimem.red
+
+Multicast_all_to_all measures bandwidth of every single GPU adding data (reducing) to every single GPU at the same time. In essence, it's `NUM_GPU` of `multicast_one_to_all_red` running simultaneously.
+
+### Multicast_all_to_one_red: multimem.ld_reduce
+
+Each measurement in multicast_all_to_one_red performs a sum of data residing on all GPUs and saves the result to local memory. Multimem.ld_reduce PTX instruction is used for this reduction. For more information, see [Data Movement and Conversion Instructions: multimem.ld_reduce, multimem.st, multimem.red](https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-multimem)
+
+![Diagram of multicast multimem.ld_reduce traffic pattern](docs/multicast_ld_reduce.png)
+
+### Multicast_all_to_all_ld_reduce: multimem.ld_reduce
+
+Multicast_all_to_all measures bandwidth of every single GPU reducing data from every single GPU at the same time. In essence, it's `NUM_GPU` of `Multicast_all_to_all_ld_reduce` running simultaneously.
+
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -24,13 +24,32 @@
 #include <iostream>
 #include <memory>
 
-#define NVLOOM_VERSION "1.1.0"
+#define NVLOOM_VERSION "1.2.0"
 #ifndef GIT_COMMIT
 #define GIT_COMMIT "unknown"
 #endif
 
 bool richOutput = false;
 int gpuToRackSamples = 5;
+int iterations = NvLoom::getDefaultIterationCount();
+
+bool shouldContinue(boost::program_options::variables_map &vm, int iteration, std::chrono::time_point<std::chrono::high_resolution_clock> startTime) {
+    if (vm["repeat"].defaulted() && vm["duration"].defaulted()) {
+        return false;
+    }
+
+    if (!vm["repeat"].defaulted()) {
+        return iteration + 1 < vm["repeat"].as<int>();
+    }
+
+    if (!vm["duration"].defaulted()) {
+        auto duration = std::chrono::duration<double>(std::chrono::high_resolution_clock::now() - startTime).count();
+        return duration < vm["duration"].as<int>();
+    }
+
+    ASSERT(0);
+    return false;
+}
 
 int run_program(int argc, char **argv) {
     boost::program_options::options_description opts("nvloom CLI");
@@ -40,6 +59,8 @@ int run_program(int argc, char **argv) {
 
     bool listTestcases = false;
     int bufferSizeInMiB = 512;
+    int repeat = 1;
+    int duration = -1;
 
     std::string suitesOptionDescription("Suite(s) to run (by name): all-to-one, egm, fabric-stress, gpu-to-rack, multicast, pairwise, rack-to-rack");
     opts.add_options()
@@ -49,8 +70,11 @@ int run_program(int argc, char **argv) {
         ("suite,s", boost::program_options::value<std::vector<std::string>>(&suitesToRun)->multitoken(), suitesOptionDescription.c_str())
         ("listTestcases,l", boost::program_options::bool_switch(&listTestcases)->default_value(listTestcases), "List testcases")
         ("richOutput,r", boost::program_options::bool_switch(&richOutput)->default_value(richOutput), "Rich output")
-        ("allocatorStrategy,a", boost::program_options::value<std::string>(&allocatorStrategyString)->default_value("reuse"), "Allocator strategy: choose between unique and reuse")
+        ("allocatorStrategy,a", boost::program_options::value<std::string>(&allocatorStrategyString)->default_value("reuse"), "Allocator strategy: choose between unique, reuse and cudapool")
         ("gpuToRackSamples", boost::program_options::value<int>(&gpuToRackSamples)->default_value(gpuToRackSamples), "Number of per-rack samples to use in gpu_to_rack testcases")
+        ("iterations,i", boost::program_options::value<int>(&iterations)->default_value(iterations), "Number of copy iterations within the testcase to run, not including the warmup iteration")
+        ("repeat,c", boost::program_options::value<int>(&repeat)->default_value(repeat), "Number of times to repeat each testcase")
+        ("duration,d", boost::program_options::value<int>(&duration)->default_value(duration), "Duration of each testcase in seconds")
         ;
 
     boost::program_options::variables_map vm;
@@ -69,6 +93,11 @@ int run_program(int argc, char **argv) {
         return 0;
     }
 
+    if (!vm["repeat"].defaulted() && !vm["duration"].defaulted()) {
+        std::cerr << "Cannot specify both repeat and duration\n";
+        return 1;
+    }
+
     OUTPUT << "nvloom_cli " << NVLOOM_VERSION << std::endl;
     OUTPUT << "git commit: " << GIT_COMMIT << std::endl;
 
@@ -77,6 +106,8 @@ int run_program(int argc, char **argv) {
         allocatorStrategy = ALLOCATOR_STRATEGY_REUSE;
     } else if (allocatorStrategyString == "unique") {
         allocatorStrategy = ALLOCATOR_STRATEGY_UNIQUE;
+    } else if (allocatorStrategyString == "cudapool") {
+        allocatorStrategy = ALLOCATOR_STRATEGY_CUDA_POOLS;
     } else {
         std::cerr << "Unknown value for the allocatorStrategy argument: " << allocatorStrategyString << "\n";
         OUTPUT << opts << "\n";
@@ -89,6 +120,8 @@ int run_program(int argc, char **argv) {
 
     OUTPUT << "Buffer size: " << bufferSizeInMiB << " MiB" << std::endl;
 
+    OUTPUT << "Iteration count: " << iterations << std::endl;
+
     auto [testcases, suites] = buildTestcases(allocatorStrategy);
 
     if (listTestcases) {
@@ -130,14 +163,36 @@ int run_program(int argc, char **argv) {
     }
 
     for (auto testcase : testcasesToRunSet) {
-        OUTPUT << "Running " << testcase << std::endl;
-        auto startTime = std::chrono::high_resolution_clock::now();
-        testcases[testcase]->filterRun(bufferSizeInB);
-        clearAllocationPools();
-        auto endTime = std::chrono::high_resolution_clock::now();
-        OUTPUT << "ExecutionTime " << testcase << " " << std::chrono::duration<double>(endTime - startTime).count() << " s" << std::endl;
-        OUTPUT << "Done " << testcase << std::endl;
-        OUTPUT << std::endl;
+        int iterationCount = 0;
+        auto loopStartTime = std::chrono::high_resolution_clock::now();
+        while (true) {
+            std::string testcaseName = testcase;
+            if (!vm["repeat"].defaulted() || !vm["duration"].defaulted()) {
+                testcaseName += "_iter_" + std::to_string(iterationCount);
+            }
+
+            OUTPUT << "Running " << testcaseName << std::endl;
+            auto startTime = std::chrono::high_resolution_clock::now();
+            testcases[testcase]->filterRun(bufferSizeInB);
+            auto endTime = std::chrono::high_resolution_clock::now();
+
+            bool shouldContinueIteration = shouldContinue(vm, iterationCount, loopStartTime);
+            if (!shouldContinueIteration) {
+                // We're only clearing the pools on last iteration of the loop
+                // But we still want to include the time it took to clear the pools in the output
+                clearAllocationPools();
+            }
+
+            OUTPUT << "ExecutionTime " << testcaseName << " " << std::chrono::duration<double>(endTime - startTime).count() << " s" << std::endl;
+            OUTPUT << "Done " << testcaseName << std::endl;
+            OUTPUT << std::endl;
+
+            if (!shouldContinueIteration) {
+                break;
+            }
+
+            iterationCount++;
+        }
     }
 
     return 0;