microsoft
diff --git a/‎include/onnxruntime/core/common/profiler_common.h‎
Lines changed: 4 additions & 4 deletions b/‎include/onnxruntime/core/common/profiler_common.h‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎include/onnxruntime/core/framework/run_options.h‎
Lines changed: 8 additions & 0 deletions b/‎include/onnxruntime/core/framework/run_options.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎include/onnxruntime/core/session/onnxruntime_c_api.h‎
Lines changed: 22 additions & 0 deletions b/‎include/onnxruntime/core/session/onnxruntime_c_api.h‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎include/onnxruntime/core/session/onnxruntime_cxx_api.h‎
Lines changed: 13 additions & 0 deletions b/‎include/onnxruntime/core/session/onnxruntime_cxx_api.h‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎include/onnxruntime/core/session/onnxruntime_cxx_inline.h‎
Lines changed: 10 additions & 0 deletions b/‎include/onnxruntime/core/session/onnxruntime_cxx_inline.h‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎onnxruntime/core/common/profiler.cc‎
Lines changed: 7 additions & 6 deletions b/‎onnxruntime/core/common/profiler.cc‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎onnxruntime/core/common/profiler.h‎
Lines changed: 1 addition & 1 deletion b/‎onnxruntime/core/common/profiler.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎onnxruntime/core/framework/run_options.cc‎
Lines changed: 13 additions & 0 deletions b/‎onnxruntime/core/framework/run_options.cc‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎onnxruntime/core/framework/sequential_executor.cc‎
Lines changed: 76 additions & 36 deletions b/‎onnxruntime/core/framework/sequential_executor.cc‎
Lines changed: 76 additions & 36 deletions
diff --git a/‎onnxruntime/core/framework/sequential_executor.h‎
Lines changed: 2 additions & 1 deletion b/‎onnxruntime/core/framework/sequential_executor.h‎
Lines changed: 2 additions & 1 deletion
@@ -4,9 +4,9 @@
 #pragma once
 
 #include "core/common/common.h"
+#include "core/common/inlined_containers.h"
 
 #include <string>
-#include <unordered_map>
 
 namespace onnxruntime {
 namespace profiling {
@@ -35,7 +35,7 @@ struct EventRecord {
               std::string&& event_name,
               long long time_stamp,
               long long duration,
-              std::unordered_map<std::string, std::string>&& event_args)
+              InlinedHashMap<std::string, std::string>&& event_args)
       : cat(category),
         pid(process_id),
         tid(thread_id),
@@ -50,7 +50,7 @@ struct EventRecord {
               const std::string& event_name,
               long long time_stamp,
               long long duration,
-              const std::unordered_map<std::string, std::string>& event_args)
+              const InlinedHashMap<std::string, std::string>& event_args)
       : cat(category),
         pid(process_id),
         tid(thread_id),
@@ -70,7 +70,7 @@ struct EventRecord {
   std::string name{};
   long long ts = 0;
   long long dur = 0;
-  std::unordered_map<std::string, std::string> args{};
+  InlinedHashMap<std::string, std::string> args{};
 };
 
 using Events = std::vector<EventRecord>;
 
@@ -35,6 +35,14 @@ struct OrtRunOptions {
   // So it is possible that only some of the nodes are executed.
   bool only_execute_path_to_fetches = false;
 
+  // Set to 'true' to enable profiling for this run.
+  bool enable_profiling = false;
+
+  // File prefix for profiling result for this run.
+  // The actual filename will be: <profile_file_prefix>_<timestamp>.json
+  // Only used when enable_profiling is true.
+  std::basic_string<ORTCHAR_T> profile_file_prefix = ORT_TSTR("onnxruntime_run_profile");
+
 #ifdef ENABLE_TRAINING
   // Used by onnxruntime::training::TrainingSession. This class is now deprecated.
   // Delete training_mode when TrainingSession is deleted.
 
@@ -7195,6 +7195,28 @@ struct OrtApi {
    * \since 1.24
    */
   ORT_API_T(void, RunOptionsSetSyncStream, _Inout_ OrtRunOptions* options, _In_ OrtSyncStream* sync_stream);
+
+  /** \brief Enable profiling for this run
+   *
+   * \param[in] options
+   * \param[in] profile_file_prefix The prefix for the profile file. The actual filename will be:
+   *                                <profile_file_prefix>_<timestamp>.json
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.25.
+   */
+  ORT_API2_STATUS(RunOptionsEnableProfiling, _Inout_ OrtRunOptions* options, _In_ const ORTCHAR_T* profile_file_prefix);
+
+  /** \brief Disable profiling for this run
+   *
+   * \param[in] options
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.25.
+   */
+  ORT_API2_STATUS(RunOptionsDisableProfiling, _Inout_ OrtRunOptions* options);
 };
 
 /*
 
@@ -1382,6 +1382,19 @@ struct RunOptions : detail::Base<OrtRunOptions> {
    * \param stream The OrtSyncStream to associate with these run options. May be nullptr to clear.
    */
   RunOptions& SetSyncStream(OrtSyncStream* stream);
+
+  /** \brief Enable profiling for this run
+   *
+   * Wraps OrtApi::RunOptionsEnableProfiling
+   * \param profile_file_prefix The prefix for the profile file
+   */
+  RunOptions& EnableProfiling(const ORTCHAR_T* profile_file_prefix);
+
+  /** \brief Disable profiling for this run
+   *
+   * Wraps OrtApi::RunOptionsDisableProfiling
+   */
+  RunOptions& DisableProfiling();
 };
 
 namespace detail {
 
@@ -1077,6 +1077,16 @@ inline RunOptions& RunOptions::SetSyncStream(OrtSyncStream* stream) {
   return *this;
 }
 
+inline RunOptions& RunOptions::EnableProfiling(const ORTCHAR_T* profile_file_prefix) {
+  ThrowOnError(GetApi().RunOptionsEnableProfiling(p_, profile_file_prefix));
+  return *this;
+}
+
+inline RunOptions& RunOptions::DisableProfiling() {
+  ThrowOnError(GetApi().RunOptionsDisableProfiling(p_));
+  return *this;
+}
+
 inline ModelCompilationOptions::ModelCompilationOptions(const Env& env, const SessionOptions& session_options) {
   ThrowOnError(GetCompileApi().CreateModelCompilationOptionsFromSessionOptions(env, session_options, &this->p_));
 }
 
@@ -71,16 +71,17 @@ template void Profiler::StartProfiling<char>(const std::basic_string<char>& file
 template void Profiler::StartProfiling<wchar_t>(const std::basic_string<wchar_t>& file_name);
 #endif
 
-void Profiler::EndTimeAndRecordEvent(EventCategory category,
-                                     const std::string& event_name,
-                                     const TimePoint& start_time,
-                                     const std::initializer_list<std::pair<std::string, std::string>>& event_args,
-                                     bool /*sync_gpu*/) {
+void Profiler::EndTimeAndRecordEvent(
+    EventCategory category,
+    const std::string& event_name,
+    const TimePoint& start_time,
+    InlinedHashMap<std::string, std::string> event_args,
+    bool /*sync_gpu*/) {
   long long dur = TimeDiffMicroSeconds(start_time);
   long long ts = TimeDiffMicroSeconds(profiling_start_time_, start_time);
 
   EventRecord event(category, logging::GetProcessId(),
-                    logging::GetThreadId(), event_name, ts, dur, {event_args.begin(), event_args.end()});
+                    logging::GetThreadId(), event_name, ts, dur, std::move(event_args));
   if (profile_with_logger_) {
     custom_logger_->SendProfileEvent(event);
   } else {
 
@@ -77,7 +77,7 @@ class Profiler {
   void EndTimeAndRecordEvent(EventCategory category,
                              const std::string& event_name,
                              const TimePoint& start_time,
-                             const std::initializer_list<std::pair<std::string, std::string>>& event_args = {},
+                             InlinedHashMap<std::string, std::string> event_args = {},
                              bool sync_gpu = false);
 
   /*
 
@@ -84,3 +84,16 @@ ORT_API_STATUS_IMPL(OrtApis::RunOptionsAddActiveLoraAdapter, _Inout_ OrtRunOptio
   return nullptr;
   API_IMPL_END
 }
+
+ORT_API_STATUS_IMPL(OrtApis::RunOptionsEnableProfiling, _Inout_ OrtRunOptions* options,
+                    _In_ const ORTCHAR_T* profile_file_prefix) {
+  options->enable_profiling = true;
+  options->profile_file_prefix = profile_file_prefix;
+  return nullptr;
+}
+
+ORT_API_STATUS_IMPL(OrtApis::RunOptionsDisableProfiling, _Inout_ OrtRunOptions* options) {
+  options->enable_profiling = false;
+  options->profile_file_prefix.clear();
+  return nullptr;
+}
@@ -8,6 +8,7 @@
 #include <vector>
 #include <sstream>
 #include "core/common/common.h"
+#include "core/common/inlined_containers.h"
 #include "core/common/logging/logging.h"
 #include "core/framework/allocation_planner.h"
 #include "core/framework/execution_frame.h"
@@ -155,8 +156,8 @@ std::string ComposeSeriesName(const GraphViewer& graph_viewer) {
 class SessionScope {
  public:
   friend class KernelScope;
-  SessionScope(const SessionState& session_state, const ExecutionFrame& frame)
-      : session_state_(session_state)
+  SessionScope(const SessionState& session_state, const ExecutionFrame& frame, profiling::Profiler* run_profiler)
+      : session_state_(session_state), run_profiler_(run_profiler)
 #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
         ,
         frame_(frame)
@@ -173,13 +174,10 @@ class SessionScope {
 #endif
 #ifdef DEBUG_NODE_INPUTS_OUTPUTS
         ,
-        dump_context_{
-            session_state_.GetGraphExecutionCounter(), 0}
+        dump_context_{session_state_.GetGraphExecutionCounter(), 0}
 #endif
   {
-    if (session_state_.Profiler().IsEnabled()) {
-      session_start_ = session_state.Profiler().Start();
-    }
+    session_start_ = StartProfilingIfEnabled();
 
     auto& logger = session_state_.Logger();
     VLOGS(logger, 0) << "Begin execution";
@@ -225,9 +223,8 @@ class SessionScope {
     }
 #endif
 
-    if (session_state_.Profiler().IsEnabled()) {
-      session_state_.Profiler().EndTimeAndRecordEvent(profiling::SESSION_EVENT, "SequentialExecutor::Execute", session_start_);
-    }
+    StopProfilingIfEnabled(profiling::SESSION_EVENT, "SequentialExecutor::Execute", session_start_);
+
 #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
     auto& logger = session_state_.Logger();
     for (auto i : frame_.GetStaticMemorySizeInfo()) {
@@ -252,8 +249,45 @@ class SessionScope {
   }
 #endif
 
+  bool IsRunProfilingEnabled() const {
+    return run_profiler_ && run_profiler_->IsEnabled();
+  }
+
+  void StopProfilingIfEnabled(profiling::EventCategory category,
+                              const std::string& event_name,
+                              const TimePoint& start_time,
+                              InlinedHashMap<std::string, std::string> event_args = {}) {
+    const bool session_profiling_enabled = session_state_.Profiler().IsEnabled();
+    const bool run_profiling_enabled = IsRunProfilingEnabled();
+
+    if (session_profiling_enabled) {
+      session_state_.Profiler().EndTimeAndRecordEvent(category,
+                                                      event_name,
+                                                      start_time,
+                                                      std::move(event_args));
+    } else if (run_profiling_enabled) {
+      run_profiler_->EndTimeAndRecordEvent(category,
+                                           event_name,
+                                           start_time,
+                                           std::move(event_args));
+    }
+  }
+
+  TimePoint StartProfilingIfEnabled() {
+    const bool session_profiling_enabled = session_state_.Profiler().IsEnabled();
+    const bool run_profiling_enabled = IsRunProfilingEnabled();
+
+    if (session_profiling_enabled) {
+      return session_state_.Profiler().Start();
+    } else if (run_profiling_enabled) {
+      return run_profiler_->Start();
+    }
+    return TimePoint{};
+  }
+
  private:
   const SessionState& session_state_;
+  profiling::Profiler* run_profiler_;
   TimePoint session_start_;
 #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
   const ExecutionFrame& frame_;
@@ -340,16 +374,17 @@ class KernelScope {
     utils::DumpNodeInputs(dump_context_, kernel_context_, kernel_.Node(), session_state_, session_scope_.dump_analysis_);
 #endif
 
-#ifdef ENABLE_NVTX_PROFILE
-    node_compute_range_.Begin();
-#endif
+    const bool session_profiling_enabled = session_state_.Profiler().IsEnabled();
+    const bool run_profiling_enabled = session_scope_.IsRunProfilingEnabled();
 
-    if (session_state_.Profiler().IsEnabled()) {
+    if (session_profiling_enabled || run_profiling_enabled) {
       auto& node = kernel.Node();
       node_name_ = node.Name().empty() ? MakeString(node.OpType(), "_", node.Index()) : node.Name();
       concurrency::ThreadPool::StartProfiling(session_state_.GetThreadPool());
       VLOGS(session_state_.Logger(), 1) << "Computing kernel: " << node_name_;
-      kernel_begin_time_ = session_state_.Profiler().Start();
+
+      kernel_begin_time_ = session_scope_.StartProfilingIfEnabled();
+
       CalculateTotalInputSizes(&kernel_context, &kernel_,
                                input_activation_sizes_, input_parameter_sizes_,
                                node_name_, input_type_shape_);
@@ -363,26 +398,30 @@ class KernelScope {
     node_compute_range_.End();
 #endif
 
-    if (session_state_.Profiler().IsEnabled()) {
-      auto& profiler = session_state_.Profiler();
+    const bool session_profiling_enabled = session_state_.Profiler().IsEnabled();
+    const bool run_profiling_enabled = session_scope_.IsRunProfilingEnabled();
+
+    if (session_profiling_enabled || run_profiling_enabled) {
       std::string output_type_shape_;
       CalculateTotalOutputSizes(&kernel_context_, total_output_sizes_, node_name_, output_type_shape_);
-      profiler.EndTimeAndRecordEvent(profiling::NODE_EVENT,
-                                     node_name_ + "_kernel_time",
-                                     kernel_begin_time_,
-                                     // Log additional operation args / info.
-                                     {
-                                         {"op_name", kernel_.KernelDef().OpName()},
-                                         {"provider", kernel_.KernelDef().Provider()},
-                                         {"node_index", std::to_string(kernel_.Node().Index())},
-                                         {"activation_size", std::to_string(input_activation_sizes_)},
-                                         {"parameter_size", std::to_string(input_parameter_sizes_)},
-                                         {"output_size", std::to_string(total_output_sizes_)},
-                                         {"input_type_shape", input_type_shape_},
-                                         {"output_type_shape", output_type_shape_},
-                                         {"thread_scheduling_stats",
-                                          concurrency::ThreadPool::StopProfiling(session_state_.GetThreadPool())},
-                                     });
+
+      InlinedHashMap<std::string, std::string> event_args = {
+          {"op_name", kernel_.KernelDef().OpName()},
+          {"provider", kernel_.KernelDef().Provider()},
+          {"node_index", std::to_string(kernel_.Node().Index())},
+          {"activation_size", std::to_string(input_activation_sizes_)},
+          {"parameter_size", std::to_string(input_parameter_sizes_)},
+          {"output_size", std::to_string(total_output_sizes_)},
+          {"input_type_shape", input_type_shape_},
+          {"output_type_shape", output_type_shape_},
+          {"thread_scheduling_stats",
+           concurrency::ThreadPool::StopProfiling(session_state_.GetThreadPool())},
+      };
+
+      session_scope_.StopProfilingIfEnabled(profiling::NODE_EVENT,
+                                            node_name_ + "_kernel_time",
+                                            kernel_begin_time_,
+                                            std::move(event_args));
     }
 
 #ifdef ONNXRUNTIME_ENABLE_INSTRUMENT
@@ -588,7 +627,8 @@ onnxruntime::Status ExecuteThePlan(const SessionState& session_state, gsl::span<
 #endif
                                    const bool& terminate_flag,
                                    const bool only_execute_path_to_fetches,
-                                   bool single_thread_mode) {
+                                   bool single_thread_mode,
+                                   profiling::Profiler* run_profiler) {
   auto* execution_plan = session_state.GetExecutionPlan();
   VLOGS(logger, 0) << "Number of streams: " << execution_plan->execution_plan.size();
   int32_t valid_streams = 0;
@@ -631,7 +671,7 @@ onnxruntime::Status ExecuteThePlan(const SessionState& session_state, gsl::span<
   ORT_UNUSED_PARAMETER(only_execute_path_to_fetches);
 #endif
 
-  SessionScope session_scope(session_state, ctx.GetExecutionFrame());
+  SessionScope session_scope(session_state, ctx.GetExecutionFrame(), run_profiler);
 
   auto* tp = single_thread_mode ? nullptr : session_state.GetInterOpThreadPool();
 
@@ -692,7 +732,7 @@ onnxruntime::Status PartialExecuteThePlan(const SessionState& session_state, gsl
 
   ctx.SetCurrentRange(&state.GetProgramRegions(session_state));
 
-  SessionScope session_scope(session_state, ctx.GetExecutionFrame());
+  SessionScope session_scope(session_state, ctx.GetExecutionFrame(), nullptr);
 
 #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
   // Only flush memory info for the 2nd partial graph execution (since ORTModule runs this function twice).
 
@@ -46,7 +46,8 @@ onnxruntime::Status ExecuteThePlan(const SessionState& session_state, gsl::span<
 #endif
                                    const bool& terminate_flag,
                                    const bool only_execute_path_to_fetches,
-                                   bool single_thread_mode);
+                                   bool single_thread_mode,
+                                   profiling::Profiler* run_profiler = nullptr);
 
 #ifdef ENABLE_TRAINING
 onnxruntime::Status PartialExecuteThePlan(const SessionState& session_state, gsl::span<const int> feed_mlvalue_idxs,