openxla
diff --git a/‎xprof/convert/event_time_fraction_analyzer_processor.cc‎
Lines changed: 18 additions & 5 deletions b/‎xprof/convert/event_time_fraction_analyzer_processor.cc‎
Lines changed: 18 additions & 5 deletions
diff --git a/‎xprof/convert/smart_suggestion/BUILD‎
Lines changed: 31 additions & 0 deletions b/‎xprof/convert/smart_suggestion/BUILD‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎xprof/convert/smart_suggestion/all_rules.h‎
Lines changed: 2 additions & 0 deletions b/‎xprof/convert/smart_suggestion/all_rules.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎xprof/convert/smart_suggestion/constants.h‎
Lines changed: 14 additions & 3 deletions b/‎xprof/convert/smart_suggestion/constants.h‎
Lines changed: 14 additions & 3 deletions
diff --git a/‎xprof/convert/smart_suggestion/host_cpu_bound_rule.h‎
Lines changed: 171 additions & 0 deletions b/‎xprof/convert/smart_suggestion/host_cpu_bound_rule.h‎
Lines changed: 171 additions & 0 deletions
@@ -48,12 +48,17 @@ using ::tensorflow::profiler::ToolOptions;
 using ::tensorflow::profiler::XSpace;
 
 std::vector<std::string> GetTargetEventNames(const ToolOptions& options) {
+  std::vector<std::string> result;
   for (const auto& [key, value] : options) {
-    if (key == "event_name" && std::holds_alternative<std::string>(value)) {
-      return absl::StrSplit(std::get<std::string>(value), ',');
+    if ((key == "event_name" || key == "tpu_event_name" ||
+         key == "cpu_event_name") &&
+        std::holds_alternative<std::string>(value)) {
+      std::vector<std::string> split =
+          absl::StrSplit(std::get<std::string>(value), ',', absl::SkipEmpty());
+      result.insert(result.end(), split.begin(), split.end());
     }
   }
-  return {};
+  return result;
 }
 
 std::unique_ptr<ProfileProcessor> CreateEventTimeFractionAnalyzerProcessor(
@@ -104,10 +109,18 @@ absl::StatusOr<std::string> EventTimeFractionAnalyzerProcessor::Map(
   PreprocessSingleHostXSpace(&xspace_copy, /*step_grouping=*/true,
                              /*derived_timeline=*/true);
   std::vector<std::string> target_event_names = GetTargetEventNames(options_);
-  TF_ASSIGN_OR_RETURN(EventTimeFractionAnalyzerResults results,
+  TF_ASSIGN_OR_RETURN(EventTimeFractionAnalyzerResults tpu_results,
                       ConvertXSpaceToEventTimeFractionAnalyzerResults(
                           xspace_copy, target_event_names));
-  return results.SerializeAsString();
+  TF_ASSIGN_OR_RETURN(EventTimeFractionAnalyzerResults host_results,
+                      ConvertXSpaceToHostEventTimeFractionAnalyzerResults(
+                          xspace_copy, target_event_names));
+
+  EventTimeFractionAnalyzerResults combined_results;
+  AccumulateEventTimeFractionAnalyzerResults(tpu_results, combined_results);
+  AccumulateEventTimeFractionAnalyzerResults(host_results, combined_results);
+
+  return combined_results.SerializeAsString();
 }
 
 absl::StatusOr<std::string> EventTimeFractionAnalyzerProcessor::Map(
 
@@ -138,6 +138,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "host_cpu_bound_rule",
+    hdrs = ["host_cpu_bound_rule.h"],
+    deps = [
+        ":constants",
+        ":signal_provider",
+        ":smart_suggestion_rule",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@org_xprof//plugin/xprof/protobuf:smart_suggestion_proto_cc",
+    ],
+)
+
 cc_library(
     name = "memory_bound_rule",
     hdrs = ["memory_bound_rule.h"],
@@ -275,6 +290,7 @@ cc_library(
         ":data_shuffle_bound_rule",
         ":data_transfer_bound_rule",
         ":debug_print_rule",
+        ":host_cpu_bound_rule",
         ":host_processing_bound_rule",
         ":input_bound_rule",
         ":memory_bound_rule",
@@ -357,6 +373,21 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "host_cpu_bound_rule_test",
+    srcs = ["host_cpu_bound_rule_test.cc"],
+    deps = [
+        ":host_cpu_bound_rule",
+        ":mock_tool_data_provider",
+        ":signal_provider",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+        "@org_xprof//plugin/xprof/protobuf:event_time_fraction_analyzer_proto_cc",
+        "@org_xprof//plugin/xprof/protobuf:smart_suggestion_proto_cc",
+    ],
+)
+
 cc_test(
     name = "host_processing_bound_rule_test",
     srcs = ["host_processing_bound_rule_test.cc"],
 
@@ -22,6 +22,7 @@ limitations under the License.
 #include "xprof/convert/smart_suggestion/data_shuffle_bound_rule.h"
 #include "xprof/convert/smart_suggestion/data_transfer_bound_rule.h"
 #include "xprof/convert/smart_suggestion/debug_print_rule.h"
+#include "xprof/convert/smart_suggestion/host_cpu_bound_rule.h"
 #include "xprof/convert/smart_suggestion/host_processing_bound_rule.h"
 #include "xprof/convert/smart_suggestion/input_bound_rule.h"
 #include "xprof/convert/smart_suggestion/memory_bound_rule.h"
@@ -41,6 +42,7 @@ inline void RegisterAllRules(SmartSuggestionRuleFactory* f) {
   f->Register<DataShuffleBoundRule>();
   f->Register<DataTransferBoundRule>();
   f->Register<DebugPrintRule>();
+  f->Register<HostCPUBoundRule>();
   f->Register<HostProcessingBoundRule>();
   f->Register<InputBoundRule>();
   f->Register<MemoryBoundRule>();
 
@@ -72,6 +72,14 @@ inline constexpr double kSpecialOpBoundThresholdInPercent = 10;
 // than this threshold, it is considered a bottleneck.
 inline constexpr double kDebugPrintBoundThresholdInPercent = 5;
 
+// If the percentage of step time that is due to the infeed op is higher
+// than this threshold, it is considered a bottleneck.
+inline constexpr double kInfeedOpBoundThresholdInPercent = 10.0;
+
+// If the percentage of trace time that is due to enqueue_device is higher
+// than this threshold, it is considered a bottleneck.
+inline constexpr double kEnqueueDeviceBoundThresholdInPercent = 30.0;
+
 // If the percentage of async-done time is higher than this threshold, it is
 // considered a bottleneck.
 inline constexpr double kAsyncDoneThresholdInPercent = 10;
@@ -84,9 +92,12 @@ inline constexpr double kMemoryUtilizationHighThreshold = 50;
 inline constexpr double kZeroEpsilon = 1e-6;
 
 // The events to be parsed by EventTimeFractionAnalyzer for smart suggestion.
-// Currently it includes barrier-cores and debug_print.
-inline constexpr char kEventTimeFractionAnalyzerEvents[] =
-    "barrier-cores,debug_print";
+// Currently it includes barrier-cores, debug_print, infeed, and EnqueueDevice.
+inline constexpr char kDeviceTpuEventTimeFractionAnalyzerEvents[] =
+    "barrier-cores,debug_print,infeed";
+
+inline constexpr char kHostCpuEventTimeFractionAnalyzerEvents[] =
+    "EnqueueDevice";
 
 }  // namespace profiler
 }  // namespace tensorflow
 
@@ -0,0 +1,171 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_XPROF_CONVERT_SMART_SUGGESTION_HOST_CPU_BOUND_RULE_H_
+#define THIRD_PARTY_XPROF_CONVERT_SMART_SUGGESTION_HOST_CPU_BOUND_RULE_H_
+
+#include <algorithm>
+#include <optional>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "xprof/convert/smart_suggestion/constants.h"
+#include "xprof/convert/smart_suggestion/signal_provider.h"
+#include "xprof/convert/smart_suggestion/smart_suggestion_rule.h"
+#include "plugin/xprof/protobuf/smart_suggestion.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+constexpr char kInfeedOpName[] = "infeed";
+constexpr char kEnqueueDeviceOpName[] = "EnqueueDevice";
+
+// Rule to detect host cpu bound bottlenecks (combining TPU infeed and
+// CPU EnqueueDevice).
+class HostCPUBoundRule : public SmartSuggestionRule {
+ public:
+  bool MeetsConditions(const SignalProvider& signal_provider) const override {
+    auto infeed_stats =
+        signal_provider.GetPerHostAvgEventTimePercent(kInfeedOpName);
+    auto enqueue_stats =
+        signal_provider.GetPerHostAvgEventTimePercent(kEnqueueDeviceOpName);
+
+    if (!infeed_stats.ok() || infeed_stats->empty() || !enqueue_stats.ok() ||
+        enqueue_stats->empty()) {
+      return false;
+    }
+
+    for (const auto& host_stat : *infeed_stats) {
+      const std::string& hostname = host_stat.first;
+      double infeed_percent = host_stat.second;
+
+      auto it =
+          std::find_if(enqueue_stats->begin(), enqueue_stats->end(),
+                       [&hostname](const std::pair<std::string, double>& p) {
+                         return p.first == hostname;
+                       });
+      if (it != enqueue_stats->end()) {
+        double enqueue_percent = it->second;
+        if (infeed_percent >= kInfeedOpBoundThresholdInPercent &&
+            enqueue_percent >= kEnqueueDeviceBoundThresholdInPercent) {
+          return true;
+        }
+      }
+    }
+
+    return false;
+  }
+
+  absl::StatusOr<std::optional<SmartSuggestion>> GenerateSuggestion(
+      const SignalProvider& signal_provider) const override {
+    SmartSuggestion suggestion;
+    suggestion.set_rule_name("HostCPUBoundRule");
+
+    absl::flat_hash_map<std::string, double> high_infeed_hosts;
+    double max_infeed_percent = 0.0;
+    auto infeed_stats =
+        signal_provider.GetPerHostAvgEventTimePercent(kInfeedOpName);
+    if (infeed_stats.ok()) {
+      for (const auto& host_stat : *infeed_stats) {
+        if (host_stat.second >= kInfeedOpBoundThresholdInPercent) {
+          high_infeed_hosts.insert(host_stat);
+          if (host_stat.second > max_infeed_percent) {
+            max_infeed_percent = host_stat.second;
+          }
+        }
+      }
+    }
+
+    absl::flat_hash_map<std::string, double> high_enqueue_hosts;
+    double max_enqueue_percent = 0.0;
+    auto enqueue_stats =
+        signal_provider.GetPerHostAvgEventTimePercent(kEnqueueDeviceOpName);
+    if (enqueue_stats.ok()) {
+      for (const auto& host_stat : *enqueue_stats) {
+        if (host_stat.second >= kEnqueueDeviceBoundThresholdInPercent) {
+          high_enqueue_hosts.insert(host_stat);
+          if (host_stat.second > max_enqueue_percent) {
+            max_enqueue_percent = host_stat.second;
+          }
+        }
+      }
+    }
+
+    std::string infeed_hosts_list_html = "<ul>";
+    for (const auto& [hostname, avg_percent] : high_infeed_hosts) {
+      absl::StrAppend(&infeed_hosts_list_html, "<li>Host <b>", hostname,
+                      "</b> average infeed time fraction: <b>",
+                      absl::StrFormat("%.1f", avg_percent), "%</b></li>");
+    }
+    absl::StrAppend(&infeed_hosts_list_html, "</ul>");
+
+    std::string enqueue_hosts_list_html = "<ul>";
+    for (const auto& [hostname, avg_percent] : high_enqueue_hosts) {
+      absl::StrAppend(&enqueue_hosts_list_html, "<li>Host <b>", hostname,
+                      "</b> average enqueue_device time fraction: <b>",
+                      absl::StrFormat("%.1f", avg_percent), "%</b></li>");
+    }
+    absl::StrAppend(&enqueue_hosts_list_html, "</ul>");
+
+    std::string suggestion_text = absl::StrCat(
+        "<p>Your program is likely bottlenecked by EnqueueDevice ops on host "
+        "CPU, an average of up to <b>",
+        absl::StrFormat("%.1f%%", max_enqueue_percent),
+        "</b> of time is spent on these operations. In addition, infeed op "
+        "on TensorCore consumes an average of up to <b>",
+        absl::StrFormat("%.1f%%", max_infeed_percent),
+        "</b> of step time correlated with the host CPU bottleneck.</p>",
+        "<p>Please consider the following optimizations:</p><ul>");
+
+    absl::StrAppend(
+        &suggestion_text,
+        "<li><b>Increase Host GCU:</b> provide more CPU resources to your "
+        "TPU worker tasks.</li>");
+
+    absl::StrAppend(
+        &suggestion_text,
+        "<li><b>(SparseCore)</b> Consider improving the input pipeline and "
+        "apply SparseCore-specific offloads, e.g., embedding data formatting "
+        "offload, gather offload.</li>");
+
+    absl::StrAppend(
+        &suggestion_text,
+        "<li><b>Overlap Computation:</b> Confirm that "
+        "<code>pipeline_execution_with_tensor_core</code> is set to "
+        "<code>True</code> in your embedding layer configuration. This "
+        "allows the TensorCore to potentially overlap its computation with "
+        "the SparseCore operations for the next step.</li>");
+
+    absl::StrAppend(
+        &suggestion_text,
+        "<li><b>Host Breakdown (Infeed):</b> The following hosts have high "
+        "device infeed time fractions:",
+        infeed_hosts_list_html,
+        "</li><li><b>Host Breakdown (EnqueueDevice):</b> The following hosts "
+        "have high host enqueue time fractions:",
+        enqueue_hosts_list_html, "</li></ul>");
+
+    suggestion.set_suggestion_text(suggestion_text);
+    return suggestion;
+  }
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_XPROF_CONVERT_SMART_SUGGESTION_HOST_CPU_BOUND_RULE_H_