Skip to content

Commit 5c3cec2

Browse files
Profiler Teamcopybara-github
authored andcommitted
Implemented host CPU bound rule for SS, triggered when any host's average infeed time fraction is over kInfeedOpBoundThresholdInPercent (set to 10% by default) AND average EnqueueDevice time fraction is over kEnqueueDeviceBoundThresholdInPercent (set to 30% by default).
PiperOrigin-RevId: 899442583
1 parent 719f4e2 commit 5c3cec2

9 files changed

+493
-21
lines changed

xprof/convert/event_time_fraction_analyzer_processor.cc

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,17 @@ using ::tensorflow::profiler::ToolOptions;
4848
using ::tensorflow::profiler::XSpace;
4949

5050
std::vector<std::string> GetTargetEventNames(const ToolOptions& options) {
51+
std::vector<std::string> result;
5152
for (const auto& [key, value] : options) {
52-
if (key == "event_name" && std::holds_alternative<std::string>(value)) {
53-
return absl::StrSplit(std::get<std::string>(value), ',');
53+
if ((key == "event_name" || key == "tpu_event_name" ||
54+
key == "cpu_event_name") &&
55+
std::holds_alternative<std::string>(value)) {
56+
std::vector<std::string> split =
57+
absl::StrSplit(std::get<std::string>(value), ',', absl::SkipEmpty());
58+
result.insert(result.end(), split.begin(), split.end());
5459
}
5560
}
56-
return {};
61+
return result;
5762
}
5863

5964
std::unique_ptr<ProfileProcessor> CreateEventTimeFractionAnalyzerProcessor(
@@ -104,10 +109,18 @@ absl::StatusOr<std::string> EventTimeFractionAnalyzerProcessor::Map(
104109
PreprocessSingleHostXSpace(&xspace_copy, /*step_grouping=*/true,
105110
/*derived_timeline=*/true);
106111
std::vector<std::string> target_event_names = GetTargetEventNames(options_);
107-
TF_ASSIGN_OR_RETURN(EventTimeFractionAnalyzerResults results,
112+
TF_ASSIGN_OR_RETURN(EventTimeFractionAnalyzerResults tpu_results,
108113
ConvertXSpaceToEventTimeFractionAnalyzerResults(
109114
xspace_copy, target_event_names));
110-
return results.SerializeAsString();
115+
TF_ASSIGN_OR_RETURN(EventTimeFractionAnalyzerResults host_results,
116+
ConvertXSpaceToHostEventTimeFractionAnalyzerResults(
117+
xspace_copy, target_event_names));
118+
119+
EventTimeFractionAnalyzerResults combined_results;
120+
AccumulateEventTimeFractionAnalyzerResults(tpu_results, combined_results);
121+
AccumulateEventTimeFractionAnalyzerResults(host_results, combined_results);
122+
123+
return combined_results.SerializeAsString();
111124
}
112125

113126
absl::StatusOr<std::string> EventTimeFractionAnalyzerProcessor::Map(

xprof/convert/smart_suggestion/BUILD

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,21 @@ cc_library(
138138
],
139139
)
140140

141+
cc_library(
142+
name = "host_cpu_bound_rule",
143+
hdrs = ["host_cpu_bound_rule.h"],
144+
deps = [
145+
":constants",
146+
":signal_provider",
147+
":smart_suggestion_rule",
148+
"@com_google_absl//absl/container:flat_hash_map",
149+
"@com_google_absl//absl/status:statusor",
150+
"@com_google_absl//absl/strings",
151+
"@com_google_absl//absl/strings:str_format",
152+
"@org_xprof//plugin/xprof/protobuf:smart_suggestion_proto_cc",
153+
],
154+
)
155+
141156
cc_library(
142157
name = "memory_bound_rule",
143158
hdrs = ["memory_bound_rule.h"],
@@ -275,6 +290,7 @@ cc_library(
275290
":data_shuffle_bound_rule",
276291
":data_transfer_bound_rule",
277292
":debug_print_rule",
293+
":host_cpu_bound_rule",
278294
":host_processing_bound_rule",
279295
":input_bound_rule",
280296
":memory_bound_rule",
@@ -357,6 +373,21 @@ cc_test(
357373
],
358374
)
359375

376+
cc_test(
377+
name = "host_cpu_bound_rule_test",
378+
srcs = ["host_cpu_bound_rule_test.cc"],
379+
deps = [
380+
":host_cpu_bound_rule",
381+
":mock_tool_data_provider",
382+
":signal_provider",
383+
"@com_google_absl//absl/status",
384+
"@com_google_absl//absl/status:statusor",
385+
"@com_google_googletest//:gtest_main",
386+
"@org_xprof//plugin/xprof/protobuf:event_time_fraction_analyzer_proto_cc",
387+
"@org_xprof//plugin/xprof/protobuf:smart_suggestion_proto_cc",
388+
],
389+
)
390+
360391
cc_test(
361392
name = "host_processing_bound_rule_test",
362393
srcs = ["host_processing_bound_rule_test.cc"],

xprof/convert/smart_suggestion/all_rules.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ limitations under the License.
2222
#include "xprof/convert/smart_suggestion/data_shuffle_bound_rule.h"
2323
#include "xprof/convert/smart_suggestion/data_transfer_bound_rule.h"
2424
#include "xprof/convert/smart_suggestion/debug_print_rule.h"
25+
#include "xprof/convert/smart_suggestion/host_cpu_bound_rule.h"
2526
#include "xprof/convert/smart_suggestion/host_processing_bound_rule.h"
2627
#include "xprof/convert/smart_suggestion/input_bound_rule.h"
2728
#include "xprof/convert/smart_suggestion/memory_bound_rule.h"
@@ -41,6 +42,7 @@ inline void RegisterAllRules(SmartSuggestionRuleFactory* f) {
4142
f->Register<DataShuffleBoundRule>();
4243
f->Register<DataTransferBoundRule>();
4344
f->Register<DebugPrintRule>();
45+
f->Register<HostCPUBoundRule>();
4446
f->Register<HostProcessingBoundRule>();
4547
f->Register<InputBoundRule>();
4648
f->Register<MemoryBoundRule>();

xprof/convert/smart_suggestion/constants.h

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,14 @@ inline constexpr double kSpecialOpBoundThresholdInPercent = 10;
7272
// than this threshold, it is considered a bottleneck.
7373
inline constexpr double kDebugPrintBoundThresholdInPercent = 5;
7474

75+
// If the percentage of step time that is due to the infeed op is higher
76+
// than this threshold, it is considered a bottleneck.
77+
inline constexpr double kInfeedOpBoundThresholdInPercent = 10.0;
78+
79+
// If the percentage of trace time that is due to enqueue_device is higher
80+
// than this threshold, it is considered a bottleneck.
81+
inline constexpr double kEnqueueDeviceBoundThresholdInPercent = 30.0;
82+
7583
// If the percentage of async-done time is higher than this threshold, it is
7684
// considered a bottleneck.
7785
inline constexpr double kAsyncDoneThresholdInPercent = 10;
@@ -84,9 +92,12 @@ inline constexpr double kMemoryUtilizationHighThreshold = 50;
8492
inline constexpr double kZeroEpsilon = 1e-6;
8593

8694
// The events to be parsed by EventTimeFractionAnalyzer for smart suggestion.
87-
// Currently it includes barrier-cores and debug_print.
88-
inline constexpr char kEventTimeFractionAnalyzerEvents[] =
89-
"barrier-cores,debug_print";
95+
// Currently it includes barrier-cores, debug_print, infeed, and EnqueueDevice.
96+
inline constexpr char kDeviceTpuEventTimeFractionAnalyzerEvents[] =
97+
"barrier-cores,debug_print,infeed";
98+
99+
inline constexpr char kHostCpuEventTimeFractionAnalyzerEvents[] =
100+
"EnqueueDevice";
90101

91102
} // namespace profiler
92103
} // namespace tensorflow
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License.
14+
==============================================================================*/
15+
16+
#ifndef THIRD_PARTY_XPROF_CONVERT_SMART_SUGGESTION_HOST_CPU_BOUND_RULE_H_
17+
#define THIRD_PARTY_XPROF_CONVERT_SMART_SUGGESTION_HOST_CPU_BOUND_RULE_H_
18+
19+
#include <algorithm>
20+
#include <optional>
21+
#include <string>
22+
23+
#include "absl/container/flat_hash_map.h"
24+
#include "absl/status/statusor.h"
25+
#include "absl/strings/str_cat.h"
26+
#include "absl/strings/str_format.h"
27+
#include "xprof/convert/smart_suggestion/constants.h"
28+
#include "xprof/convert/smart_suggestion/signal_provider.h"
29+
#include "xprof/convert/smart_suggestion/smart_suggestion_rule.h"
30+
#include "plugin/xprof/protobuf/smart_suggestion.pb.h"
31+
32+
namespace tensorflow {
33+
namespace profiler {
34+
35+
constexpr char kInfeedOpName[] = "infeed";
36+
constexpr char kEnqueueDeviceOpName[] = "EnqueueDevice";
37+
38+
// Rule to detect host cpu bound bottlenecks (combining TPU infeed and
39+
// CPU EnqueueDevice).
40+
class HostCPUBoundRule : public SmartSuggestionRule {
41+
public:
42+
bool MeetsConditions(const SignalProvider& signal_provider) const override {
43+
auto infeed_stats =
44+
signal_provider.GetPerHostAvgEventTimePercent(kInfeedOpName);
45+
auto enqueue_stats =
46+
signal_provider.GetPerHostAvgEventTimePercent(kEnqueueDeviceOpName);
47+
48+
if (!infeed_stats.ok() || infeed_stats->empty() || !enqueue_stats.ok() ||
49+
enqueue_stats->empty()) {
50+
return false;
51+
}
52+
53+
for (const auto& host_stat : *infeed_stats) {
54+
const std::string& hostname = host_stat.first;
55+
double infeed_percent = host_stat.second;
56+
57+
auto it =
58+
std::find_if(enqueue_stats->begin(), enqueue_stats->end(),
59+
[&hostname](const std::pair<std::string, double>& p) {
60+
return p.first == hostname;
61+
});
62+
if (it != enqueue_stats->end()) {
63+
double enqueue_percent = it->second;
64+
if (infeed_percent >= kInfeedOpBoundThresholdInPercent &&
65+
enqueue_percent >= kEnqueueDeviceBoundThresholdInPercent) {
66+
return true;
67+
}
68+
}
69+
}
70+
71+
return false;
72+
}
73+
74+
absl::StatusOr<std::optional<SmartSuggestion>> GenerateSuggestion(
75+
const SignalProvider& signal_provider) const override {
76+
SmartSuggestion suggestion;
77+
suggestion.set_rule_name("HostCPUBoundRule");
78+
79+
absl::flat_hash_map<std::string, double> high_infeed_hosts;
80+
double max_infeed_percent = 0.0;
81+
auto infeed_stats =
82+
signal_provider.GetPerHostAvgEventTimePercent(kInfeedOpName);
83+
if (infeed_stats.ok()) {
84+
for (const auto& host_stat : *infeed_stats) {
85+
if (host_stat.second >= kInfeedOpBoundThresholdInPercent) {
86+
high_infeed_hosts.insert(host_stat);
87+
if (host_stat.second > max_infeed_percent) {
88+
max_infeed_percent = host_stat.second;
89+
}
90+
}
91+
}
92+
}
93+
94+
absl::flat_hash_map<std::string, double> high_enqueue_hosts;
95+
double max_enqueue_percent = 0.0;
96+
auto enqueue_stats =
97+
signal_provider.GetPerHostAvgEventTimePercent(kEnqueueDeviceOpName);
98+
if (enqueue_stats.ok()) {
99+
for (const auto& host_stat : *enqueue_stats) {
100+
if (host_stat.second >= kEnqueueDeviceBoundThresholdInPercent) {
101+
high_enqueue_hosts.insert(host_stat);
102+
if (host_stat.second > max_enqueue_percent) {
103+
max_enqueue_percent = host_stat.second;
104+
}
105+
}
106+
}
107+
}
108+
109+
std::string infeed_hosts_list_html = "<ul>";
110+
for (const auto& [hostname, avg_percent] : high_infeed_hosts) {
111+
absl::StrAppend(&infeed_hosts_list_html, "<li>Host <b>", hostname,
112+
"</b> average infeed time fraction: <b>",
113+
absl::StrFormat("%.1f", avg_percent), "%</b></li>");
114+
}
115+
absl::StrAppend(&infeed_hosts_list_html, "</ul>");
116+
117+
std::string enqueue_hosts_list_html = "<ul>";
118+
for (const auto& [hostname, avg_percent] : high_enqueue_hosts) {
119+
absl::StrAppend(&enqueue_hosts_list_html, "<li>Host <b>", hostname,
120+
"</b> average enqueue_device time fraction: <b>",
121+
absl::StrFormat("%.1f", avg_percent), "%</b></li>");
122+
}
123+
absl::StrAppend(&enqueue_hosts_list_html, "</ul>");
124+
125+
std::string suggestion_text = absl::StrCat(
126+
"<p>Your program is likely bottlenecked by EnqueueDevice ops on host "
127+
"CPU, an average of up to <b>",
128+
absl::StrFormat("%.1f%%", max_enqueue_percent),
129+
"</b> of time is spent on these operations. In addition, infeed op "
130+
"on TensorCore consumes an average of up to <b>",
131+
absl::StrFormat("%.1f%%", max_infeed_percent),
132+
"</b> of step time correlated with the host CPU bottleneck.</p>",
133+
"<p>Please consider the following optimizations:</p><ul>");
134+
135+
absl::StrAppend(
136+
&suggestion_text,
137+
"<li><b>Increase Host GCU:</b> provide more CPU resources to your "
138+
"TPU worker tasks.</li>");
139+
140+
absl::StrAppend(
141+
&suggestion_text,
142+
"<li><b>(SparseCore)</b> Consider improving the input pipeline and "
143+
"apply SparseCore-specific offloads, e.g., embedding data formatting "
144+
"offload, gather offload.</li>");
145+
146+
absl::StrAppend(
147+
&suggestion_text,
148+
"<li><b>Overlap Computation:</b> Confirm that "
149+
"<code>pipeline_execution_with_tensor_core</code> is set to "
150+
"<code>True</code> in your embedding layer configuration. This "
151+
"allows the TensorCore to potentially overlap its computation with "
152+
"the SparseCore operations for the next step.</li>");
153+
154+
absl::StrAppend(
155+
&suggestion_text,
156+
"<li><b>Host Breakdown (Infeed):</b> The following hosts have high "
157+
"device infeed time fractions:",
158+
infeed_hosts_list_html,
159+
"</li><li><b>Host Breakdown (EnqueueDevice):</b> The following hosts "
160+
"have high host enqueue time fractions:",
161+
enqueue_hosts_list_html, "</li></ul>");
162+
163+
suggestion.set_suggestion_text(suggestion_text);
164+
return suggestion;
165+
}
166+
};
167+
168+
} // namespace profiler
169+
} // namespace tensorflow
170+
171+
#endif // THIRD_PARTY_XPROF_CONVERT_SMART_SUGGESTION_HOST_CPU_BOUND_RULE_H_

0 commit comments

Comments
 (0)