|
| 1 | +/* Copyright 2025 The TensorFlow Authors. All Rights Reserved. |
| 2 | +
|
| 3 | +Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +you may not use this file except in compliance with the License. |
| 5 | +You may obtain a copy of the License at |
| 6 | +
|
| 7 | + http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +
|
| 9 | +Unless required by applicable law or agreed to in writing, software |
| 10 | +distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +See the License for the specific language governing permissions and |
| 13 | +limitations under the License. |
| 14 | +==============================================================================*/ |
| 15 | + |
| 16 | +#ifndef THIRD_PARTY_XPROF_CONVERT_SMART_SUGGESTION_HOST_CPU_BOUND_RULE_H_ |
| 17 | +#define THIRD_PARTY_XPROF_CONVERT_SMART_SUGGESTION_HOST_CPU_BOUND_RULE_H_ |
| 18 | + |
| 19 | +#include <algorithm> |
| 20 | +#include <optional> |
| 21 | +#include <string> |
| 22 | + |
| 23 | +#include "absl/container/flat_hash_map.h" |
| 24 | +#include "absl/status/statusor.h" |
| 25 | +#include "absl/strings/str_cat.h" |
| 26 | +#include "absl/strings/str_format.h" |
| 27 | +#include "xprof/convert/smart_suggestion/constants.h" |
| 28 | +#include "xprof/convert/smart_suggestion/signal_provider.h" |
| 29 | +#include "xprof/convert/smart_suggestion/smart_suggestion_rule.h" |
| 30 | +#include "plugin/xprof/protobuf/smart_suggestion.pb.h" |
| 31 | + |
| 32 | +namespace tensorflow { |
| 33 | +namespace profiler { |
| 34 | + |
| 35 | +constexpr char kInfeedOpName[] = "infeed"; |
| 36 | +constexpr char kEnqueueDeviceOpName[] = "EnqueueDevice"; |
| 37 | + |
| 38 | +// Rule to detect host cpu bound bottlenecks (combining TPU infeed and |
| 39 | +// CPU EnqueueDevice). |
| 40 | +class HostCPUBoundRule : public SmartSuggestionRule { |
| 41 | + public: |
| 42 | + bool MeetsConditions(const SignalProvider& signal_provider) const override { |
| 43 | + auto infeed_stats = |
| 44 | + signal_provider.GetPerHostAvgEventTimePercent(kInfeedOpName); |
| 45 | + auto enqueue_stats = |
| 46 | + signal_provider.GetPerHostAvgEventTimePercent(kEnqueueDeviceOpName); |
| 47 | + |
| 48 | + if (!infeed_stats.ok() || infeed_stats->empty() || !enqueue_stats.ok() || |
| 49 | + enqueue_stats->empty()) { |
| 50 | + return false; |
| 51 | + } |
| 52 | + |
| 53 | + for (const auto& host_stat : *infeed_stats) { |
| 54 | + const std::string& hostname = host_stat.first; |
| 55 | + double infeed_percent = host_stat.second; |
| 56 | + |
| 57 | + auto it = |
| 58 | + std::find_if(enqueue_stats->begin(), enqueue_stats->end(), |
| 59 | + [&hostname](const std::pair<std::string, double>& p) { |
| 60 | + return p.first == hostname; |
| 61 | + }); |
| 62 | + if (it != enqueue_stats->end()) { |
| 63 | + double enqueue_percent = it->second; |
| 64 | + if (infeed_percent >= kInfeedOpBoundThresholdInPercent && |
| 65 | + enqueue_percent >= kEnqueueDeviceBoundThresholdInPercent) { |
| 66 | + return true; |
| 67 | + } |
| 68 | + } |
| 69 | + } |
| 70 | + |
| 71 | + return false; |
| 72 | + } |
| 73 | + |
| 74 | + absl::StatusOr<std::optional<SmartSuggestion>> GenerateSuggestion( |
| 75 | + const SignalProvider& signal_provider) const override { |
| 76 | + SmartSuggestion suggestion; |
| 77 | + suggestion.set_rule_name("HostCPUBoundRule"); |
| 78 | + |
| 79 | + absl::flat_hash_map<std::string, double> high_infeed_hosts; |
| 80 | + double max_infeed_percent = 0.0; |
| 81 | + auto infeed_stats = |
| 82 | + signal_provider.GetPerHostAvgEventTimePercent(kInfeedOpName); |
| 83 | + if (infeed_stats.ok()) { |
| 84 | + for (const auto& host_stat : *infeed_stats) { |
| 85 | + if (host_stat.second >= kInfeedOpBoundThresholdInPercent) { |
| 86 | + high_infeed_hosts.insert(host_stat); |
| 87 | + if (host_stat.second > max_infeed_percent) { |
| 88 | + max_infeed_percent = host_stat.second; |
| 89 | + } |
| 90 | + } |
| 91 | + } |
| 92 | + } |
| 93 | + |
| 94 | + absl::flat_hash_map<std::string, double> high_enqueue_hosts; |
| 95 | + double max_enqueue_percent = 0.0; |
| 96 | + auto enqueue_stats = |
| 97 | + signal_provider.GetPerHostAvgEventTimePercent(kEnqueueDeviceOpName); |
| 98 | + if (enqueue_stats.ok()) { |
| 99 | + for (const auto& host_stat : *enqueue_stats) { |
| 100 | + if (host_stat.second >= kEnqueueDeviceBoundThresholdInPercent) { |
| 101 | + high_enqueue_hosts.insert(host_stat); |
| 102 | + if (host_stat.second > max_enqueue_percent) { |
| 103 | + max_enqueue_percent = host_stat.second; |
| 104 | + } |
| 105 | + } |
| 106 | + } |
| 107 | + } |
| 108 | + |
| 109 | + std::string infeed_hosts_list_html = "<ul>"; |
| 110 | + for (const auto& [hostname, avg_percent] : high_infeed_hosts) { |
| 111 | + absl::StrAppend(&infeed_hosts_list_html, "<li>Host <b>", hostname, |
| 112 | + "</b> average infeed time fraction: <b>", |
| 113 | + absl::StrFormat("%.1f", avg_percent), "%</b></li>"); |
| 114 | + } |
| 115 | + absl::StrAppend(&infeed_hosts_list_html, "</ul>"); |
| 116 | + |
| 117 | + std::string enqueue_hosts_list_html = "<ul>"; |
| 118 | + for (const auto& [hostname, avg_percent] : high_enqueue_hosts) { |
| 119 | + absl::StrAppend(&enqueue_hosts_list_html, "<li>Host <b>", hostname, |
| 120 | + "</b> average enqueue_device time fraction: <b>", |
| 121 | + absl::StrFormat("%.1f", avg_percent), "%</b></li>"); |
| 122 | + } |
| 123 | + absl::StrAppend(&enqueue_hosts_list_html, "</ul>"); |
| 124 | + |
| 125 | + std::string suggestion_text = absl::StrCat( |
| 126 | + "<p>Your program is likely bottlenecked by EnqueueDevice ops on host " |
| 127 | + "CPU, an average of up to <b>", |
| 128 | + absl::StrFormat("%.1f%%", max_enqueue_percent), |
| 129 | + "</b> of time is spent on these operations. In addition, infeed op " |
| 130 | + "on TensorCore consumes an average of up to <b>", |
| 131 | + absl::StrFormat("%.1f%%", max_infeed_percent), |
| 132 | + "</b> of step time correlated with the host CPU bottleneck.</p>", |
| 133 | + "<p>Please consider the following optimizations:</p><ul>"); |
| 134 | + |
| 135 | + absl::StrAppend( |
| 136 | + &suggestion_text, |
| 137 | + "<li><b>Increase Host GCU:</b> provide more CPU resources to your " |
| 138 | + "TPU worker tasks.</li>"); |
| 139 | + |
| 140 | + absl::StrAppend( |
| 141 | + &suggestion_text, |
| 142 | + "<li><b>(SparseCore)</b> Consider improving the input pipeline and " |
| 143 | + "apply SparseCore-specific offloads, e.g., embedding data formatting " |
| 144 | + "offload, gather offload.</li>"); |
| 145 | + |
| 146 | + absl::StrAppend( |
| 147 | + &suggestion_text, |
| 148 | + "<li><b>Overlap Computation:</b> Confirm that " |
| 149 | + "<code>pipeline_execution_with_tensor_core</code> is set to " |
| 150 | + "<code>True</code> in your embedding layer configuration. This " |
| 151 | + "allows the TensorCore to potentially overlap its computation with " |
| 152 | + "the SparseCore operations for the next step.</li>"); |
| 153 | + |
| 154 | + absl::StrAppend( |
| 155 | + &suggestion_text, |
| 156 | + "<li><b>Host Breakdown (Infeed):</b> The following hosts have high " |
| 157 | + "device infeed time fractions:", |
| 158 | + infeed_hosts_list_html, |
| 159 | + "</li><li><b>Host Breakdown (EnqueueDevice):</b> The following hosts " |
| 160 | + "have high host enqueue time fractions:", |
| 161 | + enqueue_hosts_list_html, "</li></ul>"); |
| 162 | + |
| 163 | + suggestion.set_suggestion_text(suggestion_text); |
| 164 | + return suggestion; |
| 165 | + } |
| 166 | +}; |
| 167 | + |
| 168 | +} // namespace profiler |
| 169 | +} // namespace tensorflow |
| 170 | + |
| 171 | +#endif // THIRD_PARTY_XPROF_CONVERT_SMART_SUGGESTION_HOST_CPU_BOUND_RULE_H_ |
0 commit comments