PR #40917: [xla:gpu] Print HLO/Executable fingerprints when loading from AOT result

ezhulenev · Google-ML-Automation · commit 27f0295e4cdb · 2026-04-16T17:24:52.000-07:00
Imported from GitHub PR #40917 This helps with debugging non-determinism at scale by making sure we actually load and execute the same thing! Copybara import of the project: -- fdcddf6 by Eugene Zhulenev <ezhulenev@openxla.org>: [xla:gpu] Print HLO/Executable fingerprints when loading from AOT result Merging this change closes #40917 FUTURE_COPYBARA_INTEGRATE_REVIEW=#40917 from ezhulenev:fingerpint-aot-result-when-load fdcddf6 PiperOrigin-RevId: 900705039
diff --git a/xla/service/gpu/BUILD b/xla/service/gpu/BUILD
@@ -2186,6 +2186,8 @@ cc_library(
         ":gpu_executable",
         ":gpu_executable_proto_cc",
         "//xla:debug_options_flags",
+        "//xla:printer",
+        "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/pjrt:compiled_memory_stats",
         "//xla/service:buffer_assignment",
@@ -2195,20 +2197,22 @@ cc_library(
         "//xla/stream_executor:kernel_symbol_registry",
         "//xla/stream_executor:platform",
         "//xla/stream_executor/abi:executable_abi_version",
-        "//xla/tsl/platform:errors",
+        "//xla/tsl/lib/strings:proto_serialization",
+        "//xla/tsl/platform:logging",
         "//xla/tsl/platform:status_macros",
-        "//xla/tsl/platform:statusor",
         "//xla/util/split_proto:split_gpu_executable_writer",
         "//xla/util/split_proto:split_proto_reader",
         "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/functional:overload",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_protobuf//:arena",
         "@riegeli//riegeli/bytes:reader",
         "@riegeli//riegeli/bytes:string_writer",
+        "@tsl//tsl/platform:fingerprint",
     ],
 )
 
diff --git a/xla/service/gpu/gpu_aot_compilation_result.cc b/xla/service/gpu/gpu_aot_compilation_result.cc
@@ -24,34 +24,55 @@ limitations under the License.
 #include "absl/functional/overload.h"
 #include "absl/memory/memory.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "xla/tsl/platform/status_macros.h"
 #include "google/protobuf/arena.h"
 #include "riegeli/bytes/string_writer.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/pjrt/compiled_memory_stats.h"
+#include "xla/printer.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/executable.h"
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/gpu_executable.pb.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/kernel_symbol_registry.h"
 #include "xla/stream_executor/platform.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/lib/strings/proto_serialization.h"
+#include "xla/tsl/platform/logging.h"
 #include "xla/util/split_proto/split_gpu_executable_writer.h"
 #include "xla/util/split_proto/split_proto_reader.h"
+#include "xla/xla.pb.h"
+#include "tsl/platform/fingerprint.h"
 
 namespace xla::gpu {
 
+static absl::StatusOr<std::pair<std::unique_ptr<HloModule>, tsl::Fprint128>>
+ParseHloModuleAndFingerprint(const HloModuleProtoWithConfig& proto) {
+  ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                   HloModule::CreateFromProtoWithConfig(proto));
+  HighwayHashPrinter printer;
+  module->Print(&printer, HloPrintOptions::Canonical()
+                              .set_print_backend_config(true)
+                              .set_sort_backend_config(true));
+  return std::make_pair(std::move(module), printer.ToFingerprint128());
+}
+
 absl::StatusOr<std::unique_ptr<GpuAotCompilationResult>>
 GpuAotCompilationResult::FromProto(GpuExecutableProto executable_proto) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
-                      HloModule::CreateFromProtoWithConfig(
-                          executable_proto.hlo_module_with_config()));
+  tsl::Fprint128 executable_fingerprint = {
+      tsl::DeterministicProtoHash64(executable_proto),
+      tsl::DeterministicProtoHash64(executable_proto, /*seed=*/1)};
+  ASSIGN_OR_RETURN(
+      auto module_and_fingerprint,
+      ParseHloModuleAndFingerprint(executable_proto.hlo_module_with_config()));
+  auto& [module, hlo_fingerprint] = module_and_fingerprint;
   return absl::WrapUnique(new GpuAotCompilationResult(
-      std::move(executable_proto), std::move(module)));
+      std::move(executable_proto), std::move(module), hlo_fingerprint,
+      executable_fingerprint));
 }
 
 absl::StatusOr<std::unique_ptr<GpuAotCompilationResult>>
@@ -61,20 +82,24 @@ GpuAotCompilationResult::FromSerialized(
   GpuExecutableProto* executable_proto =
       google::protobuf::Arena::Create<GpuExecutableProto>(arena.get());
 
-  TF_RETURN_IF_ERROR(ReadSplitProto(std::move(reader), *executable_proto));
+  RETURN_IF_ERROR(ReadSplitProto(std::move(reader), *executable_proto));
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
-                      HloModule::CreateFromProtoWithConfig(
-                          executable_proto->hlo_module_with_config()));
-  return absl::WrapUnique(
-      new GpuAotCompilationResult(internal::ArenaAllocatedGpuExecutableProto(
-                                      std::move(arena), executable_proto),
-                                  std::move(module)));
+  tsl::Fprint128 executable_fingerprint = {
+      tsl::DeterministicProtoHash64(*executable_proto),
+      tsl::DeterministicProtoHash64(*executable_proto, /*seed=*/1)};
+  ASSIGN_OR_RETURN(
+      auto module_and_fingerprint,
+      ParseHloModuleAndFingerprint(executable_proto->hlo_module_with_config()));
+  auto& [module, hlo_fingerprint] = module_and_fingerprint;
+  return absl::WrapUnique(new GpuAotCompilationResult(
+      internal::ArenaAllocatedGpuExecutableProto(std::move(arena),
+                                                 executable_proto),
+      std::move(module), hlo_fingerprint, executable_fingerprint));
 }
 
 absl::StatusOr<std::string> GpuAotCompilationResult::SerializeAsString() const {
   std::string serialized;
-  TF_RETURN_IF_ERROR(WriteSplitGpuExecutable(
+  RETURN_IF_ERROR(WriteSplitGpuExecutable(
       GetExecutableProto(),
       std::make_unique<riegeli::StringWriter<>>(&serialized)));
   return serialized;
@@ -89,6 +114,15 @@ GpuAotCompilationResult::LoadExecutable(
         stream_executor::KernelSymbolRegistry::GetGlobalInstance();
     return registry.FindSymbol(symbol_name, platform_id);
   };
+
+  VLOG(1) << absl::StrFormat(
+      "GpuAotCompilationResult::LoadExecutable: module=%s "
+      "num_instructions=%d hlo_fingerprint=%016x%016x "
+      "executable_fingerprint=%016x%016x",
+      hlo_module_->name(), hlo_module_->instruction_count(),
+      hlo_fingerprint_.low64, hlo_fingerprint_.high64,
+      executable_fingerprint_.low64, executable_fingerprint_.high64);
+
   return GpuExecutable::FromProto(GetExecutableProto(), device_description,
                                   platform_id->ToName(),
                                   GetDebugOptionsFromFlags(), symbol_resolver);
diff --git a/xla/service/gpu/gpu_aot_compilation_result.h b/xla/service/gpu/gpu_aot_compilation_result.h
@@ -34,6 +34,7 @@ limitations under the License.
 #include "xla/stream_executor/abi/executable_abi_version.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/platform.h"
+#include "tsl/platform/fingerprint.h"
 
 namespace xla::gpu {
 
@@ -98,13 +99,18 @@ class GpuAotCompilationResult : public CompiledModule {
       std::variant<internal::ArenaAllocatedGpuExecutableProto,
                    GpuExecutableProto>
           gpu_executable_proto,
-      std::shared_ptr<HloModule> hlo_module)
+      std::shared_ptr<HloModule> hlo_module, tsl::Fprint128 hlo_fingerprint,
+      tsl::Fprint128 executable_fingerprint)
       : gpu_executable_proto_(std::move(gpu_executable_proto)),
-        hlo_module_(std::move(hlo_module)) {}
+        hlo_module_(std::move(hlo_module)),
+        hlo_fingerprint_(hlo_fingerprint),
+        executable_fingerprint_(executable_fingerprint) {}
 
   std::variant<internal::ArenaAllocatedGpuExecutableProto, GpuExecutableProto>
       gpu_executable_proto_;
   std::shared_ptr<HloModule> hlo_module_;
+  tsl::Fprint128 hlo_fingerprint_;
+  tsl::Fprint128 executable_fingerprint_;
 };
 
 }  // namespace xla::gpu
diff --git a/xla/service/gpu/gpu_aot_compilation_result_test.cc b/xla/service/gpu/gpu_aot_compilation_result_test.cc
@@ -188,15 +188,15 @@ class GpuAotCompilationResultTest : public ::testing::Test {
 };
 
 TEST_F(GpuAotCompilationResultTest, CreateAndSerialize) {
-  TF_ASSERT_OK_AND_ASSIGN(GpuExecutableProto reference_executable,
-                          CreateGpuExecutableProto());
+  ASSERT_OK_AND_ASSIGN(GpuExecutableProto reference_executable,
+                       CreateGpuExecutableProto());
 
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<GpuAotCompilationResult> result,
       GpuAotCompilationResult::FromProto(reference_executable));
 
-  TF_ASSERT_OK_AND_ASSIGN(std::string serialized_result,
-                          result->SerializeAsString());
+  ASSERT_OK_AND_ASSIGN(std::string serialized_result,
+                       result->SerializeAsString());
 
   GpuExecutableProto deserialized_executable;
   ASSERT_OK(ReadSplitProto(
@@ -214,14 +214,14 @@ TEST_F(GpuAotCompilationResultTest, CreateAndSerialize) {
 }
 
 TEST_F(GpuAotCompilationResultTest, LoadExecutable) {
-  TF_ASSERT_OK_AND_ASSIGN(GpuExecutableProto reference_executable,
-                          CreateGpuExecutableProto());
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(GpuExecutableProto reference_executable,
+                       CreateGpuExecutableProto());
+  ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<GpuAotCompilationResult> result,
       GpuAotCompilationResult::FromProto(reference_executable));
 
   {
-    TF_ASSERT_OK_AND_ASSIGN(
+    ASSERT_OK_AND_ASSIGN(
         stream_executor::ExecutableAbiVersion executable_abi_version,
         result->GetExecutableAbiVersion());
     EXPECT_EQ(executable_abi_version.platform_name(), "CUDA");
@@ -233,12 +233,12 @@ TEST_F(GpuAotCompilationResultTest, LoadExecutable) {
 
   EnsureCudaSymbolIsRegistered();
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> executable,
-                          std::move(*result).LoadExecutable(
-                              platform_.id(), GetDeviceDescription()));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> executable,
+                       std::move(*result).LoadExecutable(
+                           platform_.id(), GetDeviceDescription()));
 
   {
-    TF_ASSERT_OK_AND_ASSIGN(
+    ASSERT_OK_AND_ASSIGN(
         stream_executor::ExecutableAbiVersion executable_abi_version,
         executable->GetExecutableAbiVersion());
     EXPECT_EQ(executable_abi_version.platform_name(), "CUDA");
@@ -251,8 +251,8 @@ TEST_F(GpuAotCompilationResultTest, LoadExecutable) {
   auto* gpu_executable = dynamic_cast<GpuExecutable*>(executable.get());
   ASSERT_NE(gpu_executable, nullptr) << "Executable is not a GpuExecutable.";
 
-  TF_ASSERT_OK_AND_ASSIGN(GpuExecutableProto executable_proto,
-                          gpu_executable->ToProto());
+  ASSERT_OK_AND_ASSIGN(GpuExecutableProto executable_proto,
+                       gpu_executable->ToProto());
   // HLO module is re-created from proto, and will have a new ID, so we clear
   // it for comparison purposes.
   executable_proto.mutable_hlo_module_with_config()