review feedback

tianleiwu · tianleiwu · commit 4ac7cfda3281 · 2026-02-09T18:34:11.000Z
diff --git a/onnxruntime/contrib_ops/cpu/utils/debug_macros.h b/onnxruntime/contrib_ops/cpu/utils/debug_macros.h
@@ -46,11 +46,11 @@
 #define DUMP_TENSOR_D(...)
 #endif
 
-#if (defined(__GNUC__) || defined(__clang__)) && !defined(NDEBUG)
-#define DEBUG_PRINTF(fmt, ...) \
+#if (defined(__GNUC__) || defined(__clang__)) && (DUMP_TENSOR_LEVEL > 0)
+#define DUMP_PRINTF(fmt, ...) \
   std::printf("[DEBUG] " fmt "\n", ##__VA_ARGS__)
 #else
-#define DEBUG_PRINTF(fmt, ...) \
-  do {                         \
+#define DUMP_PRINTF(fmt, ...) \
+  do {                        \
   } while (0)
 #endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
@@ -26,7 +26,8 @@ namespace cuda {
 
 namespace {
 // Map string attribute to quantization type enum
-KVQuantizationType StringToKVQuantizationType(const std::string& s) {
+KVQuantizationType StringToKVQuantizationType(std::string s) {
+  std::transform(s.begin(), s.end(), s.begin(), [](unsigned char c) { return std::toupper(c); });
   if (s == "NONE") {
     return KVQuantizationType::NONE;
   }
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
@@ -554,13 +554,13 @@ Status LaunchGetSequenceLengths(
 }
 
 // Trace function for debugging
-#define ORT_GQA_TRACE(func_name)                                                                                           \
-  DEBUG_PRINTF("[GQA %s] is_packed_qkv: %d, is_first_prompt: %d, is_subsequent_prompt: %d, past_present_share_buffer: %d", \
-               func_name,                                                                                                  \
-               static_cast<int>(parameters.is_packed_qkv),                                                                 \
-               static_cast<int>(parameters.is_first_prompt),                                                               \
-               static_cast<int>(parameters.is_subsequent_prompt),                                                          \
-               static_cast<int>(parameters.past_present_share_buffer));
+#define ORT_GQA_TRACE(func_name)                                                                                          \
+  DUMP_PRINTF("[GQA %s] is_packed_qkv: %d, is_first_prompt: %d, is_subsequent_prompt: %d, past_present_share_buffer: %d", \
+              func_name,                                                                                                  \
+              static_cast<int>(parameters.is_packed_qkv),                                                                 \
+              static_cast<int>(parameters.is_first_prompt),                                                               \
+              static_cast<int>(parameters.is_subsequent_prompt),                                                          \
+              static_cast<int>(parameters.past_present_share_buffer));
 
 ////////// Kernels (supports right padding but not left padding)
 // Use flash attention for all workloads (rotary, kv append, attention, etc.). No extra kernel is used in this path.
@@ -706,8 +706,8 @@ Status FlashDecoding(
 
   bool past_bsnh = past_kv_format == AttentionQkvFormat::Q_K_V_BSNH;
 
-  DEBUG_PRINTF("[FlashDecoding] key=%p, value=%p, present_key=%p, present_value=%p, seqlens_k=%p, is_packed_qkv=%d",
-               key, value, present_key, present_value, seqlens_k, static_cast<int>(parameters.is_packed_qkv));
+  DUMP_PRINTF("[FlashDecoding] key=%p, value=%p, present_key=%p, present_value=%p, seqlens_k=%p, is_packed_qkv=%d",
+              key, value, present_key, present_value, seqlens_k, static_cast<int>(parameters.is_packed_qkv));
 
   ORT_RETURN_IF_ERROR(onnxruntime::flash::mha_fwd_kvcache(
       device_prop, stream, query, present_key, present_value, key, value, data.output,
diff --git a/onnxruntime/contrib_ops/cuda/bert/xqa/barriers.cuh b/onnxruntime/contrib_ops/cuda/bert/xqa/barriers.cuh
@@ -356,9 +356,7 @@ class MBarrier  // rename this to MBarrier
     } else {
       float sleepDuration = 0.125F;
       while (!func()) {
-        // if (sleepDuration > 1) {
         __nanosleep(uint32_t(sleepDuration));
-        // }
         sleepDuration = sleepDuration * 1.25F + 0.F;
       }
     }

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,8 @@ namespace cuda {`
`26`	`26`
`27`	`27`	`namespace {`
`28`	`28`	`// Map string attribute to quantization type enum`
`29`		`-KVQuantizationType StringToKVQuantizationType(const std::string& s) {`
	`29`	`+KVQuantizationType StringToKVQuantizationType(std::string s) {`
	`30`	`+ std::transform(s.begin(), s.end(), s.begin(), [](unsigned char c) { return std::toupper(c); });`
`30`	`31`	`if (s == "NONE") {`
`31`	`32`	`return KVQuantizationType::NONE;`
`32`	`33`	`}`
Original file line number	Diff line number	Diff line change
`@@ -356,9 +356,7 @@ class MBarrier // rename this to MBarrier`
`356`	`356`	`} else {`
`357`	`357`	`float sleepDuration = 0.125F;`
`358`	`358`	`while (!func()) {`
`359`		`- // if (sleepDuration > 1) {`
`360`	`359`	`__nanosleep(uint32_t(sleepDuration));`
`361`		`- // }`
`362`	`360`	`sleepDuration = sleepDuration * 1.25F + 0.F;`
`363`	`361`	`}`
`364`	`362`	`}`