Remove logic where CUDa archs env var is overeritten

Kyle1668 · Kyle1668 · commit 7b6de799c360 · 2025-07-23T11:30:24.000Z
diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/__init__.py
@@ -31,31 +31,33 @@
 # leading to recompilation of fused kernels. Set it to empty string
 # to avoid recompilation and assign arch flags explicitly in
 # extra_cuda_cflags below
-os.environ["TORCH_CUDA_ARCH_LIST"] = ""
+
+# TODO: Remove this
+# os.environ["TORCH_CUDA_ARCH_LIST"] = ""
 
 
 def load(neox_args=None):
     print("\n" + "="*80)
     print("FUSED KERNELS: Starting fused kernel loading process...")
     print("="*80)
     start_time = time.time()
-    
+
     # Check if cuda 11 is installed for compute capability 8.0
     cc_flag = []
     if torch.version.hip is None:
         print(f"FUSED KERNELS: Detected PyTorch with CUDA support")
         print(f"FUSED KERNELS: CUDA_HOME = {cpp_extension.CUDA_HOME}")
-        
+
         raw_output, bare_metal_major, bare_metal_minor = _get_cuda_bare_metal_version(
             cpp_extension.CUDA_HOME
         )
         print(f"FUSED KERNELS: Detected CUDA version {bare_metal_major}.{bare_metal_minor}")
-        
+
         if int(bare_metal_major) >= 11:
             cc_flag.append("-gencode")
             cc_flag.append("arch=compute_80,code=sm_80")
             print(f"FUSED KERNELS: Added compute capability 8.0 (A100)")
-            
+
             if int(bare_metal_minor) >= 1:
                 cc_flag.append("-gencode")
                 cc_flag.append("arch=compute_86,code=sm_86")
@@ -97,7 +99,7 @@ def _cpp_extention_load_helper(
                 + extra_cuda_flags
                 + cc_flag
             )
-        
+
         # Check if kernel is already built
         kernel_path = buildpath / name
         if os.path.exists(kernel_path) and any(f.endswith('.so') for f in os.listdir(kernel_path) if os.path.isfile(os.path.join(kernel_path, f))):
@@ -107,23 +109,23 @@ def _cpp_extention_load_helper(
             print(f"FUSED KERNELS: {name} needs to be built")
             print(f"FUSED KERNELS: This will take 30-60 seconds...")
             print(f"FUSED KERNELS: Building with flags: {extra_cuda_cflags}")
-        
+
         sys.stdout.flush()  # Force flush to ensure messages appear
-        
+
         try:
             print(f"FUSED KERNELS: Calling cpp_extension.load for {name}...")
             build_start = time.time()
-            
+
             # Monkey-patch the ninja build to add progress messages
             original_build = cpp_extension._write_ninja_file_and_build_library
             def build_with_progress(*args, **kwargs):
                 print(f"FUSED KERNELS: JIT compiling {name} with ninja...")
                 print(f"FUSED KERNELS: This involves compiling CUDA kernels - please be patient...")
                 sys.stdout.flush()
                 return original_build(*args, **kwargs)
-            
+
             cpp_extension._write_ninja_file_and_build_library = build_with_progress
-            
+
             try:
                 loaded_module = cpp_extension.load(
                     name=name,
@@ -139,15 +141,15 @@ def build_with_progress(*args, **kwargs):
             finally:
                 # Restore original function
                 cpp_extension._write_ninja_file_and_build_library = original_build
-            
+
             build_time = time.time() - build_start
             print(f"FUSED KERNELS: Successfully loaded {name} in {build_time:.2f} seconds")
             return loaded_module
-            
+
         except Exception as e:
             print(f"\nFUSED KERNELS ERROR: Failed to build/load {name}")
             print(f"FUSED KERNELS ERROR: {str(e)}")
-            
+
             # Check for common issues
             if "Permission denied" in str(e) or "cannot create directory" in str(e):
                 print(f"FUSED KERNELS ERROR: This might be a file permission issue.")
@@ -160,7 +162,7 @@ def build_with_progress(*args, **kwargs):
             elif "nvcc not found" in str(e) or "CUDA_HOME" in str(e):
                 print(f"FUSED KERNELS ERROR: CUDA installation issue detected.")
                 print(f"FUSED KERNELS ERROR: Make sure CUDA is properly installed and CUDA_HOME is set.")
-            
+
             print(f"FUSED KERNELS ERROR: Full build directory path: {buildpath}")
             raise
 
@@ -188,7 +190,7 @@ def build_with_progress(*args, **kwargs):
 
     print("\nFUSED KERNELS: Building/loading 3 fused kernels...")
     print("-"*60)
-    
+
     # Upper triangular softmax.
     print("\n[1/3] Building scaled_upper_triang_masked_softmax_cuda...")
     sources = [
@@ -201,7 +203,7 @@ def build_with_progress(*args, **kwargs):
         extra_cuda_flags,
         extra_include_paths,
     )
-    
+
     # Masked softmax.
     print("\n[2/3] Building scaled_masked_softmax_cuda...")
     sources = [
@@ -211,7 +213,7 @@ def build_with_progress(*args, **kwargs):
     scaled_masked_softmax_cuda = _cpp_extention_load_helper(
         "scaled_masked_softmax_cuda", sources, extra_cuda_flags, extra_include_paths
     )
-    
+
     # fused rope
     print("\n[3/3] Building fused_rotary_positional_embedding...")
     sources = [
@@ -224,7 +226,7 @@ def build_with_progress(*args, **kwargs):
         extra_cuda_flags,
         extra_include_paths,
     )
-    
+
     total_time = time.time() - start_time
     print("\n" + "="*80)
     print(f"FUSED KERNELS: All kernels loaded successfully!")
@@ -275,15 +277,15 @@ def load_fused_kernels():
     try:
         import scaled_upper_triang_masked_softmax_cuda
         print("FUSED KERNELS: ✓ scaled_upper_triang_masked_softmax_cuda imported successfully")
-        
+
         import scaled_masked_softmax_cuda
         print("FUSED KERNELS: ✓ scaled_masked_softmax_cuda imported successfully")
-        
+
         import fused_rotary_positional_embedding
         print("FUSED KERNELS: ✓ fused_rotary_positional_embedding imported successfully")
-        
+
         print("FUSED KERNELS: All fused kernels are available and ready to use!")
-        
+
     except (ImportError, ModuleNotFoundError) as e:
         print("\n" + "!"*100)
         print("FUSED KERNELS ERROR: Failed to import fused kernels!")