fix: Remove orphaned doc comments from normalization files

BiomeOS Developer · BiomeOS Developer · commit e84fcbf50d0e · 2026-01-16T10:40:14.000-05:00
diff --git a/showcase/gpu-universal/ml-inference/src/wgpu/normalization/groupnorm.rs b/showcase/gpu-universal/ml-inference/src/wgpu/normalization/groupnorm.rs
@@ -268,7 +268,5 @@ impl WgpuExecutor {
         self.read_buffer(&staging_buffer, total_size).await
     }
 
-    /// Execute Instance Normalization
-    ///
-    /// Normalizes each instance (batch sample) independently across spatial dimensions.
+}
 }
diff --git a/showcase/gpu-universal/ml-inference/src/wgpu/normalization/instance_norm.rs b/showcase/gpu-universal/ml-inference/src/wgpu/normalization/instance_norm.rs
@@ -167,13 +167,5 @@ impl WgpuExecutor {
         self.read_buffer(&staging_buffer, total_size).await
     }
 
-    /// Execute RMS Normalization
-    ///
-    /// Simpler alternative to LayerNorm used in modern transformers.
-    /// RMSNorm(x) = x / sqrt(mean(x²) + epsilon) * gamma
-    ///
-    /// No mean subtraction, only RMS scaling - faster and simpler than LayerNorm.
-    /// Used in: LLaMA, GPT-NeoX, T5, modern large language models.
-    ///
-    /// Deep Debt: Runtime dimensions, learnable scale parameters.
+}
 }
diff --git a/showcase/gpu-universal/ml-inference/src/wgpu/normalization/rms_norm.rs b/showcase/gpu-universal/ml-inference/src/wgpu/normalization/rms_norm.rs
@@ -143,26 +143,6 @@ impl WgpuExecutor {
         self.read_buffer(&staging_buffer, total_size).await
     }
 
-    /// Execute Fused LayerNorm: SINGLE-PASS layer normalization
-    ///
-    /// **BREAKTHROUGH OPTIMIZATION**: Combines all 3 passes into ONE kernel launch!
-    ///
-    /// Previous (3-pass):
-    ///   - Pass 1: Compute partial stats → launch overhead + sync
-    ///   - Pass 2: Finalize stats       → launch overhead + sync
-    ///   - Pass 3: Normalize            → launch overhead + sync
-    ///   - Total: 3x launch overhead + 2x global sync
-    ///
-    /// Fused (1-pass):
-    ///   - Single kernel launch with Welford's algorithm in shared memory
-    ///   - Immediate normalization (no intermediate global memory)
-    ///   - Grid-stride loop for large inputs
-    ///   - Total: 1x launch overhead + 0x global sync
-    ///
-    /// **Expected Speedup**: 8-12x for LLaMA-scale (118ms → 10-15ms)
-    ///
-    /// **Memory Pattern**: Streaming (one read, one write, no intermediate buffers)
-    ///
-    /// Formula: output = (input - mean) / sqrt(variance + epsilon) * gamma + beta
+}
 }
 }

Original file line number	Diff line number	Diff line change
`@@ -268,7 +268,5 @@ impl WgpuExecutor {`
`268`	`268`	`self.read_buffer(&staging_buffer, total_size).await`
`269`	`269`	`}`
`270`	`270`
`271`		`- /// Execute Instance Normalization`
`272`		`- ///`
`273`		`- /// Normalizes each instance (batch sample) independently across spatial dimensions.`
	`271`	`+}`
`274`	`272`	`}`
Original file line number	Diff line number	Diff line change
`@@ -167,13 +167,5 @@ impl WgpuExecutor {`
`167`	`167`	`self.read_buffer(&staging_buffer, total_size).await`
`168`	`168`	`}`
`169`	`169`
`170`		`- /// Execute RMS Normalization`
`171`		`- ///`
`172`		`- /// Simpler alternative to LayerNorm used in modern transformers.`
`173`		`- /// RMSNorm(x) = x / sqrt(mean(x²) + epsilon) * gamma`
`174`		`- ///`
`175`		`- /// No mean subtraction, only RMS scaling - faster and simpler than LayerNorm.`
`176`		`- /// Used in: LLaMA, GPT-NeoX, T5, modern large language models.`
`177`		`- ///`
`178`		`- /// Deep Debt: Runtime dimensions, learnable scale parameters.`
	`170`	`+}`
`179`	`171`	`}`