DotSlash-A
diff --git a/‎python/sglang/jit_kernel/csrc/diffusion/timestep_embedding.cuh‎
Lines changed: 0 additions & 173 deletions b/‎python/sglang/jit_kernel/csrc/diffusion/timestep_embedding.cuh‎
Lines changed: 0 additions & 173 deletions
diff --git a/‎python/sglang/jit_kernel/timestep_embedding.py‎
Lines changed: 0 additions & 44 deletions b/‎python/sglang/jit_kernel/timestep_embedding.py‎
Lines changed: 0 additions & 44 deletions
diff --git a/‎python/sglang/multimodal_gen/runtime/layers/visual_embedding.py‎
Lines changed: 4 additions & 5 deletions b/‎python/sglang/multimodal_gen/runtime/layers/visual_embedding.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎sgl-kernel/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎sgl-kernel/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎sgl-kernel/csrc/common_extension.cc‎
Lines changed: 13 additions & 0 deletions b/‎sgl-kernel/csrc/common_extension.cc‎
Lines changed: 13 additions & 0 deletions
@@ -19,12 +19,10 @@
 )
 
 try:
-    from sglang.jit_kernel.timestep_embedding import (
-        timestep_embedding as timestep_embedding_cuda,
-    )
+    from sgl_kernel.elementwise import timestep_embedding as timestep_embedding_cuda
 except Exception as _e:
     # Fallback to diffusers implementation so downstream code can still run
-    # even if `jit_kernel` is not available.
+    # even if `sgl_kernel` is not installed/available.
     timestep_embedding_cuda = _get_timestep_embedding
 
 from sglang.multimodal_gen.runtime.layers.activation import get_act_fn
@@ -88,13 +86,14 @@ def forward(self, x):
 
 class Timesteps(_Timesteps):
     def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
-        return timestep_embedding_cuda(
+        t_emb = timestep_embedding_cuda(
             timesteps,
             self.num_channels,
             flip_sin_to_cos=self.flip_sin_to_cos,
             downscale_freq_shift=self.downscale_freq_shift,
             scale=self.scale,
         )
+        return t_emb
 
 
 class CombinedTimestepGuidanceTextProjEmbeddings(
 
@@ -282,6 +282,7 @@ set(SOURCES
     "csrc/elementwise/rope.cu"
     "csrc/elementwise/pos_enc.cu"
     "csrc/elementwise/topk.cu"
+    "csrc/sgl_diffusion/elementwise/timestep_embedding.cu"
     "csrc/expert_specialization/es_fp8_blockwise.cu"
     "csrc/expert_specialization/es_sm100_mxfp8_blockscaled.cu"
     "csrc/expert_specialization/es_sm100_mxfp8_blockscaled_group_quant.cu"
 
@@ -609,6 +609,19 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
 
   m.def("fast_hadamard_transform_40N(Tensor x, float scale) -> Tensor");
   m.impl("fast_hadamard_transform_40N", torch::kCUDA, &fast_hadamard_transform_40N);
+
+  /*
+   * From csrc/sgl_diffusion/elementwise
+   */
+  m.def(
+      "timestep_embedding(Tensor input,"
+      "Tensor output,"
+      "int dim,"
+      "bool flip_sin_to_cos,"
+      "float downscale_freq_shift,"
+      "float scale,"
+      "int max_period) -> Tensor");
+  m.impl("timestep_embedding", torch::kCUDA, &timestep_embedding);
 }
 
 REGISTER_EXTENSION(common_ops)