[diffusion] chore: improve memory usage on consumer-level GPU (#18997)

mickqian · web-flow · commit d73f06f09149 · 2026-02-19T21:59:49.000+08:00
diff --git a/python/sglang/multimodal_gen/runtime/loader/utils.py b/python/sglang/multimodal_gen/runtime/loader/utils.py
@@ -148,13 +148,15 @@ def _list_safetensors_files(model_path: str) -> list[str]:
     return sorted(glob.glob(os.path.join(str(model_path), "*.safetensors")))
 
 
+BYTES_PER_GB = 1024**3
+
+
 def get_memory_usage_of_component(module) -> float | None:
     """
     returned value is in GB, rounded to 2 decimal digits
     """
     if not isinstance(module, nn.Module):
         return None
-    BYTES_PER_GB = 1024**3
     if hasattr(module, "get_memory_footprint"):
         usage = module.get_memory_footprint() / BYTES_PER_GB
     else:
diff --git a/python/sglang/multimodal_gen/runtime/server_args.py b/python/sglang/multimodal_gen/runtime/server_args.py
@@ -26,6 +26,7 @@
 from sglang.multimodal_gen.runtime.layers.quantization.configs.nunchaku_config import (
     NunchakuConfig,
 )
+from sglang.multimodal_gen.runtime.loader.utils import BYTES_PER_GB
 from sglang.multimodal_gen.runtime.platforms import (
     AttentionBackendEnum,
     current_platform,
@@ -411,7 +412,18 @@ def _adjust_quant_config(self):
             )
 
     def _adjust_offload(self):
-        if self.pipeline_config.task_type.is_image_gen():
+        # TODO: to be handled by each platform
+        if current_platform.get_device_total_memory() / BYTES_PER_GB < 30:
+            logger.info("Enabling all offloading for GPU with low device memory")
+            if self.dit_cpu_offload is None:
+                self.dit_cpu_offload = True
+            if self.text_encoder_cpu_offload is None:
+                self.text_encoder_cpu_offload = True
+            if self.image_encoder_cpu_offload is None:
+                self.image_encoder_cpu_offload = True
+            if self.vae_cpu_offload is None:
+                self.vae_cpu_offload = True
+        elif self.pipeline_config.task_type.is_image_gen():
             logger.info(
                 "Disabling some offloading (except dit, text_encoder) for image generation model"
             )
@@ -1086,7 +1098,7 @@ def _validate_offload(self):
                 )
                 self.use_fsdp_inference = False
 
-            if self.dit_cpu_offload:
+            if self.dit_cpu_offload is None:
                 logger.warning(
                     "dit_layerwise_offload is enabled, automatically disabling dit_cpu_offload."
                 )