Open
Description
Offloading compiled pipelines back and forth (or at least their weights) is a useful feat when dealing with multiple models on a single device with limited VRAM (but a lot of RAM). This doesn't seem possible right now:
import torch
import random
import time
import os
from pathlib import Path
os.environ["ONEFLOW_MLIR_ENABLE_INFERENCE_OPTIMIZATION"] = "1"
from diffusers_extensions.deep_cache import StableDiffusionXLPipeline
from onediff.infer_compiler import oneflow_compile
from onediff.schedulers import EulerDiscreteScheduler
GRAPHS_DIR = Path("/data/oneflow-cache/graphs/global-sdxl/1/onediff-ce-0.12.1/")
def load_compiled_graphs(pipe):
if pipe.vae.dtype == torch.float16 and pipe.vae.config.force_upcast:
pipe.upcast_vae()
for component_name in ["unet", "fast_unet", "vae"]:
component = getattr(pipe, component_name)
component = oneflow_compile(component)
target_file = GRAPHS_DIR / f"{component_name}-compiled.of"
print(f"Loading compiled graph for {component_name}")
component.load_graph(target_file)
setattr(pipe, component_name, component)
scheduler = EulerDiscreteScheduler.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
subfolder="scheduler",
)
# Initialize the first model
pipe_1 = StableDiffusionXLPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
torch_dtype=torch.float16,
scheduler=scheduler,
variant="fp16",
)
pipe_1.to("cuda")
load_compiled_graphs(pipe_1)
# Initialize the second model
pipe_2 = StableDiffusionXLPipeline.from_pretrained(
"dataautogpt3/OpenDalleV1.1",
torch_dtype=torch.float16,
scheduler=scheduler,
)
pipe_2.to("cuda")
load_compiled_graphs(pipe_2)
pipe_3 = StableDiffusionXLPipeline.from_pretrained(
"stablediffusionapi/dreamshaper-xl",
torch_dtype=torch.float16,
scheduler=scheduler,
)
pipe_3.to("cuda")
load_compiled_graphs(pipe_3)
# Move all models to the CPU
pipes = [pipe_1, pipe_2, pipe_3]
for pipe in pipes:
pipe.to("cpu")
while True:
# Choose a random model and move it to the GPU
t0 = time.perf_counter()
model = random.choice(pipes)
model.to("cuda")
t1 = time.perf_counter()
print(f"Loading model ({pipe}) took", t1 - t0, "seconds")
# Generate an image
image = model(
prompt="A cute cat",
num_inference_steps=25,
)
# Move the model back to the CPU
t0 = time.perf_counter()
model.to("cpu")
t1 = time.perf_counter()
print("Unloading model took", t1 - t0, "seconds")
RuntimeError: After graph built, the device of graph can't be modified, current device: cuda:0, target device: cpu