Offloading compiled components to RAM for storage (not inference)

Offloading compiled pipelines back and forth (or at least their weights) is a useful feat when dealing with multiple models on a single device with limited VRAM (but a lot of RAM). This doesn't seem possible right now:

```py
import torch
import random
import time
import os
from pathlib import Path

os.environ["ONEFLOW_MLIR_ENABLE_INFERENCE_OPTIMIZATION"] = "1"

from diffusers_extensions.deep_cache import StableDiffusionXLPipeline
from onediff.infer_compiler import oneflow_compile
from onediff.schedulers import EulerDiscreteScheduler

GRAPHS_DIR = Path("/data/oneflow-cache/graphs/global-sdxl/1/onediff-ce-0.12.1/")

def load_compiled_graphs(pipe):
    if pipe.vae.dtype == torch.float16 and pipe.vae.config.force_upcast:
        pipe.upcast_vae()

    for component_name in ["unet", "fast_unet", "vae"]:
        component = getattr(pipe, component_name)
        component = oneflow_compile(component)
        target_file = GRAPHS_DIR / f"{component_name}-compiled.of"
        print(f"Loading compiled graph for {component_name}")
        component.load_graph(target_file)
        setattr(pipe, component_name, component)

scheduler = EulerDiscreteScheduler.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    subfolder="scheduler",
)

# Initialize the first model
pipe_1 = StableDiffusionXLPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    torch_dtype=torch.float16,
    scheduler=scheduler,
    variant="fp16",
)
pipe_1.to("cuda")
load_compiled_graphs(pipe_1)

# Initialize the second model
pipe_2 = StableDiffusionXLPipeline.from_pretrained(
    "dataautogpt3/OpenDalleV1.1",
    torch_dtype=torch.float16,
    scheduler=scheduler,
)
pipe_2.to("cuda")
load_compiled_graphs(pipe_2)

pipe_3 = StableDiffusionXLPipeline.from_pretrained(
    "stablediffusionapi/dreamshaper-xl",
    torch_dtype=torch.float16,
    scheduler=scheduler,
)
pipe_3.to("cuda")
load_compiled_graphs(pipe_3)


# Move all models to the CPU
pipes = [pipe_1, pipe_2, pipe_3]
for pipe in pipes:
    pipe.to("cpu")

while True:
    # Choose a random model and move it to the GPU
    t0 = time.perf_counter()
    model = random.choice(pipes)
    model.to("cuda")
    t1 = time.perf_counter()
    print(f"Loading model ({pipe}) took", t1 - t0, "seconds")

    # Generate an image
    image = model(
        prompt="A cute cat",
        num_inference_steps=25,
    )

    # Move the model back to the CPU
    t0 = time.perf_counter()
    model.to("cpu")
    t1 = time.perf_counter()
    print("Unloading model took", t1 - t0, "seconds")
```

```
RuntimeError: After graph built, the device of graph can't be modified, current device: cuda:0, target device: cpu
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Offloading compiled components to RAM for storage (not inference) #580

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Offloading compiled components to RAM for storage (not inference) #580

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions