simple-qwen-image-layered-inference/app.py at main · creative-graphic-design/simple-qwen-image-layered-inference · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import io
from contextlib import asynccontextmanager
from typing import List, Optional, Union
from zipfile import ZIP_DEFLATED, ZipFile

import torch
from diffusers import QwenImageLayeredPipeline
from diffusers.utils import logging
from fastapi import FastAPI, File, Query, Request, UploadFile, status
from fastapi.responses import StreamingResponse
from PIL import Image

logger = logging.get_logger(__file__)


@asynccontextmanager
async def lifespan(app: FastAPI):
    logger.info("Loading Qwen-Image-Layered pipeline...")
    pipe = QwenImageLayeredPipeline.from_pretrained("Qwen/Qwen-Image-Layered")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    pipe = pipe.to(device, torch.bfloat16)

    pipe.transformer.set_attention_backend("flash_hub")

    app.state.pipe = pipe
    yield


app = FastAPI(
    title="Qwen Image Layered API",
    description=(
        "API for decomposing an input image into multiple semantic layers "
        "using the Qwen-Image-Layered diffusion model.\n\n"
        "The `/decompose` endpoint returns a ZIP file containing PNG images "
        "for each decomposed layer."
    ),
    lifespan=lifespan,
)


async def load_image_from_file(image_file: UploadFile) -> Image.Image:
    """Load an image from an uploaded file."""
    data = await image_file.read()
    image = Image.open(io.BytesIO(data))
    return image


@app.post(
    "/decompose",
    summary="Decompose an image into layered outputs",
    responses={
        status.HTTP_200_OK: {
            "description": (
                "A ZIP file containing decomposed image layers.\n\n"
                "Each file is named `layer_{n}.png` where n starts from 1."
            ),
            "content": {"application/zip": {}},
        }
    },
)
async def decompose_image(
    request: Request,
    image_file: UploadFile = File(
        description="Image file to decompose",
    ),
    prompt: Optional[Union[str, List[str]]] = Query(
        default=None,
        description="The prompt or prompts to guide the image generation.",
    ),
    negative_prompt: Optional[Union[str, List[str]]] = Query(
        default=None,
        description="The prompt or prompts not to guide the image generation.",
    ),
    true_cfg_scale: float = Query(
        default=4.0,
        description="Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://huggingface.co/papers/2207.12598). `true_cfg_scale` is defined as `w` of equation 2. of [Imagen Paper](https://huggingface.co/papers/2205.11487). Classifier-free guidance is enabled by setting `true_cfg_scale > 1` and a provided `negative_prompt`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality.",
    ),
    layers: Optional[int] = Query(
        default=4,
        description="Number of layers to decompose",
    ),
    num_inference_steps: int = Query(
        default=50,
        description="The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference.",
    ),
    sigmas: Optional[List[float]] = Query(
        default=None,
        description="Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed will be used.",
    ),
    guidance_scale: Optional[float] = Query(
        default=None,
        description='A guidance scale value for guidance distilled models. Unlike the traditional classifier-free guidance where the guidance scale is applied during inference through noise prediction rescaling, guidance distilled models take the guidance scale directly as an input parameter during forward pass. Guidance scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. This parameter in the pipeline is there to support future guidance-distilled models when they come up. It is ignored when not using guidance distilled models. To enable traditional classifier-free guidance, please pass `true_cfg_scale > 1.0` and `negative_prompt` (even an empty negative prompt like " " should enable classifier-free guidance computations).',
    ),
    num_images_per_prompt: int = Query(
        default=1,
        description="The number of images to generate per prompt.",
    ),
    seed: int = Query(
        default=0,
        description="Random seed to use for image generation",
    ),
    max_sequence_length: int = Query(
        default=512,
        description="Maximum sequence length to use with the `prompt`.",
    ),
    resolution: int = Query(
        default=640,
        description="Using different bucket in (640, 1024) to determine the condition and output resolution",
    ),
    cfg_normalize: bool = Query(
        default=False,
        description="Whether to enable CFG normalization",
    ),
    use_en_prompt: bool = Query(
        default=False,
        description="Automatic caption language if user does not provide caption",
    ),
):
    image = await load_image_from_file(image_file)

    # Get the pipeline from the app state
    pipe = request.app.state.pipe
    assert isinstance(pipe, QwenImageLayeredPipeline)

    # Run the pipeline
    output = pipe(
        image=image,
        prompt=prompt,
        negative_prompt=negative_prompt,
        true_cfg_scale=true_cfg_scale,
        layers=layers,
        num_inference_steps=num_inference_steps,
        sigmas=sigmas,
        guidance_scale=guidance_scale,
        num_images_per_prompt=num_images_per_prompt,
        generator=torch.manual_seed(seed),
        max_sequence_length=max_sequence_length,
        resolution=resolution,
        cfg_normalize=cfg_normalize,
        use_en_prompt=use_en_prompt,
    )

    # The images contain batch x layers of images,
    # but since the batch size is fixed to 1, we only use images[0]
    (images,) = output.images

    # Create a zip file in memory
    zip_buf = io.BytesIO()
    with ZipFile(zip_buf, mode="w", compression=ZIP_DEFLATED) as zip_file:
        for idx, img in enumerate(images):
            img_byte_arr = io.BytesIO()
            img.save(img_byte_arr, format="PNG")

            zip_file.writestr(f"layer_{idx + 1}.png", img_byte_arr.getvalue())

    # Reset buffer position to the beginning
    zip_buf.seek(0)

    return StreamingResponse(
        zip_buf,
        media_type="application/zip",
        headers={"Content-Disposition": "attachment; filename=decomposed_layers.zip"},
    )