|
29 | 29 | TINY_MODELS = { |
30 | 30 | "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch", |
31 | 31 | "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl", |
| 32 | + "stable-diffusion-3": "optimum-internal-testing/tiny-random-stable-diffusion-3", |
| 33 | + "flux": "optimum-internal-testing/tiny-random-flux", |
32 | 34 | } |
33 | 35 |
|
34 | 36 |
|
@@ -267,5 +269,151 @@ def test_optimize_sdxl_fp16(self): |
267 | 269 | self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) |
268 | 270 |
|
269 | 271 |
|
| 272 | +class TestSD3FluxOptimization(unittest.TestCase): |
| 273 | + def optimize_sd3_or_flux( |
| 274 | + self, model_name, export_onnx_dir, optimized_onnx_dir, expected_op_counters, is_float16, atol |
| 275 | + ): |
| 276 | + from optimum.onnxruntime import ORTPipelineForText2Image |
| 277 | + |
| 278 | + if os.path.exists(export_onnx_dir): |
| 279 | + shutil.rmtree(export_onnx_dir, ignore_errors=True) |
| 280 | + |
| 281 | + baseline = ORTPipelineForText2Image.from_pretrained(model_name, export=True, provider="CUDAExecutionProvider") |
| 282 | + if not os.path.exists(export_onnx_dir): |
| 283 | + baseline.save_pretrained(export_onnx_dir) |
| 284 | + |
| 285 | + argv = [ |
| 286 | + "--input", |
| 287 | + export_onnx_dir, |
| 288 | + "--output", |
| 289 | + optimized_onnx_dir, |
| 290 | + "--overwrite", |
| 291 | + "--disable_group_norm", |
| 292 | + "--disable_bias_splitgelu", |
| 293 | + ] |
| 294 | + |
| 295 | + if is_float16: |
| 296 | + argv.append("--float16") |
| 297 | + |
| 298 | + op_counters = optimize_stable_diffusion(argv) |
| 299 | + |
| 300 | + for name in expected_op_counters: |
| 301 | + self.assertTrue(name in op_counters) |
| 302 | + for op, count in expected_op_counters[name].items(): |
| 303 | + self.assertTrue(op in op_counters[name]) |
| 304 | + self.assertEqual(op_counters[name][op], count) |
| 305 | + |
| 306 | + treatment = ORTPipelineForText2Image.from_pretrained(optimized_onnx_dir, provider="CUDAExecutionProvider") |
| 307 | + batch_size, num_images_per_prompt, height, width = 1, 1, 64, 64 |
| 308 | + inputs = { |
| 309 | + "prompt": ["starry night by van gogh"] * batch_size, |
| 310 | + "num_inference_steps": 3, |
| 311 | + "num_images_per_prompt": num_images_per_prompt, |
| 312 | + "height": height, |
| 313 | + "width": width, |
| 314 | + "output_type": "np", |
| 315 | + } |
| 316 | + |
| 317 | + seed = 123 |
| 318 | + np.random.seed(seed) |
| 319 | + import torch |
| 320 | + |
| 321 | + baseline_outputs = baseline(**inputs, generator=torch.Generator(device="cuda").manual_seed(seed)) |
| 322 | + |
| 323 | + np.random.seed(seed) |
| 324 | + treatment_outputs = treatment(**inputs, generator=torch.Generator(device="cuda").manual_seed(seed)) |
| 325 | + |
| 326 | + self.assertTrue(np.allclose(baseline_outputs.images[0], treatment_outputs.images[0], atol=atol)) |
| 327 | + |
| 328 | + @pytest.mark.slow |
| 329 | + def test_sd3(self): |
| 330 | + """This tests optimization of stable diffusion 3 pipeline""" |
| 331 | + model_name = TINY_MODELS["stable-diffusion-3"] |
| 332 | + |
| 333 | + expected_op_counters = { |
| 334 | + "transformer": { |
| 335 | + "FastGelu": 3, |
| 336 | + "MultiHeadAttention": 2, |
| 337 | + "LayerNormalization": 8, |
| 338 | + "SimplifiedLayerNormalization": 0, |
| 339 | + }, |
| 340 | + "vae_encoder": {"Attention": 0, "GroupNorm": 0, "SkipGroupNorm": 0, "NhwcConv": 17}, |
| 341 | + "vae_decoder": {"Attention": 0, "GroupNorm": 0, "SkipGroupNorm": 0, "NhwcConv": 25}, |
| 342 | + "text_encoder": { |
| 343 | + "Attention": 2, |
| 344 | + "Gelu": 0, |
| 345 | + "LayerNormalization": 1, |
| 346 | + "QuickGelu": 2, |
| 347 | + "SkipLayerNormalization": 4, |
| 348 | + }, |
| 349 | + "text_encoder_2": { |
| 350 | + "Attention": 2, |
| 351 | + "Gelu": 0, |
| 352 | + "LayerNormalization": 1, |
| 353 | + "QuickGelu": 0, |
| 354 | + "SkipLayerNormalization": 4, |
| 355 | + }, |
| 356 | + "text_encoder_3": { |
| 357 | + "Attention": 2, |
| 358 | + "MultiHeadAttention": 0, |
| 359 | + "Gelu": 0, |
| 360 | + "FastGelu": 2, |
| 361 | + "BiasGelu": 0, |
| 362 | + "GemmFastGelu": 0, |
| 363 | + "LayerNormalization": 0, |
| 364 | + "SimplifiedLayerNormalization": 2, |
| 365 | + "SkipLayerNormalization": 0, |
| 366 | + "SkipSimplifiedLayerNormalization": 3, |
| 367 | + }, |
| 368 | + } |
| 369 | + |
| 370 | + export_onnx_dir = "tiny-random-stable-diffusion-3" |
| 371 | + optimized_onnx_dir = "tiny-random-stable-diffusion-3-optimized-fp32" |
| 372 | + self.optimize_sd3_or_flux( |
| 373 | + model_name, export_onnx_dir, optimized_onnx_dir, expected_op_counters, is_float16=False, atol=5e-3 |
| 374 | + ) |
| 375 | + |
| 376 | + optimized_onnx_dir = "tiny-random-stable-diffusion-3-optimized-fp16" |
| 377 | + self.optimize_sd3_or_flux( |
| 378 | + model_name, export_onnx_dir, optimized_onnx_dir, expected_op_counters, is_float16=True, atol=5e-2 |
| 379 | + ) |
| 380 | + |
| 381 | + @pytest.mark.slow |
| 382 | + def test_flux(self): |
| 383 | + """This tests optimization of flux pipeline""" |
| 384 | + model_name = TINY_MODELS["flux"] |
| 385 | + |
| 386 | + expected_op_counters = { |
| 387 | + "transformer": { |
| 388 | + "FastGelu": 3, |
| 389 | + "MultiHeadAttention": 2, |
| 390 | + "LayerNormalization": 6, |
| 391 | + "SimplifiedLayerNormalization": 6, |
| 392 | + }, |
| 393 | + "vae_encoder": {"Attention": 0, "GroupNorm": 0, "SkipGroupNorm": 0, "NhwcConv": 8}, |
| 394 | + "vae_decoder": {"Attention": 0, "GroupNorm": 0, "SkipGroupNorm": 0, "NhwcConv": 10}, |
| 395 | + "text_encoder": { |
| 396 | + "Attention": 5, |
| 397 | + "Gelu": 0, |
| 398 | + "LayerNormalization": 1, |
| 399 | + "QuickGelu": 0, |
| 400 | + "SkipLayerNormalization": 10, |
| 401 | + }, |
| 402 | + # The tiny flux uses clip, but FLUX.1-dev uses t5, so we skip op count verification for text_encoder_2. |
| 403 | + "text_encoder_2": {}, |
| 404 | + } |
| 405 | + |
| 406 | + export_onnx_dir = "tiny-random-flux" |
| 407 | + optimized_onnx_dir = "tiny-random-flux-optimized-fp32" |
| 408 | + self.optimize_sd3_or_flux( |
| 409 | + model_name, export_onnx_dir, optimized_onnx_dir, expected_op_counters, is_float16=False, atol=1e-3 |
| 410 | + ) |
| 411 | + |
| 412 | + optimized_onnx_dir = "tiny-random-flux-optimized-fp16" |
| 413 | + self.optimize_sd3_or_flux( |
| 414 | + model_name, export_onnx_dir, optimized_onnx_dir, expected_op_counters, is_float16=True, atol=5e-2 |
| 415 | + ) |
| 416 | + |
| 417 | + |
270 | 418 | if __name__ == "__main__": |
271 | 419 | unittest.main() |
0 commit comments