pytorch · zewenli98 · Jun 11, 2024 · Mar 19, 2025 · Mar 20, 2025
diff --git a/examples/dynamo/README.rst b/examples/dynamo/README.rst
@@ -18,6 +18,7 @@ Model Zoo
 * :ref:`torch_compile_transformer`: Compiling a Transformer model using ``torch.compile``
 * :ref:`torch_compile_stable_diffusion`: Compiling a Stable Diffusion model using ``torch.compile``
 * :ref:`_torch_compile_gpt2`: Compiling a GPT2 model using ``torch.compile``
+* :ref:`_torch_compile_phi4`: Compiling a Phi4 model from Hugging Face using ``torch.compile``
 * :ref:`_torch_export_gpt2`: Compiling a GPT2 model using AOT workflow (`ir=dynamo`)
 * :ref:`_torch_export_llama2`: Compiling a Llama2 model using AOT workflow (`ir=dynamo`)
 * :ref:`_torch_export_sam2`: Compiling SAM2 model using AOT workflow (`ir=dynamo`)

diff --git a/examples/dynamo/torch_compile_phi4.py b/examples/dynamo/torch_compile_phi4.py
@@ -0,0 +1,73 @@
+"""
+.. _torch_compile_phi4:
+
+Compiling Phi 4 model from Hugging Face using the Torch-TensorRT `torch.compile` Backend
+======================================================
+
+This script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a Phi 4 model from Hugging Face.
+"""
+
+# %%
+# Imports and Model Definition
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+import requests
+import torch
+import torch_tensorrt
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoProcessor
+
+# %%
+# Load the pre-trained model weights from Hugging Face
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+model_id = "microsoft/Phi-4-multimodal-instruct"
+processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+model = (
+    AutoModelForCausalLM.from_pretrained(
+        model_id, trust_remote_code=True, torch_dtype="auto"
+    )
+    .eval()
+    .cuda()
+)
+
+# %%
+# Compile the model with torch.compile, using Torch-TensorRT backend
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+model.forward = torch.compile(
+    model.forward,
+    backend="tensorrt",
+    options={"debug": True, "min_block_size": 1, "use_python_runtime": True},
+)
+
+# %%
+# Write prompt and load image
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+user_prompt = "<|user|>\n"
+assistant_prompt = "<|assistant|>\n"
+prompt_suffix = "<|end|>\n"
+
+# single-image prompt
+prompt = f"{user_prompt}<|image_1|>\nWhat is shown in this image?{prompt_suffix}{assistant_prompt}"
+url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+print(f">>> Prompt\n{prompt}")
+
+image = Image.open(requests.get(url, stream=True).raw)
+inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
+
+# %%
+# Inference
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+generate_ids = model.generate(
+    **inputs,
+    max_new_tokens=1000,
+    eos_token_id=processor.tokenizer.eos_token_id,
+)
+generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :]
+response = processor.batch_decode(
+    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)[0]
+print(f">>> Response\n{response}")