vllm-project
diff --git a/‎.github/workflows/test-check-transformers.yaml
Lines changed: 33 additions & 1 deletion b/‎.github/workflows/test-check-transformers.yaml
Lines changed: 33 additions & 1 deletion
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md
Lines changed: 2 additions & 3 deletions b/‎README.md
Lines changed: 2 additions & 3 deletions
diff --git a/‎examples/big_models_with_accelerate/cpu_offloading_fp8.py
Lines changed: 1 addition & 1 deletion b/‎examples/big_models_with_accelerate/cpu_offloading_fp8.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/big_models_with_accelerate/mult_gpus_int8_device_map.py
Lines changed: 1 addition & 1 deletion b/‎examples/big_models_with_accelerate/mult_gpus_int8_device_map.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/big_models_with_accelerate/multi_gpu_int8.py
Lines changed: 1 addition & 1 deletion b/‎examples/big_models_with_accelerate/multi_gpu_int8.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/multimodal_audio/whisper_example.py
Lines changed: 1 addition & 1 deletion b/‎examples/multimodal_audio/whisper_example.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/multimodal_vision/idefics3_example.py
Lines changed: 1 addition & 1 deletion b/‎examples/multimodal_vision/idefics3_example.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/multimodal_vision/llava_example.py
Lines changed: 1 addition & 1 deletion b/‎examples/multimodal_vision/llava_example.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/multimodal_vision/mllama_example.py
Lines changed: 1 addition & 1 deletion b/‎examples/multimodal_vision/mllama_example.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/multimodal_vision/phi3_vision_example.py
Lines changed: 1 addition & 1 deletion b/‎examples/multimodal_vision/phi3_vision_example.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/multimodal_vision/pixtral_example.py
Lines changed: 1 addition & 1 deletion b/‎examples/multimodal_vision/pixtral_example.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/multimodal_vision/qwen2_vl_example.py
Lines changed: 1 addition & 1 deletion b/‎examples/multimodal_vision/qwen2_vl_example.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/multimodal_vision/qwen_2_5_vl_example.py
Lines changed: 132 additions & 0 deletions b/‎examples/multimodal_vision/qwen_2_5_vl_example.py
Lines changed: 132 additions & 0 deletions
diff --git a/‎examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
Lines changed: 6 additions & 4 deletions b/‎examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
Lines changed: 6 additions & 4 deletions
diff --git a/‎examples/quantization_kv_cache/README.md
Lines changed: 1 addition & 1 deletion b/‎examples/quantization_kv_cache/README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/quantization_kv_cache/gemma2_fp8_kv_example.py
Lines changed: 5 additions & 1 deletion b/‎examples/quantization_kv_cache/gemma2_fp8_kv_example.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎examples/quantization_kv_cache/llama3_fp8_kv_example.py
Lines changed: 1 addition & 1 deletion b/‎examples/quantization_kv_cache/llama3_fp8_kv_example.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
Lines changed: 1 addition & 1 deletion b/‎examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/quantization_w4a16/README.md
Lines changed: 1 addition & 1 deletion b/‎examples/quantization_w4a16/README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/quantization_w8a8_fp8/README.md
Lines changed: 1 addition & 1 deletion b/‎examples/quantization_w8a8_fp8/README.md
Lines changed: 1 addition & 1 deletion
@@ -15,9 +15,41 @@ env:
   CLEARML_API_SECRET_KEY: ${{ secrets.CLEARML_API_SECRET_KEY }}
 
 jobs:
+  detect-changes:
+    runs-on: ubuntu-latest
+
+    outputs:
+      changes-present: ${{ steps.changed-files.outputs.any_modified }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Get changed files
+        id: changed-files
+        uses: tj-actions/changed-files@v45
+        with:
+          files: |
+            **
+            !examples/**
+            !tests/e2e/**
+            !tests/lmeval/**
+            !tests/examples/**
+            !**/*.md
+            !.github/**
+            .github/workflows/test-check-transformers.yaml
+
+      - name: Log relevant output
+        run: |
+          echo "changes-present: ${{ steps.changed-files.outputs.any_modified }}"
+          echo "all modified files: ${{ steps.changed-files.outputs.all_modified_files }}"
+        shell: bash
+
   transformers-tests:
+    needs: [detect-changes]
     runs-on: gcp-k8s-vllm-l4-solo
-    if: contains(github.event.pull_request.labels.*.name, 'ready') || github.event_name == 'push'
+    if: (contains(github.event.pull_request.labels.*.name, 'ready') || github.event_name == 'push') && needs.detect-changes.outputs.changes-present == 'true'
     steps:
       - uses: actions/setup-python@v5
         with:
 
@@ -800,5 +800,6 @@ integrations/pytorch/pytorch_vision*
 nm_temp_test_logs/*
 sparse_logs/*
 wandb/
+timings/
 output_finetune/
 env_log.json
@@ -56,10 +56,9 @@ Note that the model can be swapped for a local or remote HF-compatible checkpoin
 Quantization is applied by selecting an algorithm and calling the `oneshot` API.
 
 ```python
-from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
-from llmcompressor.transformers import oneshot
-from transformers import AutoModelForCausalLM
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor import oneshot
 
 # Select quantization algorithm. In this case, we:
 #   * apply SmoothQuant to make the activations easier to quantize
 
@@ -1,7 +1,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import oneshot
 
 MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
 OUTPUT_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
 
@@ -2,9 +2,9 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
 
 MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
 
@@ -1,8 +1,8 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 
 MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
 SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic"
 
@@ -2,8 +2,8 @@
 from datasets import load_dataset
 from transformers import WhisperProcessor
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.tracing import TraceableWhisperForConditionalGeneration
 
 # Select model and load it.
 
@@ -4,8 +4,8 @@
 from PIL import Image
 from transformers import AutoProcessor
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.tracing import TraceableIdefics3ForConditionalGeneration
 
 # Load model.
 
@@ -3,8 +3,8 @@
 from PIL import Image
 from transformers import AutoProcessor
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.tracing import TraceableLlavaForConditionalGeneration
 
 # Load model.
 
@@ -3,8 +3,8 @@
 from PIL import Image
 from transformers import AutoProcessor
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.tracing import TraceableMllamaForConditionalGeneration
 
 # Load model.
 
@@ -5,8 +5,8 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoProcessor
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 
 # Load model.
 model_id = "microsoft/Phi-3-vision-128k-instruct"
 
@@ -3,8 +3,8 @@
 from PIL import Image
 from transformers import AutoProcessor
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.tracing import TraceableLlavaForConditionalGeneration
 
 # Load model.
 
@@ -6,8 +6,8 @@
 from qwen_vl_utils import process_vision_info
 from transformers import AutoProcessor
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.tracing import TraceableQwen2VLForConditionalGeneration
 
 # Load model.
 
@@ -0,0 +1,132 @@
+import base64
+from io import BytesIO
+
+import torch
+from datasets import load_dataset
+from qwen_vl_utils import process_vision_info
+from transformers import AutoProcessor
+
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.transformers import oneshot
+from llmcompressor.transformers.tracing import (
+    TraceableQwen2_5_VLForConditionalGeneration,
+)
+
+# Load model.
+model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
+model = TraceableQwen2_5_VLForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype="auto",
+)
+processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+
+# Oneshot arguments
+DATASET_ID = "lmms-lab/flickr30k"
+DATASET_SPLIT = {"calibration": "test[:512]"}
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
+ds = ds.shuffle(seed=42)
+
+
+# Apply chat template and tokenize inputs.
+def preprocess_and_tokenize(example):
+    # preprocess
+    buffered = BytesIO()
+    example["image"].save(buffered, format="PNG")
+    encoded_image = base64.b64encode(buffered.getvalue())
+    encoded_image_text = encoded_image.decode("utf-8")
+    base64_qwen = f"data:image;base64,{encoded_image_text}"
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": base64_qwen},
+                {"type": "text", "text": "What does the image show?"},
+            ],
+        }
+    ]
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, video_inputs = process_vision_info(messages)
+
+    # tokenize
+    return processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+    )
+
+
+ds = ds.map(preprocess_and_tokenize, remove_columns=ds["calibration"].column_names)
+
+
+# Define a oneshot data collator for multimodal inputs.
+def data_collator(batch):
+    assert len(batch) == 1
+    return {key: torch.tensor(value) for key, value in batch[0].items()}
+
+
+# Recipe
+recipe = [
+    GPTQModifier(
+        targets="Linear",
+        scheme="W4A16",
+        sequential_targets=["Qwen2_5_VLDecoderLayer"],
+        ignore=["lm_head", "re:visual.*"],
+    ),
+]
+
+# Perform oneshot
+oneshot(
+    model=model,
+    tokenizer=model_id,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    trust_remote_code_model=True,
+    data_collator=data_collator,
+)
+
+# Confirm generations of the quantized model look sane.
+print("========== SAMPLE GENERATION ==============")
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": "http://images.cocodataset.org/train2017/000000231895.jpg",
+            },
+            {"type": "text", "text": "Please describe the animal in this image\n"},
+        ],
+    }
+]
+prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+image_inputs, video_inputs = process_vision_info(messages)
+inputs = processor(
+    text=[prompt],
+    images=image_inputs,
+    videos=video_inputs,
+    padding=False,
+    max_length=MAX_SEQUENCE_LENGTH,
+    truncation=True,
+    return_tensors="pt",
+).to("cuda")
+output = model.generate(**inputs, max_new_tokens=100)
+print(processor.decode(output[0], skip_special_tokens=True))
+print("==========================================")
+
+
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
@@ -33,6 +33,7 @@
 bf16 = False  # using full precision for training
 lr_scheduler_type = "cosine"
 warmup_ratio = 0.1
+preprocessing_num_workers = 8
 
 # this will run the recipe stage by stage:
 # oneshot sparsification -> finetuning -> oneshot quantization
@@ -52,10 +53,11 @@
     learning_rate=learning_rate,
     lr_scheduler_type=lr_scheduler_type,
     warmup_ratio=warmup_ratio,
+    preprocessing_num_workers=preprocessing_num_workers,
 )
 logger.info(
-    "Note: llcompressor does not currently support running ",
-    "compressed models in the marlin-24 format. The model ",
-    "produced from this example can be run on vLLM with ",
-    "dtype=torch.float16",
+    "llmcompressor does not currently support running compressed models in the marlin24 format."  # noqa
+)
+logger.info(
+    "The model produced from this example can be run on vLLM with dtype=torch.float16"
 )
@@ -75,7 +75,7 @@ Configure and apply the FP8 quantization for weights, activations, and KV cache.
 Notice the new `kv_cache_scheme` section:
 
 ```python
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 
 recipe = """
 quant_stage:
 
@@ -1,7 +1,7 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 
 # Select model and load it.
 MODEL_ID = "google/gemma-2-9b-it"
@@ -86,6 +86,10 @@ def process_and_tokenize(example):
     "Please use vLLM for inference with the quantized kv_cache.",
 )
 # Confirm generations of the quantized model look sane.
+
+# NOTE: transformers 4.49.0 results in a generation error with gemma2.
+# Consider either downgrading your transformers version to a previous version
+# or use vLLM for sample generation.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
 
@@ -2,7 +2,7 @@
 from loguru import logger
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
@@ -1,7 +1,7 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 
 # Select model and load it.
 # Phi-3.5 is a special case for KV cache quantization because it has
 
@@ -86,7 +86,7 @@ In our case, we will apply the default GPTQ recipe for `int4` (which uses static
 > See the `Recipes` documentation for more information on making complex recipes
 
 ```python
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
 
 # Configure the quantization algorithm to run.
 
@@ -54,7 +54,7 @@ We recommend targeting all `Linear` layers using the `FP8_DYNAMIC` scheme, which
 Since simple PTQ does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.
 
 ```python
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
 
 # Configure the simple PTQ quantization