Generate speedup for inference (#2151)

jainapurva · web-flow · commit 8369268afecd · 2025-05-07T13:48:13.000-07:00
diff --git a/benchmarks/microbenchmarks/README.md b/benchmarks/microbenchmarks/README.md
@@ -130,6 +130,18 @@ Currently, quantization string is in same format as the one being passed in llam
       max_power: 11
   ```
 
+- `small_sweep`: Generate a small sweep of shapes with increasing powers of 2 for M, K, N dimensions
+  - Parameters:
+    - `min_power`: Minimum power of 2 (default: 10, which is 1024)
+    - `max_power`: Maximum power of 2 (default: 14, which is 16,384)
+  - Note: This generates shapes where M <= K <= N (ensuring increasing order), which produces fewer combinations than the full sweep, and could be good to use for plots like heatmap
+  ```yaml
+  matrix_shapes:
+    - name: "small_sweep"
+      min_power: 10  # 2^10 = 1024
+      max_power: 15  # 2^15 = 32,768
+  ```
+
 - `sweep`: Generate a sweep of shapes with different powers of 2 for M, K, N dimensions
   - Parameters:
     - `min_power`: Minimum power of 2 (default: 8, which is 256)
@@ -142,6 +154,8 @@ Currently, quantization string is in same format as the one being passed in llam
       max_power: 9  # 2^9 = 512
   ```
 
+
+
 ## Output
 
 Results are saved to a CSV file in the specified output directory
diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py
@@ -51,9 +51,28 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
             high_precision_dtype=config.high_precision_dtype,
             device=config.device,
         )
+        # Copy base model for quantizing
+        m_copy = deepcopy(base_model)
+
+        # Run benchmarks
+        result = BenchmarkResult(config=config)
+
+        # Store result in model for memory profiling
+        base_model._benchmark_result = result
+
+        # Run baseline benchmarking
+        base_model = base_model.eval().to(config.device)
+        if config.use_torch_compile:
+            print("Compiling baseline model....")
+            base_model = torch.compile(
+                base_model, mode=config.torch_compile_mode, fullgraph=True
+            )
+        # Benchmark time to run an inference call for baseline model
+        print("Benchmarking baseline inference.....")
+        result.baseline_inference_time_in_ms = model_inference_time_in_ms(
+            model=base_model, input_data=input_data
+        )
 
-        # Use quantize_ to apply each quantization function to the model
-        m_copy = deepcopy(base_model).eval().to(config.device)
         ao_base_config = string_to_config(
             config.quantization,
             config.sparsity,
@@ -79,24 +98,29 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
             pass  # No quantization or sparsity specified, do nothing
         else:
             print("Quantizing model....")
+            m_copy = m_copy.eval().to(config.device)
             quantize_(m_copy, ao_base_config)
 
         if config.use_torch_compile:
-            print("Compiling model....")
+            print("Compiling quantized model....")
             m_copy = torch.compile(
                 m_copy, mode=config.torch_compile_mode, fullgraph=True
             )
 
-        # Run benchmarks
-        result = BenchmarkResult(config=config)
         # Store result in model for memory profiling
         m_copy._benchmark_result = result
 
         # Benchmark time to run an inference call for quantized model
+        print("Benchmarking quantized model.....")
         result.model_inference_time_in_ms = model_inference_time_in_ms(
             model=m_copy, input_data=input_data
         )
 
+        # Calculate speedup w.r.t. baseline
+        result.speedup = round(
+            result.baseline_inference_time_in_ms / result.model_inference_time_in_ms, 2
+        )
+
         # Run profiler if enabled
         if config.enable_profiler:
             print("Running profiler...")
diff --git a/benchmarks/microbenchmarks/benchmark_runner.py b/benchmarks/microbenchmarks/benchmark_runner.py
@@ -76,6 +76,20 @@ def get_shapes_for_config(
                 val2 = 2**power_of_2 + 2 ** (power_of_2 - 1)
                 shapes.append((f"{name}_{idx * 2}", [val1, val1, val1]))
                 shapes.append((f"{name}_{idx * 2 + 1}", [val2, val2, val2]))
+        elif name == "small_sweep":
+            # Generate a small sweep of shapes with increasing powers of 2 for M, K, N
+            min_p2 = shape_config.get("min_power", 10)  # 1024
+            max_p2 = shape_config.get("max_power", 14)  # 16,384
+            counter = 0
+            for M_p2 in range(min_p2, max_p2 + 1):
+                M = 2**M_p2
+                for K_p2 in range(min_p2, max_p2 + 1):
+                    K = 2**K_p2
+                    for N_p2 in range(min_p2, max_p2 + 1):
+                        N = 2**N_p2
+                        if M <= K <= N:  # Ensure increasing order
+                            shapes.append((f"{name}_{counter}", [M, K, N]))
+                            counter += 1
         elif name == "sweep":
             # Generate a sweep of shapes with different powers of 2 for M, K, N
             min_p2 = shape_config.get("min_power", 8)  # 256
@@ -202,7 +216,7 @@ def run_inference_benchmarks_from_config(configs: List[BenchmarkConfig]) -> None
         print("----------------------------------------")
         try:
             print(
-                f"Running: {config.name} for Quantization: {config.quantization} and Sparsity: {config.sparsity}"
+                f"Running: {config.name} for Quantization: {config.quantization} and Sparsity: {config.sparsity} for {config.shape_name}: {config.m, config.k, config.n}"
             )
             result = run_inference(config)  # Pass the config object directly
             if result is not None:  # Only add successful results
diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml
@@ -3,18 +3,15 @@ benchmark_mode: "inference"
 quantization_config_recipe_names: # Will run a baseline inference for model by default, without quantization for comparison
   - "int8wo"
   - "int8dq"
-  - "float8dq"
+  - "float8dq-tensor"
   - "float8wo"
 output_dir: "benchmarks/microbenchmarks/results"
 model_params:
   - name: "small_bf16_linear"
     matrix_shapes:
-      - name: "custom"
-        shapes: [
-          [1024, 1024, 1024],  # [m, k, n]
-          [2048, 4096, 1024],
-          [4096, 4096, 1024]
-        ]
+      - name: "small_sweep"
+        min_power: 14
+        max_power: 16
     high_precision_dtype: "torch.bfloat16"
     use_torch_compile: true
     torch_compile_mode: "max-autotune"
@@ -60,9 +57,6 @@ model_params:
       - name: "pow2_extended"  # Example of using extended power of 2 shapes
         min_power: 10  # 1024
         max_power: 11  # 2048
-      - name: "sweep"  # Example of using sweep shapes (commented out as it generates many shapes)
-        min_power: 8   # 256
-        max_power: 9   # 512
     high_precision_dtype: "torch.bfloat16"
     use_torch_compile: true
     torch_compile_mode: "max-autotune"
diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py
@@ -124,7 +124,9 @@ def __init__(
     ):
         self.config = config
         self.output_dir = config.output_dir
+        self.baseline_inference_time_in_ms = 0.0
         self.model_inference_time_in_ms = 0.0
+        self.speedup = 0.0
         self.profiler_json_path: Optional[str] = None
         self.memory_profile_path: Optional[str] = None
         self.memory_visualization_path: Optional[str] = None
@@ -134,7 +136,9 @@ def to_dict(self) -> Dict[str, Any]:
         """Convert result to dictionary for main function"""
         result_dict = {
             **self.config.to_dict(),
+            "baseline_inference_time_in_ms": self.baseline_inference_time_in_ms,
             "model_inference_time_in_ms": self.model_inference_time_in_ms,
+            "speedup": self.speedup,
             "profiler_json_path": self.profiler_json_path,
             "memory_profile_path": self.memory_profile_path,
             "memory_visualization_path": self.memory_visualization_path,
@@ -299,7 +303,7 @@ def model_inference_time_in_ms(model, input_data):
         input_data: Input data for the model
 
     Returns:
-        float: Median inference time in microseconds
+        float: Median inference time in milliseconds
     """
     # First run to trigger any compilation/lazy initialization
 
@@ -315,8 +319,8 @@ def model_inference_time_in_ms(model, input_data):
     measurement = timer.timeit(number=100)
     res = measurement.mean
 
-    # Convert to microseconds
-    return res * 1e6
+    # Convert to milliseconds
+    return (res * 1e6) / 1000  # Convert microseconds to milliseconds
 
 
 def clean_caches():
@@ -386,7 +390,9 @@ def print_results(results: List[BenchmarkResult]):
             result.config.quantization or "baseline",
             result.config.sparsity or "none",
             f"{result.config.shape_name} ({result.config.m}, {result.config.k}, {result.config.n})",
+            f"{result.baseline_inference_time_in_ms:.2f}",
             f"{result.model_inference_time_in_ms:.2f}",
+            f"{result.speedup:.2f}x",
             str(result.config.enable_profiler),
         ]
 
@@ -398,7 +404,9 @@ def print_results(results: List[BenchmarkResult]):
         "Quantization",
         "Sparsity",
         "Shape",
+        "Baseline Inference Time (ms)",
         "Inference Time (ms)",
+        "Speedup",
         "Profiler Enabled",
     ]