1
1
from typing import Any , Callable
2
2
3
3
import torch
4
+ from .utils import get_arch_name , write_json_result
4
5
6
+ _OUTPUT_JSON_PATH = "benchmark_results"
5
7
6
8
def setup_baseline ():
7
9
from torchao .quantization .utils import recommended_inductor_config_setter
@@ -11,6 +13,21 @@ def setup_baseline():
11
13
torch ._dynamo .config .cache_size_limit = 10000
12
14
13
15
16
+ def benchmark_and_write_json_result (model , args , kwargs , quantization , device ):
17
+ print (quantization + " run" )
18
+ from torchao .utils import benchmark_model , profiler_runner
19
+ model = torch .compile (model , mode = "max-autotune" )
20
+ benchmark_model (model , 20 , args , kwargs )
21
+ elapsed_time = benchmark_model (model , 100 , args , kwargs )
22
+ print ("elapsed_time: " , elapsed_time , " milliseconds" )
23
+
24
+ name = model ._orig_mod .__class__ .__name__
25
+ headers = ["name" , "dtype" , "device" , "arch" , "metric" , "actual" , "target" ]
26
+ arch = get_arch_name ()
27
+ dtype = quantization
28
+ performance_result = [name , dtype , device , arch , "time_ms(avg)" , elapsed_time , None ]
29
+ write_json_result (_OUTPUT_JSON_PATH , headers , performance_result )
30
+
14
31
def torchao_optimize_ctx (quantization : str ):
15
32
from torchao .quantization .quant_api import (
16
33
autoquant ,
@@ -20,10 +37,21 @@ def torchao_optimize_ctx(quantization: str):
20
37
quantize_ ,
21
38
)
22
39
from torchao .utils import unwrap_tensor_subclass
40
+ import torchao
23
41
24
42
def inner (model_iter_fn : Callable ):
25
43
def _torchao_apply (module : torch .nn .Module , example_inputs : Any ):
26
44
if getattr (module , "_quantized" , None ) is None :
45
+ if quantization == "noquant" :
46
+ if isinstance (example_inputs , dict ):
47
+ args = ()
48
+ kwargs = example_inputs
49
+ else :
50
+ args = example_inputs
51
+ kwargs = {}
52
+
53
+ benchmark_and_write_json_result (module , args , kwargs , "noquant" , "cuda" )
54
+
27
55
if quantization == "int8dynamic" :
28
56
quantize_ (
29
57
module ,
@@ -47,6 +75,16 @@ def _torchao_apply(module: torch.nn.Module, example_inputs: Any):
47
75
"NotAutoquantizable"
48
76
f"Found no autoquantizable layers in model { type (module )} , stopping autoquantized run"
49
77
)
78
+
79
+ if isinstance (example_inputs , dict ):
80
+ args = ()
81
+ kwargs = example_inputs
82
+ else :
83
+ args = example_inputs
84
+ kwargs = {}
85
+
86
+ torchao .quantization .utils .recommended_inductor_config_setter ()
87
+ benchmark_and_write_json_result (module , args , kwargs , "autoquant" , "cuda" )
50
88
else :
51
89
unwrap_tensor_subclass (module )
52
90
setattr (module , "_quantized" , True ) # noqa: B010
0 commit comments