Olive/olive/cli/benchmark.py at 33f93343ba053d44695da98b50e70fe88b8279e3 · microsoft/Olive · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# -----------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# -----------------------------------------------------------------------------
from argparse import ArgumentParser
from copy import deepcopy

from olive.cli.base import (
    BaseOliveCLICommand,
    add_input_model_options,
    add_logging_options,
    add_save_config_file_options,
    add_shared_cache_options,
    add_telemetry_options,
    get_input_model_config,
    update_shared_cache_options,
)
from olive.common.utils import set_nested_dict_value
from olive.telemetry import action


class BenchmarkCommand(BaseOliveCLICommand):
    @staticmethod
    def register_subcommand(parser: ArgumentParser):
        sub_parser = parser.add_parser("benchmark", help="Evaluate the model using lm-eval.")

        # model options
        add_input_model_options(
            sub_parser, enable_hf=True, enable_hf_adapter=True, enable_pt=True, default_output_path="onnx-model"
        )

        # lm-eval options
        lmeval_group = sub_parser.add_argument_group("lm-eval evaluator options")
        lmeval_group.add_argument(
            "--tasks",
            type=str,
            required=True,
            nargs="*",
            help="List of tasks to evaluate on.",
        )

        lmeval_group.add_argument(
            "--device",
            type=str,
            default="cpu",
            choices=["cpu", "gpu"],
            help="Target device for evaluation.",
        )

        lmeval_group.add_argument(
            "--batch_size",
            type=int,
            default=1,
            help="Batch size.",
        )

        lmeval_group.add_argument(
            "--max_length",
            type=int,
            default=1024,
            help="Maximum length of input + output.",
        )

        lmeval_group.add_argument(
            "--limit",
            type=float,
            default=1,
            help="Number (or percentage of dataset) of samples to use for evaluation.",
        )

        lmeval_group.add_argument(
            "--backend",
            type=str,
            default="auto",
            choices=["auto", "ort", "ortgenai"],
            help="Backend for ONNX model evaluation. Use 'auto' to infer backend from model type.",
        )

        lmeval_group.add_argument(
            "--confirm_run_unsafe_code",
            action="store_true",
            default=False,
            help="Allow running tasks that execute model-generated code (e.g., MBPP, HumanEval).",
        )

        add_logging_options(sub_parser)
        add_save_config_file_options(sub_parser)
        add_shared_cache_options(sub_parser)
        add_telemetry_options(sub_parser)
        sub_parser.set_defaults(func=BenchmarkCommand)

    @action
    def run(self):
        return self._run_workflow()

    def _get_run_config(self, tempdir: str) -> dict:
        config = deepcopy(TEMPLATE)

        input_model_config = get_input_model_config(self.args)
        assert input_model_config["type"].lower() in {
            "hfmodel",
            "pytorchmodel",
            "onnxmodel",
        }, "Only HfModel, PyTorchModel and OnnxModel are supported in benchmark command."

        if self.args.backend != "auto" and input_model_config["type"].lower() != "onnxmodel":
            raise ValueError("--backend is only supported for ONNX input models.")

        to_replace = [
            ("input_model", input_model_config),
            ("output_dir", self.args.output_path),
            ("log_severity_level", self.args.log_level),
            (("systems", "local_system", "accelerators", 0, "device"), self.args.device),
            (
                ("systems", "local_system", "accelerators", 0, "execution_providers"),
                [("CUDAExecutionProvider" if self.args.device == "gpu" else "CPUExecutionProvider")],
            ),
            (("evaluators", "evaluator", "tasks"), self.args.tasks),
            (("evaluators", "evaluator", "device"), self.args.device),
            (("evaluators", "evaluator", "batch_size"), self.args.batch_size),
            (("evaluators", "evaluator", "max_length"), self.args.max_length),
            (("evaluators", "evaluator", "limit"), self.args.limit),
            (
                ("evaluators", "evaluator", "model_class"),
                None if self.args.backend == "auto" else self.args.backend,
            ),
            (
                ("evaluators", "evaluator", "confirm_run_unsafe_code"),
                self.args.confirm_run_unsafe_code or None,
            ),
        ]

        for keys, value in to_replace:
            if value is not None:
                set_nested_dict_value(config, keys, value)
        update_shared_cache_options(config, self.args)

        return config


TEMPLATE = {
    "systems": {
        "local_system": {
            "type": "LocalSystem",
            "accelerators": [{"device": "cpu", "execution_providers": ["CPUExecutionProvider"]}],
        }
    },
    "evaluators": {
        "evaluator": {
            "type": "LMEvaluator",
            "tasks": [],
            "batch_size": 16,
            "max_length": 1024,
            "device": "cpu",
            "limit": 64,
        }
    },
    "evaluator": "evaluator",
    "host": "local_system",
    "target": "local_system",
    "no_artifacts": True,
}