tabbyAPI/common/config_models.py at e40dcd497f56230cc8418dc0d183152866b7b833 · theroyallab/tabbyAPI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
from pydantic import (
    BaseModel,
    ConfigDict,
    constr,
    Field,
    PrivateAttr,
    field_validator,
)
from typing import List, Literal, Optional, Union


CACHE_SIZES = Literal["FP16", "Q8", "Q6", "Q4"]
CACHE_TYPE = Union[CACHE_SIZES, constr(pattern=r"^[2-8]\s*,\s*[2-8]$")]


class Metadata(BaseModel):
    """metadata model for config options"""

    include_in_config: Optional[bool] = Field(True)


class BaseConfigModel(BaseModel):
    """Base model for config models with added metadata"""

    _metadata: Metadata = PrivateAttr(Metadata())


class ConfigOverrideConfig(BaseConfigModel):
    """Model for overriding a provided config file."""

    # TODO: convert this to a pathlib.path?
    config: Optional[str] = Field(
        None, description=("Path to an overriding config.yml file")
    )

    _metadata: Metadata = PrivateAttr(Metadata(include_in_config=False))


class NetworkConfig(BaseConfigModel):
    """Options for networking"""

    host: Optional[str] = Field(
        "127.0.0.1",
        description=(
            "The IP to host on (default: 127.0.0.1).\n"
            "Use 0.0.0.0 to expose on all network adapters."
        ),
    )
    port: Optional[int] = Field(
        5000, description=("The port to host on (default: 5000).")
    )
    disable_auth: Optional[bool] = Field(
        False,
        description=(
            "Disable HTTP token authentication with requests.\n"
            "WARNING: This will make your instance vulnerable!\n"
            "Turn on this option if you are ONLY connecting from localhost."
        ),
    )
    disable_fetch_requests: Optional[bool] = Field(
        False,
        description=(
            "Disable fetching external content in response to requests,"
            "such as images from URLs."
        ),
    )
    send_tracebacks: Optional[bool] = Field(
        False,
        description=(
            "Send tracebacks over the API (default: False).\n"
            "NOTE: Only enable this for debug purposes."
        ),
    )
    api_servers: Optional[List[Literal["oai", "kobold"]]] = Field(
        ["OAI"],
        description=(
            'Select API servers to enable (default: ["OAI"]).\n'
            "Possible values: OAI, Kobold."
        ),
    )

    # Converts all strings in the api_servers list to lowercase
    # NOTE: Expand if more models need this validator
    @field_validator("api_servers", mode="before")
    def api_server_validator(cls, api_servers):
        return [server_name.lower() for server_name in api_servers]


# TODO: Migrate config.yml to have the log_ prefix
# This is a breaking change.
class LoggingConfig(BaseConfigModel):
    """Options for logging"""

    log_prompt: Optional[bool] = Field(
        False,
        description=("Enable prompt logging (default: False)."),
    )
    log_generation_params: Optional[bool] = Field(
        False,
        description=("Enable generation parameter logging (default: False)."),
    )
    log_requests: Optional[bool] = Field(
        False,
        description=(
            "Enable request logging (default: False).\n"
            "NOTE: Only use this for debugging!"
        ),
    )


class ModelConfig(BaseConfigModel):
    """
    Options for model overrides and loading
    Please read the comments to understand how arguments are handled
    between initial and API loads
    """

    # TODO: convert this to a pathlib.path?
    model_dir: str = Field(
        "models",
        description=(
            "Directory to look for models (default: models).\n"
            "Windows users, do NOT put this path in quotes!"
        ),
    )
    inline_model_loading: Optional[bool] = Field(
        False,
        description=(
            "Allow direct loading of models "
            "from a completion or chat completion request (default: False).\n"
            "This method of loading is strict by default.\n"
            "Enable dummy models to add exceptions for invalid model names."
        ),
    )
    use_dummy_models: Optional[bool] = Field(
        False,
        description=(
            "Sends dummy model names when the models endpoint is queried. "
            "(default: False)\n"
            "Enable this if the client is looking for specific OAI models.\n"
        ),
    )
    dummy_model_names: List[str] = Field(
        default=["gpt-3.5-turbo"],
        description=(
            "A list of fake model names that are sent via the /v1/models endpoint. "
            '(default: ["gpt-3.5-turbo"])\n'
            "Also used as bypasses for strict mode if inline_model_loading is true."
        ),
    )
    model_name: Optional[str] = Field(
        None,
        description=(
            "An initial model to load.\n"
            "Make sure the model is located in the model directory!\n"
            "REQUIRED: This must be filled out to load a model on startup."
        ),
    )
    use_as_default: List[str] = Field(
        default_factory=list,
        description=(
            "Names of args to use as a fallback for API load requests (default: []).\n"
            "For example, if you always want cache_mode to be Q4 "
            'instead of on the inital model load, add "cache_mode" to this array.\n'
            "Example: ['max_seq_len', 'cache_mode']."
        ),
    )
    backend: Optional[str] = Field(
        None,
        description=(
            "Backend to use for this model (auto-detect if not specified)\n"
            "Options: exllamav2, exllamav3"
        ),
    )
    max_seq_len: Optional[int] = Field(
        None,
        description=(
            "Max sequence length (default: 4096).\n"
            "Set to -1 to fetch from the model's config.json"
        ),
        ge=-1,
    )
    cache_size: Optional[int] = Field(
        None,
        description=(
            "Size of the prompt cache to allocate (default: max_seq_len).\n"
            "Must be a multiple of 256 and can't be less than max_seq_len.\n"
            "For CFG, set this to 2 * max_seq_len."
        ),
        multiple_of=256,
        gt=0,
    )
    cache_mode: Optional[CACHE_TYPE] = Field(
        "FP16",
        description=(
            "Enable different cache modes for VRAM savings (default: FP16).\n"
            f"Possible values for exllamav2: {str(CACHE_SIZES)[15:-1]}.\n"
            "For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits "
            "are integers from 2-8 (i.e. 8,8)."
        ),
    )
    tensor_parallel: Optional[bool] = Field(
        False,
        description=(
            "Load model with tensor parallelism (default: False).\n"
            "Falls back to autosplit if GPU split isn't provided.\n"
            "This ignores the gpu_split_auto value."
        ),
    )
    tensor_parallel_backend: Optional[str] = Field(
        "native",
        description=(
            "Sets a backend type for tensor parallelism. (default: native).\n"
            "Options: native, nccl\n"
            "Native is recommended for PCIe GPUs\n"
            "NCCL is recommended for NVLink."
        ),
    )
    gpu_split_auto: Optional[bool] = Field(
        True,
        description=(
            "Automatically allocate resources to GPUs (default: True).\n"
            "Not parsed for single GPU users."
        ),
    )
    autosplit_reserve: List[float] = Field(
        [96],
        description=(
            "Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0).\n"
            "Represented as an array of MB per GPU."
        ),
    )
    gpu_split: List[float] = Field(
        default_factory=list,
        description=(
            "An integer array of GBs of VRAM to split between GPUs (default: []).\n"
            "Used with tensor parallelism."
        ),
    )
    rope_scale: Optional[float] = Field(
        1.0,
        description=(
            "Rope scale (default: 1.0).\n"
            "Same as compress_pos_emb.\n"
            "Use if the model was trained on long context with rope.\n"
            "Leave blank to pull the value from the model."
        ),
    )
    rope_alpha: Optional[Union[float, Literal["auto"]]] = Field(
        None,
        description=(
            "Rope alpha (default: None).\n"
            'Same as alpha_value. Set to "auto" to auto-calculate.\n'
            "Leaving this value blank will either pull from the model "
            "or auto-calculate."
        ),
    )
    chunk_size: Optional[int] = Field(
        2048,
        description=(
            "Chunk size for prompt ingestion (default: 2048).\n"
            "A lower value reduces VRAM usage but decreases ingestion speed.\n"
            "NOTE: Effects vary depending on the model.\n"
            "An ideal value is between 512 and 4096."
        ),
        gt=0,
    )
    output_chunking: Optional[bool] = Field(
        True,
        description=(
            "Use output chunking (default: True)\n"
            "Instead of allocating cache space for the entire completion at once, "
            "allocate in chunks as needed.\n"
            "Used by EXL3 models only.\n"
        ),
    )
    max_batch_size: Optional[int] = Field(
        None,
        description=(
            "Set the maximum number of prompts to process at one time "
            "(default: None/Automatic).\n"
            "Automatically calculated if left blank.\n"
            "NOTE: Only available for Nvidia ampere (30 series) and above GPUs."
        ),
        ge=1,
    )
    prompt_template: Optional[str] = Field(
        None,
        description=(
            "Set the prompt template for this model. (default: None)\n"
            "If empty, attempts to look for the model's chat template.\n"
            "If a model contains multiple templates in its tokenizer_config.json,\n"
            "set prompt_template to the name of the template you want to use.\n"
            "NOTE: Only works with chat completion message lists!"
        ),
    )
    reasoning_parser: Optional[str] = Field(
        None,
        description=(
            "Reasoning parser key used to split output into reasoning/content.\n"
            "Compatible with vLLM parser naming (e.g. exaone4, deepseek_r1).\n"
            "If omitted, defaults to 'basic'."
        ),
    )
    enable_auto_tool_choice: Optional[bool] = Field(
        False,
        description=(
            "Enable auto tool choice for chat completions (default: False).\n"
            "Equivalent to vLLM's --enable-auto-tool-choice.\n"
            "Requires tool_call_parser to be set."
        ),
    )
    tool_call_parser: Optional[str] = Field(
        None,
        description=(
            "Tool parser key for model-generated tool call output.\n"
            "Equivalent to vLLM's --tool-call-parser.\n"
            "Built-in parser keys include: hermes, llama/llama3_json/llama4_json,\n"
            "openai, pythonic, qwen3_coder, qwen3_xml,\n"
            "deepseek_v3, deepseek_v31, deepseek_v32."
        ),
    )
    exclude_tools_when_tool_choice_none: Optional[bool] = Field(
        False,
        description=(
            "Exclude tool definitions from prompt when tool_choice='none'.\n"
            "Equivalent to vLLM's --exclude-tools-when-tool-choice-none."
        ),
    )
    vision: Optional[bool] = Field(
        False,
        description=(
            "Enables vision support if the model supports it. (default: False)"
        ),
    )

    _metadata: Metadata = PrivateAttr(Metadata())
    model_config = ConfigDict(protected_namespaces=())


class DraftModelConfig(BaseConfigModel):
    """
    Options for draft models (speculative decoding)
    This will use more VRAM!
    """

    # TODO: convert this to a pathlib.path?
    draft_model_dir: Optional[str] = Field(
        "models",
        description=("Directory to look for draft models (default: models)"),
    )
    draft_model_name: Optional[str] = Field(
        None,
        description=(
            "An initial draft model to load.\n"
            "Ensure the model is in the model directory."
        ),
    )
    draft_rope_scale: Optional[float] = Field(
        1.0,
        description=(
            "Rope scale for draft models (default: 1.0).\n"
            "Same as compress_pos_emb.\n"
            "Use if the draft model was trained on long context with rope."
        ),
    )
    draft_rope_alpha: Optional[float] = Field(
        None,
        description=(
            "Rope alpha for draft models (default: None).\n"
            'Same as alpha_value. Set to "auto" to auto-calculate.\n'
            "Leaving this value blank will either pull from the model "
            "or auto-calculate."
        ),
    )
    draft_cache_mode: Optional[CACHE_SIZES] = Field(
        "FP16",
        description=(
            "Cache mode for draft models to save VRAM (default: FP16).\n"
            f"Possible values: {str(CACHE_SIZES)[15:-1]}."
        ),
    )
    draft_gpu_split: List[float] = Field(
        default_factory=list,
        description=(
            "An integer array of GBs of VRAM to split between GPUs (default: []).\n"
            "If this isn't filled in, the draft model is autosplit."
        ),
    )


class SamplingConfig(BaseConfigModel):
    """Options for Sampling"""

    override_preset: Optional[str] = Field(
        None,
        description=(
            "Select a sampler override preset (default: None).\n"
            "Find this in the sampler-overrides folder.\n"
            "This overrides default fallbacks for sampler values "
            "that are passed to the API.\n"
            "NOTE: safe_defaults preset provides a fallback for frontends "
            "that do not pass sampling params.\n"
            "Remove it if not necessary."
        ),
    )


class LoraInstanceModel(BaseConfigModel):
    """Model representing an instance of a Lora."""

    name: Optional[str] = None
    scaling: float = Field(1.0, ge=0)


class LoraConfig(BaseConfigModel):
    """Options for Loras"""

    # TODO: convert this to a pathlib.path?
    lora_dir: Optional[str] = Field(
        "loras", description=("Directory to look for LoRAs (default: loras).")
    )
    loras: Optional[List[LoraInstanceModel]] = Field(
        None,
        description=(
            "List of LoRAs to load and associated scaling factors "
            "(default scale: 1.0).\n"
            "For the YAML file, add each entry as a YAML list:\n"
            "- name: lora1\n"
            "  scaling: 1.0"
        ),
    )


class EmbeddingsConfig(BaseConfigModel):
    """
    Options for embedding models and loading.
    NOTE: Embeddings requires the "extras" feature to be installed
    Install it via "pip install .[extras]"
    """

    # TODO: convert this to a pathlib.path?
    embedding_model_dir: Optional[str] = Field(
        "models",
        description=("Directory to look for embedding models (default: models)."),
    )
    embeddings_device: Optional[Literal["cpu", "auto", "cuda"]] = Field(
        "cpu",
        description=(
            "Device to load embedding models on (default: cpu).\n"
            "Possible values: cpu, auto, cuda.\n"
            "NOTE: It's recommended to load embedding models on the CPU.\n"
            "If using an AMD GPU, set this value to 'cuda'."
        ),
    )
    embedding_model_name: Optional[str] = Field(
        None,
        description=("An initial embedding model to load on the infinity backend."),
    )


class DeveloperConfig(BaseConfigModel):
    """Options for development and experimentation"""

    unsafe_launch: Optional[bool] = Field(
        False,
        description=(
            "Skip Exllamav2 version check (default: False).\n"
            "WARNING: It's highly recommended to update your dependencies rather "
            "than enabling this flag."
        ),
    )
    disable_request_streaming: Optional[bool] = Field(
        False, description=("Disable API request streaming (default: False).")
    )
    realtime_process_priority: Optional[bool] = Field(
        False,
        description=(
            "Set process to use a higher priority.\n"
            "For realtime process priority, run as administrator or sudo.\n"
            "Otherwise, the priority will be set to high."
        ),
    )


class TabbyConfigModel(BaseModel):
    """Base model for a TabbyConfig."""

    config: Optional[ConfigOverrideConfig] = Field(
        default_factory=ConfigOverrideConfig.model_construct
    )
    network: Optional[NetworkConfig] = Field(
        default_factory=NetworkConfig.model_construct
    )
    logging: Optional[LoggingConfig] = Field(
        default_factory=LoggingConfig.model_construct
    )
    model: Optional[ModelConfig] = Field(default_factory=ModelConfig.model_construct)
    draft_model: Optional[DraftModelConfig] = Field(
        default_factory=DraftModelConfig.model_construct
    )
    lora: Optional[LoraConfig] = Field(default_factory=LoraConfig.model_construct)
    embeddings: Optional[EmbeddingsConfig] = Field(
        default_factory=EmbeddingsConfig.model_construct
    )
    sampling: Optional[SamplingConfig] = Field(
        default_factory=SamplingConfig.model_construct
    )
    developer: Optional[DeveloperConfig] = Field(
        default_factory=DeveloperConfig.model_construct
    )

    model_config = ConfigDict(validate_assignment=True, protected_namespaces=())