FastDeploy/fastdeploy/config.py at d40e5a65e345d8b158b5edbd90a4f3de9981a992 · PaddlePaddle/FastDeploy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""

from __future__ import annotations

import json
import os
from dataclasses import field
from enum import Enum
from typing import Any, Dict, Literal, Optional, Union

import paddle
import paddle.distributed as dist
from packaging.version import parse as parse_version
from paddleformers.transformers.configuration_utils import PretrainedConfig
from typing_extensions import assert_never

import fastdeploy
from fastdeploy import envs
from fastdeploy.model_executor.layers.quantization.quant_base import QuantConfigBase
from fastdeploy.platforms import current_platform
from fastdeploy.scheduler import SchedulerConfig
from fastdeploy.transformer_utils.config import get_pooling_config
from fastdeploy.utils import (
    ceil_div,
    check_unified_ckpt,
    get_host_ip,
    get_logger,
    parse_ports,
)

logger = get_logger("config", "config.log")

TaskOption = Literal["auto", "generate", "embedding", "embed"]

RunnerType = Literal["generate", "pooling"]

RunnerOption = Literal["auto", "generate", "pooling"]

ConvertOption = Literal["auto", "none", "embed"]

ConvertType = Literal["none", "embed"]

_ResolvedTask = Literal["generate", "encode", "embed"]

# Model implementation backend options
ModelImpl = Literal["auto", "fastdeploy", "paddleformers"]

_RUNNER_CONVERTS: dict[RunnerType, list[ConvertType]] = {
    "generate": [],
    "pooling": ["embed", "reward"],
}

PREEMPTED_TOKEN_ID = -9

# Some model suffixes are based on auto classes from Transformers:
# https://huggingface.co/docs/transformers/en/model_doc/auto
# NOTE: Items higher on this list priority over lower ones
_SUFFIX_TO_DEFAULTS: list[tuple[str, tuple[RunnerType, ConvertType]]] = [
    ("ForCausalLM", ("generate", "none")),
    ("ForConditionalGeneration", ("generate", "none")),
    ("ChatModel", ("generate", "none")),
    ("LMHeadModel", ("generate", "none")),
    ("ForTextEncoding", ("pooling", "embed")),
    ("EmbeddingModel", ("pooling", "embed")),
    ("ForSequenceClassification", ("pooling", "classify")),
    ("ForAudioClassification", ("pooling", "classify")),
    ("ForImageClassification", ("pooling", "classify")),
    ("ForVideoClassification", ("pooling", "classify")),
    ("ClassificationModel", ("pooling", "classify")),
    ("ForRewardModeling", ("pooling", "reward")),
    ("RewardModel", ("pooling", "reward")),
    # Let other `*Model`s take priority
    ("Model", ("pooling", "embed")),
]


def iter_architecture_defaults():
    yield from _SUFFIX_TO_DEFAULTS


def try_match_architecture_defaults(
    architecture: str,
    *,
    runner_type: Optional[RunnerType] = None,
    convert_type: Optional[ConvertType] = None,
):
    for suffix, (default_runner_type, default_convert_type) in iter_architecture_defaults():
        if (
            (runner_type is None or runner_type == default_runner_type)
            and (convert_type is None or convert_type == default_convert_type)
            and architecture.endswith(suffix)
        ):
            return suffix, (default_runner_type, default_convert_type)
    return None


class MoEPhase:
    """
    The generation phase of the moe.
    """

    def __init__(self, phase="prefill"):
        self._phase = phase

    @property
    def phase(self):
        return self._phase

    @phase.setter
    def phase(self, value):
        if value not in ["prefill", "decode"]:
            raise ValueError(f"The moe_phase is invalid, only support prefill and decode, but got {value}")
        else:
            self._phase = value


class ErnieArchitectures:
    """Helper class for ERNIE architecture check."""

    ARCHITECTURES = {
        "Ernie4_5ForCausalLM",  # 0.3B-PT
        "Ernie4_5_ForCausalLM",
        "Ernie4_5_MoeForCausalLM",
        "Ernie4_5_VLMoeForConditionalGeneration",
        "Ernie4_5_VLMoeForProcessRewardModel",
    }

    ERNIE5_MODELS = {
        "Ernie5ForCausalLM",
        "Ernie5MoeForCausalLM",
        "Ernie5MoEForRewardModel",
    }

    @classmethod
    def register_ernie_model_arch(cls, model_class):
        if model_class.name().startswith("Ernie") and model_class.name() not in cls.ARCHITECTURES:
            cls.ARCHITECTURES.add(model_class.name())

    @classmethod
    def contains_ernie_arch(cls, architectures):
        """Check if any ERNIE architecture is present in the given architectures."""
        return any(arch in architectures for arch in cls.ARCHITECTURES)

    @classmethod
    def is_ernie_arch(cls, architecture):
        """Check if the given architecture is an ERNIE architecture."""
        return architecture in cls.ARCHITECTURES

    @classmethod
    def is_ernie5_arch(cls, architectures):
        """Check if the given architecture is an ERNIE5 architecture."""
        return any(arch in architectures for arch in cls.ERNIE5_MODELS)


PRETRAINED_INIT_CONFIGURATION = {
    "top_p": 1.0,
    "temperature": 1.0,
    "rope_theta": 10000.0,
    "penalty_score": 1.0,
    "frequency_score": 0.0,
    "presence_score": 0.0,
    "min_length": 1,
    "num_key_value_heads": -1,
    "start_layer_index": 0,
    "moe_num_shared_experts": 0,
    "moe_layer_start_index": 0,
    "num_max_dispatch_tokens_per_rank": 128,
    "moe_use_aux_free": False,
    "vocab_size": -1,
    "hidden_dropout_prob": 0.0,
    "initializer_range": 0.02,
    "max_position_embeddings": 512,
    "quantization_config": None,
    "tie_word_embeddings": False,
    "rms_norm_eps": 1e-5,
    "moe_num_experts": None,
    "moe_layer_end_index": None,
}


class ModelConfig:
    """
    The configuration class to store the configuration of a `LLM`.
    """

    def __init__(
        self,
        args,
    ):
        self.model = ""
        self.is_quantized = False
        self.is_moe_quantized = False
        self.max_model_len = 0
        self.dtype = "bfloat16"
        self.enable_logprob = False
        self.max_logprobs = 20
        self.logprobs_mode = "raw_logprobs"
        self.redundant_experts_num = 0
        self.seed = 0
        self.quantization = None
        self.pad_token_id: int = -1
        self.eos_tokens_lens: int = 2
        self.lm_head_fp32: bool = False
        self.moe_gate_fp32: bool = False
        self.model_format = "auto"
        self.runner = "auto"
        self.convert = "auto"
        self.pooler_config: Optional["PoolerConfig"] = field(init=False)
        self.override_pooler_config: Optional[Union[dict, "PoolerConfig"]] = None
        self.revision = None
        self.prefix_layer_name = "layers"
        self.kv_cache_quant_scale_path = ""
        self.enable_entropy = False
        self.model_impl: ModelImpl = "auto"

        self.partial_rotary_factor: float = 1.0
        self.num_nextn_predict_layers = 0
        self.mm_max_tokens_per_item = None
        for key, value in args.items():
            if hasattr(self, key) and value != "None":
                setattr(self, key, value)

        assert self.model != ""
        pretrained_config, _ = PretrainedConfig.get_config_dict(self.model)
        self.pretrained_config = PretrainedConfig.from_dict(pretrained_config)

        # Some exported configs (e.g. Qwen3-VL) embed the text model's configuration under a `text_config` key.
        if "text_config" in pretrained_config and isinstance(pretrained_config["text_config"], dict):
            text_fg = pretrained_config.pop("text_config")
            for key, value in text_fg.items():
                if not hasattr(self, key):
                    setattr(self, key, value)

        # set attribute from pretrained_config
        for key, value in pretrained_config.items():
            setattr(self, key, value)
        # we need set default value when not exist
        for key, value in PRETRAINED_INIT_CONFIGURATION.items():
            if not hasattr(self, key):
                setattr(self, key, value)

        if not hasattr(self, "head_dim"):
            self.head_dim = self.hidden_size // self.num_attention_heads

        if hasattr(self, "vision_config"):
            self.vision_config = PretrainedConfig.from_dict(self.vision_config)

        # Align external multimodal rope_3d configuration
        if (
            hasattr(self, "rope_scaling")
            and isinstance(self.rope_scaling, dict)
            and "mrope_section" in self.rope_scaling
        ):
            setattr(self, "rope_3d", True)
            setattr(self, "freq_allocation", self.rope_scaling["mrope_section"][0])

        self.ori_vocab_size = args.get("ori_vocab_size", self.vocab_size)
        self.think_start_id = args.get("think_start_id", -1)
        self.think_end_id = args.get("think_end_id", -1)
        self.im_patch_id = args.get("image_patch_id", -1)
        self.line_break_id = args.get("line_break_id", -1)
        self.think_truncate_prompt_ids = args.get("think_truncate_prompt_ids", [-1])

        num_max_logprobs = args.get("max_logprobs", None)
        if num_max_logprobs is not None and num_max_logprobs < -1:
            raise ValueError(" The possible values for max_logprobs can't be less than -1 ")
        if self.ori_vocab_size is not None and num_max_logprobs is not None:
            if num_max_logprobs > self.ori_vocab_size:
                raise ValueError(
                    f" The possible values for max_logprobs can't be greater than the vocabulary size {self.ori_vocab_size}"
                )

        self._post_init()

    def _post_init(self):
        self.is_unified_ckpt = check_unified_ckpt(self.model)
        self.runner_type = self._get_runner_type(self.architectures, self.runner)
        self.convert_type = self._get_convert_type(self.architectures, self.runner_type, self.convert)
        registry = self.registry
        is_generative_model = registry.is_text_generation_model(self.architectures, self)
        is_pooling_model = registry.is_pooling_model(self.architectures, self)
        is_multimodal_model = registry.is_multimodal_model(self.architectures, self)
        self.is_reasoning_model = registry.is_reasoning_model(self.architectures, self)

        self.enable_mm = is_multimodal_model

        self.kv_cache_quant_scale_path = os.path.join(self.model, "kv_cache_scale.json")
        if self.runner_type == "pooling":
            os.environ["FD_USE_GET_SAVE_OUTPUT_V1"] = "1"

        if self.runner_type == "generate" and not is_generative_model:
            if is_multimodal_model:
                pass
            elif self.model_impl in ("auto", "paddleformers"):
                # Skip check for auto/paddleformers - may fallback to paddleformers which supports any model
                pass
            else:
                generate_converts = _RUNNER_CONVERTS["generate"]
                if self.convert_type not in generate_converts:
                    raise ValueError("This model does not support '--runner generate.")
        if self.runner_type == "pooling" and not is_pooling_model:
            pooling_converts = _RUNNER_CONVERTS["pooling"]
            if self.convert_type not in pooling_converts:
                convert_option = "<" + "|".join(pooling_converts) + ">"
                raise ValueError(
                    "This model does not support `--runner pooling`. "
                    f"You can pass `--convert {convert_option} to adapt "
                    "it into a pooling model."
                )

        self.supported_tasks = self._get_supported_tasks(self.architectures, self.runner_type, self.convert_type)
        model_info, arch = registry.inspect_model_cls(self.architectures, self)
        self._model_info = model_info
        self._architecture = arch
        self.architectures = [arch]

        self.pooler_config = self._init_pooler_config()
        self.override_name_from_config()
        self.read_from_env()
        self.read_model_config()

    @property
    def registry(self):
        from fastdeploy.model_executor.models.model_base import ModelRegistry

        return ModelRegistry()

    def override_name_from_config(self):
        """
        Override attribute names from the exported model's configuration.
        """

        if not self.is_unified_ckpt and hasattr(self, "infer_model_mp_num"):
            self.tensor_parallel_size = self.infer_model_mp_num
            del self.infer_model_mp_num

        if hasattr(self, "num_hidden_layers") and self.runner != "pooling":
            if hasattr(self, "remove_tail_layer"):
                if self.remove_tail_layer is True:
                    self.num_hidden_layers -= 1
                elif isinstance(self.remove_tail_layer, int):
                    self.num_hidden_layers -= self.remove_tail_layer

        if not hasattr(self, "mla_use_absorb"):
            self.mla_use_absorb = False

        if hasattr(self, "num_experts") and getattr(self, "moe_num_experts") is None:
            self.moe_num_experts = self.num_experts
        if hasattr(self, "n_routed_experts") and getattr(self, "moe_num_experts") is None:
            self.moe_num_experts = self.n_routed_experts
        if hasattr(self, "n_shared_experts") and getattr(self, "moe_num_shared_experts") is None:
            # Because the ERNIE 4.5 config.json contains two sets of keys, adaptation is required.
            self.moe_num_shared_experts = self.n_shared_experts

    def read_from_env(self):
        """
        Read configuration information from environment variables and update the object's attributes.
        If an attribute is not present or is an empty string in the environment variables, use the default value.
        """
        self.max_stop_seqs_num = envs.FD_MAX_STOP_SEQS_NUM
        self.stop_seqs_max_len = envs.FD_STOP_SEQS_MAX_LEN

        def reset_config_value(key, value):
            if not hasattr(self, key.lower()):
                if os.getenv(key, None):
                    value = eval(os.getenv(key))
                    logger.info(f"Get parameter `{key}` = {value} from environment.")
                else:
                    logger.info(f"Parameter `{key}` will use default value {value}.")
                setattr(self, key.lower(), value)

        reset_config_value("COMPRESSION_RATIO", 1.0)
        reset_config_value("ROPE_THETA", 10000)

    def read_model_config(self):
        config_path = os.path.join(self.model, "config.json")
        if os.path.exists(config_path):
            with open(config_path, "r", encoding="utf-8") as f:
                raw_cfg = json.load(f)
                if "text_config" in raw_cfg and isinstance(raw_cfg["text_config"], dict):
                    text_cfg = raw_cfg.pop("text_config")
                    for k, v in text_cfg.items():
                        if k not in raw_cfg:
                            raw_cfg[k] = v
            self.model_config = raw_cfg
            if "torch_dtype" in self.model_config and "dtype" in self.model_config:
                raise ValueError(
                    "Only one of 'torch_dtype' or 'dtype' should be present in config.json. "
                    "Found both, which indicates an ambiguous model format. "
                    "Please ensure your config.json contains only one dtype field."
                )
            elif "torch_dtype" in self.model_config:
                self.model_format = "torch"
                logger.info("The model format is Hugging Face Torch")
            elif "dtype" in self.model_config:
                # https://github.com/huggingface/transformers/releases/tag/v4.56.0  Transformers 4.56.0 version deprecated torch_dtype
                if "source" in self.model_config and self.model_config["source"] == "paddle":
                    self.model_format = "paddle"
                else:
                    if "transformers_version" in self.model_config and parse_version(
                        self.model_config["transformers_version"]
                    ) > parse_version("4.56.0"):
                        self.model_format = "torch"
                        logger.info("The model format is Hugging Face Torch")
                    else:
                        self.model_format = "paddle"
                        logger.info("The model format is Paddle")
            elif (
                "quantization_config" in self.model_config
                and "quant_method" in self.model_config["quantization_config"]
                and "mxfp4" == self.model_config["quantization_config"]["quant_method"]
            ):
                self.model_format = "torch"
                logger.info("The model format is Hugging Face")
            else:
                if "source" in self.model_config and self.model_config["source"] == "paddle":
                    self.model_format = "paddle"
                else:
                    self.model_format = "torch"
                logger.info("The model format is Hugging Face")

    def _get_default_runner_type(
        self,
        architectures: list[str],
    ) -> RunnerType:
        registry = self.registry
        if get_pooling_config(self.model, self.revision):
            return "pooling"
        for arch in architectures:
            if arch in registry.get_supported_archs():
                if registry.is_pooling_model(architectures, self):
                    return "pooling"
                if registry.is_text_generation_model(architectures, self):
                    return "generate"
            match = try_match_architecture_defaults(arch)
            if match:
                _, (runner_type, _) = match
                return runner_type
        return "generate"

    def _get_default_convert_type(
        self,
        architectures: list[str],
        runner_type: RunnerType,
    ) -> ConvertType:
        registry = self.registry

        for arch in architectures:
            if arch in registry.get_supported_archs():
                if runner_type == "generate" and registry.is_text_generation_model(architectures, self):
                    return "none"
                if runner_type == "pooling" and registry.is_pooling_model(architectures, self):
                    return "none"
            match = try_match_architecture_defaults(arch, runner_type=runner_type)
            if match:
                _, (_, convert_type) = match
                return convert_type

        # This is to handle Sentence Transformers models that use *ForCausalLM
        # and also multi-modal pooling models which are not defined as
        # Sentence Transformers models
        if runner_type == "pooling":
            return "embed"

        return "none"

    def _get_runner_type(
        self,
        architectures: list[str],
        runner: RunnerOption,
    ) -> RunnerType:
        if runner != "auto":
            return runner

        runner_type = self._get_default_runner_type(architectures)
        if runner_type != "generate":
            logger.info(
                "Resolved `--runner auto` to `--runner %s`. " "Pass the value explicitly to silence this message.",
                runner_type,
            )

        return runner_type

    def _get_convert_type(
        self,
        architectures: list[str],
        runner_type: RunnerType,
        convert: ConvertOption,
    ) -> ConvertType:
        if convert != "auto":
            return convert

        convert_type = self._get_default_convert_type(architectures, runner_type)

        if convert_type != "none":
            logger.info(
                "Resolved `--convert auto` to `--convert %s`. " "Pass the value explicitly to silence this message.",
                convert_type,
            )

        return convert_type

    def _get_supported_generation_tasks(
        self,
        architectures: list[str],
        convert_type: ConvertType,
    ) -> list[_ResolvedTask]:
        registry = self.registry

        supported_tasks = list[_ResolvedTask]()
        if registry.is_text_generation_model(architectures, self) or convert_type in _RUNNER_CONVERTS["generate"]:
            supported_tasks.append("generate")

        # TODO:Temporarily does not support transcription.
        return supported_tasks

    def _get_default_pooling_task(
        self,
        architectures: list[str],
    ) -> Literal["embed"]:
        # Temporarily does not support classification and reward.
        for arch in architectures:
            match = try_match_architecture_defaults(arch, runner_type="pooling")
            if match:
                _, (_, convert_type) = match
                assert convert_type != "none"
                return convert_type

        return "embed"

    def _get_supported_pooling_tasks(
        self,
        architectures: list[str],
        convert_type: ConvertType,
    ) -> list[_ResolvedTask]:
        registry = self.registry

        supported_tasks = list[_ResolvedTask]()
        if registry.is_pooling_model(architectures, self) or convert_type in _RUNNER_CONVERTS["pooling"]:
            supported_tasks.append("encode")

            extra_task = self._get_default_pooling_task(architectures) if convert_type == "none" else convert_type
            supported_tasks.append(extra_task)

        return supported_tasks

    def _get_supported_tasks(
        self,
        architectures: list[str],
        runner_type: RunnerType,
        convert_type: ConvertType,
    ) -> list[_ResolvedTask]:
        if runner_type == "generate":
            return self._get_supported_generation_tasks(architectures, convert_type)
        if runner_type == "pooling":
            return self._get_supported_pooling_tasks(architectures, convert_type)

        assert_never(runner_type)

    def _init_pooler_config(self) -> Optional["PoolerConfig"]:
        if self.runner_type == "pooling":
            if isinstance(self.override_pooler_config, dict):
                self.override_pooler_config = PoolerConfig(**self.override_pooler_config)

            pooler_config = self.override_pooler_config or PoolerConfig()

            base_config = get_pooling_config(self.model, self.revision)
            if base_config is not None:
                for k, v in base_config.items():
                    if getattr(pooler_config, k) is None:
                        setattr(pooler_config, k, v)

            default_pooling_type = self._model_info.default_pooling_type
            if pooler_config.pooling_type is None:
                pooler_config.pooling_type = default_pooling_type

            return pooler_config

        return None

    def _get_download_model(self, model_name, model_type="default"):
        # TODO: Provide dynamic graph for self-downloading and save to the specified download directory.
        pass

    def print(self):
        """
        Print all configuration information.
        """
        logger.info("Model Configuration Information :")
        for k, v in self.__dict__.items():
            logger.info("{:<20}:{:<6}{}".format(k, "", v))
        logger.info("=============================================================")


class ParallelConfig:
    """Configuration for the distributed execution."""

    def __init__(
        self,
        args,
    ):
        self.sequence_parallel = False  # Whether to enable sequence parallelism.
        self.use_ep = False  # Whether to enable Expert Parallelism
        self.msg_queue_id = 1  # message queue id

        self.tensor_parallel_rank = 0  # TP rank ID
        self.tensor_parallel_size = 1  # TP degree
        self.expert_parallel_rank = 0  # EP rank ID
        self.expert_parallel_size = 1  # EP degree
        self.data_parallel_rank = 0  # DP rank ID
        self.data_parallel_size = 1  # DP degree
        self.enable_expert_parallel = False
        self.enable_chunked_moe = False
        self.chunked_moe_size = 256

        self.local_data_parallel_id = 0
        # Engine worker queue port
        self.engine_worker_queue_port: Union[int, str, list] = None
        self.local_engine_worker_queue_port: Optional[int] = None
        # cuda visible devices
        self.device_ids: str = "0"
        # First token id
        self.first_token_id: int = 1
        # Process ID of engine
        self.engine_pid: Optional[int] = None
        # Do profile or not
        self.do_profile: bool = False
        # Use internode_ll_two_stage or not
        self.use_internode_ll_two_stage: bool = False
        # disable sequence parallel moe
        self.disable_sequence_parallel_moe: bool = False
        # shutdown comm group if worker idle
        self.shutdown_comm_group_if_worker_idle: bool = None

        self.pod_ip: str = None
        # enable the custom all-reduce kernel and fall back to NCCL(dist.all_reduce).
        self.disable_custom_all_reduce: bool = False
        for key, value in args.items():
            if hasattr(self, key):
                setattr(self, key, value)

        self.engine_worker_queue_port = parse_ports(self.engine_worker_queue_port)

        # currently, the expert parallel size is equal data parallel size
        if self.enable_expert_parallel:
            self.expert_parallel_size = self.data_parallel_size * self.tensor_parallel_size
        else:
            self.expert_parallel_size = 1
        self.use_ep = self.expert_parallel_size > 1

        if self.shutdown_comm_group_if_worker_idle is None:
            self.shutdown_comm_group_if_worker_idle = not self.use_ep

        # pd_disaggregation
        use_pd_disaggregation: int = int(os.getenv("FLAGS_use_pd_disaggregation", 0))
        use_pd_disaggregation_per_chunk: int = int(os.getenv("FLAGS_use_pd_disaggregation_per_chunk", 0))
        if use_pd_disaggregation_per_chunk:
            self.pd_disaggregation_mode = "per_chunk"
        elif use_pd_disaggregation:
            self.pd_disaggregation_mode = "per_query"
        else:
            self.pd_disaggregation_mode = "None"

        # disable_sequence_parallel_moe: qkv_linear + attn + out_linear + allreduce
        # use_sequence_parallel_moe: allgather + qkv_linear + attn + all2all + out_linear
        self.use_sequence_parallel_moe = (
            (not self.disable_sequence_parallel_moe)
            and self.expert_parallel_size > 1
            and self.tensor_parallel_size > 1
        )
        logger.info(f"use_sequence_parallel_moe: {self.use_sequence_parallel_moe}")

    def set_communicate_group(self):
        # different tp group id
        # prevent different tp_groups using the same group_id
        tp_gid_offset = envs.FD_TP_GROUP_GID_OFFSET
        dist.collective._set_custom_gid(self.data_parallel_rank + tp_gid_offset)

        self.tp_group = dist.new_group(
            range(
                self.data_parallel_rank * self.tensor_parallel_size,
                (self.data_parallel_rank + 1) * self.tensor_parallel_size,
            )
        )
        dist.collective._set_custom_gid(None)
        # same ep group id
        if self.enable_expert_parallel:
            dist.collective._set_custom_gid(self.data_parallel_size + tp_gid_offset)
            self.ep_group = dist.new_group(range(self.expert_parallel_size))
            dist.collective._set_custom_gid(None)
        logger.info(
            f"data_parallel_size: {self.data_parallel_size}, tensor_parallel_size: {self.tensor_parallel_size}, expert_parallel_size: {self.expert_parallel_size}, data_parallel_rank: {self.data_parallel_rank}, tensor_parallel_rank: {self.tensor_parallel_rank}, expert_parallel_rank: {self.expert_parallel_rank}, tp_group: {self.tp_group}."
        )

    def print(self):
        """
        print all config

        """
        logger.info("Parallel Configuration Information :")
        for k, v in self.__dict__.items():
            logger.info("{:<20}:{:<6}{}".format(k, "", v))
        logger.info("=============================================================")


class SpeculativeConfig:
    """
    Configuration for speculative decoding.
    """

    def __init__(
        self,
        args,
    ):
        self.method_list = ["ngram_match", "mtp", "suffix"]
        self.mtp_strategy_list = ["default", "with_ngram"]

        # speculative method, choose in [None, "ngram_match", "mtp", "hybrid_mtp_ngram"]
        self.method: Optional[str] = None
        # mtp strategy in mtp-method
        self.mtp_strategy = "default"
        # the max length of speculative tokens
        self.num_speculative_tokens: int = 1
        # the model runner step of draft model/mtp...
        self.num_model_steps: int = 1
        # the max length of candidate tokens for speculative method
        self.max_candidate_len: int = 5
        # the max length of verify window for speculative method
        self.verify_window: int = 2
        # ngram match
        self.max_ngram_size: int = 5
        self.min_ngram_size: int = 2
        # Suffix Decoding
        # The maximum length of token sequences cached in suffix trees.
        self.suffix_decoding_max_tree_depth: int = 64
        # The limits of requests that can be stored in the cache.
        self.suffix_decoding_max_cached_requests: int = -1
        # The factor of matched length, calculated as num_draft_tokens = suffix_max_spec_factor * matched_length
        self.suffix_decoding_max_spec_factor: float = 1.0
        # The probability threshold for speculated tokens.
        self.suffix_decoding_min_token_prob: float = 0.1
        # model for mtp/eagle/draft_model
        self.model: Optional[str] = None
        # quantization of model
        self.quantization: Optional[Dict[str, Any]] = None
        # allocate more blocks to prevent mtp from finishing the block earlier than the main model
        # Fixed now
        self.num_gpu_block_expand_ratio: Optional[float] = 1
        # To distinguish the main model and draft model(mtp/eagle/draftmodel)
        # ["main", "mtp"]
        self.model_type: Optional[str] = "main"
        # TODO(liuzichang): To reduce memory usage, MTP shares the main model's lm_head and embedding layers.
        # A trick method is currently used to enable this sharing.
        # This will be replaced with a more standardized solution in the future.
        self.sharing_model = None
        # During benchmarking, we need to enforce that the number of accepted tokens is 1.
        # This means no tokens from MTP are accepted.
        # This ensures that the specified simulation acceptance rate is not affected.
        self.benchmark_mode: bool = False
        # Enable token constraint enforcement in generation phase
        # When enabled, enforces specific tokens after the reasoning phase boundary pattern
        self.enf_gen_phase_tag: bool = False

        self.num_extra_cache_layer = 0

        self.enable_draft_logprob: bool = False

        for key, value in args.items():
            if hasattr(self, key):
                setattr(self, key, value)

        self.read_model_config()
        self.reset()

    def read_model_config(self):
        """
        Read configuration from file.
        """
        self.model_config = {}
        if not self.enabled_speculative_decoding():
            return

        self.is_unified_ckpt = check_unified_ckpt(self.model)
        if self.model is None:
            return

        self.config_path = os.path.join(self.model, "config.json")
        if os.path.exists(self.config_path):
            self.model_config = json.load(open(self.config_path, "r", encoding="utf-8"))

    def reset(self):
        """
        Reset configuration.
        """

        def reset_value(cls, value_name, key=None, default=None):
            if key is not None and key in cls.model_config:
                setattr(cls, value_name, cls.model_config[key])
            elif getattr(cls, value_name, None) is None:
                setattr(cls, value_name, default)

        if not self.enabled_speculative_decoding():
            return

        # NOTE(liuzichang): We will support multi-layer in future
        if self.method in ["mtp"]:
            self.num_extra_cache_layer = 1

    def enabled_speculative_decoding(self):
        """
        Check if speculative decoding is enabled.
        """
        if self.method is None:
            return False
        return True

    def to_json_string(self):
        """
        Convert speculative_config to json string.
        """
        return json.dumps({key: value for key, value in self.__dict__.items() if value is not None})

    def print(self):
        """
        print all config

        """
        logger.info("Speculative Decoding Configuration Information :")
        for k, v in self.__dict__.items():
            logger.info("{:<20}:{:<6}{}".format(k, "", v))
        logger.info("=============================================================")

    def check_legality_parameters(
        self,
    ) -> None:
        """Check the legality of parameters passed in from the command line"""
        if self.method is not None:
            assert (
                self.method in self.method_list
            ), f"speculative method only support {self.method_list} now, but get {self.method}."

            assert (
                self.num_speculative_tokens >= 1 and self.num_speculative_tokens <= 5
            ), f"num_speculative_tokens only support in range[1, 5], but get {self.num_speculative_tokens}."
            assert (
                self.num_model_steps >= 1 and self.num_model_steps <= 5
            ), f"num_model_steps only support in range[1, 5], but get {self.num_model_steps}."

            if self.method in ["mtp", "hybrid_mtp_ngram"]:
                if self.num_speculative_tokens < self.num_model_steps:
                    logger.warning(
                        f"Get num_model_steps > num_speculative_tokens. Reset num_speculative_tokens to {self.num_model_steps}"
                    )
                    self.num_speculative_tokens = self.num_model_steps

            assert (
                self.mtp_strategy in self.mtp_strategy_list
            ), f"mtp_strategy_list only support {self.mtp_strategy_list}, but get {self.mtp_strategy}"

    def __str__(self) -> str:
        return self.to_json_string()


class DeviceConfig:
    """
    Configuration for device settings.
    """

    def __init__(
        self,
        args,
    ):
        self.device_type = "cuda"
        for key, value in args.items():
            if hasattr(self, key):
                setattr(self, key, value)


class GraphOptimizationConfig:
    """
    Configuration for compute graph level optimization.
    """

    def __init__(
        self,
        args,
    ):
        """The Top-level graph optimization contral corresponds to different backends.
        - 0: dyncmic graph
        - 1: static graph
        - 2: static graph + cinn compilation backend
        """
        self.graph_opt_level: int = 0

        # CUDA Graph Config
        """ Whether to use cudagraph.
        - False: cudagraph is not used.
        - True: cudagraph is used.
            It requires that all input buffers have fixed addresses, and all
            splitting ops write their outputs to input buffers.
            - With dyncmic graph backend: ...
            - With static graph backend: WIP
        """
        self.sot_warmup_sizes: list[int] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 32, 64, 128]
        """  Number of warmup runs for SOT warmup. """
        self.use_cudagraph: bool = False if paddle.is_compiled_with_xpu() else True
        """Sizes to capture cudagraph.
        - None (default): capture sizes are inferred from llm config.
        - list[int]: capture sizes are specified as given."""
        self.cudagraph_capture_sizes: Optional[list[int]] = None
        self.cudagraph_capture_sizes_prefill: list[int] = [1, 2, 4, 8]
        """ Number of warmup runs for cudagraph. """
        self.cudagraph_num_of_warmups: int = 2
        """Whether to copy input tensors for cudagraph.
        If the caller can guarantee that the same input buffers
        are always used, it can set this to False. Otherwise, it should
        set this to True."""
        self.cudagraph_copy_inputs: bool = False
        """ In static graph, this is an operation list that does not need to be captured by the CUDA graph.
        CudaGraphBackend will split these operations from the static graph.
        Example usage:
            cudagraph_splitting_ops = ["paddle.unified_attention"]

        Note: If want to use subgraph capture functionality in a dynamic graph,
        can manually split the model into multiple layers and apply the @support_graph_optimization decorator
        only to the layer where CUDA graph functionality is required.
        """
        self.cudagraph_splitting_ops: list[str] = []
        """ Whether to use a full cuda graph for the entire forward pass rather than
        splitting certain operations such as attention into subgraphs.
        Thus this flag cannot be used together with splitting_ops."""
        self.cudagraph_only_prefill: bool = False
        """When cudagraph_only_prefill is False, only capture decode-only.
        When cudagraph_only_prefill is True, only capture prefill-only.
        Now don't support capture both decode-only and prefill-only"""
        self.full_cuda_graph: bool = True

        """ Maximum CUDA Graph capture size """
        self.max_capture_size: int = None
        """ Record maps mapped from real shape to captured size to reduce runtime overhead """
        self.real_shape_to_captured_size: dict[int, int] = None
        """ Record maps mapped from real batch size to captured size"""
        self.real_bsz_to_captured_size: dict[int, int] = {}
        """ Whether to use shared memory pool for multi capture_size """
        self.use_unique_memory_pool: bool = True
        """ Whether to use cudagraph for draft model."""
        self.draft_model_use_cudagraph: bool = False
        """ Maximum CUDA Graph capture size for static graph mode.
        Recommend 512 for small models (e.g., ERNIE45T 0.3B) and 128 for massive models (e.g., 300B).
        """
        self.max_capture_shape_prefill: int = 512

        # CINN Config ...
        if args is not None:
            for key, value in args.items():
                if hasattr(self, key):
                    setattr(self, key, value)

        self.check_legality_parameters()

    def init_with_cudagrpah_size(self, max_capture_size: int = 0, max_capture_shape_prefill: int = 0) -> None:
        """
        Initialize cuda graph capture sizes and
        pre-compute the mapping from batch size to padded graph size
        """
        # Regular capture sizes
        self.cudagraph_capture_sizes = [size for size in self.cudagraph_capture_sizes if size <= max_capture_size]
        self.cudagraph_capture_sizes_prefill = [
            size for size in self.cudagraph_capture_sizes_prefill if size <= max_capture_shape_prefill
        ]
        dedup_sizes = list(set(self.cudagraph_capture_sizes))
        if len(dedup_sizes) < len(self.cudagraph_capture_sizes):
            logger.info(
                ("cudagraph sizes specified by model runner" " %s is overridden by config %s"),
                self.cudagraph_capture_sizes,
                dedup_sizes,
            )
        self.cudagraph_capture_sizes = dedup_sizes

        # Sort to make sure cudagraph capture sizes are in descending order
        self.cudagraph_capture_sizes.sort(reverse=True)
        self.cudagraph_capture_sizes_prefill.sort(reverse=True)
        self.max_capture_size = self.cudagraph_capture_sizes[0] if self.cudagraph_capture_sizes else 0
        self.max_capture_size_prefill = (
            self.cudagraph_capture_sizes_prefill[0] if self.cudagraph_capture_sizes_prefill else 0
        )