refactor(infer): enhance guided decoding parameter handling

xming521 · xming521 · commit ab863cf13486 · 2026-03-29T11:25:18.000+08:00
- Updated the guided decoding parameter logic to support dynamic imports, improving compatibility with different configurations.
- Simplified the creation of guided decoding parameters by introducing a helper function.
- Modified the retry mechanism in the OpenAI API call to retry on all exceptions, enhancing robustness in error handling.
diff --git a/WC-exp b/WC-exp
@@ -1 +1 @@
-Subproject commit e62bd446c9147b92ebb2b03dd02f169cbbbf90dd
+Subproject commit e0acf48c642b56625832e858ce3a7bf95ac14270
diff --git a/weclone/core/inference/offline_infer.py b/weclone/core/inference/offline_infer.py
@@ -11,7 +11,21 @@
 from vllm import LLM, SamplingParams
 from vllm.lora.request import LoRARequest
 from vllm.outputs import RequestOutput
-from vllm.sampling_params import GuidedDecodingParams
+
+try:
+    from vllm.sampling_params import GuidedDecodingParams as _GuidedDecodingParams  # type: ignore[attr-defined]
+
+    _STRUCTURED_OUTPUTS_PARAMS = None
+except ImportError:
+    _GuidedDecodingParams = None  # type: ignore[assignment,misc]
+    from vllm.sampling_params import StructuredOutputsParams as _STRUCTURED_OUTPUTS_PARAMS  # type: ignore[assignment]
+
+
+def _make_guided_decoding_params(json_schema: dict, disable_any_whitespace: bool = True):
+    if _GuidedDecodingParams is not None:
+        return _GuidedDecodingParams(json=json_schema, disable_any_whitespace=disable_any_whitespace)
+    return _STRUCTURED_OUTPUTS_PARAMS(json=json_schema, disable_any_whitespace=disable_any_whitespace)  # type: ignore[misc]
+
 
 from weclone.utils.config import load_config
 from weclone.utils.config_models import VllmArgs
@@ -134,22 +148,28 @@ def vllm_infer(
     template_obj = get_template_and_fix_tokenizer(tokenizer, data_args)
     template_obj.mm_plugin.expand_mm_tokens = False  # for vllm generate
 
+    guided_decoding_params = None
     if guided_decoding_class:
         json_schema = guided_decoding_class.model_json_schema()
-        guided_decoding_params = GuidedDecodingParams(json=json_schema, disable_any_whitespace=True)
-
-    sampling_params = SamplingParams(
-        repetition_penalty=generating_args.repetition_penalty or 1.0,
-        temperature=generating_args.temperature,
-        top_p=generating_args.top_p or 1.0,
-        top_k=generating_args.top_k or -1,
-        stop_token_ids=template_obj.get_stop_token_ids(tokenizer),
-        max_tokens=generating_args.max_new_tokens,
-        skip_special_tokens=skip_special_tokens,
-        seed=seed,
-        bad_words=bad_words,
-        guided_decoding=guided_decoding_params if guided_decoding_class else None,
-    )
+        guided_decoding_params = _make_guided_decoding_params(json_schema)
+
+    _sampling_kwargs: dict = {
+        "repetition_penalty": generating_args.repetition_penalty or 1.0,
+        "temperature": generating_args.temperature,
+        "top_p": generating_args.top_p or 1.0,
+        "top_k": generating_args.top_k or -1,
+        "stop_token_ids": template_obj.get_stop_token_ids(tokenizer),
+        "max_tokens": generating_args.max_new_tokens,
+        "skip_special_tokens": skip_special_tokens,
+        "seed": seed,
+        "bad_words": bad_words,
+    }
+    if guided_decoding_params is not None:
+        if _GuidedDecodingParams is not None:
+            _sampling_kwargs["guided_decoding"] = guided_decoding_params
+        else:
+            _sampling_kwargs["structured_outputs"] = guided_decoding_params
+    sampling_params = SamplingParams(**_sampling_kwargs)
     if model_args.adapter_name_or_path is not None:
         lora_request = LoRARequest("default", 1, model_args.adapter_name_or_path[0])
     else:
@@ -163,9 +183,10 @@ def vllm_infer(
         "disable_log_stats": True,
         "enable_lora": model_args.adapter_name_or_path is not None,
         "enable_prefix_caching": True,
-        "guided_decoding_backend": "guidance",
-        "guided_decoding_disable_any_whitespace": True,
     }
+    if _GuidedDecodingParams is not None:
+        engine_args["guided_decoding_backend"] = "guidance"
+        engine_args["guided_decoding_disable_any_whitespace"] = True
 
     if template_obj.mm_plugin.__class__.__name__ != "BasePlugin":
         engine_args["limit_mm_per_prompt"] = {"image": 4, "video": 2, "audio": 2}
diff --git a/weclone/utils/retry.py b/weclone/utils/retry.py
@@ -99,7 +99,7 @@ def retry_openai_api(
 ):
     """
     专门用于OpenAI API调用的重试装饰器
-    处理OpenAI特有的异常类型
+    对所有Exception执行重试
     """
 
     def decorator(func):
@@ -110,18 +110,7 @@ def wrapper(*args, **kwargs):
                     return func(*args, **kwargs)
 
                 except Exception as e:
-                    # 检查是否是速率限制或临时错误
-                    error_message = str(e).lower()
-                    should_retry = (
-                        "rate limit" in error_message
-                        or "429" in error_message
-                        or "too many requests" in error_message
-                        or "server error" in error_message
-                        or "timeout" in error_message
-                        or "connection" in error_message
-                    )
-
-                    if should_retry and attempt < max_retries:
+                    if attempt < max_retries:
                         delay = _calculate_delay(attempt, base_delay, max_delay, backoff_factor, jitter)
                         logger.warning(
                             f"OpenAI API调用失败: {type(e).__name__}: {e}，"
@@ -130,12 +119,11 @@ def wrapper(*args, **kwargs):
                         )
                         time.sleep(delay)
                         continue
-                    else:
-                        if attempt >= max_retries:
-                            logger.error(
-                                f"OpenAI API调用在 {max_retries + 1} 次尝试后最终失败: {type(e).__name__}: {e}"
-                            )
-                        raise
+
+                    logger.error(
+                        f"OpenAI API调用在 {max_retries + 1} 次尝试后最终失败: {type(e).__name__}: {e}"
+                    )
+                    raise
 
             return None
 

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-Subproject commit e62bd446c9147b92ebb2b03dd02f169cbbbf90dd`
	`1`	`+Subproject commit e0acf48c642b56625832e858ce3a7bf95ac14270`