hjh0119
diff --git a/‎docs/source_en/Instruction/GKD.md‎
Lines changed: 5 additions & 8 deletions b/‎docs/source_en/Instruction/GKD.md‎
Lines changed: 5 additions & 8 deletions
diff --git a/‎examples/megatron/rlhf/gkd/teacher_server.sh‎
Lines changed: 18 additions & 37 deletions b/‎examples/megatron/rlhf/gkd/teacher_server.sh‎
Lines changed: 18 additions & 37 deletions
diff --git a/‎examples/train/rlhf/gkd/teacher_server.sh‎
Lines changed: 19 additions & 25 deletions b/‎examples/train/rlhf/gkd/teacher_server.sh‎
Lines changed: 19 additions & 25 deletions
diff --git a/‎swift/infer_engine/protocol.py‎
Lines changed: 7 additions & 2 deletions b/‎swift/infer_engine/protocol.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎swift/infer_engine/vllm_engine.py‎
Lines changed: 55 additions & 1 deletion b/‎swift/infer_engine/vllm_engine.py‎
Lines changed: 55 additions & 1 deletion
diff --git a/‎swift/megatron/pipelines/train/rlhf.py‎
Lines changed: 22 additions & 2 deletions b/‎swift/megatron/pipelines/train/rlhf.py‎
Lines changed: 22 additions & 2 deletions
@@ -178,8 +178,8 @@ $$
 ```bash
 swift rlhf \
     --rlhf_type gkd \
-    --model Qwen/Qwen2-7B-Instruct \
-    --teacher_model Qwen/Qwen2-72B-Instruct \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --teacher_model Qwen/Qwen2.5-14B-Instruct \
     --gkd_logits_topk 64 \
     --dataset your_dataset \
     ...
@@ -204,22 +204,19 @@ When `gkd_logits_topk` is set, you can use an external teacher model API service
 
 ```bash
 # Deploy teacher model with swift deploy (recommended)
-CUDA_VISIBLE_DEVICES=0,1 swift deploy \
-    --model Qwen/Qwen2-72B-Instruct \
+swift deploy \
+    --model Qwen/Qwen2.5-14B-Instruct \
     --infer_backend vllm \
     --port 8000 \
     --vllm_engine_kwargs '{"max_logprobs": 64}'
-
-# Or use standalone vLLM server
-vllm serve Qwen/Qwen2-72B-Instruct --max-logprobs 64 --port 8000
 ```
 
 **Step 2: Start GKD Training**
 
 ```bash
 swift rlhf \
     --rlhf_type gkd \
-    --model Qwen/Qwen2-7B-Instruct \
+    --model Qwen/Qwen2.5-7B-Instruct \
     --teacher_model_server http://localhost:8000 \
     --gkd_logits_topk 20 \
     --dataset your_dataset \
 
@@ -1,58 +1,39 @@
-# Megatron GKD Training with External Teacher Model Server
-#
-# This script demonstrates using an external vLLM server as the teacher model
-# for knowledge distillation with Megatron-SWIFT. This approach is useful when:
-# - The teacher model is too large to load alongside the student model
-# - You want to separate teacher inference from training for better resource utilization
-# - You need to use different model parallelism for student vs teacher
-#
-# Prerequisites:
-# 1. Start the teacher model server first (see below)
-# 2. Ensure the server is accessible at the specified URL
-#
-# Teacher Server Setup (run in a separate terminal):
-#   CUDA_VISIBLE_DEVICES=4,5,6,7 swift deploy \
-#       --model Qwen/Qwen2-72B-Instruct \
-#       --infer_backend vllm \
-#       --port 8000 \
-#       --vllm_engine_kwargs '{"max_logprobs": 64}'
-#
-# Or using vLLM directly:
-#   vllm serve Qwen/Qwen2-72B-Instruct --max-logprobs 64 --port 8000
-
-TEACHER_SERVER_URL=${TEACHER_SERVER_URL:-"http://localhost:8000"}
-GKD_LOGITS_TOPK=${GKD_LOGITS_TOPK:-20}
-
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
 NPROC_PER_NODE=4 \
 PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
 megatron rlhf \
     --rlhf_type gkd \
     --model Qwen/Qwen3-8B-Base \
-    --teacher_model_server $TEACHER_SERVER_URL \
-    --gkd_logits_topk $GKD_LOGITS_TOPK \
+    --teacher_model_server http://localhost:8000 \
+    --gkd_logits_topk 20 \
     --tuner_type lora \
-    --dataset 'AI-ModelScope/alpaca-gpt4-data-en#2000' 'AI-ModelScope/alpaca-gpt4-data-zh#2000' \
-    --tensor_model_parallel_size 2 \
+    --dataset AI-ModelScope/alpaca-gpt4-data-en#2000 AI-ModelScope/alpaca-gpt4-data-zh#2000 \
+    --tensor_model_parallel_size 1 \
     --expert_model_parallel_size 1 \
     --pipeline_model_parallel_size 1 \
-    --context_parallel_size 2 \
+    --context_parallel_size 1 \
     --seq_kd false \
-    --lmbda 0 \
-    --beta 0.5 \
+    --lmbda 1 \
+    --beta 1 \
     --torch_dtype bfloat16 \
     --micro_batch_size 2 \
     --global_batch_size 16 \
     --max_epochs 1 \
-    --lr 5e-6 \
-    --log_interval 5 \
-    --max_length 4096 \
-    --max_completion_length 1024 \
+    --lr 5e-5 \
+    --log_interval 1 \
+    --max_length 8192 \
+    --max_completion_length 8192 \
     --attention_backend flash \
+    --use_vllm true \
+    --vllm_mode colocate \
+    --vllm_gpu_memory_utilization 0.5 \
+    --vllm_tensor_parallel_size 1 \
+    --vllm_max_model_len 16384 \
+    --sleep_level 1 \
     --recompute_granularity selective \
     --finetune \
     --no_save_optim \
     --no_save_rng \
-    --temperature 0.9 \
+    --temperature 1.0 \
     --padding_free true \
     --sequence_parallel true
@@ -1,27 +1,17 @@
 # GKD Training with External Teacher Model Server
 #
 # This script demonstrates using an external vLLM server as the teacher model
-# for knowledge distillation. This approach is useful when:
-# - The teacher model is too large to load alongside the student model
-# - You want to share a single teacher server across multiple training processes
-# - You need more control over the teacher model deployment
-#
-# Prerequisites:
-# 1. Start the teacher model server first (see below)
-# 2. Ensure the server is accessible at the specified URL
-#
-# Teacher Server Setup (run in a separate terminal):
-#   CUDA_VISIBLE_DEVICES=0,1 swift deploy \
-#       --model Qwen/Qwen2-72B-Instruct \
-#       --infer_backend vllm \
-#       --port 8000 \
-#       --vllm_engine_kwargs '{"max_logprobs": 64}'
-#
-# Or using vLLM directly:
-#   vllm serve Qwen/Qwen2-72B-Instruct --max-logprobs 64 --port 8000
+# for knowledge distillation.
+
+# Teacher Server Setup (run in a separate gpu):
+# CUDA_VISIBLE_DEVICES=5 swift deploy \
+#     --model Qwen/Qwen2.5-14B-Instruct \
+#     --infer_backend vllm \
+#     --port 8000 \
+#     --vllm_engine_kwargs '{"max_logprobs": 64}'
 
-TEACHER_SERVER_URL=${TEACHER_SERVER_URL:-"http://localhost:8000"}
-GKD_LOGITS_TOPK=${GKD_LOGITS_TOPK:-20}
+TEACHER_SERVER_URL=${TEACHER_SERVER_URL:-"http://localhost:8001"}
+GKD_LOGITS_TOPK=${GKD_LOGITS_TOPK:-64}
 
 NPROC_PER_NODE=4 \
 PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
@@ -30,12 +20,17 @@ swift rlhf \
     --rlhf_type gkd \
     --model Qwen/Qwen2.5-7B \
     --teacher_model_server $TEACHER_SERVER_URL \
+    --use_vllm true \
+    --vllm_mode colocate \
+    --vllm_gpu_memory_utilization 0.5 \
+    --vllm_tensor_parallel_size 1 \
+    --vllm_max_model_len 10240 \
     --gkd_logits_topk $GKD_LOGITS_TOPK \
-    --tuner_type full \
+    --tuner_type lora \
     --dataset 'AI-ModelScope/alpaca-gpt4-data-en' \
     --seq_kd false \
-    --lmbda 0 \
-    --beta 0.5 \
+    --lmbda 1 \
+    --beta 1 \
     --torch_dtype bfloat16 \
     --max_epochs 1 \
     --per_device_train_batch_size 1 \
@@ -47,8 +42,7 @@ swift rlhf \
     --save_total_limit 2 \
     --logging_steps 5 \
     --max_length 2048 \
-    --max_completion_length 512 \
-    --output_dir output/gkd_teacher_server \
+    --max_completion_length 2048 \
     --warmup_ratio 0.05 \
     --save_only_model true \
     --dataloader_num_workers 4 \
 
@@ -173,6 +173,7 @@ class RequestConfig:
     stream: bool = False
     logprobs: bool = False
     top_logprobs: Optional[int] = None
+    prompt_logprobs: Optional[int] = None  # Set to an integer to get top-k logprobs for each prompt token
 
     n: int = 1
     best_of: Optional[int] = None
@@ -192,7 +193,6 @@ def __post_init__(self):
 @dataclass
 class CompletionRequestMixin:
     model: str
-    prompt: str
 
 
 @dataclass
@@ -393,11 +393,14 @@ class ChatCompletionResponseChoice:
     finish_reason: Literal['stop', 'length', None]
     logprobs: Optional[Dict[str, List[Dict[str, Any]]]] = None
     token_ids: Optional[List[int]] = None
+    # Logprobs for prompt tokens (when prompt_logprobs is requested)
+    prompt_logprobs: Optional[List[Dict[str, Any]]] = None
 
     def to_cmpl_choice(self) -> 'CompletionResponseChoice':
         self = deepcopy(self)
         assert not self.message.tool_calls, f'message: {self.message}'
-        return CompletionResponseChoice(self.index, self.message.content, self.finish_reason, self.logprobs)
+        return CompletionResponseChoice(self.index, self.message.content, self.finish_reason, self.logprobs,
+                                        self.prompt_logprobs)
 
 
 @dataclass
@@ -423,6 +426,8 @@ class CompletionResponseChoice:
     text: str
     finish_reason: Literal['stop', 'length', None]
     logprobs: Optional[Dict[str, List[Dict[str, Any]]]] = None
+    # Logprobs for prompt tokens (when prompt_logprobs is requested)
+    prompt_logprobs: Optional[List[Dict[str, Any]]] = None
 
 
 @dataclass
 
@@ -399,6 +399,48 @@ def _get_logprobs(self,
                 logprobs[token_id] = logprob.logprob
         return super()._get_logprobs(logprobs_list, token_ids, top_logprobs)
 
+    def _get_prompt_logprobs(
+        self,
+        prompt_logprobs: Optional[List[Optional[Dict]]],
+        prompt_token_ids: List[int],
+    ) -> Optional[List[Dict[str, Any]]]:
+        if prompt_logprobs is None or not prompt_token_ids:
+            return None
+
+        result = []
+        for pos_idx, (token_id, pos_logprobs) in enumerate(zip(prompt_token_ids, prompt_logprobs)):
+            token = self.tokenizer.decode(token_id)
+            entry = {
+                'token_id': token_id,
+                'token': token,
+                'logprob': None,  # Will be filled if available
+                'top_logprobs': [],
+            }
+
+            if pos_logprobs is not None:
+                # Get logprob for the actual token at this position
+                if token_id in pos_logprobs:
+                    logprob_obj = pos_logprobs[token_id]
+                    entry['logprob'] = logprob_obj.logprob if hasattr(logprob_obj, 'logprob') else logprob_obj
+
+                # Get top logprobs sorted by probability (descending)
+                sorted_items = sorted(
+                    pos_logprobs.items(), key=lambda x: -(x[1].logprob if hasattr(x[1], 'logprob') else x[1]))
+                for tid, logprob_obj in sorted_items:
+                    logprob_val = logprob_obj.logprob if hasattr(logprob_obj, 'logprob') else logprob_obj
+                    if logprob_val == float('-inf'):
+                        continue
+                    t = self.tokenizer.decode(tid)
+                    entry['top_logprobs'].append({
+                        'token_id': tid,
+                        'token': t,
+                        'logprob': logprob_val,
+                    })
+
+            result.append(entry)
+
+        return result
+
     def _prepare_generation_config(self, request_config: RequestConfig) -> SamplingParams:
         kwargs = {'max_tokens': request_config.max_tokens}
         for key in ['temperature', 'top_k', 'top_p', 'repetition_penalty']:
@@ -424,6 +466,10 @@ def _prepare_generation_config(self, request_config: RequestConfig) -> SamplingP
                 # Return only the sampled token's logprob
                 kwargs['logprobs'] = 0
 
+        # Handle prompt_logprobs: return logprobs for prompt/input tokens
+        if request_config.prompt_logprobs is not None:
+            kwargs['prompt_logprobs'] = request_config.prompt_logprobs
+
         # TODO: beam search
         for key in ['n', 'best_of', 'frequency_penalty', 'presence_penalty', 'seed']:
             if hasattr(SamplingParams, key):
@@ -582,13 +628,21 @@ def _create_chat_completion_response(
             logprobs = self._get_logprobs(output.logprobs, output.token_ids, request_config.top_logprobs)
             toolcall = self._get_toolcall(content)  # Use content instead of response for tool calls
             token_ids = output.token_ids if request_config.return_details else None
+
+            # Get prompt logprobs if requested
+            prompt_logprobs_result = None
+            if request_config.prompt_logprobs is not None:
+                prompt_logprobs_result = self._get_prompt_logprobs(result.prompt_logprobs,
+                                                                   list(result.prompt_token_ids))
+
             choice = ChatCompletionResponseChoice(
                 index=output.index,
                 message=ChatMessage(
                     role='assistant', content=content, reasoning_content=reasoning_content, tool_calls=toolcall),
                 finish_reason=output.finish_reason,
                 logprobs=logprobs,
-                token_ids=token_ids)
+                token_ids=token_ids,
+                prompt_logprobs=prompt_logprobs_result)
             choices.append(choice)
         prompt_token_ids = None
         images_size = None
 
@@ -77,9 +77,29 @@ def _prepare_vllm_client(self):
         return vllm_client
 
     def _prepare_teacher_api_client(self):
-        """Prepare teacher API client for external teacher model service."""
+        """Prepare teacher API client for external teacher model service.
+
+        In Megatron with pure Data Parallel (TP=PP=CP=1), each rank processes different data
+        and needs its own API client. With model parallelism (TP/PP/CP > 1), one rank per
+        model parallel group calls the API and broadcasts results.
+        """
         from swift.rlhf_trainers.utils import create_teacher_api_client
-        return create_teacher_api_client(self.args, check_health=True, timeout=60, use_last_rank=True)
+
+        # Check if using pure data parallelism (no model parallelism)
+        tp = getattr(self.args, 'tensor_model_parallel_size', 1)
+        pp = getattr(self.args, 'pipeline_model_parallel_size', 1)
+        cp = getattr(self.args, 'context_parallel_size', 1)
+        is_pure_dp = (tp == 1 and pp == 1 and cp == 1)
+
+        # In pure DP mode, each rank has different data and needs its own client
+        # In MP mode, only last rank creates client and broadcasts results
+        return create_teacher_api_client(
+            self.args,
+            check_health=True,
+            timeout=60,
+            use_last_rank=True,
+            tokenizer=self.template.tokenizer,
+            all_ranks=is_pure_dp)
 
 
 def megatron_rlhf_main(args: Optional[Union[List[str], MegatronRLHFArguments]] = None):