THUDM
diff --git a/‎docker/Dockerfile‎
Lines changed: 3 additions & 3 deletions b/‎docker/Dockerfile‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docker/patch/latest/megatron.patch‎
Lines changed: 19 additions & 3 deletions b/‎docker/patch/latest/megatron.patch‎
Lines changed: 19 additions & 3 deletions
@@ -9,15 +9,14 @@ RUN apt update
 RUN apt install -y nvtop rsync
 
 # TODO: change to pip install sglang-router after it has a new release
-RUN pip install sglang-router --force-reinstall
+RUN pip install sglang-router==0.2.1 --force-reinstall
 RUN pip install git+https://github.com/fzyzcjy/torch_memory_saver.git --no-cache-dir --force-reinstall
 RUN pip install ray[default]
 RUN pip install httpx[http2] wandb pylatexenc blobfile accelerate "mcp[cli]"
 
 # mbridge
 RUN pip install git+https://github.com/ISEEKYAN/mbridge.git --no-deps
 
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.9;9.0;9.0a" pip install git+https://github.com/fanshiqing/grouped_gemm@v1.1.4 --no-build-isolation
 # apex
 RUN NVCC_APPEND_FLAGS="--threads 4" \
   pip -v install --disable-pip-version-check --no-cache-dir \
@@ -31,13 +30,14 @@ RUN MAX_JOBS=64 pip -v install flash-attn==2.7.4.post1 --no-build-isolation
 RUN pip install flash-linear-attention
 RUN pip -v install --no-build-isolation transformer_engine[pytorch]
 
+WORKDIR /root/
 RUN git clone https://github.com/Dao-AILab/flash-attention.git && \
     cd flash-attention/ && git submodule update --init && cd hopper/ && python setup.py install && \
     export python_path=`python -c "import site; print(site.getsitepackages()[0])"` && \
     mkdir -p $python_path/flash_attn_3 && \
     cp flash_attn_interface.py $python_path/flash_attn_3/flash_attn_interface.py
+RUN rm -rf flash-attention/
 
-WORKDIR /root/
 RUN git clone https://github.com/NVIDIA/Megatron-LM.git --recursive && \
     cd Megatron-LM && git checkout ${MEGATRON_COMMIT} && \
     pip install -e .
 
@@ -94,7 +94,7 @@ index 860ee64a9..80944b702 100755
                      "mlp.0.weight": "mlp.linear_fc1.layer_norm_weight",
                      "mlp.0.bias": "mlp.linear_fc1.layer_norm_bias",
 diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
-index 6aec66e6d..7aa4b2f7d 100644
+index 6aec66e6d..b660a2002 100644
 --- a/megatron/core/models/gpt/gpt_model.py
 +++ b/megatron/core/models/gpt/gpt_model.py
@@ -355,6 +355,7 @@ class GPTModel(LanguageModule):
@@ -143,8 +143,24 @@ index 6aec66e6d..7aa4b2f7d 100644
              hidden_states_list = torch.chunk(hidden_states, 1 + self.config.mtp_num_layers, dim=0)
              hidden_states = hidden_states_list[0]
              if loss_mask is None:
-@@ -480,9 +485,9 @@ class GPTModel(LanguageModule):
-                     runtime_gather_output=runtime_gather_output,
+@@ -474,15 +479,21 @@ class GPTModel(LanguageModule):
+                 loss_mask = torch.ones_like(mtp_labels)
+             for mtp_layer_number in range(self.config.mtp_num_layers):
+                 # output
+-                mtp_logits, _ = self.output_layer(
+-                    hidden_states_list[mtp_layer_number + 1],
+-                    weight=output_weight,
+-                    runtime_gather_output=runtime_gather_output,
++                output_layer_params = {k: v.detach() for k, v in self.output_layer.named_parameters()}
++                output_layer_buffers = dict(self.output_layer.named_buffers())
++                mtp_logits, _ = torch.func.functional_call(
++                    self.output_layer,
++                    {**output_layer_params, **output_layer_buffers},
++                    (hidden_states_list[mtp_layer_number + 1],),
++                    {
++                        "weight": output_weight.detach() if output_weight else None,
++                        "runtime_gather_output": runtime_gather_output,
++                    },
                  )
                  # Calc loss for the current Multi-Token Prediction (MTP) layers.
 -                mtp_labels, _ = roll_tensor(mtp_labels, shifts=-1, dims=-1, cp_group=self.cp_group)