From 42473b3812bbafd2d92e8a1257eab551225c99cc Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Tue, 31 Mar 2026 19:55:04 +0800 Subject: [PATCH 1/8] update readme --- README_zh.md | 4 ++-- src/mcore_bridge/__init__.py | 4 ++-- src/mcore_bridge/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README_zh.md b/README_zh.md index df4730e..90c9b17 100644 --- a/README_zh.md +++ b/README_zh.md @@ -7,7 +7,7 @@

-->

- 为最先进的大语言模型提供 Megatron-Core 模型定义 + 为最先进的大模型提供 Megatron-Core 模型定义

@@ -53,7 +53,7 @@ ## 📝 简介 ## 🎉 新闻 -- 🎉 2026.03.30: MCore-Bridge 正式发布!为最先进的大语言模型提供 Megatron-Core 模型定义,让 Megatron 训练像 Transformers 一样简单。 +- 🎉 2026.03.30: MCore-Bridge 正式发布!为最先进的大模型提供 Megatron-Core 模型定义,让 Megatron 训练像 Transformers 一样简单。 ## 🛠️ 安装 使用pip进行安装: diff --git a/src/mcore_bridge/__init__.py b/src/mcore_bridge/__init__.py index 5fb7d8c..f2e5a73 100644 --- a/src/mcore_bridge/__init__.py +++ b/src/mcore_bridge/__init__.py @@ -12,7 +12,7 @@ from .config import ModelConfig, hf_to_mcore_config from .model import get_mcore_model from .tuners import LoraParallelLinear - from .utils import get_logger, set_random_seed + from .utils import get_logger, set_random_seed, unwrap_model, split_cp_inputs from .version import __release_datetime__, __version__ else: _import_structure = { @@ -20,7 +20,7 @@ 'config': ['ModelConfig', 'hf_to_mcore_config'], 'model': ['get_mcore_model'], 'tuners': ['LoraParallelLinear'], - 'utils': ['get_logger', 'set_random_seed'], + 'utils': ['get_logger', 'set_random_seed', 'unwrap_model', 'split_cp_inputs'], 'version': ['__release_datetime__', '__version__'], } diff --git a/src/mcore_bridge/version.py b/src/mcore_bridge/version.py index 67830af..3594f30 100644 --- a/src/mcore_bridge/version.py +++ b/src/mcore_bridge/version.py @@ -1,5 +1,5 @@ # Make sure to modify __release_datetime__ to release time when making official release. -__version__ = '1.0.0.dev0' +__version__ = '1.0.1.dev0' # default release datetime for branches under active development is set # to be a time far-far-away-into-the-future __release_datetime__ = '2099-12-31 23:59:59' From 135adadc2a49aa237f4125f91c98870d4c66a2bd Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Wed, 1 Apr 2026 14:59:41 +0800 Subject: [PATCH 2/8] update --- src/mcore_bridge/bridge/gpt_bridge.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/mcore_bridge/bridge/gpt_bridge.py b/src/mcore_bridge/bridge/gpt_bridge.py index 136e647..5f39aa4 100644 --- a/src/mcore_bridge/bridge/gpt_bridge.py +++ b/src/mcore_bridge/bridge/gpt_bridge.py @@ -11,7 +11,7 @@ from peft import PeftModel from peft.utils import ModulesToSaveWrapper from tqdm import tqdm -from typing import List, Optional, Union +from typing import Callable, List, Optional, Union from mcore_bridge.tuners import LoraParallelLinear from mcore_bridge.utils import (MxFp4Dequantizer, SafetensorLazyLoader, StreamingSafetensorSaver, deep_getattr, @@ -1615,7 +1615,14 @@ def _convert_mtp_layer(self, lm_model, hf_state_dict, hf_prefix: str, layer_idx: hf_state_dict.update(origin_hf_state_dict) return hf_state_dict - def load_weights(self, mg_models, hf_model_dir: str, peft_format: bool = False, adapter_name: str = 'default'): + def load_weights( + self, + mg_models, + hf_model_dir: str, + peft_format: bool = False, + adapter_name: str = 'default', + converter=None, + ): """Load weights from safetensors (HuggingFace) format into Megatron model. Args: @@ -1631,6 +1638,8 @@ def load_weights(self, mg_models, hf_model_dir: str, peft_format: bool = False, self._disable_tqdm = False with torch.no_grad(), SafetensorLazyLoader(hf_model_dir, peft_format=peft_format) as loader: state_dict = loader.get_state_dict() + if converter: + state_dict = dict(converter(k, v) for k, v in state_dict.items()) hf_prefix = 'base_model.model.' if peft_format else '' for mg_model in mg_models: list(self._convert([mg_model], state_dict, hf_prefix, True, 'Loading: ')) @@ -1640,6 +1649,8 @@ def export_weights(self, target_device=None, only_master_rank: bool = False, peft_format: bool = False, + adapter_name: str = 'default', + converter=None, tqdm_desc: str = 'Exporting: ', disable_tqdm: bool = True): """Export Megatron model weights to safetensors (HuggingFace) format as a generator. @@ -1663,8 +1674,8 @@ def export_weights(self, self._target_device = target_device self._only_master_rank = only_master_rank self._peft_format = peft_format + self._adapter_name = adapter_name self._disable_tqdm = disable_tqdm - self._adapter_name = 'default' self._peft_target_modules = set() self._peft_modules_to_save = set() hf_prefix = 'base_model.model.' if peft_format else '' @@ -1681,6 +1692,8 @@ def save_weights( mg_models, output_dir: str, peft_format: bool = False, + adapter_name: str = 'default', + converter: Callable = None, max_shard_size: str = '5GB', ) -> None: """Save Megatron model checkpoint in safetensors (HuggingFace) format. @@ -1705,6 +1718,7 @@ def save_weights( target_device='cpu', only_master_rank=True, peft_format=peft_format, + adapter_name=adapter_name, tqdm_desc='Saving: ', disable_tqdm=False): saver.add_tensor(k, v) From 2f4e98af0f1c7293101cf8caddadcd47d5d4e24a Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Wed, 1 Apr 2026 15:08:03 +0800 Subject: [PATCH 3/8] fix --- src/mcore_bridge/bridge/gpt_bridge.py | 33 ++++++++++++++++++--------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/src/mcore_bridge/bridge/gpt_bridge.py b/src/mcore_bridge/bridge/gpt_bridge.py index 5f39aa4..0c20445 100644 --- a/src/mcore_bridge/bridge/gpt_bridge.py +++ b/src/mcore_bridge/bridge/gpt_bridge.py @@ -1621,7 +1621,7 @@ def load_weights( hf_model_dir: str, peft_format: bool = False, adapter_name: str = 'default', - converter=None, + converter: Callable = None, ): """Load weights from safetensors (HuggingFace) format into Megatron model. @@ -1631,6 +1631,7 @@ def load_weights( peft_format: Whether the weights are in PEFT (LoRA, etc.) format. Defaults to False. If True, loads LoRA delta weights. If False, loads the full model weights. adapter_name: Name of the adapter for PEFT models. Defaults to 'default'. + converter: Used to perform key-value conversion on the newly loaded state_dict. """ self._peft_format = peft_format self._adapter_name = adapter_name @@ -1644,15 +1645,17 @@ def load_weights( for mg_model in mg_models: list(self._convert([mg_model], state_dict, hf_prefix, True, 'Loading: ')) - def export_weights(self, - mg_models, - target_device=None, - only_master_rank: bool = False, - peft_format: bool = False, - adapter_name: str = 'default', - converter=None, - tqdm_desc: str = 'Exporting: ', - disable_tqdm: bool = True): + def export_weights( + self, + mg_models, + target_device=None, + only_master_rank: bool = False, + peft_format: bool = False, + adapter_name: str = 'default', + converter: Callable = None, + tqdm_desc: str = 'Exporting: ', + disable_tqdm: bool = True, + ): """Export Megatron model weights to safetensors (HuggingFace) format as a generator. This method yields weight tensors one by one for streaming save operations or RL weight synchronization, @@ -1665,6 +1668,8 @@ def export_weights(self, peft_format: Whether to export in PEFT (LoRA, etc.) format. Defaults to False. - If True, exports only LoRA delta weights. If False, exports the complete model weights (e.g., after merge-lora or full-parameter fine-tuning). + adapter_name: Name of the adapter for PEFT models. Defaults to 'default'. + converter: Used to perform key-value conversion on the newly exported state_dict. tqdm_desc: Description text for the progress bar. Defaults to 'Exporting: '. disable_tqdm: Whether to disable the tqdm progress bar. Defaults to True. @@ -1685,7 +1690,10 @@ def export_weights(self, mg_models[i] = mg_model.model self.config = mg_models[0].config with torch.no_grad(): - yield from self._convert(mg_models, {}, hf_prefix, False, tqdm_desc=tqdm_desc) + for k, v in self._convert(mg_models, {}, hf_prefix, False, tqdm_desc=tqdm_desc): + if converter: + k, v = converter(k, v) + yield k, v def save_weights( self, @@ -1708,6 +1716,8 @@ def save_weights( peft_format: Whether to save in PEFT (LoRA, etc.) format. Defaults to False. If True, saves LoRA delta weights. If False, saves the complete model weights (e.g., after merge-lora or full-parameter fine-tuning). + adapter_name: Name of the adapter for PEFT models. Defaults to 'default'. + converter: Used to perform key-value conversion on the newly exported state_dict. max_shard_size: Maximum size of a single storage file, default is '5GB'. """ gc_collect() @@ -1719,6 +1729,7 @@ def save_weights( only_master_rank=True, peft_format=peft_format, adapter_name=adapter_name, + converter=converter, tqdm_desc='Saving: ', disable_tqdm=False): saver.add_tensor(k, v) From adf859dd9a927f878f28628d97e4f32683f35044 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Wed, 1 Apr 2026 15:11:07 +0800 Subject: [PATCH 4/8] update --- src/mcore_bridge/bridge/gpt_bridge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mcore_bridge/bridge/gpt_bridge.py b/src/mcore_bridge/bridge/gpt_bridge.py index 0c20445..2d4c28f 100644 --- a/src/mcore_bridge/bridge/gpt_bridge.py +++ b/src/mcore_bridge/bridge/gpt_bridge.py @@ -1640,7 +1640,7 @@ def load_weights( with torch.no_grad(), SafetensorLazyLoader(hf_model_dir, peft_format=peft_format) as loader: state_dict = loader.get_state_dict() if converter: - state_dict = dict(converter(k, v) for k, v in state_dict.items()) + state_dict = dict(converter(k, v, adapter_name=adapter_name) for k, v in state_dict.items()) hf_prefix = 'base_model.model.' if peft_format else '' for mg_model in mg_models: list(self._convert([mg_model], state_dict, hf_prefix, True, 'Loading: ')) @@ -1692,7 +1692,7 @@ def export_weights( with torch.no_grad(): for k, v in self._convert(mg_models, {}, hf_prefix, False, tqdm_desc=tqdm_desc): if converter: - k, v = converter(k, v) + k, v = converter(k, v, adapter_name=adapter_name) yield k, v def save_weights( From 4af369fbe0eae429d9505ba6729ac9952c29a50e Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Wed, 1 Apr 2026 16:37:01 +0800 Subject: [PATCH 5/8] fix --- src/mcore_bridge/bridge/gpt_bridge.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/mcore_bridge/bridge/gpt_bridge.py b/src/mcore_bridge/bridge/gpt_bridge.py index 0c20445..80c6db9 100644 --- a/src/mcore_bridge/bridge/gpt_bridge.py +++ b/src/mcore_bridge/bridge/gpt_bridge.py @@ -1621,7 +1621,7 @@ def load_weights( hf_model_dir: str, peft_format: bool = False, adapter_name: str = 'default', - converter: Callable = None, + converter: Optional[Callable] = None, ): """Load weights from safetensors (HuggingFace) format into Megatron model. @@ -1640,7 +1640,14 @@ def load_weights( with torch.no_grad(), SafetensorLazyLoader(hf_model_dir, peft_format=peft_format) as loader: state_dict = loader.get_state_dict() if converter: - state_dict = dict(converter(k, v) for k, v in state_dict.items()) + new_state_dict = {} + for k, v in state_dict.items(): + res = converter(k, v) + if res is None: + continue + k, v = res + new_state_dict[k] = v + state_dict = new_state_dict hf_prefix = 'base_model.model.' if peft_format else '' for mg_model in mg_models: list(self._convert([mg_model], state_dict, hf_prefix, True, 'Loading: ')) @@ -1652,7 +1659,7 @@ def export_weights( only_master_rank: bool = False, peft_format: bool = False, adapter_name: str = 'default', - converter: Callable = None, + converter: Optional[Callable] = None, tqdm_desc: str = 'Exporting: ', disable_tqdm: bool = True, ): @@ -1692,7 +1699,10 @@ def export_weights( with torch.no_grad(): for k, v in self._convert(mg_models, {}, hf_prefix, False, tqdm_desc=tqdm_desc): if converter: - k, v = converter(k, v) + res = converter(k, v) + if res is None: + continue + k, v = res yield k, v def save_weights( @@ -1701,7 +1711,7 @@ def save_weights( output_dir: str, peft_format: bool = False, adapter_name: str = 'default', - converter: Callable = None, + converter: Optional[Callable] = None, max_shard_size: str = '5GB', ) -> None: """Save Megatron model checkpoint in safetensors (HuggingFace) format. From 9763b5daa24dca37cd22bdf5e2662c801350e4f0 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Wed, 1 Apr 2026 16:40:43 +0800 Subject: [PATCH 6/8] fix --- src/mcore_bridge/bridge/gpt_bridge.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mcore_bridge/bridge/gpt_bridge.py b/src/mcore_bridge/bridge/gpt_bridge.py index 589f0a1..80c6db9 100644 --- a/src/mcore_bridge/bridge/gpt_bridge.py +++ b/src/mcore_bridge/bridge/gpt_bridge.py @@ -1751,4 +1751,3 @@ class MultimodalGPTBridge(GPTBridge): hf_layers_prefix = 'model.language_model.layers' hf_embed_key = 'model.language_model.embed_tokens.weight' hf_final_layernorm_key = 'model.language_model.norm.weight' - From b4f9f4b838cb8385ac0a1f02aa6b4ef45a990131 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Wed, 1 Apr 2026 16:41:42 +0800 Subject: [PATCH 7/8] fix --- src/mcore_bridge/bridge/gpt_bridge.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/mcore_bridge/bridge/gpt_bridge.py b/src/mcore_bridge/bridge/gpt_bridge.py index 80c6db9..332de4a 100644 --- a/src/mcore_bridge/bridge/gpt_bridge.py +++ b/src/mcore_bridge/bridge/gpt_bridge.py @@ -1642,10 +1642,10 @@ def load_weights( if converter: new_state_dict = {} for k, v in state_dict.items(): - res = converter(k, v) - if res is None: + kv = converter(k, v) + if kv is None: continue - k, v = res + k, v = kv new_state_dict[k] = v state_dict = new_state_dict hf_prefix = 'base_model.model.' if peft_format else '' @@ -1699,10 +1699,10 @@ def export_weights( with torch.no_grad(): for k, v in self._convert(mg_models, {}, hf_prefix, False, tqdm_desc=tqdm_desc): if converter: - res = converter(k, v) - if res is None: + kv = converter(k, v) + if kv is None: continue - k, v = res + k, v = kv yield k, v def save_weights( From 45bc01b8000f25f84435bf43bebfc10f772f219e Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Wed, 1 Apr 2026 17:20:57 +0800 Subject: [PATCH 8/8] fix --- src/mcore_bridge/bridge/gpt_bridge.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/mcore_bridge/bridge/gpt_bridge.py b/src/mcore_bridge/bridge/gpt_bridge.py index 332de4a..c33a572 100644 --- a/src/mcore_bridge/bridge/gpt_bridge.py +++ b/src/mcore_bridge/bridge/gpt_bridge.py @@ -5,7 +5,6 @@ import torch import torch.distributed as dist import torch.nn.functional as F -import transformers from megatron.core import mpu from packaging import version from peft import PeftModel @@ -66,7 +65,6 @@ def __init__(self, config): self.pp_group = mpu.get_pipeline_model_parallel_group() self.etp_group = mpu.get_expert_tensor_parallel_group() self.ep_group = mpu.get_expert_model_parallel_group() - self.is_transformers_5 = version.parse(transformers.__version__) >= version.parse('5.0.0.dev') self.tp_rank = mpu.get_tensor_model_parallel_rank() self.pp_rank = mpu.get_pipeline_model_parallel_rank() self.etp_rank = mpu.get_expert_tensor_parallel_rank()