microsoft · kunal-vaishnavi · Feb 11, 2026 · Jan 30, 2026 · Feb 3, 2026 · Feb 3, 2026
diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@ See documentation at the [ONNX Runtime website](https://onnxruntime.ai/docs/gena
 
 | Support matrix | Supported now | Under development | On the roadmap|
 | -------------- | ------------- | ----------------- | -------------- |
-| Model architectures | AMD OLMo <br/> ChatGLM <br/> DeepSeek <br/> ERNIE 4.5 <br/> Fara <br/> Gemma <br/> gpt-oss <br/> Granite <br/> Llama <br/> Mistral <br/> Nemotron <br/> Phi (language + vision) <br/> Qwen (language + vision) <br/> SmolLM3 <br/> Whisper | Stable diffusion | Multi-modal models |
+| Model architectures | AMD OLMo <br/> ChatGLM <br/> DeepSeek <br/> ERNIE 4.5 <br/> Fara <br/> Gemma <br/> gpt-oss <br/> Granite <br/> InternLM2 <br/> Llama <br/> Mistral <br/> Nemotron <br/> Phi (language + vision) <br/> Qwen (language + vision) <br/> SmolLM3 <br/> Whisper | Stable diffusion | Multi-modal models |
 | API| Python <br/>C# <br/>C/C++ <br/> Java ^ | Objective-C ||
 | O/S | Linux <br/> Windows <br/>Mac  <br/>Android   || iOS |||
 | Architecture | x86 <br/> x64 <br/> arm64 ||||

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
@@ -88,6 +88,22 @@ FetchContent_Declare(
 set(OCOS_BUILD_PRESET ort_genai)
 onnxruntime_fetchcontent_makeavailable(onnxruntime_extensions)
 
+# Add InternLM2Tokenizer support: same BPE/LLaMA format as LlamaTokenizer (see Hugging Face model card).
+# This allows exported models to keep the official tokenizer_config.json (InternLM2Tokenizer) per
+# https://huggingface.co/internlm/internlm2-1_8b/blob/main/tokenizer_config.json
+FetchContent_GetProperties(onnxruntime_extensions)
+if(onnxruntime_extensions_POPULATED)
+  set(ORTX_TOKENIZER_CONFIG "${onnxruntime_extensions_SOURCE_DIR}/operators/tokenizer/tokenizer_jsconfig.hpp")
+  if(EXISTS "${ORTX_TOKENIZER_CONFIG}")
+    file(READ "${ORTX_TOKENIZER_CONFIG}" ORTX_TOKENIZER_CONFIG_CONTENT)
+    string(REPLACE "{\"LlamaTokenizer\", TokenType::kBPE},"
+      "{\"LlamaTokenizer\", TokenType::kBPE},\n  {\"InternLM2Tokenizer\", TokenType::kBPE},"
+      ORTX_TOKENIZER_CONFIG_CONTENT "${ORTX_TOKENIZER_CONFIG_CONTENT}")
+    file(WRITE "${ORTX_TOKENIZER_CONFIG}" "${ORTX_TOKENIZER_CONFIG_CONTENT}")
+    message(STATUS "Patched onnxruntime_extensions: added InternLM2Tokenizer support")
+  endif()
+endif()
+
 list(APPEND EXTERNAL_LIBRARIES
   onnxruntime_extensions
   ocos_operators

diff --git a/src/models/model_type.h b/src/models/model_type.h
@@ -1,5 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
+// Copyright (C)  [2026]  Advanced Micro Devices, Inc. All rights reserved. 
+// --------------------------------------------------------------------------
+
 #pragma once
 
 #include <algorithm>
@@ -12,7 +15,7 @@ namespace Generators {
 struct ModelType {
   inline static bool IsLLM(const std::string& model_type) {
     // Large-language model (LLM)
-    static constexpr std::array<std::string_view, 20> LLM = {"chatglm", "decoder", "ernie4_5", "gemma", "gemma2", "gemma3_text", "gpt2", "gptoss", "granite", "llama", "mistral", "nemotron", "olmo", "phi", "phimoe", "phi3", "phi3small", "qwen2", "qwen3", "smollm3"};
+    static constexpr std::array<std::string_view, 21> LLM = {"chatglm", "decoder", "ernie4_5", "gemma", "gemma2", "gemma3_text", "gpt2", "gptoss", "granite", "internlm2", "llama", "mistral", "nemotron", "olmo", "phi", "phimoe", "phi3", "phi3small", "qwen2", "qwen3", "smollm3"};
     return std::find(LLM.begin(), LLM.end(), model_type) != LLM.end();
   }
 

diff --git a/src/python/py/models/README.md b/src/python/py/models/README.md
@@ -41,13 +41,15 @@ The tool currently supports the following model architectures.
 - Gemma
 - gpt-oss
 - Granite
+- InternLM2
 - Llama
 - Mistral
 - Nemotron
 - Phi
 - Qwen
 - SmolLM3
 
+
 It is intended for supporting the latest, popular state-of-the-art models.
 
 ## Usage

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
@@ -3,6 +3,8 @@
 # Licensed under the MIT License.  See License.txt in the project root for
 # license information.
 # --------------------------------------------------------------------------
+# Copyright (C)  [2026]  Advanced Micro Devices, Inc. All rights reserved. Portions of this file consist of AI generated content.
+# --------------------------------------------------------------------------
 """
 Run the model builder to create the desired ONNX model.
 """
@@ -22,6 +24,7 @@
     GemmaModel,
     GPTOSSModel,
     GraniteModel,
+    InternLM2Model,
     LlamaModel,
     MistralModel,
     Model,
@@ -244,6 +247,8 @@ def create_model(
         onnx_model = GPTOSSModel(config, io_dtype, onnx_dtype, execution_provider, cache_dir, extra_options)
     elif config.architectures[0] == "GraniteForCausalLM":
         onnx_model = GraniteModel(config, io_dtype, onnx_dtype, execution_provider, cache_dir, extra_options)
+    elif config.architectures[0] == "InternLM2ForCausalLM":
+        onnx_model = InternLM2Model(config, io_dtype, onnx_dtype, execution_provider, cache_dir, extra_options)
     elif config.architectures[0] == "LlamaForCausalLM":
         onnx_model = LlamaModel(config, io_dtype, onnx_dtype, execution_provider, cache_dir, extra_options)
     elif config.architectures[0] == "MistralForCausalLM":

diff --git a/src/python/py/models/builders/__init__.py b/src/python/py/models/builders/__init__.py
@@ -3,12 +3,15 @@
 # Licensed under the MIT License.  See License.txt in the project root for
 # license information.
 # --------------------------------------------------------------------------
+# Copyright (C)  [2026]  Advanced Micro Devices, Inc. All rights reserved. Portions of this file consist of AI generated content.
+# --------------------------------------------------------------------------
 from .base import Model
 from .chatglm import ChatGLMModel
 from .ernie import ErnieModel
 from .gemma import Gemma2Model, Gemma3Model, GemmaModel
 from .gptoss import GPTOSSModel
 from .granite import GraniteModel
+from .internlm import InternLM2Model
 from .llama import LlamaModel
 from .mistral import MistralModel
 from .nemotron import NemotronModel
@@ -34,6 +37,7 @@
     "Gemma3Model",
     "GemmaModel",
     "GraniteModel",
+    "InternLM2Model",
     "LlamaModel",
     "MistralModel",
     "Model",

diff --git a/src/python/py/models/builders/base.py b/src/python/py/models/builders/base.py
@@ -3,7 +3,7 @@
 # Licensed under the MIT License.  See License.txt in the project root for
 # license information.
 #
-# Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C)  [2026]  Advanced Micro Devices, Inc. All rights reserved. Portions of this file consist of AI generated content.
 # --------------------------------------------------------------------------
 from __future__ import annotations
 
@@ -673,6 +673,15 @@ def save_processing(self, model_name_or_path, extra_kwargs, out_dir):
         )
         print(f"Saving processing files in {out_dir} for GenAI")
         tokenizer.save_pretrained(out_dir)
+        # Overwrite model_max_length with the model's context_length so it is a normal integer
+        # (HF often uses 1e30 for "no limit", which can serialize to a huge decimal in JSON)
+        tokenizer_config_path = os.path.join(out_dir, "tokenizer_config.json")
+        if os.path.isfile(tokenizer_config_path):
+            with open(tokenizer_config_path, "r", encoding="utf-8") as f:
+                config = json.load(f)
+            config["model_max_length"] = self.context_length
+            with open(tokenizer_config_path, "w", encoding="utf-8") as f:
+                json.dump(config, f, indent=2, ensure_ascii=False)
 
     def make_int4_algo_config(self, quant_method: str):
         customized_weight_config = {}

diff --git a/src/python/py/models/builders/internlm.py b/src/python/py/models/builders/internlm.py
@@ -0,0 +1,120 @@
+# -------------------------------------------------------------------------
+# Copyright (C)  [2026]  Advanced Micro Devices, Inc. All rights reserved. Portions of this file consist of AI generated content
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from .base import Model
+import torch.nn as nn
+
+
+class InternLM2Model(Model):
+    def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
+        super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options)
+        # Export genai_config with type "internlm2" (C++ model_type.h already lists "internlm2" as LLM)
+        self.model_type = "InternLM2ForCausalLM"
+
+    def load_weights(self, input_path):
+        """
+        Load the InternLM2 model and adapt attribute names to match base class expectations.
+        InternLM2 uses:
+        - attention_norm instead of input_layernorm
+        - ffn_norm instead of post_attention_layernorm  
+        - feed_forward instead of mlp
+        - wqkv (combined QKV) instead of separate q_proj, k_proj, v_proj
+        - wo instead of o_proj
+        """
+        # Load the model using the parent class method
+        model = super().load_weights(input_path)
+
+        # Get config from the loaded model
+        config = model.config
+
+        # Adapt each decoder layer to match the expected attribute names
+        for layer in model.model.layers:
+            # Map attention_norm to input_layernorm
+            if hasattr(layer, 'attention_norm') and not hasattr(layer, 'input_layernorm'):
+                layer.input_layernorm = layer.attention_norm
+
+            # Map ffn_norm to post_attention_layernorm
+            if hasattr(layer, 'ffn_norm') and not hasattr(layer, 'post_attention_layernorm'):
+                layer.post_attention_layernorm = layer.ffn_norm
+
+            # Map feed_forward to mlp
+            if hasattr(layer, 'feed_forward') and not hasattr(layer, 'mlp'):
+                layer.mlp = layer.feed_forward
+
+            # Map attention to self_attn
+            if hasattr(layer, 'attention') and not hasattr(layer, 'self_attn'):
+                layer.self_attn = layer.attention
+
+            # Map MLP projections (w1/w2/w3 to gate_proj/down_proj/up_proj)
+            if hasattr(layer.mlp, 'w1') and not hasattr(layer.mlp, 'gate_proj'):
+                layer.mlp.gate_proj = layer.mlp.w1
+            if hasattr(layer.mlp, 'w2') and not hasattr(layer.mlp, 'down_proj'):
+                layer.mlp.down_proj = layer.mlp.w2
+            if hasattr(layer.mlp, 'w3') and not hasattr(layer.mlp, 'up_proj'):
+                layer.mlp.up_proj = layer.mlp.w3
+
+            # Handle the combined wqkv projection in attention
+            # InternLM2 uses a grouped/interleaved layout: [Q1, Q2, ..., Qn, K, V] per KV group
+            # Layout: [batch, seq, num_kv_heads, (num_q_heads_per_kv_group + 2), head_dim]
+            if hasattr(layer, 'self_attn') and hasattr(layer.self_attn, 'wqkv'):
+                attn = layer.self_attn
+                wqkv_weight = attn.wqkv.weight  # Shape: [(num_heads + 2*num_kv_heads) * head_dim, hidden_size]
+                wqkv_bias = attn.wqkv.bias if hasattr(attn.wqkv, 'bias') and attn.wqkv.bias is not None else None
+
+                # Calculate dimensions
+                num_q_heads = config.num_attention_heads
+                num_kv_heads = config.num_key_value_heads
+                num_kv_groups = num_q_heads // num_kv_heads  # How many Q heads per KV head
+                head_dim = config.hidden_size // num_q_heads
+
+                q_size = num_q_heads * head_dim
+                kv_size = num_kv_heads * head_dim
+
+                # InternLM2's wqkv is organized as interleaved groups:
+                # For each KV head group: [Q_heads for this group (num_kv_groups heads), K for this group, V for this group]
+                # We need to reshape and reorder to standard [all Q | all K | all V] layout
+
+                # Reshape to grouped format: [num_kv_heads, (num_kv_groups + 2), head_dim, hidden_size]
+                group_size = num_kv_groups + 2
+                wqkv_grouped = wqkv_weight.reshape(num_kv_heads, group_size, head_dim, config.hidden_size)
+
+                # Extract Q, K, V from grouped layout
+                # Q heads: first num_kv_groups entries in each group
+                q_weight = wqkv_grouped[:, :num_kv_groups, :, :].reshape(num_q_heads, head_dim, config.hidden_size)
+                q_weight = q_weight.reshape(q_size, config.hidden_size)
+
+                # K heads: second to last entry in each group
+                k_weight = wqkv_grouped[:, -2, :, :].reshape(kv_size, config.hidden_size)
+
+                # V heads: last entry in each group
+                v_weight = wqkv_grouped[:, -1, :, :].reshape(kv_size, config.hidden_size)
+
+                # Create separate projection layers
+                attn.q_proj = nn.Linear(config.hidden_size, q_size, bias=config.bias)
+                attn.k_proj = nn.Linear(config.hidden_size, kv_size, bias=config.bias)
+                attn.v_proj = nn.Linear(config.hidden_size, kv_size, bias=config.bias)
+
+                # Copy weights (ensure proper copy and contiguous memory)
+                attn.q_proj.weight.data.copy_(q_weight.contiguous())
+                attn.k_proj.weight.data.copy_(k_weight.contiguous())
+                attn.v_proj.weight.data.copy_(v_weight.contiguous())
+
+                # Handle biases if they exist (same grouped layout)
+                if wqkv_bias is not None:
+                    bias_grouped = wqkv_bias.reshape(num_kv_heads, group_size, head_dim)
+
+                    q_bias = bias_grouped[:, :num_kv_groups, :].reshape(q_size)
+                    k_bias = bias_grouped[:, -2, :].reshape(kv_size)
+                    v_bias = bias_grouped[:, -1, :].reshape(kv_size)
+
+                    attn.q_proj.bias.data.copy_(q_bias.contiguous())
+                    attn.k_proj.bias.data.copy_(k_bias.contiguous())
+                    attn.v_proj.bias.data.copy_(v_bias.contiguous())
+
+                # Map wo to o_proj
+                if hasattr(attn, 'wo') and not hasattr(attn, 'o_proj'):
+                    attn.o_proj = attn.wo
+
+        return model