diff --git a/.agents/skills b/.agents/skills new file mode 120000 index 0000000000..454b8427cd --- /dev/null +++ b/.agents/skills @@ -0,0 +1 @@ +../.claude/skills \ No newline at end of file diff --git a/.claude/skills/xtuner-sync-supported-models/SKILL.md b/.claude/skills/xtuner-sync-supported-models/SKILL.md new file mode 100644 index 0000000000..e56d6a7655 --- /dev/null +++ b/.claude/skills/xtuner-sync-supported-models/SKILL.md @@ -0,0 +1,123 @@ +--- +name: xtuner-sync-supported-models +description: Synchronize xtuner's supported model documentation (docs/en/pretrain_sft/advanced_tutorial/model.md and docs/zh_cn/pretrain_sft/advanced_tutorial/model.md) with the actual Config classes defined under xtuner/v1/model/. Use when (1) new TransformerConfig, MoEConfig, or BaseComposeConfig subclasses are added, removed, or renamed in xtuner/v1/model/, (2) existing model configs change their inheritance hierarchy, scale, or HuggingFace counterpart, or (3) a code review or user request points out that model.md is out of sync with the codebase. +--- + +# Update XTuner Supported Model Docs + +Keep the English and Chinese `model.md` files synchronized with the actual Config classes in `xtuner/v1/model/`. + +## Scan the Codebase + +Run the bundled scan script from the xtuner project root to discover all Config classes and their inheritance: + +```bash +python3 .agents/skills/xtuner-sync-supported-models/scripts/scan_model_configs.py +``` + +The script outputs JSON with two keys: +- `configs`: list of every `*Config` class under `xtuner/v1/model/` with its parent classes and file path +- `children`: parent-to-children mapping for the hierarchy tree + +## What to Update + +Compare the script output against the two files: +- `docs/en/pretrain_sft/advanced_tutorial/model.md` +- `docs/zh_cn/pretrain_sft/advanced_tutorial/model.md` + +Both files share the same structure and must stay in sync: + +1. **Base Config Classes** — configs that directly inherit from `TransformerConfig` (or `MoEConfig`) and provide a `from_hf` classmethod for loading HuggingFace weights +2. **Concrete Model Configs** — fixed-scale subclasses of the base configs above +3. **Compose Models** — multimodal configs that inherit from `BaseComposeConfig` +4. **Inheritance Hierarchy** — a text tree showing the full `XTunerBaseModelConfig` hierarchy + +### Rules for the Base Config table + +Include these direct descendants of `TransformerConfig`/`MoEConfig`: +- `Qwen2DenseConfig` +- `Qwen3DenseConfig` +- `DeepSeekV3Config` +- `GptOssConfig` +- `Qwen3MoEConfig` + +Exclude from the base table: +- `MoEConfig` — it is an intermediate base class, not a usable model family +- `Qwen3_5_VLTextMoEConfig` — it is an intermediate base with only one concrete child; its child `Qwen3_5_VLTextMoE35BA3BConfig` belongs under the MoE concrete table + +### Rules for the Concrete Model table + +Include every concrete subclass that has fixed parameter defaults. For each row note: +- `Config Class` +- `Base Class / Family` +- `Architecture Type`: `Dense`, `MoE`, `Dense (VL backbone)`, `MoE (VL backbone)` +- `Scale / Notes`: parameter count or total/activated size; for VL backbones note "for multimodal" + +`DeepSeekV3Config` appears here even though it has no separate base entry (it is both base and concrete). + +### Rules for the Compose Models section + +Include three sub-tables: +1. **Compose Base Config Classes** — `Qwen3VLBaseConfig`, `InternVLBaseConfig`, `InternS1BaseConfig` + - `Qwen3VLBaseConfig`: VL model based on Qwen3 text backbone + - `InternVLBaseConfig`: VL model based on InternViT + Qwen3 + - `InternS1BaseConfig`: Science multimodal model based on InternViT + Qwen3 +2. **Concrete Compose Model Configs** — every subclass of the above bases; for each row note the wrapped `Text Config` and scale + +### Rules for the Inheritance Hierarchy tree + +Rebuild the tree from `XTunerBaseModelConfig` with two top-level branches: + +```text +XTunerBaseModelConfig +├── TransformerConfig +│ ├── Dense Models +│ │ ├── Qwen2DenseConfig +│ │ │ └── Qwen2Dense7BConfig +│ │ └── Qwen3DenseConfig +│ │ ├── Qwen3Dense8BConfig +│ │ ├── Qwen3Dense4BConfig +│ │ ├── Qwen3Dense0P6BConfig +│ │ ├── Qwen3VLTextDense4BConfig +│ │ └── Qwen3VLTextDense8BConfig +│ └── MoE Models (via MoEConfig) +│ ├── DeepSeekV3Config +│ ├── GptOssConfig +│ │ ├── GptOss21BA3P6Config +│ │ └── GptOss117BA5P8Config +│ ├── Qwen3MoEConfig +│ │ ├── Qwen3MoE30BA3Config +│ │ ├── Qwen3MoE235BA22Config +│ │ ├── Qwen3MoEFoPEConfig +│ │ ├── Qwen3VLTextMoE30BA3Config +│ │ └── Qwen3VLTextMoE235BA22Config +│ └── Qwen3_5_VLTextMoEConfig +│ └── Qwen3_5_VLTextMoE35BA3BConfig +└── BaseComposeConfig + ├── Qwen3VLBaseConfig + │ ├── Qwen3VLMoE30BA3Config + │ ├── Qwen3VLMoE235BA22Config + │ ├── Qwen3VLDense4BConfig + │ ├── Qwen3VLDense8BConfig + │ └── Qwen3_5_BaseConfig + │ └── Qwen3_5_VLMoE35BA3Config + ├── InternVLBaseConfig + │ ├── InternVL3P5Dense8BConfig + │ ├── InternVL3P5MoE30BA3Config + │ └── InternVL3P5Dense1BConfig + └── InternS1BaseConfig + ├── InternS1Config + └── InternS1MiniConfig +``` + +When new configs are added, insert them into the appropriate branch following the same indentation style. + +## Translation Notes + +Keep the Chinese `model.md` (`docs/zh_cn/...`) structurally identical to the English one. Translate: +- Section headings +- Table header cells +- Description cells (e.g., "Image / Video + Text" → "图像/视频 + 文本") +- Scale descriptions (e.g., "~7B parameters" → "约 7B 参数", "FoPE variant" → "FoPE 变体") + +Do **not** translate Config class names, file paths, or code identifiers. diff --git a/.claude/skills/xtuner-sync-supported-models/scripts/scan_model_configs.py b/.claude/skills/xtuner-sync-supported-models/scripts/scan_model_configs.py new file mode 100644 index 0000000000..f07bccc774 --- /dev/null +++ b/.claude/skills/xtuner-sync-supported-models/scripts/scan_model_configs.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +"""Scan xtuner/v1/model for all Config classes and output model info as JSON.""" + +import json +import re +import sys +from pathlib import Path + +# We care about configs that are part of the supported model hierarchy +RELEVANT_BASES = { + "TransformerConfig", + "MoEConfig", + "BaseComposeConfig", + "XTunerBaseModelConfig", + # Known intermediate/family bases + "Qwen2DenseConfig", + "Qwen3DenseConfig", + "Qwen3MoEConfig", + "Qwen3_5_VLTextMoEConfig", + "GptOssConfig", + "DeepSeekV3Config", + "Qwen3VLBaseConfig", + "Qwen3_5_BaseConfig", + "InternVLBaseConfig", + "InternS1BaseConfig", +} + + +def scan_file(path: Path): + text = path.read_text() + # Match class definitions like: class FooConfig(BarConfig): + pattern = r"^class\s+(\w+Config)\s*\(([^)]+)\):" + results = [] + for m in re.finditer(pattern, text, re.MULTILINE): + class_name = m.group(1) + parents = [p.strip() for p in m.group(2).split(",")] + results.append({"class": class_name, "parents": parents, "file": str(path)}) + return results + + +def main(): + root = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(".") + model_dir = root / "xtuner" / "v1" / "model" + if not model_dir.exists(): + print(f"Model directory not found: {model_dir}", file=sys.stderr) + sys.exit(1) + + all_configs = [] + for py_file in sorted(model_dir.rglob("*.py")): + all_configs.extend(scan_file(py_file)) + + # Build parent -> children map + children: dict[str, list[str]] = {} + for cfg in all_configs: + for p in cfg["parents"]: + if p in RELEVANT_BASES or p.endswith("Config"): + children.setdefault(p, []).append(cfg["class"]) + + # Deduplicate + for k in children: + children[k] = sorted(set(children[k])) + + output = { + "configs": all_configs, + "children": children, + } + print(json.dumps(output, indent=2, ensure_ascii=False)) + + +if __name__ == "__main__": + main() diff --git a/docs/en/pretrain_sft/advanced_tutorial/model.md b/docs/en/pretrain_sft/advanced_tutorial/model.md index a68a55fe92..791177d023 100644 --- a/docs/en/pretrain_sft/advanced_tutorial/model.md +++ b/docs/en/pretrain_sft/advanced_tutorial/model.md @@ -1,3 +1,110 @@ # Model -Coming soon... \ No newline at end of file +XTuner v1's `TrainEngine` supports a variety of Transformer architectures through different `TransformerConfig` subclasses. The documentation below summarizes the currently supported models (RL-related configs are excluded). + +## Base Config Classes + +The following table lists the **base config classes** that define each model family. They provide the `from_hf` interface for loading pretrained weights from HuggingFace. + +| Base Config Class | Model Family | Architecture Type | HuggingFace Counterpart | +|---|---|---|---| +| `Qwen2DenseConfig` | Qwen2 Dense | Dense | `Qwen2ForCausalLM` | +| `Qwen3DenseConfig` | Qwen3 Dense | Dense | `Qwen3ForCausalLM` | +| `DeepSeekV3Config` | DeepSeek-V3 | MoE | `DeepseekV3ForCausalLM` | +| `GptOssConfig` | GPT-OSS | MoE | `GptOssForCausalLM` | +| `Qwen3MoEConfig` | Qwen3 MoE | MoE | `Qwen3MoeForCausalLM` | + +## Concrete Model Configs + +The following table lists the **concrete model configs** that inherit from the base classes above. Each config corresponds to a specific model scale or variant. + +| Config Class | Base Class / Family | Architecture Type | Scale / Notes | +|---|---|---|---| +| `Qwen2Dense7BConfig` | `Qwen2DenseConfig` | Dense | ~7B parameters | +| `Qwen3Dense8BConfig` | `Qwen3DenseConfig` | Dense | ~8B parameters | +| `Qwen3Dense4BConfig` | `Qwen3DenseConfig` | Dense | ~4B parameters | +| `Qwen3Dense0P6BConfig` | `Qwen3DenseConfig` | Dense | ~0.6B parameters | +| `Qwen3VLTextDense4BConfig` | `Qwen3DenseConfig` | Dense (VL backbone) | ~4B parameters, for multimodal | +| `Qwen3VLTextDense8BConfig` | `Qwen3DenseConfig` | Dense (VL backbone) | ~8B parameters, for multimodal | +| `DeepSeekV3Config` | — | MoE | ~671B total / ~37B activated | +| `GptOss21BA3P6Config` | `GptOssConfig` | MoE | ~21B total / ~3.6B activated | +| `GptOss117BA5P8Config` | `GptOssConfig` | MoE | ~117B total / ~5.8B activated | +| `Qwen3MoE30BA3Config` | `Qwen3MoEConfig` | MoE | ~30B total / ~3B activated | +| `Qwen3MoE235BA22Config` | `Qwen3MoEConfig` | MoE | ~235B total / ~22B activated | +| `Qwen3MoEFoPEConfig` | `Qwen3MoEConfig` | MoE | FoPE (Frequency-based Position Embedding) variant | +| `Qwen3VLTextMoE30BA3Config` | `Qwen3MoEConfig` | MoE (VL backbone) | ~30B total, for multimodal | +| `Qwen3VLTextMoE235BA22Config` | `Qwen3MoEConfig` | MoE (VL backbone) | ~235B total, for multimodal | +| `Qwen3_5_VLTextMoE35BA3BConfig` | `Qwen3_5_VLTextMoEConfig` | MoE (VL backbone) | ~35B total / ~3B activated, for multimodal | + +## Compose Models + +In addition to pure text models, XTuner also supports **multimodal compose models** that combine a vision encoder, a projector, and a language model. These configs inherit from `BaseComposeConfig` rather than `TransformerConfig` directly, but they wrap the text configs listed above. + +### Compose Base Config Classes + +| Base Config Class | Model Family | Modality | Description | +|---|---|---|---| +| `Qwen3VLBaseConfig` | Qwen3-VL | Image / Video + Text | VL model based on Qwen3 text backbone | +| `InternVLBaseConfig` | InternVL | Image + Text | VL model based on InternViT + Qwen3 | +| `InternS1BaseConfig` | InternS1 | Image + Text | Science multimodal model based on InternViT + Qwen3 | + +### Concrete Compose Model Configs + +| Config Class | Compose Base / Family | Text Config | Scale / Notes | +|---|---|---|---| +| `Qwen3VLMoE30BA3Config` | `Qwen3VLBaseConfig` | `Qwen3VLTextMoE30BA3Config` | ~30B total, MoE VL | +| `Qwen3VLMoE235BA22Config` | `Qwen3VLBaseConfig` | `Qwen3VLTextMoE235BA22Config` | ~235B total, MoE VL | +| `Qwen3VLDense4BConfig` | `Qwen3VLBaseConfig` | `Qwen3VLTextDense4BConfig` | ~4B parameters, Dense VL | +| `Qwen3VLDense8BConfig` | `Qwen3VLBaseConfig` | `Qwen3VLTextDense8BConfig` | ~8B parameters, Dense VL | +| `Qwen3_5_VLMoE35BA3Config` | `Qwen3_5_BaseConfig` | `Qwen3_5_VLTextMoE35BA3BConfig` | ~35B total / ~3B activated, MoE VL | +| `InternVL3P5Dense8BConfig` | `InternVLBaseConfig` | `Qwen3Dense8BConfig` | ~8B parameters, Dense VL | +| `InternVL3P5MoE30BA3Config` | `InternVLBaseConfig` | `Qwen3MoE30BA3Config` | ~30B total, MoE VL | +| `InternVL3P5Dense1BConfig` | `InternVLBaseConfig` | `Qwen3Dense0P6BConfig` | ~1B parameters, Dense VL | +| `InternS1Config` | `InternS1BaseConfig` | `Qwen3MoE235BA22Config` | ~235B total, MoE multimodal | +| `InternS1MiniConfig` | `InternS1BaseConfig` | `Qwen3Dense8BConfig` | ~8B parameters, Dense multimodal | + +## Inheritance Hierarchy + +The following diagram shows the complete inheritance hierarchy of all config classes supported by `TrainEngine`, including both `TransformerConfig` and `BaseComposeConfig` branches. + +```text +XTunerBaseModelConfig +├── TransformerConfig +│ ├── Dense Models +│ │ ├── Qwen2DenseConfig +│ │ │ └── Qwen2Dense7BConfig +│ │ └── Qwen3DenseConfig +│ │ ├── Qwen3Dense8BConfig +│ │ ├── Qwen3Dense4BConfig +│ │ ├── Qwen3Dense0P6BConfig +│ │ ├── Qwen3VLTextDense4BConfig +│ │ └── Qwen3VLTextDense8BConfig +│ └── MoE Models (via MoEConfig) +│ ├── DeepSeekV3Config +│ ├── GptOssConfig +│ │ ├── GptOss21BA3P6Config +│ │ └── GptOss117BA5P8Config +│ ├── Qwen3MoEConfig +│ │ ├── Qwen3MoE30BA3Config +│ │ ├── Qwen3MoE235BA22Config +│ │ ├── Qwen3MoEFoPEConfig +│ │ ├── Qwen3VLTextMoE30BA3Config +│ │ └── Qwen3VLTextMoE235BA22Config +│ └── Qwen3_5_VLTextMoEConfig +│ └── Qwen3_5_VLTextMoE35BA3BConfig +└── BaseComposeConfig + ├── Qwen3VLBaseConfig + │ ├── Qwen3VLMoE30BA3Config + │ ├── Qwen3VLMoE235BA22Config + │ ├── Qwen3VLDense4BConfig + │ ├── Qwen3VLDense8BConfig + │ └── Qwen3_5_BaseConfig + │ └── Qwen3_5_VLMoE35BA3Config + ├── InternVLBaseConfig + │ ├── InternVL3P5Dense8BConfig + │ ├── InternVL3P5MoE30BA3Config + │ └── InternVL3P5Dense1BConfig + └── InternS1BaseConfig + ├── InternS1Config + └── InternS1MiniConfig +``` diff --git a/docs/zh_cn/pretrain_sft/advanced_tutorial/model.md b/docs/zh_cn/pretrain_sft/advanced_tutorial/model.md index e1f94714ce..a6ff72e0f9 100644 --- a/docs/zh_cn/pretrain_sft/advanced_tutorial/model.md +++ b/docs/zh_cn/pretrain_sft/advanced_tutorial/model.md @@ -1,3 +1,110 @@ # 模型 -Coming soon... +XTuner v1 的 `TrainEngine` 通过不同的 `TransformerConfig` 子类支持多种 Transformer 架构。下文总结了当前支持的模型(不包含 RL 相关配置)。 + +## 基类配置 + +下表列出**基类配置**,它们定义了各个模型系列,并提供了从 HuggingFace 加载预训练权重的 `from_hf` 接口。 + +| 基类配置 | 模型系列 | 架构类型 | 对应的 HuggingFace 模型 | +|---|---|---|---| +| `Qwen2DenseConfig` | Qwen2 Dense | Dense | `Qwen2ForCausalLM` | +| `Qwen3DenseConfig` | Qwen3 Dense | Dense | `Qwen3ForCausalLM` | +| `DeepSeekV3Config` | DeepSeek-V3 | MoE | `DeepseekV3ForCausalLM` | +| `GptOssConfig` | GPT-OSS | MoE | `GptOssForCausalLM` | +| `Qwen3MoEConfig` | Qwen3 MoE | MoE | `Qwen3MoeForCausalLM` | + +## 具体模型配置 + +下表列出**具体模型配置**,它们继承自上述基类,每个配置对应特定的模型规模或变体。 + +| 配置类名 | 基类 / 所属系列 | 架构类型 | 规模 / 说明 | +|---|---|---|---| +| `Qwen2Dense7BConfig` | `Qwen2DenseConfig` | Dense | 约 7B 参数 | +| `Qwen3Dense8BConfig` | `Qwen3DenseConfig` | Dense | 约 8B 参数 | +| `Qwen3Dense4BConfig` | `Qwen3DenseConfig` | Dense | 约 4B 参数 | +| `Qwen3Dense0P6BConfig` | `Qwen3DenseConfig` | Dense | 约 0.6B 参数 | +| `Qwen3VLTextDense4BConfig` | `Qwen3DenseConfig` | Dense(VL 文本主干) | 约 4B 参数,用于多模态 | +| `Qwen3VLTextDense8BConfig` | `Qwen3DenseConfig` | Dense(VL 文本主干) | 约 8B 参数,用于多模态 | +| `DeepSeekV3Config` | — | MoE | 约 671B 总参 / 约 37B 激活 | +| `GptOss21BA3P6Config` | `GptOssConfig` | MoE | 约 21B 总参 / 约 3.6B 激活 | +| `GptOss117BA5P8Config` | `GptOssConfig` | MoE | 约 117B 总参 / 约 5.8B 激活 | +| `Qwen3MoE30BA3Config` | `Qwen3MoEConfig` | MoE | 约 30B 总参 / 约 3B 激活 | +| `Qwen3MoE235BA22Config` | `Qwen3MoEConfig` | MoE | 约 235B 总参 / 约 22B 激活 | +| `Qwen3MoEFoPEConfig` | `Qwen3MoEConfig` | MoE | FoPE(基于频率的位置编码)变体 | +| `Qwen3VLTextMoE30BA3Config` | `Qwen3MoEConfig` | MoE(VL 文本主干) | 约 30B 总参,用于多模态 | +| `Qwen3VLTextMoE235BA22Config` | `Qwen3MoEConfig` | MoE(VL 文本主干) | 约 235B 总参,用于多模态 | +| `Qwen3_5_VLTextMoE35BA3BConfig` | `Qwen3_5_VLTextMoEConfig` | MoE(VL 文本主干) | 约 35B 总参 / 约 3B 激活,用于多模态 | + +## Compose 多模态模型 + +除了纯文本模型外,XTuner 还支持**多模态 Compose 模型**,它们将视觉编码器(vision encoder)、投影层(projector)和语言模型组合在一起。这些配置直接继承自 `BaseComposeConfig` 而非 `TransformerConfig`,但其内部封装了上文列出的文本模型配置。 + +### Compose 基类配置 + +| 基类配置 | 模型系列 | 模态 | 说明 | +|---|---|---|---| +| `Qwen3VLBaseConfig` | Qwen3-VL | 图像/视频 + 文本 | 基于 Qwen3 文本主干的 VL 模型 | +| `InternVLBaseConfig` | InternVL | 图像 + 文本 | 基于 InternViT + Qwen3 的 VL 模型 | +| `InternS1BaseConfig` | InternS1 | 图像 + 文本 | 基于 InternViT + Qwen3 的科学多模态模型 | + +### 具体 Compose 模型配置 + +| 配置类名 | Compose 基类 / 系列 | 文本模型配置 | 规模 / 说明 | +|---|---|---|---| +| `Qwen3VLMoE30BA3Config` | `Qwen3VLBaseConfig` | `Qwen3VLTextMoE30BA3Config` | 约 30B 总参,MoE VL | +| `Qwen3VLMoE235BA22Config` | `Qwen3VLBaseConfig` | `Qwen3VLTextMoE235BA22Config` | 约 235B 总参,MoE VL | +| `Qwen3VLDense4BConfig` | `Qwen3VLBaseConfig` | `Qwen3VLTextDense4BConfig` | 约 4B 参数,Dense VL | +| `Qwen3VLDense8BConfig` | `Qwen3VLBaseConfig` | `Qwen3VLTextDense8BConfig` | 约 8B 参数,Dense VL | +| `Qwen3_5_VLMoE35BA3Config` | `Qwen3_5_BaseConfig` | `Qwen3_5_VLTextMoE35BA3BConfig` | 约 35B 总参 / 约 3B 激活,MoE VL | +| `InternVL3P5Dense8BConfig` | `InternVLBaseConfig` | `Qwen3Dense8BConfig` | 约 8B 参数,Dense VL | +| `InternVL3P5MoE30BA3Config` | `InternVLBaseConfig` | `Qwen3MoE30BA3Config` | 约 30B 总参,MoE VL | +| `InternVL3P5Dense1BConfig` | `InternVLBaseConfig` | `Qwen3Dense0P6BConfig` | 约 1B 参数,Dense VL | +| `InternS1Config` | `InternS1BaseConfig` | `Qwen3MoE235BA22Config` | 约 235B 总参,MoE 多模态 | +| `InternS1MiniConfig` | `InternS1BaseConfig` | `Qwen3Dense8BConfig` | 约 8B 参数,Dense 多模态 | + +## 继承关系 + +下图展示了 `TrainEngine` 支持的所有配置类的完整继承层级,包括 `TransformerConfig` 和 `BaseComposeConfig` 两大分支。 + +```text +XTunerBaseModelConfig +├── TransformerConfig +│ ├── Dense 模型 +│ │ ├── Qwen2DenseConfig +│ │ │ └── Qwen2Dense7BConfig +│ │ └── Qwen3DenseConfig +│ │ ├── Qwen3Dense8BConfig +│ │ ├── Qwen3Dense4BConfig +│ │ ├── Qwen3Dense0P6BConfig +│ │ ├── Qwen3VLTextDense4BConfig +│ │ └── Qwen3VLTextDense8BConfig +│ └── MoE 模型(经由 MoEConfig) +│ ├── DeepSeekV3Config +│ ├── GptOssConfig +│ │ ├── GptOss21BA3P6Config +│ │ └── GptOss117BA5P8Config +│ ├── Qwen3MoEConfig +│ │ ├── Qwen3MoE30BA3Config +│ │ ├── Qwen3MoE235BA22Config +│ │ ├── Qwen3MoEFoPEConfig +│ │ ├── Qwen3VLTextMoE30BA3Config +│ │ └── Qwen3VLTextMoE235BA22Config +│ └── Qwen3_5_VLTextMoEConfig +│ └── Qwen3_5_VLTextMoE35BA3BConfig +└── BaseComposeConfig + ├── Qwen3VLBaseConfig + │ ├── Qwen3VLMoE30BA3Config + │ ├── Qwen3VLMoE235BA22Config + │ ├── Qwen3VLDense4BConfig + │ ├── Qwen3VLDense8BConfig + │ └── Qwen3_5_BaseConfig + │ └── Qwen3_5_VLMoE35BA3Config + ├── InternVLBaseConfig + │ ├── InternVL3P5Dense8BConfig + │ ├── InternVL3P5MoE30BA3Config + │ └── InternVL3P5Dense1BConfig + └── InternS1BaseConfig + ├── InternS1Config + └── InternS1MiniConfig +```