Skip to content

Commit 1ce3db6

Browse files
committed
add Nanonets-OCR2
1 parent 84f755f commit 1ce3db6

File tree

9 files changed

+257
-45
lines changed

9 files changed

+257
-45
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ LittleAcademia[<a href="https://github.com/foldl/little-academia" style="text-
3131

3232
**What's New:**
3333

34+
* 2025-10-13: Nanonets-OCR2
3435
* 2025-10-13: dots.ocr
3536
* 2025-10-10: [I can draw](./docs/multimodal.md): Janus-Pro
3637
* 2025-09-23: Qwen2.5-VL

convert.py

Lines changed: 60 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,7 @@ class ModelType(Enum):
241241
Qwen2Audio = ModelTypeTagChatAudioIn + 0x0000001
242242

243243
Qwen2_5VL = ModelTypeTagChatImageVideoIn + 0x0000001
244+
Qwen2_VL = ModelTypeTagChatImageVideoIn + 0x0000002
244245
KimiVL = ModelTypeTagChatImageVideoIn + 0x0000100
245246
SmolVLM = ModelTypeTagChatImageVideoIn + 0x0000200
246247

@@ -4479,6 +4480,58 @@ def get_weight_names(config):
44794480

44804481
return weight_names
44814482

4483+
class QWen2_VLConverter(BaseConverter):
4484+
MODEL_TYPE = ModelType.Qwen2_VL
4485+
4486+
@classmethod
4487+
def state_dict_pp(cls, config, state_dict):
4488+
r = QWen2_5VLConverter.state_dict_pp(config, state_dict)
4489+
return r
4490+
4491+
@staticmethod
4492+
def dump_config(f, config, ggml_type):
4493+
assert config.vision_config['hidden_act'] == 'quick_gelu'
4494+
config.vision_config['hidden_act'] = 'silu'
4495+
config.vision_config['hidden_size'] = config.vision_config['embed_dim']
4496+
QWen2_5VLConverter.dump_config(f, config, ggml_type)
4497+
4498+
@staticmethod
4499+
def get_weight_names(config):
4500+
weight_names = QWen2Converter.get_weight_names(config if config.text_config is None else AttributeDict(config.text_config))
4501+
4502+
for i in range(config.vision_config['depth']):
4503+
weight_names += [
4504+
f"visual.blocks.{i}.attn.proj.bias",
4505+
f"visual.blocks.{i}.attn.proj.weight",
4506+
f"visual.blocks.{i}.attn.q_proj.bias",
4507+
f"visual.blocks.{i}.attn.q_proj.weight",
4508+
f"visual.blocks.{i}.attn.k_proj.bias",
4509+
f"visual.blocks.{i}.attn.k_proj.weight",
4510+
f"visual.blocks.{i}.attn.v_proj.bias",
4511+
f"visual.blocks.{i}.attn.v_proj.weight",
4512+
f"visual.blocks.{i}.mlp.fc1.bias",
4513+
f"visual.blocks.{i}.mlp.fc1.weight",
4514+
f"visual.blocks.{i}.mlp.fc2.bias",
4515+
f"visual.blocks.{i}.mlp.fc2.weight",
4516+
f"visual.blocks.{i}.norm1.bias",
4517+
f"visual.blocks.{i}.norm1.weight",
4518+
f"visual.blocks.{i}.norm2.bias",
4519+
f"visual.blocks.{i}.norm2.weight",
4520+
]
4521+
4522+
weight_names += [
4523+
"visual.merger.ln_q.bias",
4524+
"visual.merger.ln_q.weight",
4525+
"visual.merger.mlp.0.bias",
4526+
"visual.merger.mlp.0.weight",
4527+
"visual.merger.mlp.2.bias",
4528+
"visual.merger.mlp.2.weight",
4529+
"visual.patch_embed.proj.0.weight",
4530+
"visual.patch_embed.proj.1.weight",
4531+
]
4532+
4533+
return weight_names
4534+
44824535
class QWen2_5VLConverter(BaseConverter):
44834536
MODEL_TYPE = ModelType.Qwen2_5VL
44844537

@@ -4507,21 +4560,23 @@ def state_dict_pp(cls, config, state_dict):
45074560

45084561
@staticmethod
45094562
def dump_config(f, config, ggml_type):
4510-
assert config.rope_scaling['type'] == 'mrope', 'rope_scaling must be mrope'
4563+
#assert config.rope_scaling['type'] == 'mrope', 'rope_scaling must be mrope'
45114564
assert config.vision_config['hidden_act'] == 'silu'
45124565

45134566
QWen2Converter.dump_config(f, config, ggml_type)
45144567

45154568
MROPE_SECTION_MAX = 4
45164569

4570+
text_config = config if config.text_config is None else AttributeDict(config.text_config)
4571+
45174572
config_values = [
4518-
config.tie_word_embeddings if config.tie_word_embeddings is not None else 0
4573+
text_config.tie_word_embeddings if text_config.tie_word_embeddings is not None else 0
45194574
] + pad_to_len(config.rope_scaling['mrope_section'], MROPE_SECTION_MAX)
45204575
f.write(struct.pack("<" + "i" * len(config_values), *config_values))
45214576

45224577
@staticmethod
45234578
def get_weight_names(config):
4524-
weight_names = QWen2Converter.get_weight_names(config)
4579+
weight_names = QWen2Converter.get_weight_names(config if config.text_config is None else AttributeDict(config.text_config))
45254580

45264581
for i in range(config.vision_config['depth']):
45274582
weight_names += [
@@ -8501,6 +8556,8 @@ def main():
85018556
QWen2Converter.convert(config, model_files, vocab, ggml_type, args.save_path)
85028557
elif arch == 'Qwen2AudioForConditionalGeneration':
85038558
QWen2AudioConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
8559+
elif arch == 'Qwen2VLForConditionalGeneration':
8560+
QWen2_VLConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
85048561
elif arch == 'Qwen2_5_VLForConditionalGeneration':
85058562
QWen2_5VLConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
85068563
elif arch == 'KimiVLForConditionalGeneration':

docs/models.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,9 @@ Please use `--format completion` for these models.
384384
Note: Prompt for OCR: _{{image:...}}Extract the text content from this image_. [Here](https://github.com/rednote-hilab/dots.ocr/blob/master/dots_ocr/utils/prompts.py)
385385
are other prompts for OCR. Use `+single-turn` to discard history automatically.
386386

387+
* Nanonets-OCR2 (`Qwen2VLForConditionalGeneration`, `Qwen2_5_VLForConditionalGeneration`)
388+
* [x] OCR2: [3B](https://huggingface.co/nanonets/Nanonets-OCR2-3B/tree/d0368059ad151ce9e38f526890cfd4f27b28be65), [1.5B](https://huggingface.co/nanonets/Nanonets-OCR2-1.5B-exp/tree/306a9b2a65672a3dbebd9bce9a9373a9a18674a2)
389+
387390
## RAG Models
388391

389392
* Text Embedding (`XLMRobertaModel`)

models/dots.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ namespace chatllm::dots::vit
2929
norm(ctx, config.hidden_size)
3030
{}
3131

32-
ggml::tensor *forward(ComputeContext *ctx, ggml::tensor *input, int grid_h, int grid_w)
32+
ggml::tensor *forward(ComputeContext *ctx, ggml::tensor *input, int grid_h, int grid_w) override
3333
{
3434
ggml::tensor *x = nullptr;
3535
x = proj.forward(ctx, input);
@@ -234,7 +234,7 @@ namespace chatllm::dots::vit
234234
loaded = true;
235235
}
236236

237-
ggml::tensor *forward(ComputeContext *ctx, ggml::tensor *input, int grid_h, int grid_w)
237+
ggml::tensor *forward(ComputeContext *ctx, ggml::tensor *input, int grid_h, int grid_w) override
238238
{
239239
pos_helper->prepare(grid_h, grid_w);
240240

models/moonshot.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ namespace chatllm::kimi::vit
162162
return ggml::nelements(pos_emb);
163163
}
164164

165-
ggml::tensor *forward(ComputeContext *ctx, ggml::tensor *input, int grid_h, int grid_w)
165+
ggml::tensor *forward(ComputeContext *ctx, ggml::tensor *input, int grid_h, int grid_w) override
166166
{
167167
CHATLLM_CHECK(ggml::get_dim(input, 3) == 1);
168168

@@ -204,7 +204,7 @@ namespace chatllm::kimi::vit
204204
{
205205
}
206206

207-
ggml::tensor *forward(ComputeContext *ctx, ggml::tensor *input, int grid_h, int grid_w)
207+
ggml::tensor *forward(ComputeContext *ctx, ggml::tensor *input, int grid_h, int grid_w) override
208208
{
209209
ggml::tensor *x = proj.forward(ctx, input);
210210
x = ggml::reshape_3d(ctx, x, ggml::get_dim(x, 2), grid_h, grid_w);
@@ -422,7 +422,7 @@ namespace chatllm::kimi::vit
422422
loaded = true;
423423
}
424424

425-
ggml::tensor *forward(ComputeContext *ctx, ggml::tensor *input, int grid_h, int grid_w)
425+
ggml::tensor *forward(ComputeContext *ctx, ggml::tensor *input, int grid_h, int grid_w) override
426426
{
427427
multi_modal_projector.merge_param.grid_h = grid_h;
428428
multi_modal_projector.merge_param.grid_w = grid_w;

0 commit comments

Comments
 (0)