Skip to content

Commit 3820919

Browse files
committed
add maya1
1 parent aa4ac00 commit 3820919

File tree

15 files changed

+119
-35
lines changed

15 files changed

+119
-35
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ set(core_files src/backend.cpp
9696
models/jina.cpp
9797
models/jiutian.cpp
9898
models/llama.cpp
99+
models/maya.cpp
99100
models/m_a_p.cpp
100101
models/megrez.cpp
101102
models/minicpm.cpp

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ LittleAcademia[<a href="https://github.com/foldl/little-academia" style="text-
3333

3434
**What's New:**
3535

36+
* 2025-11-06: Maya1
3637
* 2025-11-03: Ouro
3738
* 2025-10-31: Megrez2-3x7B-A3B
3839
* 2025-10-25: LLaDA2.0-mini

convert.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,7 @@ class ModelType(Enum):
236236
OuteTTSQwen3 = 0x10000108
237237
QWen3_Embedding = 0x10000109
238238
QWen3_ReRanker = 0x1000010A
239+
Maya1 = 0x1000010B
239240

240241
LlaMAMulti = 0x20000001
241242

@@ -8443,9 +8444,9 @@ def main():
84438444
global g_model_meta
84448445
load_some_info(g_model_meta, Path(args.model_name_or_path))
84458446

8446-
if arch == 'orpheus-tts':
8447+
if arch in ['orpheus-tts', 'maya1']:
84478448
if args.snac_model == '':
8448-
raise Exception('snac_model (`--snac_model`) is required for Orpheus-TTS')
8449+
raise Exception(f'snac_model (`--snac_model`) is required for `{arch}`')
84498450
load_some_info(g_model_meta, Path(args.snac_model), 'snac_')
84508451
model_files += load_some_model(Path(args.snac_model))
84518452
config.snac_model = g_model_meta['snac_config.json']
@@ -8854,6 +8855,9 @@ def main():
88548855
Mistral2Converter.convert(config, model_files, vocab, ggml_type, args.save_path)
88558856
elif arch == 'orpheus-tts':
88568857
OrpheusTTSConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
8858+
elif arch == 'maya1':
8859+
OrpheusTTSConverter.MODEL_TYPE = ModelType.Maya1
8860+
OrpheusTTSConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
88578861
elif arch == 'outetts':
88588862
if config.architectures[0] == 'Qwen3ForCausalLM':
88598863
OuteTTSConverter.MODEL_TYPE = ModelType.OuteTTSQwen3

docs/models.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,17 @@ Please use `--format completion` for these models.
345345

346346
## TTS Models
347347

348+
* Maya1
349+
350+
* [maya1] (https://huggingface.co/maya-research/maya1/tree/fbd30e2b3ec92d2e227df20005a73e172bc5d2de)
351+
352+
[SNAC-24kHz](https://huggingface.co/mlx-community/snac_24khz/tree/556af1cd3b1c5f2d294f6aa9bb886245d7b716ac) is used as codec.
353+
Use these additional command line options when converting: `--name Orpheus-TTS -a Orpheus-TTS --snac_model /path/to/snac_24kHz`
354+
355+
Use `--set voice XX` to describe the voice. [More info](https://huggingface.co/maya-research/maya1/blob/fbd30e2b3ec92d2e227df20005a73e172bc5d2de/prompt.txt).
356+
357+
IMPORTANT: don't forget to use `--max-new-tokens N` to control the length of result.
358+
348359
* Orpheus TTS
349360
* [x] 3B: [EN](https://huggingface.co/canopylabs/orpheus-3b-0.1-ft/tree/4206a56e5a68cf6cf96900a8a78acd3370c02eb6), [ZH](https://huggingface.co/canopylabs/3b-zh-ft-research_release/commit/29d016d6d0e5a2688267d3b3e432b7e23f043876), etc
350361

models/bailing.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,6 @@ namespace chatllm::bailing::llada
396396
const bool continuous,
397397
bool &completed,
398398
ModelPerfInfo *performance,
399-
int gen_max_tokens,
400399
BaseStreamer *streamer = nullptr) override;
401400
protected:
402401
bool generate_next_block(const int *input_ids, const int ids_count, const GenerationConfig &gen_config,
@@ -558,7 +557,6 @@ namespace chatllm::bailing::llada
558557
const bool continuous,
559558
bool &completed,
560559
ModelPerfInfo *performance,
561-
int gen_max_tokens,
562560
BaseStreamer *streamer)
563561
{
564562
CHATLLM_CHECK(gen_config.max_length <= config_.max_length)
@@ -598,6 +596,7 @@ namespace chatllm::bailing::llada
598596
transformer->set_ctx((int)curr_input_ids.size());
599597
int next_output_idx = 0;
600598

599+
int gen_max_tokens = gen_config.max_new_tokens;
601600
if (gen_max_tokens > 0)
602601
gen_max_tokens = n_past + (int)curr_input_ids.size() + gen_max_tokens;
603602

models/maya.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#include <ostream>
2+
#include "orpheus.h"
3+
4+
namespace chatllm::maya::v1
5+
{
6+
typedef orpheus::tts::Config Config;
7+
8+
9+
class Tokenizer : public orpheus::tts::Tokenizer
10+
{
11+
public:
12+
using orpheus::tts::Tokenizer::Tokenizer;
13+
14+
size_t load(tokenizer::DataReader *buffer, int n_vocab) override
15+
{
16+
auto r = orpheus::tts::Tokenizer::load(buffer, n_vocab);
17+
18+
return r;
19+
}
20+
21+
void encode(const std::string &text, std::vector<int> &ids) const override
22+
{
23+
std::ostringstream oss;
24+
oss << "<description=\"";
25+
oss << (!voice.empty() ? voice : "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing.");
26+
oss << "\"> ";
27+
oss << text;
28+
29+
ids.push_back(bos_token_id);
30+
BaseTokenizer::encode(oss.str(), ids);
31+
ids.push_back(128009);
32+
}
33+
};
34+
35+
class ConditionalGeneration : public orpheus::tts::ConditionalGeneration
36+
{
37+
public:
38+
ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config):
39+
orpheus::tts::ConditionalGeneration(config, runtime_config, MODEL_TYPE_MAYA1, 128256, 156938) // 128266, 156937)
40+
{
41+
}
42+
};
43+
44+
45+
REGISTER_MODEL_LOADER(MAYA1, maya::v1, 1);
46+
}

models/orpheus.cpp

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -403,11 +403,17 @@ namespace chatllm::orpheus::tts
403403
}
404404

405405
ConditionalGeneration::ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config)
406-
: llama::v3_2::ConditionalGeneration(config, runtime_config, MODEL_TYPE_ORPHEUS_TTS),
407-
snac_ctx(w_ctx_.get_backend_context())
406+
: ConditionalGeneration(config, runtime_config, MODEL_TYPE_ORPHEUS_TTS, 128256, 156938)
408407
{
409408
}
410409

410+
ConditionalGeneration::ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config, ModelType type, int custom_token_start, int custom_token_end) :
411+
llama::v3_2::ConditionalGeneration(config, runtime_config, type),
412+
snac_ctx(w_ctx_.get_backend_context()),
413+
custom_token_start(custom_token_start), custom_token_end(custom_token_end)
414+
{}
415+
416+
411417
void ConditionalGeneration::load(ModelLoader &loader)
412418
{
413419
llama::v3_2::ConditionalGeneration::load(loader);
@@ -426,7 +432,7 @@ namespace chatllm::orpheus::tts
426432
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
427433
#if (1)
428434
bool completed = false;
429-
auto tokens = generate(input_ids, gen_config, false, completed, nullptr, 0, nullptr);
435+
auto tokens = generate(input_ids, gen_config, false, completed, nullptr, nullptr);
430436
ggml::log(GGML_LOG_LEVEL_INFO, "%zd vocoder tokens generated.", tokens.size());
431437
#else
432438
const int tokens[] = {
@@ -442,9 +448,9 @@ namespace chatllm::orpheus::tts
442448

443449
for (auto id : tokens)
444450
{
445-
if (id < tok->custom_token_start || id > tok->custom_token_end) continue;
451+
if (id < custom_token_start || id > custom_token_end) continue;
446452

447-
decoder_push_llm_tok_id(gen_config, id - tok->custom_token_start, pcm_samples);
453+
decoder_push_llm_tok_id(gen_config, id - custom_token_start, pcm_samples);
448454
if (pcm_samples.size() == 8192)
449455
{
450456
for (int i = 2048; i < 4096; i++)

models/orpheus.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,8 +150,6 @@ namespace chatllm::orpheus::tts
150150
void encode(const std::string &text, std::vector<int> &ids) const override;
151151
public:
152152
std::string voice;
153-
const int custom_token_start = 128256;
154-
const int custom_token_end = 156938;
155153
};
156154

157155
class ConditionalGeneration : public llama::v3_2::ConditionalGeneration
@@ -168,12 +166,16 @@ namespace chatllm::orpheus::tts
168166

169167
bool load_more(const json::JSON &config) override;
170168
protected:
169+
ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config, ModelType type, int custom_token_start, int custom_token_end);
170+
171171
void reset_decoder(void);
172172
void decoder_push_llm_tok_id(const GenerationConfig &gen_config, int id, std::vector<float> &pcm_samples);
173173
protected:
174174
InitContext snac_ctx;
175175
snac::Config codec_config;
176176
std::unique_ptr<snac::Codec> codec;
177177
std::vector<int> vocoder_ids;
178+
const int custom_token_start;
179+
const int custom_token_end;
178180
};
179181
}

models/oute.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -643,7 +643,7 @@ namespace chatllm::oute::tts_llama
643643
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
644644

645645
bool completed = false;
646-
auto tokens = generate(input_ids, gen_config, false, completed, nullptr, 0, nullptr);
646+
auto tokens = generate(input_ids, gen_config, false, completed, nullptr, nullptr);
647647
ggml::log(GGML_LOG_LEVEL_INFO, "%zd vocoder tokens generated.", tokens.size());
648648

649649
generate_audio(gen_config, codec,
@@ -729,7 +729,7 @@ namespace chatllm::oute::tts_qwen3
729729
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
730730

731731
bool completed = false;
732-
auto tokens = generate(input_ids, gen_config, false, completed, nullptr, 0, nullptr);
732+
auto tokens = generate(input_ids, gen_config, false, completed, nullptr, nullptr);
733733
ggml::log(GGML_LOG_LEVEL_INFO, "%zd vocoder tokens generated.", tokens.size());
734734

735735
tts_llama::ConditionalGeneration::generate_audio(gen_config, codec,

scripts/models.json

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3515,5 +3515,21 @@
35153515
}
35163516
}
35173517
}
3518+
},
3519+
"maya1": {
3520+
"brief": "Maya1 is a speech model built for expressive voice generation with rich human emotion and precise voice design.",
3521+
"default": "3.3b",
3522+
"license": "Apache License 2.0",
3523+
"variants": {
3524+
"3.3b": {
3525+
"default": "q8",
3526+
"quantized": {
3527+
"q8": {
3528+
"size": 3544675872,
3529+
"url": "chatllm_quantized_maya/maya1-3.3b.bin"
3530+
}
3531+
}
3532+
}
3533+
}
35183534
}
35193535
}

0 commit comments

Comments
 (0)