add maya1

foldl · foldl · commit 382091952b46 · 2025-11-06T18:20:39.000+08:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -96,6 +96,7 @@ set(core_files src/backend.cpp
     models/jina.cpp
     models/jiutian.cpp
     models/llama.cpp
+    models/maya.cpp
     models/m_a_p.cpp
     models/megrez.cpp
     models/minicpm.cpp
diff --git a/README.md b/README.md
@@ -33,6 +33,7 @@ LittleAcademia[<a href="https://github.com/foldl/little-academia"   style="text-
 
 **What's New:**
 
+* 2025-11-06: Maya1
 * 2025-11-03: Ouro
 * 2025-10-31: Megrez2-3x7B-A3B
 * 2025-10-25: LLaDA2.0-mini
diff --git a/convert.py b/convert.py
@@ -236,6 +236,7 @@ class ModelType(Enum):
     OuteTTSQwen3            = 0x10000108
     QWen3_Embedding         = 0x10000109
     QWen3_ReRanker          = 0x1000010A
+    Maya1                   = 0x1000010B
 
     LlaMAMulti    = 0x20000001
 
@@ -8443,9 +8444,9 @@ def main():
     global g_model_meta
     load_some_info(g_model_meta, Path(args.model_name_or_path))
 
-    if arch == 'orpheus-tts':
+    if arch in ['orpheus-tts', 'maya1']:
         if args.snac_model == '':
-            raise Exception('snac_model (`--snac_model`) is required for Orpheus-TTS')
+            raise Exception(f'snac_model (`--snac_model`) is required for `{arch}`')
         load_some_info(g_model_meta, Path(args.snac_model), 'snac_')
         model_files += load_some_model(Path(args.snac_model))
         config.snac_model = g_model_meta['snac_config.json']
@@ -8854,6 +8855,9 @@ def main():
         Mistral2Converter.convert(config, model_files, vocab, ggml_type, args.save_path)
     elif arch == 'orpheus-tts':
         OrpheusTTSConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
+    elif arch == 'maya1':
+        OrpheusTTSConverter.MODEL_TYPE = ModelType.Maya1
+        OrpheusTTSConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
     elif arch == 'outetts':
         if config.architectures[0] == 'Qwen3ForCausalLM':
             OuteTTSConverter.MODEL_TYPE = ModelType.OuteTTSQwen3
diff --git a/docs/models.md b/docs/models.md
@@ -345,6 +345,17 @@ Please use `--format completion` for these models.
 
 ## TTS Models
 
+* Maya1
+
+    * [maya1] (https://huggingface.co/maya-research/maya1/tree/fbd30e2b3ec92d2e227df20005a73e172bc5d2de)
+
+        [SNAC-24kHz](https://huggingface.co/mlx-community/snac_24khz/tree/556af1cd3b1c5f2d294f6aa9bb886245d7b716ac) is used as codec.
+        Use these additional command line options when converting: `--name Orpheus-TTS -a Orpheus-TTS --snac_model /path/to/snac_24kHz`
+
+        Use `--set voice XX` to describe the voice. [More info](https://huggingface.co/maya-research/maya1/blob/fbd30e2b3ec92d2e227df20005a73e172bc5d2de/prompt.txt).
+
+        IMPORTANT: don't forget to use `--max-new-tokens N` to control the length of result.
+
 * Orpheus TTS
     * [x] 3B: [EN](https://huggingface.co/canopylabs/orpheus-3b-0.1-ft/tree/4206a56e5a68cf6cf96900a8a78acd3370c02eb6), [ZH](https://huggingface.co/canopylabs/3b-zh-ft-research_release/commit/29d016d6d0e5a2688267d3b3e432b7e23f043876), etc
 
diff --git a/models/bailing.cpp b/models/bailing.cpp
@@ -396,7 +396,6 @@ namespace chatllm::bailing::llada
                                   const bool continuous,
                                   bool &completed,
                                   ModelPerfInfo *performance,
-                                  int gen_max_tokens,
                                   BaseStreamer *streamer = nullptr) override;
     protected:
         bool generate_next_block(const int *input_ids, const int ids_count, const GenerationConfig &gen_config,
@@ -558,7 +557,6 @@ namespace chatllm::bailing::llada
                                   const bool continuous,
                                   bool &completed,
                                   ModelPerfInfo *performance,
-                                  int gen_max_tokens,
                                   BaseStreamer *streamer)
     {
         CHATLLM_CHECK(gen_config.max_length <= config_.max_length)
@@ -598,6 +596,7 @@ namespace chatllm::bailing::llada
         transformer->set_ctx((int)curr_input_ids.size());
         int next_output_idx = 0;
 
+        int gen_max_tokens = gen_config.max_new_tokens;
         if (gen_max_tokens > 0)
             gen_max_tokens = n_past + (int)curr_input_ids.size() + gen_max_tokens;
 
diff --git a/models/maya.cpp b/models/maya.cpp
@@ -0,0 +1,46 @@
+#include <ostream>
+#include "orpheus.h"
+
+namespace chatllm::maya::v1
+{
+    typedef orpheus::tts::Config Config;
+
+
+    class Tokenizer : public  orpheus::tts::Tokenizer
+    {
+    public:
+        using  orpheus::tts::Tokenizer::Tokenizer;
+
+        size_t load(tokenizer::DataReader *buffer, int n_vocab) override
+        {
+            auto r =  orpheus::tts::Tokenizer::load(buffer, n_vocab);
+
+            return r;
+        }
+
+        void encode(const std::string &text, std::vector<int> &ids) const override
+        {
+            std::ostringstream oss;
+            oss << "<description=\"";
+            oss << (!voice.empty() ? voice : "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing.");
+            oss << "\"> ";
+            oss << text;
+
+            ids.push_back(bos_token_id);
+            BaseTokenizer::encode(oss.str(), ids);
+            ids.push_back(128009);
+        }
+    };
+
+    class ConditionalGeneration : public orpheus::tts::ConditionalGeneration
+    {
+    public:
+        ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config):
+            orpheus::tts::ConditionalGeneration(config, runtime_config, MODEL_TYPE_MAYA1, 128256, 156938) // 128266, 156937)
+        {
+        }
+    };
+
+
+    REGISTER_MODEL_LOADER(MAYA1,                maya::v1, 1);
+}
diff --git a/models/orpheus.cpp b/models/orpheus.cpp
@@ -403,11 +403,17 @@ namespace chatllm::orpheus::tts
     }
 
     ConditionalGeneration::ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config)
-        : llama::v3_2::ConditionalGeneration(config, runtime_config, MODEL_TYPE_ORPHEUS_TTS),
-            snac_ctx(w_ctx_.get_backend_context())
+        : ConditionalGeneration(config, runtime_config, MODEL_TYPE_ORPHEUS_TTS, 128256, 156938)
     {
     }
 
+    ConditionalGeneration::ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config, ModelType type, int custom_token_start, int custom_token_end) :
+        llama::v3_2::ConditionalGeneration(config, runtime_config, type),
+        snac_ctx(w_ctx_.get_backend_context()),
+        custom_token_start(custom_token_start), custom_token_end(custom_token_end)
+    {}
+
+
     void ConditionalGeneration::load(ModelLoader &loader)
     {
         llama::v3_2::ConditionalGeneration::load(loader);
@@ -426,7 +432,7 @@ namespace chatllm::orpheus::tts
         Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
         #if (1)
         bool completed = false;
-        auto tokens = generate(input_ids, gen_config, false, completed, nullptr, 0, nullptr);
+        auto tokens = generate(input_ids, gen_config, false, completed, nullptr, nullptr);
         ggml::log(GGML_LOG_LEVEL_INFO, "%zd vocoder tokens generated.", tokens.size());
         #else
         const int tokens[] = {
@@ -442,9 +448,9 @@ namespace chatllm::orpheus::tts
 
         for (auto id : tokens)
         {
-            if (id < tok->custom_token_start || id > tok->custom_token_end) continue;
+            if (id < custom_token_start || id > custom_token_end) continue;
 
-            decoder_push_llm_tok_id(gen_config, id - tok->custom_token_start, pcm_samples);
+            decoder_push_llm_tok_id(gen_config, id - custom_token_start, pcm_samples);
             if (pcm_samples.size() == 8192)
             {
                 for (int i = 2048; i < 4096; i++)
diff --git a/models/orpheus.h b/models/orpheus.h
@@ -150,8 +150,6 @@ namespace chatllm::orpheus::tts
         void encode(const std::string &text, std::vector<int> &ids) const override;
     public:
         std::string voice;
-        const int custom_token_start = 128256;
-        const int custom_token_end = 156938;
     };
 
     class ConditionalGeneration : public llama::v3_2::ConditionalGeneration
@@ -168,12 +166,16 @@ namespace chatllm::orpheus::tts
 
         bool load_more(const json::JSON &config) override;
     protected:
+        ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config, ModelType type, int custom_token_start, int custom_token_end);
+
         void reset_decoder(void);
         void decoder_push_llm_tok_id(const GenerationConfig &gen_config, int id, std::vector<float> &pcm_samples);
     protected:
         InitContext snac_ctx;
         snac::Config codec_config;
         std::unique_ptr<snac::Codec> codec;
         std::vector<int> vocoder_ids;
+        const int custom_token_start;
+        const int custom_token_end;
     };
 }
diff --git a/models/oute.cpp b/models/oute.cpp
@@ -643,7 +643,7 @@ namespace chatllm::oute::tts_llama
             Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
 
             bool completed = false;
-            auto tokens = generate(input_ids, gen_config, false, completed, nullptr, 0, nullptr);
+            auto tokens = generate(input_ids, gen_config, false, completed, nullptr, nullptr);
             ggml::log(GGML_LOG_LEVEL_INFO, "%zd vocoder tokens generated.", tokens.size());
 
             generate_audio(gen_config, codec,
@@ -729,7 +729,7 @@ namespace chatllm::oute::tts_qwen3
             Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
 
             bool completed = false;
-            auto tokens = generate(input_ids, gen_config, false, completed, nullptr, 0, nullptr);
+            auto tokens = generate(input_ids, gen_config, false, completed, nullptr, nullptr);
             ggml::log(GGML_LOG_LEVEL_INFO, "%zd vocoder tokens generated.", tokens.size());
 
             tts_llama::ConditionalGeneration::generate_audio(gen_config, codec,
diff --git a/scripts/models.json b/scripts/models.json
@@ -3515,5 +3515,21 @@
                 }
             }
         }
+    },
+    "maya1": {
+        "brief": "Maya1 is a speech model built for expressive voice generation with rich human emotion and precise voice design.",
+        "default": "3.3b",
+        "license": "Apache License 2.0",
+        "variants": {
+            "3.3b": {
+                "default": "q8",
+                "quantized": {
+                    "q8": {
+                        "size": 3544675872,
+                        "url": "chatllm_quantized_maya/maya1-3.3b.bin"
+                    }
+                }
+            }
+        }
     }
 }
diff --git a/src/chat.cpp b/src/chat.cpp
@@ -1678,8 +1678,7 @@ namespace chatllm
     }
 
     Pipeline::Pipeline(const std::string &path, const ModelObject::extra_args &args)
-        : gen_max_tokens(-1),
-          initializing(true),
+        : initializing(true),
           extending(ExtendingMethod::Restart),
           modelobj(path, args)
     {
@@ -1703,15 +1702,15 @@ namespace chatllm
         input_ids = tokenizer->encode_history(history, gen_config.max_context_length, continuous, true, gen_config.reversed_role);
         add_ai_prefix(input_ids, gen_config, streamer);
 
-        std::vector<int> output_ids = model->generate(input_ids, gen_config, continuous, completed, &performance, gen_max_tokens, streamer);
+        std::vector<int> output_ids = model->generate(input_ids, gen_config, continuous, completed, &performance, streamer);
         if (!completed)
         {
             if (continuous)
             {
                 streamer->putln("\nRUN OUT OF CONTEXT. Let me forget something and try again ...\n");
                 input_ids = tokenizer->encode_history(history, gen_config.max_context_length, false, true, gen_config.reversed_role);
                 add_ai_prefix(input_ids, gen_config, streamer);
-                output_ids = model->generate(input_ids, gen_config, false, completed, &performance, gen_max_tokens, streamer);
+                output_ids = model->generate(input_ids, gen_config, false, completed, &performance, streamer);
             }
             else
                 streamer->putln("\nRUN OUT OF CONTEXT. I have to stop now.\n");
@@ -1737,7 +1736,7 @@ namespace chatllm
         input_ids = tokenizer->encode_history(history, gen_config.max_context_length, continuous, true, gen_config.reversed_role);
         add_ai_prefix(input_ids, gen_config, streamer);
 
-        std::vector<int> output_ids = model->generate(input_ids, gen_config, continuous, completed, &performance, gen_max_tokens, streamer);
+        std::vector<int> output_ids = model->generate(input_ids, gen_config, continuous, completed, &performance, streamer);
         if (!completed)
         {
             streamer->putln("\nRUN OUT OF CONTEXT. I have to stop now.\n");
@@ -1762,7 +1761,7 @@ namespace chatllm
         std::vector<int> input_ids = tokenizer->encode_history(history, gen_config.max_context_length, continuous, true, gen_config.reversed_role);
         add_ai_prefix(input_ids, gen_config, streamer);
 
-        std::vector<int> output_ids = model->generate(input_ids, gen_config, continuous, completed, &performance, gen_max_tokens, streamer);
+        std::vector<int> output_ids = model->generate(input_ids, gen_config, continuous, completed, &performance, streamer);
 
         while (!completed)
         {
@@ -1773,7 +1772,7 @@ namespace chatllm
             else
                 input_ids.clear();
 
-            auto ids = model->generate(input_ids, gen_config, true, completed, &performance, gen_max_tokens, streamer);
+            auto ids = model->generate(input_ids, gen_config, true, completed, &performance, streamer);
             output_ids.insert(output_ids.end(), ids.begin(), ids.end());
         }
 
@@ -1786,7 +1785,9 @@ namespace chatllm
         bool completed = false;
         std::vector<int> input_ids = tokenizer->encode_sys_prompt();
 
-        model->generate(input_ids, gen_config, false, completed, &performance, 1, nullptr);
+        GenerationConfig copy(gen_config);
+        copy.max_new_tokens = 1;
+        model->generate(input_ids, copy, false, completed, &performance, nullptr);
 
         // just in case that chatting is continued
         tokenizer->set_skip_sys_prompt(true);
@@ -1861,14 +1862,14 @@ namespace chatllm
 
         history[history.size() - 1].content.push_back(external);
 
-        std::vector<int> output_ids = model->generate(input_ids, gen_config, continuous, completed, &performance, gen_max_tokens, streamer);
+        std::vector<int> output_ids = model->generate(input_ids, gen_config, continuous, completed, &performance, streamer);
         if (!completed)
         {
             if (continuous)
             {
                 streamer->putln("\nRUN OUT OF CONTEXT. Let me forget something and try again ...\n");
                 input_ids = tokenizer->encode_history(history, gen_config.max_context_length, false, false);
-                output_ids = model->generate(input_ids, gen_config, false, completed, &performance, gen_max_tokens, streamer);
+                output_ids = model->generate(input_ids, gen_config, false, completed, &performance, streamer);
             }
             else
                 streamer->putln("\nRUN OUT OF CONTEXT. I have to stop now.\n");
@@ -2121,7 +2122,7 @@ namespace chatllm
         bool completed = false;
 
         std::vector<int> input_ids = tokenizer->encode_history(history, gen_config.max_context_length, false);
-        std::vector<int> output_ids = rewrite_model->generate(input_ids, gen_config, false, completed, &performance, -1, nullptr);
+        std::vector<int> output_ids = rewrite_model->generate(input_ids, gen_config, false, completed, &performance, nullptr);
 
         if (!completed) return "";
 
diff --git a/src/chat.h b/src/chat.h
@@ -832,6 +832,7 @@ namespace chatllm
     {
         int max_length;
         int max_context_length;
+        int max_new_tokens;
         bool do_sample;
         bool reversed_role;
         int top_k;
@@ -933,7 +934,6 @@ namespace chatllm
                                             const bool continuous,
                                             bool &completed,
                                             ModelPerfInfo *performance,
-                                            int gen_max_tokens,
                                             BaseStreamer *streamer = nullptr) = 0;
 
         virtual bool generate_next_token(const std::vector<int> &input_ids, const GenerationConfig &gen_config, std::vector<float> &lm_logits) { return true; };
@@ -1018,10 +1018,9 @@ namespace chatllm
                                             const bool continuous,
                                             bool &completed,
                                             ModelPerfInfo *performance,
-                                            int gen_max_tokens,
                                             BaseStreamer *streamer = nullptr) override
         {
-            return model->generate(input_ids, gen_config, continuous, completed, performance, gen_max_tokens, streamer);
+            return model->generate(input_ids, gen_config, continuous, completed, performance, streamer);
         }
 
         bool generate_next_token(const std::vector<int> &input_ids, const GenerationConfig &gen_config, std::vector<float> &lm_logits) override
@@ -1144,7 +1143,6 @@ namespace chatllm
                                             const bool continuous,
                                             bool &completed,
                                             ModelPerfInfo *performance,
-                                            int gen_max_tokens,
                                             BaseStreamer *streamer = nullptr) override
         {
             std::vector<int> r;
@@ -1395,7 +1393,6 @@ namespace chatllm
         BaseTokenizer *tokenizer;
         AbstractModel *model;
         ModelPerfInfo performance;
-        int gen_max_tokens;
 
     protected:
         bool initializing;
diff --git a/src/main.cpp b/src/main.cpp
diff --git a/src/models.cpp b/src/models.cpp
diff --git a/src/models_priv.h b/src/models_priv.h

Original file line number	Diff line number	Diff line change
`@@ -3515,5 +3515,21 @@`
`3515`	`3515`	`}`
`3516`	`3516`	`}`
`3517`	`3517`	`}`
	`3518`	`+ },`
	`3519`	`+ "maya1": {`
	`3520`	`+ "brief": "Maya1 is a speech model built for expressive voice generation with rich human emotion and precise voice design.",`
	`3521`	`+ "default": "3.3b",`
	`3522`	`+ "license": "Apache License 2.0",`
	`3523`	`+ "variants": {`
	`3524`	`+ "3.3b": {`
	`3525`	`+ "default": "q8",`
	`3526`	`+ "quantized": {`
	`3527`	`+ "q8": {`
	`3528`	`+ "size": 3544675872,`
	`3529`	`+ "url": "chatllm_quantized_maya/maya1-3.3b.bin"`
	`3530`	`+ }`
	`3531`	`+ }`
	`3532`	`+ }`
	`3533`	`+ }`
`3518`	`3534`	`}`
`3519`	`3535`	`}`