fix speaker; optimize decoder for special tokens.

foldl · foldl · commit 79a56856abc8 · 2025-06-05T16:00:20.000+08:00
diff --git a/docs/models.md b/docs/models.md
@@ -280,12 +280,16 @@ Please use `--format completion` for these models.
         [SNAC-24kHz](https://huggingface.co/mlx-community/snac_24khz/tree/556af1cd3b1c5f2d294f6aa9bb886245d7b716ac) is used as codec.
         Use these additional command line options when converting: `--name Orpheus-TTS -a Orpheus-TTS --snac_model /path/to/snac_24kHz`
 
+        Use `--set voice XX` to select voice `XX`, such as `tara`. [More info](https://github.com/canopyai/Orpheus-TTS?tab=readme-ov-file#prompting).
+
 * OuteTTS:
     * [x] 1.0: [1B](https://huggingface.co/OuteAI/Llama-OuteTTS-1.0-1B/commit/911e296ce01148a01f3af9329163b0d298ac33a1), [0.6B](https://huggingface.co/OuteAI/OuteTTS-1.0-0.6B/tree/e7bcd87b0ca47fd8c46317c8f745a5e4e19c7b5c)
 
         [DAC.speech.v1.0 1.5kbps](https://huggingface.co/ibm-research/DAC.speech.v1.0/commits/main) is used as codec.
         Use these additional command line options when converting: `--name OuteTTS -a OuteTTS --dac_model /path/to/dac`
 
+        Use `--set speaker /path/to/speaker.json` to select a speaker profile. [More info](https://github.com/edwko/OuteTTS/blob/main/docs/interface_usage.md#creating-custom-speaker-profiles).
+
 ## Multimodal Models
 
 * Fuyu (`FuyuForCausalLM`)
diff --git a/models/oute.cpp b/models/oute.cpp
@@ -500,7 +500,8 @@ namespace tts_llama
         std::string normalized_text = text_normalization(text);
 
         std::string prompt;
-        if (!speaker.IsNull()) {
+        if (!speaker.IsNull())
+        {
             // Merge speaker text
             auto [merged_text, separator] = merge_speaker_text(normalized_text, speaker["text"].ToString());
             normalized_text = merged_text;
@@ -580,7 +581,7 @@ namespace tts_llama
 
     void Tokenizer::encode(const std::string &text, std::vector<int> &ids) const
     {
-        auto prompt = get_completion_prompt(text, json::JSON::_null);
+        auto prompt = get_completion_prompt(text);
         BaseTokenizer::encode(prompt, ids);
     }
 
@@ -689,7 +690,7 @@ namespace tts_qwen3
 
     void Tokenizer::encode(const std::string &text, std::vector<int> &ids) const
     {
-        auto prompt = tts_llama::get_completion_prompt(text, json::JSON::_null);
+        auto prompt = tts_llama::get_completion_prompt(text);
         BaseTokenizer::encode(prompt, ids);
     }
 
diff --git a/src/basics.h b/src/basics.h
@@ -72,4 +72,6 @@ namespace utils
     void parallel_for(int64_t start, int64_t end, std::function<void(int64_t)> func, int num_threads = 0);
 
     std::string load_file(const char *fn);
+
+    //#define TIME_STAMP (std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count())
 }
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
@@ -5,6 +5,7 @@
 #include <cstring>
 #include <limits>
 #include <regex>
+#include <iostream>
 
 #include "unicode.h"
 #include "chat.h"
@@ -474,6 +475,7 @@ size_t BPEProcessor2::Load(DataReader *data_reader, int n_vocab)
     vocab_.id_to_token.resize(piece_size);
     load_vocab_merges(vocab_, reader);
     build_special_token_cache(vocab_);
+    searcher.rebuild(vocab_.special_tokens_cache);
 
     return reader.get_total_size();
 }
@@ -694,37 +696,115 @@ const std::string BPEProcessor3::IdToPiece(int id) const
     }
 }
 
-static std::string search_first_special_token(std::string &input, const _vocab &vocab, int &sp_tok_id)
+NearestKeywordSearcher::Node *NearestKeywordSearcher::make_tree(std::vector<Item> &items, char ch, int value)
 {
-    sp_tok_id = -1;
-    auto nearest_match = std::string::npos;
-    for (auto & st: vocab.special_tokens_cache)
-    {
-        const auto & special_id     = st.first;
-        const auto & special_token  = st.second;
+    Node * r = new Node();
+    r->ch = ch;
+    r->value = items.size() < 1 ? value : -1;
 
-        auto match = input.find(special_token, 0);
+    while (true)
+    {
+        bool flag = false;
+        char tag = 0;
+        int  v   = -1;
 
-        if (match < nearest_match)
+        std::vector<Item> sub;
+        for (int i = (int)items.size() - 1; i >= 0; i--)
         {
-            nearest_match = match;
-            sp_tok_id = special_id;
+            if (items[i].s.size() < 1) continue;
+
+            if (!flag)
+            {
+                flag = true;
+                tag = items[i].s[0];
+                v   = items[i].value;
+            }
+            else
+            {
+                if (items[i].s[0] != tag) continue;
+            }
+
+            if (items[i].s.size() > 1)
+                sub.emplace_back(items[i].s.substr(1), items[i].value);
+
+            // mark as visited
+            items[i].s = "";
         }
+
+        if (!flag) break;
+
+        Node *child = make_tree(sub, tag, v);
+        r->child.emplace_back(std::unique_ptr<Node>(child));
     }
 
-    if (sp_tok_id >= 0)
+    std::sort(r->child.begin(), r->child.end(), [](auto &p1, auto &p2) { return p1->ch <= p2->ch; });
+
+    return r;
+}
+
+void NearestKeywordSearcher::rebuild(const std::unordered_map<int, std::string> keywords)
+{
+    root.reset(nullptr);
+
+    std::vector<Item> sub;
+
+    for (auto & st: keywords)
     {
-        const auto & special_token  = vocab.special_tokens_cache.at(sp_tok_id);
-        std::string r = input.substr(0, nearest_match);
-        input = input.substr(nearest_match + special_token.size());
-        return r;
+        sub.emplace_back(st.second, st.first);
     }
-    else
+    root.reset(make_tree(sub, 0, -1));
+}
+
+int NearestKeywordSearcher::match(const std::string &input, int index, Node *node, int &level) const
+{
+    if (node->child.size() < 1) return node->value;
+    if (index >= (int)input.size()) return -1;
+    const char ch = input[index];
+
+    int low  = 0;
+    int high = (int)node->child.size() - 1;
+    while (high >= low)
+    {
+        // assuming no overflow
+        int middle = (high + low) / 2;
+        Node *n = node->child[middle].get();
+        if (n->ch < ch)
+        {
+            low = middle + 1;
+        }
+        else if (ch < n->ch)
+        {
+            high = middle - 1;
+        }
+        else
+        {
+            level++;
+            return match(input, index + 1, n, level);
+        }
+    }
+
+    return -1;
+}
+
+std::string NearestKeywordSearcher::search(std::string &input, int &kw_id) const
+{
+    int index = 0;
+    while (index < (int)input.size())
     {
-        std::string r(input);
-        input = "";
-        return r;
+        int len = 0;
+        kw_id = match(input, index, root.get(), len);
+        if (kw_id >= 0)
+        {
+            std::string r = input.substr(0, index);
+            input = input.substr(index + len);
+            return r;
+        }
+        index++;
     }
+
+    std::string r(input);
+    input = "";
+    return r;
 }
 
 int BPEProcessor2::DoEncode(const std::string &input,
@@ -734,7 +814,7 @@ int BPEProcessor2::DoEncode(const std::string &input,
     int sp_tok_id = -1;
     while (text.size() > 0)
     {
-        auto leading = search_first_special_token(text, vocab_, sp_tok_id);
+        auto leading = searcher.search(text, sp_tok_id);
         DoEncode2(leading, ids);
         if (sp_tok_id < 0) break;
         ids->push_back(sp_tok_id);
diff --git a/src/tokenizer.h b/src/tokenizer.h
@@ -193,6 +193,34 @@ class BPEProcessor1: public Processor
             std::vector<int> *ids) const override;
 };
 
+class NearestKeywordSearcher
+{
+public:
+    void rebuild(const std::unordered_map<int, std::string> keywords);
+
+    std::string search(std::string &input, int &kw_id) const;
+
+protected:
+
+    struct Item
+    {
+        std::string s;
+        int value;
+    };
+    struct Node
+    {
+        char ch;
+        int value;
+        std::vector<std::unique_ptr<Node>> child;
+    };
+
+    Node *make_tree(std::vector<Item> &items, char ch, int value);
+
+    int match(const std::string &input, int index, Node *node, int &level) const;
+
+    std::unique_ptr<Node> root;
+};
+
 class BPEProcessor2: public Processor
 {
 public:
@@ -212,6 +240,7 @@ class BPEProcessor2: public Processor
             std::vector<int> *ids) const;
 
     std::vector<std::string> regex_exprs;
+    NearestKeywordSearcher searcher;
 };
 
 class BPEProcessor3: public BPEProcessor2

Original file line number	Diff line number	Diff line change
`@@ -500,7 +500,8 @@ namespace tts_llama`
`500`	`500`	`std::string normalized_text = text_normalization(text);`
`501`	`501`
`502`	`502`	`std::string prompt;`
`503`		`- if (!speaker.IsNull()) {`
	`503`	`+ if (!speaker.IsNull())`
	`504`	`+ {`
`504`	`505`	`// Merge speaker text`
`505`	`506`	`auto [merged_text, separator] = merge_speaker_text(normalized_text, speaker["text"].ToString());`
`506`	`507`	`normalized_text = merged_text;`
`@@ -580,7 +581,7 @@ namespace tts_llama`
`580`	`581`
`581`	`582`	`void Tokenizer::encode(const std::string &text, std::vector<int> &ids) const`
`582`	`583`	`{`
`583`		`- auto prompt = get_completion_prompt(text, json::JSON::_null);`
	`584`	`+ auto prompt = get_completion_prompt(text);`
`584`	`585`	`BaseTokenizer::encode(prompt, ids);`
`585`	`586`	`}`
`586`	`587`
`@@ -689,7 +690,7 @@ namespace tts_qwen3`
`689`	`690`
`690`	`691`	`void Tokenizer::encode(const std::string &text, std::vector<int> &ids) const`
`691`	`692`	`{`
`692`		`- auto prompt = tts_llama::get_completion_prompt(text, json::JSON::_null);`
	`693`	`+ auto prompt = tts_llama::get_completion_prompt(text);`
`693`	`694`	`BaseTokenizer::encode(prompt, ids);`
`694`	`695`	`}`
`695`	`696`
Original file line number	Diff line number	Diff line change
`@@ -72,4 +72,6 @@ namespace utils`
`72`	`72`	`void parallel_for(int64_t start, int64_t end, std::function<void(int64_t)> func, int num_threads = 0);`
`73`	`73`
`74`	`74`	`std::string load_file(const char *fn);`
	`75`	`+`
	`76`	`+ //#define TIME_STAMP (std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count())`
`75`	`77`	`}`