Skip to content

Commit 389571c

Browse files
committed
kimi-vl support video input. and some refactor.
1 parent 108c585 commit 389571c

File tree

9 files changed

+137
-75
lines changed

9 files changed

+137
-75
lines changed

models/baichuan.cpp

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -404,14 +404,9 @@ namespace m1
404404

405405
void set_additional_args(const std::map<std::string, std::string> &args) override
406406
{
407-
auto it = args.find("chat_template");
408-
if (it != args.end())
409-
{
410-
if (it->second == "im")
411-
{
407+
if (utils::get_opt(args, "chat_template", "") == "im")
412408
tokenizer->set_chat_encoder(&_im_chat_encoder);
413-
}
414-
}
409+
415410
}
416411

417412
private:

models/gemma.cpp

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -874,12 +874,7 @@ class ConditionalGeneration : public BaseModelForConditionalGeneration
874874
void set_additional_args(const std::map<std::string, std::string> &args) override
875875
{
876876
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
877-
auto it = args.find("do_pan_and_scan");
878-
if (it == args.end()) it = args.find("do-pan-and-scan");
879-
if (it != args.end())
880-
{
881-
tok->do_pan_and_scan = it->second != "0";
882-
}
877+
tok->do_pan_and_scan = utils::get_opt(args, "do-pan-and-scan", false);
883878
}
884879

885880
void before_generate(const GenerationConfig &gen_config) override

models/kimi.cpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -586,6 +586,8 @@ namespace vl
586586
int media_content_token_id;
587587
int media_end_token_id;
588588
int media_pad_token_id;
589+
590+
int video_max_frames = 20;
589591
};
590592

591593
void ChatHistoryEncoder::append_ai(int round_idx, const std::string &ai, std::vector<int> &ids) const
@@ -678,6 +680,12 @@ namespace vl
678680
_chat_encoder.vit_loaded = visual.load(loader);
679681
}
680682

683+
void set_additional_args(const std::map<std::string, std::string> &args) override
684+
{
685+
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
686+
tok->video_max_frames = utils::get_opt(args, "video_max_frames", tok->video_max_frames);
687+
}
688+
681689
void before_generate(const GenerationConfig &gen_config) override
682690
{
683691
std::vector<uint8_t> buf;
@@ -697,7 +705,33 @@ namespace vl
697705
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
698706
append_user_opening(round_idx, ids);
699707

708+
std::vector<std::unique_ptr<vision::VideoLoader>> videos;
709+
710+
// expand video into images
711+
std::vector<ContentPiece> pieces;
700712
for (auto &piece : user.pieces)
713+
{
714+
if (piece.type != ContentPiece::Type::Video)
715+
{
716+
pieces.push_back(piece);
717+
continue;
718+
}
719+
720+
// video is just like a collection of images.
721+
// But, it's still not clear on fps.
722+
// https://github.com/MoonshotAI/Kimi-VL/issues/24#issuecomment-2804163270
723+
auto video = new vision::VideoLoader(piece.content.c_str(), 1.0f, tok->video_max_frames);
724+
videos.emplace_back(video);
725+
if (video->frames.size() < 1)
726+
continue;
727+
728+
for (size_t i = 0; i < video->frames.size(); i++)
729+
{
730+
pieces.emplace_back(video->frames[i], ContentPiece::Type::Image);
731+
}
732+
}
733+
734+
for (auto &piece : pieces)
701735
{
702736
if (piece.type == ContentPiece::Type::Text)
703737
{

models/orpheus.cpp

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -544,11 +544,7 @@ namespace tts
544544
void set_additional_args(const std::map<std::string, std::string> &args) override
545545
{
546546
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
547-
auto it = args.find("voice");
548-
if (it != args.end())
549-
{
550-
tok->voice = it->second;
551-
}
547+
tok->voice = utils::get_opt(args, "voice", tok->voice);
552548
}
553549

554550
bool load_more(const json::JSON &config) override

models/oute.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -539,10 +539,10 @@ namespace tts_llama
539539

540540
static void load_speaker_from_args(const std::map<std::string, std::string> &args)
541541
{
542-
auto x = args.find("speaker");
543-
if (x != args.end())
542+
auto x = utils::get_opt(args, "speaker", "");
543+
if (x != "")
544544
{
545-
speaker = json::JSON::Load(utils::load_file(x->second.c_str()));
545+
speaker = json::JSON::Load(utils::load_file(x.c_str()));
546546
}
547547
}
548548

models/smollm.cpp

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -651,12 +651,7 @@ namespace vlm
651651
void set_additional_args(const std::map<std::string, std::string> &args) override
652652
{
653653
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
654-
auto it = args.find("do_split");
655-
if (it == args.end()) it = args.find("do-split");
656-
if (it != args.end())
657-
{
658-
tok->do_split = it->second != "0";
659-
}
654+
tok->do_split = utils::get_opt(args, "do-split", false);
660655
}
661656

662657
void before_generate(const GenerationConfig &gen_config) override

src/basics.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <functional>
66
#include <algorithm>
77
#include <sstream>
8+
#include <map>
89

910
#if defined(_WIN32)
1011
#define strcasecmp stricmp
@@ -81,5 +82,14 @@ namespace utils
8182
// create a unique temp file name (full path)
8283
std::string tmpname(void);
8384

85+
bool is_same_command_option(const char *a, const char *b);
86+
bool is_same_command_option(const std::string &a, const std::string &b);
87+
88+
int get_opt(const std::map<std::string, std::string> &options, const char *key, int def);
89+
double get_opt(const std::map<std::string, std::string> &options, const char *key, double def);
90+
bool get_opt(const std::map<std::string, std::string> &options, const char *key, const bool def);
91+
std::string get_opt(const std::map<std::string, std::string> &options, const char *key, const char *def);
92+
std::string get_opt(const std::map<std::string, std::string> &options, const char *key, const std::string &def);
93+
8494
//#define TIME_STAMP (std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count())
8595
}

src/main.cpp

Lines changed: 19 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -132,28 +132,6 @@ static std::string show_default_thought_tags(void)
132132
return tags;
133133
}
134134

135-
static bool is_same_command_option(const char *a, const char *b)
136-
{
137-
while (*a && *b)
138-
{
139-
char ca = *a;
140-
char cb = *b;
141-
if (ca == '-') ca = '_';
142-
if (cb == '-') cb = '_';
143-
if (ca != cb)
144-
return false;
145-
a++;
146-
b++;
147-
}
148-
149-
return *a == *b;
150-
}
151-
152-
bool is_same_command_option(const std::string &a, const std::string &b)
153-
{
154-
return is_same_command_option(a.c_str(), b.c_str());
155-
}
156-
157135
void usage(const std::string &prog)
158136
{
159137
Args args;
@@ -309,31 +287,31 @@ static size_t parse_args(Args &args, const std::vector<std::string> &argv)
309287
const size_t argc = argv.size();
310288

311289
#define handle_para0(fmt1, field, f) \
312-
else if ((is_same_command_option(arg, fmt1))) \
290+
else if ((utils::is_same_command_option(arg, fmt1))) \
313291
{ \
314292
c++; \
315293
if (c < argc) \
316294
args.field = f(argv[c].c_str()); \
317295
}
318296

319297
#define handle_param(fmt1, fmt2, field, f) \
320-
else if ((is_same_command_option(arg, fmt1)) || (is_same_command_option(arg, fmt2))) \
298+
else if ((utils::is_same_command_option(arg, fmt1)) || (utils::is_same_command_option(arg, fmt2))) \
321299
{ \
322300
c++; \
323301
if (c < argc) \
324302
args.field = f(argv[c].c_str()); \
325303
}
326304

327305
#define append_param(fmt1, field, f) \
328-
else if ((is_same_command_option(arg, fmt1))) \
306+
else if ((utils::is_same_command_option(arg, fmt1))) \
329307
{ \
330308
c++; \
331309
if (c < argc) \
332310
args.field.push_back(f(argv[c].c_str())); \
333311
}
334312

335313
#define handle_flag(field) \
336-
else if ((is_same_command_option(arg, "+" #field)) || (is_same_command_option(arg, "--" #field))) \
314+
else if ((utils::is_same_command_option(arg, "+" #field)) || (utils::is_same_command_option(arg, "--" #field))) \
337315
{ \
338316
args.field = true; \
339317
}
@@ -345,19 +323,21 @@ static size_t parse_args(Args &args, const std::vector<std::string> &argv)
345323
while (c < argc)
346324
{
347325
const char *arg = argv[c].c_str();
348-
if ((is_same_command_option(arg, "--help")) || (is_same_command_option(arg, "-h")) || (is_same_command_option(arg, "-?")))
326+
if ((utils::is_same_command_option(arg, "--help"))
327+
|| (utils::is_same_command_option(arg, "-h"))
328+
|| (utils::is_same_command_option(arg, "-?")))
349329
{
350330
args.show_help = true;
351331
}
352-
else if ((strcmp(arg, "--interactive") == 0) || (strcmp(arg, "-i") == 0))
332+
else if ((utils::is_same_command_option(arg, "--interactive")) || (strcmp(arg, "-i") == 0))
353333
{
354334
args.interactive = true;
355335
}
356-
else if (is_same_command_option(arg, "--multi"))
336+
else if (utils::is_same_command_option(arg, "--multi"))
357337
{
358338
args.multi_line = true;
359339
}
360-
else if (is_same_command_option(arg, "--hide_banner"))
340+
else if (utils::is_same_command_option(arg, "--hide_banner"))
361341
{
362342
args.show_banner = false;
363343
}
@@ -370,7 +350,7 @@ static size_t parse_args(Args &args, const std::vector<std::string> &argv)
370350
handle_flag(rerank_rewrite)
371351
handle_flag(moe_on_cpu)
372352
handle_flag(detect_thoughts)
373-
else if (is_same_command_option(arg, "--format"))
353+
else if (utils::is_same_command_option(arg, "--format"))
374354
{
375355
c++;
376356
if (c < argc)
@@ -383,7 +363,7 @@ static size_t parse_args(Args &args, const std::vector<std::string> &argv)
383363
args.format = chatllm::ChatFormat::CHAT;
384364
}
385365
}
386-
else if (is_same_command_option(arg, "--save_session"))
366+
else if (utils::is_same_command_option(arg, "--save_session"))
387367
{
388368
c++;
389369
if (c + 1 < argc)
@@ -393,15 +373,15 @@ static size_t parse_args(Args &args, const std::vector<std::string> &argv)
393373
c++;
394374
}
395375
}
396-
else if (is_same_command_option(arg, "--kv"))
376+
else if (utils::is_same_command_option(arg, "--kv"))
397377
{
398378
while (c + 2 < argc)
399379
{
400380
args.additional.insert_or_assign(argv[c + 1], argv[c + 2]);
401381
c += 2;
402382
}
403383
}
404-
else if (is_same_command_option(arg, "--vector_store"))
384+
else if (utils::is_same_command_option(arg, "--vector_store"))
405385
{
406386
c++;
407387
if (c < argc)
@@ -414,7 +394,7 @@ static size_t parse_args(Args &args, const std::vector<std::string> &argv)
414394
args.vector_stores.at(args.cur_vs_name).push_back(argv[c]);
415395
}
416396
}
417-
else if (is_same_command_option(arg, "--thought_tags"))
397+
else if (utils::is_same_command_option(arg, "--thought_tags"))
418398
{
419399
if (c + 2 < argc)
420400
{
@@ -424,7 +404,7 @@ static size_t parse_args(Args &args, const std::vector<std::string> &argv)
424404
args.detect_thoughts = true;
425405
}
426406
}
427-
else if (is_same_command_option(arg, "--multimedia_file_tags"))
407+
else if (utils::is_same_command_option(arg, "--multimedia_file_tags"))
428408
{
429409
if (c + 2 < argc)
430410
{
@@ -433,23 +413,23 @@ static size_t parse_args(Args &args, const std::vector<std::string> &argv)
433413
c += 2;
434414
}
435415
}
436-
else if (is_same_command_option(arg, "--set"))
416+
else if (utils::is_same_command_option(arg, "--set"))
437417
{
438418
if (c + 2 < argc)
439419
{
440420
args.additional[argv[c + 1]] = argv[c + 2];
441421
c += 2;
442422
}
443423
}
444-
else if (is_same_command_option(arg, "-mgl") || is_same_command_option(arg, "--model_gpu_layers"))
424+
else if (utils::is_same_command_option(arg, "-mgl") || utils::is_same_command_option(arg, "--model_gpu_layers"))
445425
{
446426
if (c + 2 < argc)
447427
{
448428
args.model_n_gpu_layers[argv[c + 1]] = argv[c + 2];
449429
c += 2;
450430
}
451431
}
452-
else if (is_same_command_option(arg, "-ngl") || is_same_command_option(arg, "--n_gpu_layers"))
432+
else if (utils::is_same_command_option(arg, "-ngl") || utils::is_same_command_option(arg, "--n_gpu_layers"))
453433
{
454434
c++;
455435
if (c < argc)

0 commit comments

Comments
 (0)