Skip to content

Commit 5e1e800

Browse files
committed
SmolVLM: support video
1 parent 2011a4d commit 5e1e800

File tree

5 files changed

+214
-5
lines changed

5 files changed

+214
-5
lines changed

models/smollm.cpp

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -484,6 +484,7 @@ namespace vlm
484484
: BaseTokenizer::BaseTokenizer(config, encoder)
485485
{
486486
sys_prompt = "";
487+
do_split = false;
487488
}
488489

489490
size_t load(tokenizer::DataReader *buffer, int n_vocab) override;
@@ -496,6 +497,7 @@ namespace vlm
496497
int fake_token_around_image_token_id;
497498
int global_image_token_token_id;
498499
int nl_token_id;
500+
bool do_split;
499501

500502
std::vector<std::vector<int>> row_col_token_ids;
501503
};
@@ -616,7 +618,7 @@ namespace vlm
616618
public:
617619
typedef llama::v2::GenericConditionalGeneration<LlamaBlock> Base;
618620
ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config, ModelType type = ModelType::MODEL_TYPE_SMOL_VLM)
619-
: ExtendEmbedding(81 * 64),
621+
: ExtendEmbedding(4096),
620622
Base(config, runtime_config, type, config.num_key_value_heads, config.max_length, 12, false),
621623
visual(runtime_config)
622624
{
@@ -648,6 +650,17 @@ namespace vlm
648650
_chat_encoder.vit_loaded = visual.load(loader);
649651
}
650652

653+
void set_additional_args(const std::map<std::string, std::string> &args) override
654+
{
655+
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
656+
auto it = args.find("do_split");
657+
if (it == args.end()) it = args.find("do-split");
658+
if (it != args.end())
659+
{
660+
tok->do_split = it->second != "0";
661+
}
662+
}
663+
651664
void before_generate(const GenerationConfig &gen_config) override
652665
{
653666
std::vector<uint8_t> buf;
@@ -667,7 +680,42 @@ namespace vlm
667680
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
668681
append_user_opening(round_idx, ids);
669682

683+
std::vector<std::unique_ptr<vision::VideoLoader>> videos;
684+
685+
// expand video into images
686+
std::vector<ContentPiece> pieces;
670687
for (auto &piece : user.pieces)
688+
{
689+
if (piece.type != ContentPiece::Type::Video)
690+
{
691+
pieces.push_back(piece);
692+
continue;
693+
}
694+
695+
auto video = new vision::VideoLoader(piece.content.c_str(), vis_config->video.fps, vis_config->video.max_frames);
696+
videos.emplace_back(video);
697+
if (video->frames.size() < 1)
698+
continue;
699+
700+
std::ostringstream oss;
701+
oss << "You are provided the following series of " << utils::num2words((int)video->frames.size())
702+
<< " frames from a " << utils::sec2hms((float)video->frames.size() / vis_config->video.fps) << " [H:MM:SS] video.\n";
703+
pieces.emplace_back(oss.str());
704+
705+
for (size_t i = 0; i < video->frames.size(); i++)
706+
{
707+
oss.str("");
708+
oss << "\nFrame from " << utils::sec2ms(i / vis_config->video.fps);
709+
pieces.emplace_back(oss.str());
710+
711+
pieces.emplace_back(video->frames[i], ContentPiece::Type::Image);
712+
}
713+
714+
// DEFAULT_MEDIA_OUTTRO = "\n\n"
715+
pieces.emplace_back("\n\n");
716+
}
717+
718+
for (auto &piece : pieces)
671719
{
672720
if (piece.type == ContentPiece::Type::Text)
673721
{
@@ -688,7 +736,7 @@ namespace vlm
688736
int splits_cols_num = 0;
689737
int splits_rows_num = 0;
690738

691-
vision::image_load_split(piece.content.c_str(), splits, image_size_per_split, image_size_per_split, splits_cols_num, splits_rows_num);
739+
vision::image_load_split(piece.content.c_str(), splits, tok->do_split, image_size_per_split, image_size_per_split, splits_cols_num, splits_rows_num);
692740

693741
std::vector<float> scaled;
694742

src/basics.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,5 +73,10 @@ namespace utils
7373

7474
std::string load_file(const char *fn);
7575

76+
std::string num2words(int value);
77+
78+
std::string sec2hms(float seconds, bool show_ms = false);
79+
std::string sec2ms(float seconds, bool show_ms = false);
80+
7681
//#define TIME_STAMP (std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count())
7782
}

src/vectorstore.cpp

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -468,4 +468,107 @@ namespace utils
468468

469469
return buffer.str();
470470
}
471+
472+
std::string num2words(int value)
473+
{
474+
if (value < 0)
475+
return "minus " + num2words(-value);
476+
477+
if (value == 0)
478+
return "zero";
479+
480+
const static std::vector<std::string> units = {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"};
481+
const static std::vector<std::string> teens = {"ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen"};
482+
const static std::vector<std::string> tens = {"", "ten", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"};
483+
484+
std::string result;
485+
486+
if (value >= 1000000000) {
487+
int billion = value / 1000000000;
488+
result += num2words(billion) + " billion";
489+
value %= 1000000000;
490+
if (value > 0) {
491+
result += ", ";
492+
}
493+
}
494+
495+
if (value >= 1000000) {
496+
int million = value / 1000000;
497+
result += num2words(million) + " million";
498+
value %= 1000000;
499+
if (value > 0) {
500+
result += ", ";
501+
}
502+
}
503+
504+
if (value >= 1000) {
505+
int thousand = value / 1000;
506+
result += num2words(thousand) + " thousand";
507+
value %= 1000;
508+
if (value > 0) {
509+
result += ", ";
510+
}
511+
}
512+
513+
if (value >= 100) {
514+
int hundred = value / 100;
515+
result += units[hundred] + " hundred";
516+
value %= 100;
517+
if (value > 0) {
518+
result += " and ";
519+
}
520+
}
521+
522+
if (value >= 20)
523+
{
524+
int ten = value / 10;
525+
result += tens[ten];
526+
value %= 10;
527+
528+
if (value > 0)
529+
{
530+
result += "-" + units[value];
531+
}
532+
}
533+
else if (value >= 10)
534+
{
535+
result += teens[value - 10];
536+
}
537+
else if (value >= 1)
538+
{
539+
result += units[value];
540+
}
541+
542+
return result;
543+
}
544+
545+
std::string sec2hms(float seconds, bool show_ms)
546+
{
547+
int sec = (int)seconds;
548+
int ms = (int)((seconds - sec) * 1000000);
549+
int min = sec / 60;
550+
int hh = min / 60;
551+
sec %= 60;
552+
min %= 60;
553+
char s[100];
554+
if (show_ms)
555+
sprintf(s, "%d:%02d:%02d.%06d", hh, min, sec, ms);
556+
else
557+
sprintf(s, "%d:%02d:%02d", hh, min, sec);
558+
return s;
559+
}
560+
561+
std::string sec2ms(float seconds, bool show_ms)
562+
{
563+
int sec = (int)seconds;
564+
int ms = (int)((seconds - sec) * 1000000);
565+
int min = sec / 60;
566+
sec %= 60;
567+
char s[100];
568+
if (show_ms)
569+
sprintf(s, "%02d:%02d.%06d", min, sec, ms);
570+
else
571+
sprintf(s, "%02d:%02d", min, sec);
572+
return s;
573+
}
471574
}

src/vision_process.cpp

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
#include <regex>
33
#include <sstream>
44
#include <cmath>
5+
#include <filesystem>
6+
#include "basics.h"
57

68
#if defined(_MSC_VER)
79
#define popen _popen
@@ -10,6 +12,8 @@
1012

1113
namespace vision
1214
{
15+
namespace fs = std::filesystem;
16+
1317
struct Params
1418
{
1519
int pre_max_width;
@@ -186,7 +190,7 @@ namespace vision
186190
pclose(pp);
187191
}
188192

189-
void image_load_split(const char *fn, std::vector<image_pixels_t> &splits, const int split_width, const int split_height, int &splits_cols_num, int &splits_rows_num)
193+
void image_load_split(const char *fn, std::vector<image_pixels_t> &splits, bool do_split, const int split_width, const int split_height, int &splits_cols_num, int &splits_rows_num)
190194
{
191195
splits.clear();
192196
splits_cols_num = 0;
@@ -204,7 +208,7 @@ namespace vision
204208

205209
splits_rows_num =(height + split_height - 1) / split_height;
206210
splits_cols_num =(width + split_width - 1) / split_width;
207-
if ((splits_rows_num > 1) || (splits_cols_num > 1))
211+
if (do_split && ((splits_rows_num > 1) || (splits_cols_num > 1)))
208212
{
209213
const int optimal_height = (height + splits_rows_num - 1) / splits_rows_num;
210214
const int optimal_width = (width + splits_cols_num - 1) / splits_cols_num;
@@ -444,6 +448,44 @@ namespace vision
444448
}
445449
}
446450

451+
VideoLoader::VideoLoader(const char *fn, float fps, const int max_frames, const int resize_width, const int resize_height)
452+
{
453+
if (!fs::exists(fn))
454+
return;
455+
456+
tmp_dir = std::tmpnam(nullptr);
457+
fs::create_directories(tmp_dir);
458+
459+
char cmd[1024];
460+
if ((resize_height <= 0) || (resize_height <= 0))
461+
{
462+
sprintf(cmd, "ffmpeg -loglevel error -i \"%s\" -vf \"fps=%f\" -frames:v %d \"%s\"",
463+
fn, fps, max_frames, (fs::path(tmp_dir) / "%04d.jpg").string().c_str());
464+
}
465+
else
466+
{
467+
sprintf(cmd, "ffmpeg -loglevel error -i \"%s\" -vf \"scale=w=%d:h=%d:force_original_aspect_ratio=1,pad=%d:%d:(ow-iw)/2:(oh-ih)/2,fps=%f\" -frames:v %d \"%s\"",
468+
fn, resize_width, resize_height, resize_width, resize_height,
469+
fps, max_frames, (fs::path(tmp_dir) / "%04d.jpg").string().c_str());
470+
}
471+
472+
int ret = std::system(cmd);
473+
474+
for (const auto &entry : fs::directory_iterator(tmp_dir))
475+
{
476+
if (entry.path().extension() == ".jpg") {
477+
frames.push_back(entry.path().string());
478+
}
479+
}
480+
481+
std::sort(frames.begin(), frames.end());
482+
}
483+
484+
VideoLoader::~VideoLoader()
485+
{
486+
fs::remove_all(tmp_dir);
487+
}
488+
447489
static void print_data(const std::vector<float> &pixels, const int group_size = 10, int max_elem = 100)
448490
{
449491
const size_t cnt = pixels.size();

src/vision_process.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,10 +97,21 @@ namespace vision
9797

9898
void image_dimension(const char *fn, int &width, int &height);
9999
void image_load(const char *fn, std::vector<uint8_t> &rgb_pixels, int &width, int &height, int patch_size, PaddingMode pad = PaddingMode::No);
100-
void image_load_split(const char *fn, std::vector<image_pixels_t> &splits, const int split_width, const int split_height, int &splits_cols_num, int &splits_rows_num); // splits are in natural order
100+
void image_load_split(const char *fn, std::vector<image_pixels_t> &splits, bool do_split, const int split_width, const int split_height, int &splits_cols_num, int &splits_rows_num); // splits are in natural order
101101
void image_rescale(const std::vector<uint8_t> &rgb_pixels, std::vector<float> &scaled_rgb_pixels, float scale_factor = 1/255.0f);
102102
void image_normalize(std::vector<float> &rgb_pixels, const float *mean, const float *std_d);
103103

104+
class VideoLoader
105+
{
106+
public:
107+
VideoLoader(const char *fn, float fps = 1.0f, const int max_frames = 10, const int resize_width = -1, const int resize_height = -1);
108+
~VideoLoader();
109+
public:
110+
std::vector<std::string> frames;
111+
private:
112+
std::string tmp_dir;
113+
};
114+
104115
// ASSUMPTION: already properly aligned to `patch_size`
105116
void image_arrange(const std::vector<float> &rgb_pixels, const int width, const int patch_size,
106117
std::vector<float> &arranged, const PatchesFormat fmt);

0 commit comments

Comments
 (0)