Skip to content

Commit f5c6ae1

Browse files
authored
mtmd, server: add "placeholder bitmap" for counting tokens , add */input_tokens API (#23913)
* mtmd: add "placeholder bitmap" for counting tokens w/o preprocessing * fast path skip preproc for placeholder * fix build * correct the api * add server endpoint + tests * add object name * update docs * add proxy handling * fix build * fix audio input path * use is_placeholder in process_mtmd_prompt() * nits * nits (2) * docs: clarify chat/completions/input_tokens is not official * fix merge problem
1 parent 5a69c97 commit f5c6ae1

26 files changed

Lines changed: 732 additions & 422 deletions

tools/mtmd/clip-impl.h

Lines changed: 139 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "gguf.h"
55
#include "clip.h"
66

7+
#include <array>
78
#include <climits>
89
#include <cstdarg>
910
#include <cinttypes>
@@ -429,26 +430,156 @@ static projector_type clip_projector_type_from_string(const std::string & str) {
429430

430431
// RGB uint8 image
431432
struct clip_image_u8 {
432-
int nx;
433-
int ny;
433+
clip_image_size get_size() const {
434+
return { nx, ny };
435+
}
436+
437+
void set_size(clip_image_size size, bool is_placeholder) {
438+
nx = size.width;
439+
ny = size.height;
440+
if (is_placeholder) {
441+
buf.clear();
442+
} else {
443+
buf.resize((size_t) nx * (size_t) ny * 3);
444+
}
445+
}
446+
447+
void cpy_buf(const std::vector<uint8_t> & new_buf) {
448+
buf = new_buf;
449+
}
450+
451+
const std::vector<uint8_t> & get_ro_buf() const {
452+
if (is_placeholder()) {
453+
throw std::runtime_error("this clip_image_u8 is a placeholder");
454+
}
455+
return buf;
456+
}
434457

458+
// note to contributors: NEVER add a get_rw_buf(), it is a DANGEROUS pattern. always use get_pixel / set_pixel for buffer manipulation
459+
460+
bool is_placeholder() const {
461+
return buf.empty();
462+
}
463+
464+
std::array<uint8_t, 3> get_pixel(int x, int y) const {
465+
if (is_placeholder()) {
466+
// return a dummy value, so that legacy code can still process image without errors
467+
return { 0, 0, 0 };
468+
}
469+
int idx = (y * nx + x) * 3;
470+
return { buf[idx], buf[idx + 1], buf[idx + 2] };
471+
}
472+
473+
void set_pixel(int x, int y, const std::array<uint8_t, 3> & rgb) {
474+
if (is_placeholder()) {
475+
return; // no-op
476+
}
477+
int idx = (y * nx + x) * 3;
478+
buf[idx] = rgb[0];
479+
buf[idx + 1] = rgb[1];
480+
buf[idx + 2] = rgb[2];
481+
}
482+
483+
size_t n_pixels() const {
484+
return (size_t) nx * (size_t) ny;
485+
}
486+
487+
size_t n_elements() const {
488+
return n_pixels() * 3;
489+
}
490+
491+
private:
435492
std::vector<uint8_t> buf;
493+
int nx = 0;
494+
int ny = 0;
436495
};
437496

438497
// For images, buf.size() == nx*ny*3
439498
// Memory layout: RGBRGBRGB...
440499
// For audio, only one channel is used, buf.size() == nx*ny
441500
// nx will be n_frames and ny will be n_mel
442501
struct clip_image_f32 {
443-
int nx;
444-
int ny;
445-
446-
std::vector<float> buf;
447-
448502
// marks the global view in e.g., DeepSeek-OCR Models
449503
bool add_viewsep = false;
450-
// whether a learned newline token should be appended after the image (eg Granite4 Vision)
504+
// whether a learned newline (or EOI) token should be appended after the image (eg Granite4 Vision)
451505
bool add_newline = false;
506+
507+
clip_image_size get_size() const {
508+
return { nx_, ny_ };
509+
}
510+
511+
int nx() const { return nx_; }
512+
int ny() const { return ny_; }
513+
514+
void set_size(clip_image_size size, bool is_placeholder, bool is_audio) {
515+
nx_ = size.width;
516+
ny_ = size.height;
517+
if (is_placeholder) {
518+
buf.clear();
519+
} else {
520+
if (is_audio) {
521+
buf.resize((size_t) nx_ * (size_t) ny_);
522+
} else {
523+
buf.resize((size_t) nx_ * (size_t) ny_ * 3);
524+
}
525+
}
526+
}
527+
528+
void cpy_buf(const std::vector<float> & new_buf) {
529+
buf = new_buf;
530+
}
531+
532+
void from_u8(const clip_image_u8 & img) {
533+
auto size = img.get_size();
534+
nx_ = size.width;
535+
ny_ = size.height;
536+
if (img.is_placeholder()) {
537+
buf.clear();
538+
return; // no-op
539+
}
540+
buf.resize(img.n_elements());
541+
const auto & u8_buf = img.get_ro_buf();
542+
for (size_t i = 0; i < img.n_elements(); ++i) {
543+
buf[i] = (float) u8_buf[i] / 255.0f;
544+
}
545+
}
546+
547+
size_t n_pixels() const {
548+
return (size_t) nx_ * (size_t) ny_;
549+
}
550+
551+
size_t n_elements() const {
552+
return n_pixels() * 3;
553+
}
554+
555+
void normalize(const float mean[3], const float std[3]) {
556+
if (is_placeholder()) {
557+
return; // no-op
558+
}
559+
for (size_t i = 0; i < n_pixels(); ++i) {
560+
buf[i * 3 + 0] = (buf[i * 3 + 0] - mean[0]) / std[0];
561+
buf[i * 3 + 1] = (buf[i * 3 + 1] - mean[1]) / std[1];
562+
buf[i * 3 + 2] = (buf[i * 3 + 2] - mean[2]) / std[2];
563+
}
564+
}
565+
566+
const std::vector<float> & get_ro_buf() const {
567+
if (is_placeholder()) {
568+
throw std::runtime_error("this clip_image_f32 is a placeholder");
569+
}
570+
return buf;
571+
}
572+
573+
// note to contributors: NEVER add a get_rw_buf(), it is a DANGEROUS pattern
574+
575+
bool is_placeholder() const {
576+
return buf.empty();
577+
}
578+
579+
private:
580+
std::vector<float> buf;
581+
int nx_ = 0;
582+
int ny_ = 0;
452583
};
453584

454585
//

0 commit comments

Comments
 (0)