|
4 | 4 | #include "gguf.h" |
5 | 5 | #include "clip.h" |
6 | 6 |
|
| 7 | +#include <array> |
7 | 8 | #include <climits> |
8 | 9 | #include <cstdarg> |
9 | 10 | #include <cinttypes> |
@@ -429,26 +430,156 @@ static projector_type clip_projector_type_from_string(const std::string & str) { |
429 | 430 |
|
430 | 431 | // RGB uint8 image |
431 | 432 | struct clip_image_u8 { |
432 | | - int nx; |
433 | | - int ny; |
| 433 | + clip_image_size get_size() const { |
| 434 | + return { nx, ny }; |
| 435 | + } |
| 436 | + |
| 437 | + void set_size(clip_image_size size, bool is_placeholder) { |
| 438 | + nx = size.width; |
| 439 | + ny = size.height; |
| 440 | + if (is_placeholder) { |
| 441 | + buf.clear(); |
| 442 | + } else { |
| 443 | + buf.resize((size_t) nx * (size_t) ny * 3); |
| 444 | + } |
| 445 | + } |
| 446 | + |
| 447 | + void cpy_buf(const std::vector<uint8_t> & new_buf) { |
| 448 | + buf = new_buf; |
| 449 | + } |
| 450 | + |
| 451 | + const std::vector<uint8_t> & get_ro_buf() const { |
| 452 | + if (is_placeholder()) { |
| 453 | + throw std::runtime_error("this clip_image_u8 is a placeholder"); |
| 454 | + } |
| 455 | + return buf; |
| 456 | + } |
434 | 457 |
|
| 458 | + // note to contributors: NEVER add a get_rw_buf(), it is a DANGEROUS pattern. always use get_pixel / set_pixel for buffer manipulation |
| 459 | + |
| 460 | + bool is_placeholder() const { |
| 461 | + return buf.empty(); |
| 462 | + } |
| 463 | + |
| 464 | + std::array<uint8_t, 3> get_pixel(int x, int y) const { |
| 465 | + if (is_placeholder()) { |
| 466 | + // return a dummy value, so that legacy code can still process image without errors |
| 467 | + return { 0, 0, 0 }; |
| 468 | + } |
| 469 | + int idx = (y * nx + x) * 3; |
| 470 | + return { buf[idx], buf[idx + 1], buf[idx + 2] }; |
| 471 | + } |
| 472 | + |
| 473 | + void set_pixel(int x, int y, const std::array<uint8_t, 3> & rgb) { |
| 474 | + if (is_placeholder()) { |
| 475 | + return; // no-op |
| 476 | + } |
| 477 | + int idx = (y * nx + x) * 3; |
| 478 | + buf[idx] = rgb[0]; |
| 479 | + buf[idx + 1] = rgb[1]; |
| 480 | + buf[idx + 2] = rgb[2]; |
| 481 | + } |
| 482 | + |
| 483 | + size_t n_pixels() const { |
| 484 | + return (size_t) nx * (size_t) ny; |
| 485 | + } |
| 486 | + |
| 487 | + size_t n_elements() const { |
| 488 | + return n_pixels() * 3; |
| 489 | + } |
| 490 | + |
| 491 | + private: |
435 | 492 | std::vector<uint8_t> buf; |
| 493 | + int nx = 0; |
| 494 | + int ny = 0; |
436 | 495 | }; |
437 | 496 |
|
438 | 497 | // For images, buf.size() == nx*ny*3 |
439 | 498 | // Memory layout: RGBRGBRGB... |
440 | 499 | // For audio, only one channel is used, buf.size() == nx*ny |
441 | 500 | // nx will be n_frames and ny will be n_mel |
442 | 501 | struct clip_image_f32 { |
443 | | - int nx; |
444 | | - int ny; |
445 | | - |
446 | | - std::vector<float> buf; |
447 | | - |
448 | 502 | // marks the global view in e.g., DeepSeek-OCR Models |
449 | 503 | bool add_viewsep = false; |
450 | | - // whether a learned newline token should be appended after the image (eg Granite4 Vision) |
| 504 | + // whether a learned newline (or EOI) token should be appended after the image (eg Granite4 Vision) |
451 | 505 | bool add_newline = false; |
| 506 | + |
| 507 | + clip_image_size get_size() const { |
| 508 | + return { nx_, ny_ }; |
| 509 | + } |
| 510 | + |
| 511 | + int nx() const { return nx_; } |
| 512 | + int ny() const { return ny_; } |
| 513 | + |
| 514 | + void set_size(clip_image_size size, bool is_placeholder, bool is_audio) { |
| 515 | + nx_ = size.width; |
| 516 | + ny_ = size.height; |
| 517 | + if (is_placeholder) { |
| 518 | + buf.clear(); |
| 519 | + } else { |
| 520 | + if (is_audio) { |
| 521 | + buf.resize((size_t) nx_ * (size_t) ny_); |
| 522 | + } else { |
| 523 | + buf.resize((size_t) nx_ * (size_t) ny_ * 3); |
| 524 | + } |
| 525 | + } |
| 526 | + } |
| 527 | + |
| 528 | + void cpy_buf(const std::vector<float> & new_buf) { |
| 529 | + buf = new_buf; |
| 530 | + } |
| 531 | + |
| 532 | + void from_u8(const clip_image_u8 & img) { |
| 533 | + auto size = img.get_size(); |
| 534 | + nx_ = size.width; |
| 535 | + ny_ = size.height; |
| 536 | + if (img.is_placeholder()) { |
| 537 | + buf.clear(); |
| 538 | + return; // no-op |
| 539 | + } |
| 540 | + buf.resize(img.n_elements()); |
| 541 | + const auto & u8_buf = img.get_ro_buf(); |
| 542 | + for (size_t i = 0; i < img.n_elements(); ++i) { |
| 543 | + buf[i] = (float) u8_buf[i] / 255.0f; |
| 544 | + } |
| 545 | + } |
| 546 | + |
| 547 | + size_t n_pixels() const { |
| 548 | + return (size_t) nx_ * (size_t) ny_; |
| 549 | + } |
| 550 | + |
| 551 | + size_t n_elements() const { |
| 552 | + return n_pixels() * 3; |
| 553 | + } |
| 554 | + |
| 555 | + void normalize(const float mean[3], const float std[3]) { |
| 556 | + if (is_placeholder()) { |
| 557 | + return; // no-op |
| 558 | + } |
| 559 | + for (size_t i = 0; i < n_pixels(); ++i) { |
| 560 | + buf[i * 3 + 0] = (buf[i * 3 + 0] - mean[0]) / std[0]; |
| 561 | + buf[i * 3 + 1] = (buf[i * 3 + 1] - mean[1]) / std[1]; |
| 562 | + buf[i * 3 + 2] = (buf[i * 3 + 2] - mean[2]) / std[2]; |
| 563 | + } |
| 564 | + } |
| 565 | + |
| 566 | + const std::vector<float> & get_ro_buf() const { |
| 567 | + if (is_placeholder()) { |
| 568 | + throw std::runtime_error("this clip_image_f32 is a placeholder"); |
| 569 | + } |
| 570 | + return buf; |
| 571 | + } |
| 572 | + |
| 573 | + // note to contributors: NEVER add a get_rw_buf(), it is a DANGEROUS pattern |
| 574 | + |
| 575 | + bool is_placeholder() const { |
| 576 | + return buf.empty(); |
| 577 | + } |
| 578 | + |
| 579 | + private: |
| 580 | + std::vector<float> buf; |
| 581 | + int nx_ = 0; |
| 582 | + int ny_ = 0; |
452 | 583 | }; |
453 | 584 |
|
454 | 585 | // |
|
0 commit comments