Skip to content

Commit 1e6ffce

Browse files
committed
partial vision processor impl.
1 parent e00b123 commit 1e6ffce

File tree

6 files changed

+495
-38
lines changed

6 files changed

+495
-38
lines changed

shared/api/image_processor.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "image_transforms.hpp"
1414
#include "image_transforms_phi_3.hpp"
1515
#include "image_transforms_mllama.hpp"
16+
#include "vision_processor_phi_4.hpp"
1617

1718
namespace ort_extensions {
1819
std::tuple<std::unique_ptr<ImageRawData[]>, size_t>
@@ -39,6 +40,7 @@ Operation::KernelRegistry ImageProcessor::kernel_registry_ = {
3940
{"Permute3D", []() { return CreateKernelInstance(&Permute3D::Compute); }},
4041
{"Phi3ImageTransform", []() { return CreateKernelInstance(phi3_hd_transform); }},
4142
{"Llama3ImageTransform", []() { return CreateKernelInstance(&Llama3ImageTransform::Compute); }},
43+
{"Llama3ImageTransform", []() { return CreateKernelInstance(&Phi4VisionProcessor::Compute); }},
4244
};
4345

4446
OrtxStatus ImageProcessor::Init(std::string_view processor_def) {

shared/api/image_transforms.hpp

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,44 @@ void DumpTensorToFile(const ortc::Tensor<T>& tensor, const char* name) {
2828
#endif
2929
}
3030

31+
template <typename T>
32+
void SplitIntoTitles(const ortc::Tensor<T>& normalized_image, ortc::Tensor<T>& pixel_values,
33+
int64_t tile_height, int64_t tile_width) {
34+
auto& shape = normalized_image.Shape();
35+
int64_t image_height = shape[0];
36+
int64_t image_width = shape[1];
37+
int64_t num_channels = shape[2];
38+
39+
const int64_t image_1c_size = tile_height * tile_width;
40+
assert(image_height % tile_height == 0);
41+
int64_t num_tiles_height = static_cast<int64_t>(image_height / tile_height);
42+
assert(image_width % tile_width == 0);
43+
int64_t num_tiles_width = static_cast<int64_t>(image_width / tile_width);
44+
45+
auto p_normalized_image = normalized_image.Data();
46+
// shape (num_tiles_width * num_tiles_height, num_channels, tile_height, tile_width)
47+
float* output_pixel =
48+
pixel_values.Allocate({num_tiles_height * num_tiles_width, num_channels, tile_height, tile_width});
49+
50+
// From (image_height, image_width, num_channels)
51+
// Permute to (num_tiles_height, num_tiles_width, num_channels, tile_height, tile_width)
52+
for (int64_t i = 0; i < num_tiles_height; ++i) {
53+
for (int64_t j = 0; j < num_tiles_width; ++j) {
54+
// convert to be channel first
55+
for (int64_t k = 0; k < num_channels; ++k) {
56+
auto sub_index = image_1c_size * (i * num_tiles_width + j) * num_channels + image_1c_size * k;
57+
for (int64_t y = 0; y < tile_height; ++y) {
58+
for (int64_t x = 0; x < tile_width; ++x) {
59+
output_pixel[sub_index + y * tile_width + x] =
60+
p_normalized_image[(i * tile_height + y) * image_width * num_channels +
61+
(j * tile_width + x) * num_channels + k];
62+
}
63+
}
64+
}
65+
}
66+
}
67+
}
68+
3169
inline OrtxStatus convert_to_rgb(const ortc::Tensor<uint8_t>& input, ortc::Tensor<uint8_t>& output) {
3270
auto& dimensions = input.Shape();
3371
if (dimensions.size() != 3ULL || dimensions[2] != 3) {
@@ -106,7 +144,6 @@ struct Resize {
106144
std::memcpy(p_output_image + c0_index, output_image->image[i] + j * 4, c);
107145
}
108146
}
109-
// DumpTensor(output);
110147

111148
ImagingDelete(output_image);
112149
return {};

shared/api/image_transforms_mllama.hpp

Lines changed: 0 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -11,43 +11,6 @@
1111
#include "image_transforms.hpp"
1212

1313
struct Llama3ImageTransform {
14-
static void SplitIntoTitles(const ortc::Tensor<float>& normalized_image, ortc::Tensor<float>& pixel_values,
15-
int64_t tile_height, int64_t tile_width) {
16-
auto& shape = normalized_image.Shape();
17-
int64_t image_height = shape[0];
18-
int64_t image_width = shape[1];
19-
int64_t num_channels = shape[2];
20-
21-
const int64_t image_1c_size = tile_height * tile_width;
22-
assert(image_height % tile_height == 0);
23-
int64_t num_tiles_height = static_cast<int64_t>(image_height / tile_height);
24-
assert(image_width % tile_width == 0);
25-
int64_t num_tiles_width = static_cast<int64_t>(image_width / tile_width);
26-
27-
auto p_normalized_image = normalized_image.Data();
28-
// shape (num_tiles_width * num_tiles_height, num_channels, tile_height, tile_width)
29-
float* output_pixel =
30-
pixel_values.Allocate({num_tiles_height * num_tiles_width, num_channels, tile_height, tile_width});
31-
32-
// From (image_height, image_width, num_channels)
33-
// Permute to (num_tiles_height, num_tiles_width, num_channels, tile_height, tile_width)
34-
for (int64_t i = 0; i < num_tiles_height; ++i) {
35-
for (int64_t j = 0; j < num_tiles_width; ++j) {
36-
// convert to be channel first
37-
for (int64_t k = 0; k < num_channels; ++k) {
38-
auto sub_index = image_1c_size * (i * num_tiles_width + j) * num_channels + image_1c_size * k;
39-
for (int64_t y = 0; y < tile_height; ++y) {
40-
for (int64_t x = 0; x < tile_width; ++x) {
41-
output_pixel[sub_index + y * tile_width + x] =
42-
p_normalized_image[(i * tile_height + y) * image_width * num_channels +
43-
(j * tile_width + x) * num_channels + k];
44-
}
45-
}
46-
}
47-
}
48-
}
49-
}
50-
5114
OrtxStatus Compute(const ortc::Tensor<uint8_t>& image, ortc::Tensor<float>& pixel_values,
5215
ortc::Tensor<int64_t>& aspect_ratio_ids, ortc::Tensor<int64_t>& aspect_ratio_mask,
5316
ortc::Tensor<int64_t>& num_tiles) {

0 commit comments

Comments
 (0)