Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 111 additions & 45 deletions src/core/src/runtime/string_aligned_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,64 +4,130 @@

#include "openvino/runtime/string_aligned_buffer.hpp"

#include <limits>
#include <numeric>

#include "openvino/core/type/element_type.hpp"
#include "openvino/runtime/aligned_buffer.hpp"
#include "openvino/util/common_util.hpp"


namespace {
void aux_unpack_string_tensor(const char* data, size_t size, std::shared_ptr<ov::StringAlignedBuffer>& string_buffer) {
// unpack string tensor
// packed format is the following:
// <num_string>, <1st string offset>,..., <nth string offset>, <1st string raw format>,..., <nth string raw format>
// check the format of the input bitstream representing the string tensor
OPENVINO_ASSERT(size >= 4, "Incorrect packed string tensor format: no batch size in the packed string tensor");
const int32_t* pindices = reinterpret_cast<const int32_t*>(data);
int32_t num_strings = pindices[0];
OPENVINO_ASSERT(int32_t(size) >= 4 + 4 + 4 * num_strings,
"Incorrect packed string tensor format: the packed string tensor must contain first "
"string offset and end indices");
const int32_t* begin_ids = pindices + 1;
const int32_t* end_ids = pindices + 2;
const char* symbols = reinterpret_cast<const char*>(pindices + 2 + num_strings);

// allocate StringAlignedBuffer to store unpacked strings in std::string objects
void aux_unpack_string_tensor(const char* const data,
const size_t size,
std::shared_ptr<ov::StringAlignedBuffer>& string_buffer) {
// Packed format is the following:
// <strings_count>, <1st string offset>,..., <nth string offset>, <1st string raw contents>,..., <nth string raw
// contents>

using header_element_t = int32_t; // Type of a single element in the header (strings_count and offsets)

static_assert(sizeof(header_element_t) <= sizeof(size_t),
"Header element type must be able to represent offsets and number of strings as size_t");

OPENVINO_ASSERT(size >= sizeof(header_element_t),
"Incorrect packed string tensor format: no strings count in the packed string tensor");

const auto header = reinterpret_cast<const header_element_t*>(data);
const auto strings_count_signed = header[0];
OPENVINO_ASSERT(strings_count_signed >= 0, "Incorrect packed string tensor format: negative number of strings");

const auto strings_count = static_cast<size_t>(strings_count_signed);

constexpr size_t strings_count_elements = 1; // Header size occupied by strings_count
constexpr size_t last_end_elements = 1; // Header size occupied by last end offset

const size_t header_elems =
strings_count + (strings_count == 0 ? strings_count_elements : strings_count_elements + last_end_elements);

constexpr size_t element_size = sizeof(header_element_t);

size_t header_size = 0;
const bool is_overflow = ov::util::mul_overflow(header_elems, element_size, header_size);
OPENVINO_ASSERT(!is_overflow, "Incorrect packed string tensor format: header size overflow detected");

OPENVINO_ASSERT(header_size <= size, "Incorrect packed string tensor format: header exceeds provided buffer size");

const auto data_region_size = size - header_size;

// Allocate StringAlignedBuffer to store unpacked strings in std::string objects
// SharedBuffer to read byte stream is not applicable because we need unpacked format for strings
string_buffer = std::make_shared<ov::StringAlignedBuffer>(
num_strings,
ov::element::string.size() * num_strings,
64, // host alignment used the same as in creation of buffer for Constant
true);
std::string* src_strings = static_cast<std::string*>(string_buffer->get_ptr());
for (int32_t idx = 0; idx < num_strings; ++idx) {
src_strings[idx] = std::string(symbols + begin_ids[idx], symbols + end_ids[idx]);
constexpr size_t alignment = 64; // host alignment used the same as in creation of buffer for Constant
constexpr bool initialize = true; // initialize std::string objects to be able to assign to them later
string_buffer = std::make_shared<ov::StringAlignedBuffer>(strings_count,
ov::element::string.size() * strings_count,
alignment,
initialize);

std::string* strings = static_cast<std::string*>(string_buffer->get_ptr());

const header_element_t* begin_offsets = header + 1;
const header_element_t* end_offsets = header + 2;
const char* const data_region = reinterpret_cast<const char*>(header + header_elems);

for (size_t idx = 0; idx < strings_count; ++idx, ++strings, ++begin_offsets, ++end_offsets) {
const auto begin_signed = *begin_offsets;
const auto end_signed = *end_offsets;

OPENVINO_ASSERT(begin_signed >= 0 && end_signed >= 0,
"Incorrect packed string tensor format: negative string offset in the packed string tensor");

OPENVINO_ASSERT(begin_signed <= end_signed,
"Incorrect packed string tensor format: begin offset greater than end offset");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

                        "Incorrect packed string tensor format: begin offset greater than end offset");

begin offset greater than end offset this give no information as condition is serialized
Optional
Is always keep as number and message of assert as small as possible (but explain error) to not add which is not is usually used.


const size_t begin = static_cast<size_t>(begin_signed);
const size_t end = static_cast<size_t>(end_signed);

OPENVINO_ASSERT(end <= data_region_size,
"Incorrect packed string tensor format: string offset exceeds buffer bounds");

strings->assign(data_region + begin, data_region + end);
}
}

void aux_get_header(const std::shared_ptr<ov::StringAlignedBuffer>& string_aligned_buffer_ptr,
std::shared_ptr<uint8_t>& header,
std::shared_ptr<uint8_t>& data,
size_t& header_size) {
OPENVINO_ASSERT(string_aligned_buffer_ptr, "StringAlignedBuffer pointer is nullptr");
// packed format is the following:
// <num_string>, <1st string offset>,..., <nth string offset>, <1st string raw format>,..., <nth rawformat>
auto num_elements = string_aligned_buffer_ptr->get_num_elements();
auto strings = reinterpret_cast<std::string*>(string_aligned_buffer_ptr->get_ptr());

// first run over all elements: calculate total memory required to hold all strings
header_size = sizeof(int32_t) * (1 + 1 + num_elements);
header = std::shared_ptr<uint8_t>(new uint8_t[header_size], std::default_delete<uint8_t[]>());

int32_t* pindices = reinterpret_cast<int32_t*>(header.get());
pindices[0] = int32_t(num_elements);
pindices[1] = 0;
pindices += 2;
size_t current_symbols_pos = 0;

for (size_t ind = 0; ind < num_elements; ++ind) {
const auto& str = strings[ind];
current_symbols_pos += str.size();
*pindices = int32_t(current_symbols_pos);
++pindices;
// Packed format is the following:
// <strings_count>, <1st string offset>,..., <nth string offset>, <1st string raw contents>,..., <nth string raw
// contents>
using header_element_t = int32_t; // Type of a single element in the header (strings_count and offsets)

const auto strings_count = string_aligned_buffer_ptr->get_num_elements();

constexpr size_t strings_count_elements = 1; // Header size occupied by strings_count
constexpr size_t last_end_elements = 1; // Header size occupied by last end offset

OPENVINO_ASSERT(strings_count <= std::numeric_limits<size_t>::max() - strings_count_elements - last_end_elements,
"Too many strings: header element count overflow");

const size_t header_elems =
strings_count + (strings_count == 0 ? strings_count_elements : strings_count_elements + last_end_elements);

constexpr size_t element_size = sizeof(header_element_t);

size_t header_size_bytes = 0;
const bool is_overflow = ov::util::mul_overflow(header_elems, element_size, header_size_bytes);
OPENVINO_ASSERT(!is_overflow, "Too many strings: header size overflow detected");

header_size = header_size_bytes;
data = std::shared_ptr<uint8_t>(new uint8_t[header_size], std::default_delete<uint8_t[]>());

header_element_t* header = reinterpret_cast<header_element_t*>(data.get());
header[0] = static_cast<header_element_t>(strings_count);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Both strings_count (from get_num_elements()) and current_string_end are size_t values cast to int32_t via static_cast without overflow validation. If either value exceeds INT32_MAX, this is signed integer overflow — undefined behavior in C++. Since this is a security-focused PR, the packing side should also be hardened:

OPENVINO_ASSERT(strings_count <= static_cast<size_t>(std::numeric_limits<header_element_t>::max()),
"Too many strings to pack: strings_count exceeds int32_t range");
Similarly, current_string_end should be validated before each cast:

OPENVINO_ASSERT(current_string_end <= static_cast<size_t>(std::numeric_limits<header_element_t>::max()),
"Cumulative string size exceeds int32_t range");


if (strings_count > 0) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question: what is the wanted buffer shape in case of zero strings provided?

Do we still want to create and fill the tail (i.e. begin and end of a string, that doesn't exist)?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If Constant has shape [2] and her string count is zero the constant will be not correct as shape and number of data not match.

It looks like some kind of error case

header[1] = 0;
header += 2;
size_t current_string_end = 0;

auto strings = reinterpret_cast<std::string*>(string_aligned_buffer_ptr->get_ptr());

for (size_t idx = 0; idx < strings_count; ++idx, ++header, ++strings) {
current_string_end += strings->size();
*header = static_cast<header_element_t>(current_string_end);
}
}
}

Expand Down
179 changes: 179 additions & 0 deletions src/core/tests/string_unpack_tensor_test.cpp
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test can put into string_align_buffer_test.cpp file

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually this test file grew a little. Is it OK if I keep it separate?

I'll move content to string_align_buffer_test.cpp if you say so ;)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is better keep all of them in same file.

Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
// Copyright (C) 2018-2026 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include <gtest/gtest.h>

#include "common_test_utils/test_assertions.hpp"
#include "openvino/runtime/string_aligned_buffer.hpp"

using header_element_t = int32_t; // Type of a single element in the header (strings_count and offsets)

namespace {
Comment on lines +10 to +12
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move all inside ov::test


std::vector<uint8_t> pack_string_tensor(const std::vector<header_element_t>& header,
const std::vector<uint8_t>& strings = {}) {
std::vector<uint8_t> tensor;
tensor.reserve(header.size() * sizeof(header_element_t) + strings.size());
for (const auto value : header) {
const uint8_t* characters = reinterpret_cast<const uint8_t*>(&value);
tensor.insert(tensor.end(), characters, characters + sizeof(header_element_t));
}
tensor.insert(tensor.end(), strings.begin(), strings.end());
return tensor;
}

void tamper_with_elements_count(std::vector<uint8_t>& tensor, const header_element_t elements_count) {
constexpr size_t elements_count_index = 0;
const uint8_t* characters = reinterpret_cast<const uint8_t*>(&elements_count);
std::copy(characters, characters + sizeof(header_element_t), tensor.begin() + elements_count_index);
}

} // namespace

namespace ov {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
namespace ov {
namespace ov::test {


/// @brief Test case for zero number of strings in packed string tensor.
/// Expecting an empty buffer.
/// num_strings = 0
TEST(StringUnpackTensorTest, ZeroNumberOfStringsYieldsEmptyBuffer) {
constexpr auto strings_count = 0;

const auto tensor = pack_string_tensor(std::vector<header_element_t>{strings_count});
const auto result = AttributeAdapter<std::shared_ptr<StringAlignedBuffer>>::unpack_string_tensor(
reinterpret_cast<const char*>(tensor.data()),
tensor.size());

EXPECT_EQ(result->get_num_elements(), 0);
}

/// @brief Test case for missing number of strings in packed string tensor.
/// Expecting AssertFailure with message about missing strings count.
/// num_strings = <missing>
Comment on lines +50 to +52
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These comment are not required

TEST(StringUnpackTensorTest, MissingNumberOfStringsFails) {
using testing::HasSubstr;

const std::vector<uint8_t> strings = {'0', '1'};

const auto tensor = pack_string_tensor(std::vector<header_element_t>{}, strings);
OV_EXPECT_THROW(AttributeAdapter<std::shared_ptr<StringAlignedBuffer>>::unpack_string_tensor(
reinterpret_cast<const char*>(tensor.data()),
tensor.size()),
AssertFailure,
HasSubstr("no strings count in the packed string tensor"));
}

/// @brief Test case for negative number of strings in packed string tensor.
/// Expecting AssertFailure with message about negative number of strings.
/// num_strings = -1, which is invalid because number of strings cannot be negative.
TEST(StringUnpackTensorTest, NegativeNumberOfStringsFails) {
using testing::HasSubstr;

constexpr auto strings_count = -1;

const auto tensor = pack_string_tensor(std::vector<header_element_t>{strings_count});
OV_EXPECT_THROW(AttributeAdapter<std::shared_ptr<StringAlignedBuffer>>::unpack_string_tensor(
reinterpret_cast<const char*>(tensor.data()),
tensor.size()),
AssertFailure,
HasSubstr("negative number of strings"));
}

/// @brief Test case for header size exceeding indexable range.
/// Expecting AssertFailure with message about header size calculation overflow.
/// num_strings = <plenty>
TEST(StringUnpackTensorTest, HeaderSizeOverflowFails) {
using testing::HasSubstr;

constexpr auto strings_count = 1;
constexpr auto strings_count_tampered = std::numeric_limits<header_element_t>::max();

if (strings_count_tampered < std::numeric_limits<size_t>::max() / sizeof(header_element_t)) {
GTEST_SKIP() << "Header size calculation does not overflow on this platform";
}

const std::vector<uint8_t> strings = {'0', '1', '2', '3', '4'};
auto tensor = pack_string_tensor(std::vector<header_element_t>{strings_count, 0, 5}, strings);

tamper_with_elements_count(tensor, strings_count_tampered);

OV_EXPECT_THROW(AttributeAdapter<std::shared_ptr<StringAlignedBuffer>>::unpack_string_tensor(
reinterpret_cast<const char*>(tensor.data()),
tensor.size()),
AssertFailure,
HasSubstr("header size overflow detected"));
}

/// @brief Test case for header size exceeding buffer bounds in packed string tensor.
/// Expecting AssertFailure with message about header exceeds provided buffer size.
/// num_strings = 10, header: [10, 0, end0=3, end1=5], but buffer too small
TEST(StringUnpackTensorTest, HeaderSizeBeyondBufferFails) {
using testing::HasSubstr;

constexpr auto strings_count = 2;
constexpr auto strings_count_tampered = 10;

const std::vector<uint8_t> strings = {'0', '1', '2', '3', '4'};
auto tensor = pack_string_tensor(std::vector<header_element_t>{strings_count, 0, 3, 5}, strings);

tamper_with_elements_count(tensor, strings_count_tampered);

OV_EXPECT_THROW(AttributeAdapter<std::shared_ptr<StringAlignedBuffer>>::unpack_string_tensor(
reinterpret_cast<const char*>(tensor.data()),
tensor.size()),
AssertFailure,
HasSubstr("header exceeds provided buffer size"));
}

/// @brief Test case for negative offsets in packed string tensor.
/// Expecting AssertFailure with message about begin offset greater than end offset.
/// num_strings = 2, header: [2, 0, end0=-3, end1=5]
TEST(StringUnpackTensorTest, NegativeOffsetsFails) {
using testing::HasSubstr;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It can be used once in this file


constexpr auto strings_count = 2;

const std::vector<uint8_t> strings = {'0', '1', '2', '3', '4'};
const auto tensor = pack_string_tensor(std::vector<header_element_t>{strings_count, 0, -3, 5}, strings);
OV_EXPECT_THROW(AttributeAdapter<std::shared_ptr<StringAlignedBuffer>>::unpack_string_tensor(
reinterpret_cast<const char*>(tensor.data()),
tensor.size()),
AssertFailure,
HasSubstr("negative string offset in the packed string tensor"));
}

/// @brief Test case for decreasing offsets in packed string tensor.
/// Expecting AssertFailure with message about begin offset greater than end offset.
/// num_strings = 2, header: [2, 0, end0=5, end1=3]
TEST(StringUnpackTensorTest, DecreasingOffsetsFails) {
using testing::HasSubstr;

constexpr auto strings_count = 2;

const std::vector<uint8_t> strings = {'0', '1', '2', '3', '4'};
const auto tensor = pack_string_tensor(std::vector<header_element_t>{strings_count, 0, 5, 3}, strings);
OV_EXPECT_THROW(AttributeAdapter<std::shared_ptr<StringAlignedBuffer>>::unpack_string_tensor(
reinterpret_cast<const char*>(tensor.data()),
tensor.size()),
AssertFailure,
HasSubstr("begin offset greater than end offset"));
}

/// @brief Test case for string offset exceeding buffer bounds in packed string tensor.
/// Expecting AssertFailure with message about string offset exceeds buffer bounds.
/// num_strings = 1, header: [1, 0, end0=10], but buffer too small
TEST(StringUnpackTensorTest, OffsetBeyondBufferFails) {
using testing::HasSubstr;

constexpr auto strings_count = 2;

const std::vector<uint8_t> strings = {'0', '1', '2', '3', '4'};
const auto tensor = pack_string_tensor(std::vector<header_element_t>{strings_count, 0, 10, 20}, strings);
OV_EXPECT_THROW(AttributeAdapter<std::shared_ptr<StringAlignedBuffer>>::unpack_string_tensor(
reinterpret_cast<const char*>(tensor.data()),
tensor.size()),
AssertFailure,
HasSubstr("string offset exceeds buffer bounds"));
}

} // namespace ov
Loading