Skip to content

Commit d497184

Browse files
committed
Fix access violation exception
1 parent 247abb4 commit d497184

File tree

2 files changed

+46
-4
lines changed

2 files changed

+46
-4
lines changed

operators/tokenizer/case_encoder.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ class CaseEncoder {
9797
if (state_ == 0) {
9898
buffer_.clear();
9999
buffer_queue_.clear();
100+
buffer_queue_.reserve(32); // Reserve space to prevent reallocation during collection
100101
offset_ = 0;
101102
}
102103

operators/tokenizer/ugm_kernels.hpp

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -521,8 +521,25 @@ struct SpmUgmTokenizer {
521521
if (longest_prefix_offset >= prefix_replacements_size_) {
522522
ORTX_CXX_API_THROW("[UgmTok]Index out of array bounds in precompiled charsmap!", ORT_RUNTIME_EXCEPTION);
523523
}
524+
// Validate that we have valid prefix_replacements_ data
525+
if (prefix_replacements_ == nullptr || prefix_replacements_size_ == 0) {
526+
ORTX_CXX_API_THROW("[UgmTok]prefix_replacements_ is null or empty!", ORT_RUNTIME_EXCEPTION);
527+
}
524528
const char* prefix_replacement = &prefix_replacements_[longest_prefix_offset];
525-
return {prefix_replacement, static_cast<int>(longest_prefix_length)};
529+
// Calculate safe length: find null terminator within remaining buffer
530+
size_t max_len = prefix_replacements_size_ - longest_prefix_offset;
531+
size_t replacement_len = 0;
532+
while (replacement_len < max_len && prefix_replacement[replacement_len] != '\0') {
533+
replacement_len++;
534+
}
535+
if (replacement_len == max_len) {
536+
ORTX_CXX_API_THROW("[UgmTok]Replacement string not null-terminated in precompiled charsmap!", ORT_RUNTIME_EXCEPTION);
537+
}
538+
// Additional safety: if empty replacement, return original input segment
539+
if (replacement_len == 0) {
540+
return {input_view.substr(0, longest_prefix_length), static_cast<int>(longest_prefix_length)};
541+
}
542+
return {std::string_view(prefix_replacement, replacement_len), static_cast<int>(longest_prefix_length)};
526543
} else {
527544
// if yes, return this sequence unmodified
528545
size_t prefix_offset = ustring::UTF8Len(input_view[0]);
@@ -555,9 +572,12 @@ struct SpmUgmTokenizer {
555572

556573
while (!input_view.empty()) {
557574
auto p = case_encoder_->NormalizePrefix(input_view);
575+
576+
// Copy the string_view data to avoid dangling pointer issues
577+
std::string normalized_str(p.first);
558578

559-
for (size_t i = 0; i < p.first.size(); i++) {
560-
char c = p.first[i];
579+
for (size_t i = 0; i < normalized_str.size(); i++) {
580+
char c = normalized_str[i];
561581
if (c != ' ') {
562582
if (!processing_non_ws) {
563583
processing_non_ws = true;
@@ -674,8 +694,29 @@ struct SpmUgmTokenizer {
674694
if (longest_prefix_offset >= prefix_replacements_size_) {
675695
ORTX_CXX_API_THROW("[UgmTok]Index out of array bounds in precompiled charsmap!", ORT_RUNTIME_EXCEPTION);
676696
}
697+
// Validate that we have valid prefix_replacements_ data
698+
if (prefix_replacements_ == nullptr || prefix_replacements_size_ == 0) {
699+
ORTX_CXX_API_THROW("[UgmTok]prefix_replacements_ is null or empty!", ORT_RUNTIME_EXCEPTION);
700+
}
677701
const char* prefix_replacement = &prefix_replacements_[longest_prefix_offset];
678-
return {prefix_replacement, strlen(prefix_replacement), longest_prefix_length};
702+
// Calculate safe length: find null terminator within remaining buffer
703+
size_t max_len = prefix_replacements_size_ - longest_prefix_offset;
704+
size_t replacement_len = 0;
705+
while (replacement_len < max_len && prefix_replacement[replacement_len] != '\0') {
706+
replacement_len++;
707+
}
708+
if (replacement_len == max_len) {
709+
ORTX_CXX_API_THROW("[UgmTok]Replacement string not null-terminated in precompiled charsmap!", ORT_RUNTIME_EXCEPTION);
710+
}
711+
// Additional safety: if empty replacement, return original input segment
712+
if (replacement_len == 0) {
713+
size_t prefix_offset = input_offset + ustring::UTF8Len(input[input_offset]);
714+
if (prefix_offset <= input.size()) {
715+
return {&input[input_offset], prefix_offset - input_offset, longest_prefix_length};
716+
}
717+
return {"\xEF\xBF\xBD", 3, 1};
718+
}
719+
return {prefix_replacement, replacement_len, longest_prefix_length};
679720
} else {
680721
// if yes, return this sequence unmodified
681722
size_t prefix_offset = input_offset + ustring::UTF8Len(input[input_offset]);

0 commit comments

Comments
 (0)