Skip to content

Commit 28a18a4

Browse files
committed
Fix access violation exception
1 parent 247abb4 commit 28a18a4

File tree

2 files changed

+54
-6
lines changed

2 files changed

+54
-6
lines changed

operators/tokenizer/case_encoder.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ class CaseEncoder {
9797
if (state_ == 0) {
9898
buffer_.clear();
9999
buffer_queue_.clear();
100+
buffer_queue_.reserve(32); // Reserve space to prevent reallocation during collection
100101
offset_ = 0;
101102
}
102103

operators/tokenizer/ugm_kernels.hpp

Lines changed: 53 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -449,8 +449,14 @@ struct SpmUgmTokenizer {
449449

450450
for (size_t input_offset = 0; input_offset < input_len;) {
451451
auto norm_res = NormalizePrefix(input, input_offset);
452-
for (size_t i = 0; i < norm_res.normalized_len; i++) {
453-
char c = norm_res.normalized[i];
452+
// Validate the result before using it
453+
if (norm_res.normalized == nullptr && norm_res.normalized_len > 0) {
454+
ORTX_CXX_API_THROW("[UgmTok]NormalizePrefix returned null pointer with non-zero length!", ORT_RUNTIME_EXCEPTION);
455+
}
456+
// Copy normalized data immediately to be safe
457+
std::string normalized_segment(norm_res.normalized, norm_res.normalized_len);
458+
for (size_t i = 0; i < normalized_segment.size(); i++) {
459+
char c = normalized_segment[i];
454460
if (c != ' ') {
455461
if (!processing_non_ws) {
456462
processing_non_ws = true;
@@ -521,8 +527,25 @@ struct SpmUgmTokenizer {
521527
if (longest_prefix_offset >= prefix_replacements_size_) {
522528
ORTX_CXX_API_THROW("[UgmTok]Index out of array bounds in precompiled charsmap!", ORT_RUNTIME_EXCEPTION);
523529
}
530+
// Validate that we have valid prefix_replacements_ data
531+
if (prefix_replacements_ == nullptr || prefix_replacements_size_ == 0) {
532+
ORTX_CXX_API_THROW("[UgmTok]prefix_replacements_ is null or empty!", ORT_RUNTIME_EXCEPTION);
533+
}
524534
const char* prefix_replacement = &prefix_replacements_[longest_prefix_offset];
525-
return {prefix_replacement, static_cast<int>(longest_prefix_length)};
535+
// Calculate safe length: find null terminator within remaining buffer
536+
size_t max_len = prefix_replacements_size_ - longest_prefix_offset;
537+
size_t replacement_len = 0;
538+
while (replacement_len < max_len && prefix_replacement[replacement_len] != '\0') {
539+
replacement_len++;
540+
}
541+
if (replacement_len == max_len) {
542+
ORTX_CXX_API_THROW("[UgmTok]Replacement string not null-terminated in precompiled charsmap!", ORT_RUNTIME_EXCEPTION);
543+
}
544+
// Additional safety: if empty replacement, return original input segment
545+
if (replacement_len == 0) {
546+
return {input_view.substr(0, longest_prefix_length), static_cast<int>(longest_prefix_length)};
547+
}
548+
return {std::string_view(prefix_replacement, replacement_len), static_cast<int>(longest_prefix_length)};
526549
} else {
527550
// if yes, return this sequence unmodified
528551
size_t prefix_offset = ustring::UTF8Len(input_view[0]);
@@ -555,9 +578,12 @@ struct SpmUgmTokenizer {
555578

556579
while (!input_view.empty()) {
557580
auto p = case_encoder_->NormalizePrefix(input_view);
581+
582+
// Copy the string_view data to avoid dangling pointer issues
583+
std::string normalized_str(p.first);
558584

559-
for (size_t i = 0; i < p.first.size(); i++) {
560-
char c = p.first[i];
585+
for (size_t i = 0; i < normalized_str.size(); i++) {
586+
char c = normalized_str[i];
561587
if (c != ' ') {
562588
if (!processing_non_ws) {
563589
processing_non_ws = true;
@@ -674,8 +700,29 @@ struct SpmUgmTokenizer {
674700
if (longest_prefix_offset >= prefix_replacements_size_) {
675701
ORTX_CXX_API_THROW("[UgmTok]Index out of array bounds in precompiled charsmap!", ORT_RUNTIME_EXCEPTION);
676702
}
703+
// Validate that we have valid prefix_replacements_ data
704+
if (prefix_replacements_ == nullptr || prefix_replacements_size_ == 0) {
705+
ORTX_CXX_API_THROW("[UgmTok]prefix_replacements_ is null or empty!", ORT_RUNTIME_EXCEPTION);
706+
}
677707
const char* prefix_replacement = &prefix_replacements_[longest_prefix_offset];
678-
return {prefix_replacement, strlen(prefix_replacement), longest_prefix_length};
708+
// Calculate safe length: find null terminator within remaining buffer
709+
size_t max_len = prefix_replacements_size_ - longest_prefix_offset;
710+
size_t replacement_len = 0;
711+
while (replacement_len < max_len && prefix_replacement[replacement_len] != '\0') {
712+
replacement_len++;
713+
}
714+
if (replacement_len == max_len) {
715+
ORTX_CXX_API_THROW("[UgmTok]Replacement string not null-terminated in precompiled charsmap!", ORT_RUNTIME_EXCEPTION);
716+
}
717+
// Additional safety: if empty replacement, return original input segment
718+
if (replacement_len == 0) {
719+
size_t prefix_offset = input_offset + ustring::UTF8Len(input[input_offset]);
720+
if (prefix_offset <= input.size()) {
721+
return {&input[input_offset], prefix_offset - input_offset, longest_prefix_length};
722+
}
723+
return {"\xEF\xBF\xBD", 3, 1};
724+
}
725+
return {prefix_replacement, replacement_len, longest_prefix_length};
679726
} else {
680727
// if yes, return this sequence unmodified
681728
size_t prefix_offset = input_offset + ustring::UTF8Len(input[input_offset]);

0 commit comments

Comments
 (0)