@@ -521,8 +521,25 @@ struct SpmUgmTokenizer {
521521 if (longest_prefix_offset >= prefix_replacements_size_) {
522522 ORTX_CXX_API_THROW (" [UgmTok]Index out of array bounds in precompiled charsmap!" , ORT_RUNTIME_EXCEPTION);
523523 }
524+ // Validate that we have valid prefix_replacements_ data
525+ if (prefix_replacements_ == nullptr || prefix_replacements_size_ == 0 ) {
526+ ORTX_CXX_API_THROW (" [UgmTok]prefix_replacements_ is null or empty!" , ORT_RUNTIME_EXCEPTION);
527+ }
524528 const char * prefix_replacement = &prefix_replacements_[longest_prefix_offset];
525- return {prefix_replacement, static_cast <int >(longest_prefix_length)};
529+ // Calculate safe length: find null terminator within remaining buffer
530+ size_t max_len = prefix_replacements_size_ - longest_prefix_offset;
531+ size_t replacement_len = 0 ;
532+ while (replacement_len < max_len && prefix_replacement[replacement_len] != ' \0 ' ) {
533+ replacement_len++;
534+ }
535+ if (replacement_len == max_len) {
536+ ORTX_CXX_API_THROW (" [UgmTok]Replacement string not null-terminated in precompiled charsmap!" , ORT_RUNTIME_EXCEPTION);
537+ }
538+ // Additional safety: if empty replacement, return original input segment
539+ if (replacement_len == 0 ) {
540+ return {input_view.substr (0 , longest_prefix_length), static_cast <int >(longest_prefix_length)};
541+ }
542+ return {std::string_view (prefix_replacement, replacement_len), static_cast <int >(longest_prefix_length)};
526543 } else {
527544 // if yes, return this sequence unmodified
528545 size_t prefix_offset = ustring::UTF8Len (input_view[0 ]);
@@ -555,9 +572,12 @@ struct SpmUgmTokenizer {
555572
556573 while (!input_view.empty ()) {
557574 auto p = case_encoder_->NormalizePrefix (input_view);
575+
576+ // Copy the string_view data to avoid dangling pointer issues
577+ std::string normalized_str (p.first );
558578
559- for (size_t i = 0 ; i < p. first .size (); i++) {
560- char c = p. first [i];
579+ for (size_t i = 0 ; i < normalized_str .size (); i++) {
580+ char c = normalized_str [i];
561581 if (c != ' ' ) {
562582 if (!processing_non_ws) {
563583 processing_non_ws = true ;
@@ -674,8 +694,29 @@ struct SpmUgmTokenizer {
674694 if (longest_prefix_offset >= prefix_replacements_size_) {
675695 ORTX_CXX_API_THROW (" [UgmTok]Index out of array bounds in precompiled charsmap!" , ORT_RUNTIME_EXCEPTION);
676696 }
697+ // Validate that we have valid prefix_replacements_ data
698+ if (prefix_replacements_ == nullptr || prefix_replacements_size_ == 0 ) {
699+ ORTX_CXX_API_THROW (" [UgmTok]prefix_replacements_ is null or empty!" , ORT_RUNTIME_EXCEPTION);
700+ }
677701 const char * prefix_replacement = &prefix_replacements_[longest_prefix_offset];
678- return {prefix_replacement, strlen (prefix_replacement), longest_prefix_length};
702+ // Calculate safe length: find null terminator within remaining buffer
703+ size_t max_len = prefix_replacements_size_ - longest_prefix_offset;
704+ size_t replacement_len = 0 ;
705+ while (replacement_len < max_len && prefix_replacement[replacement_len] != ' \0 ' ) {
706+ replacement_len++;
707+ }
708+ if (replacement_len == max_len) {
709+ ORTX_CXX_API_THROW (" [UgmTok]Replacement string not null-terminated in precompiled charsmap!" , ORT_RUNTIME_EXCEPTION);
710+ }
711+ // Additional safety: if empty replacement, return original input segment
712+ if (replacement_len == 0 ) {
713+ size_t prefix_offset = input_offset + ustring::UTF8Len (input[input_offset]);
714+ if (prefix_offset <= input.size ()) {
715+ return {&input[input_offset], prefix_offset - input_offset, longest_prefix_length};
716+ }
717+ return {" \xEF\xBF\xBD " , 3 , 1 };
718+ }
719+ return {prefix_replacement, replacement_len, longest_prefix_length};
679720 } else {
680721 // if yes, return this sequence unmodified
681722 size_t prefix_offset = input_offset + ustring::UTF8Len (input[input_offset]);
0 commit comments