@@ -449,8 +449,14 @@ struct SpmUgmTokenizer {
449449
450450 for (size_t input_offset = 0 ; input_offset < input_len;) {
451451 auto norm_res = NormalizePrefix (input, input_offset);
452- for (size_t i = 0 ; i < norm_res.normalized_len ; i++) {
453- char c = norm_res.normalized [i];
452+ // Validate the result before using it
453+ if (norm_res.normalized == nullptr && norm_res.normalized_len > 0 ) {
454+ ORTX_CXX_API_THROW (" [UgmTok]NormalizePrefix returned null pointer with non-zero length!" , ORT_RUNTIME_EXCEPTION);
455+ }
456+ // Copy normalized data immediately to be safe
457+ std::string normalized_segment (norm_res.normalized , norm_res.normalized_len );
458+ for (size_t i = 0 ; i < normalized_segment.size (); i++) {
459+ char c = normalized_segment[i];
454460 if (c != ' ' ) {
455461 if (!processing_non_ws) {
456462 processing_non_ws = true ;
@@ -521,8 +527,25 @@ struct SpmUgmTokenizer {
521527 if (longest_prefix_offset >= prefix_replacements_size_) {
522528 ORTX_CXX_API_THROW (" [UgmTok]Index out of array bounds in precompiled charsmap!" , ORT_RUNTIME_EXCEPTION);
523529 }
530+ // Validate that we have valid prefix_replacements_ data
531+ if (prefix_replacements_ == nullptr || prefix_replacements_size_ == 0 ) {
532+ ORTX_CXX_API_THROW (" [UgmTok]prefix_replacements_ is null or empty!" , ORT_RUNTIME_EXCEPTION);
533+ }
524534 const char * prefix_replacement = &prefix_replacements_[longest_prefix_offset];
525- return {prefix_replacement, static_cast <int >(longest_prefix_length)};
535+ // Calculate safe length: find null terminator within remaining buffer
536+ size_t max_len = prefix_replacements_size_ - longest_prefix_offset;
537+ size_t replacement_len = 0 ;
538+ while (replacement_len < max_len && prefix_replacement[replacement_len] != ' \0 ' ) {
539+ replacement_len++;
540+ }
541+ if (replacement_len == max_len) {
542+ ORTX_CXX_API_THROW (" [UgmTok]Replacement string not null-terminated in precompiled charsmap!" , ORT_RUNTIME_EXCEPTION);
543+ }
544+ // Additional safety: if empty replacement, return original input segment
545+ if (replacement_len == 0 ) {
546+ return {input_view.substr (0 , longest_prefix_length), static_cast <int >(longest_prefix_length)};
547+ }
548+ return {std::string_view (prefix_replacement, replacement_len), static_cast <int >(longest_prefix_length)};
526549 } else {
527550 // if yes, return this sequence unmodified
528551 size_t prefix_offset = ustring::UTF8Len (input_view[0 ]);
@@ -555,9 +578,12 @@ struct SpmUgmTokenizer {
555578
556579 while (!input_view.empty ()) {
557580 auto p = case_encoder_->NormalizePrefix (input_view);
581+
582+ // Copy the string_view data to avoid dangling pointer issues
583+ std::string normalized_str (p.first );
558584
559- for (size_t i = 0 ; i < p. first .size (); i++) {
560- char c = p. first [i];
585+ for (size_t i = 0 ; i < normalized_str .size (); i++) {
586+ char c = normalized_str [i];
561587 if (c != ' ' ) {
562588 if (!processing_non_ws) {
563589 processing_non_ws = true ;
@@ -674,8 +700,29 @@ struct SpmUgmTokenizer {
674700 if (longest_prefix_offset >= prefix_replacements_size_) {
675701 ORTX_CXX_API_THROW (" [UgmTok]Index out of array bounds in precompiled charsmap!" , ORT_RUNTIME_EXCEPTION);
676702 }
703+ // Validate that we have valid prefix_replacements_ data
704+ if (prefix_replacements_ == nullptr || prefix_replacements_size_ == 0 ) {
705+ ORTX_CXX_API_THROW (" [UgmTok]prefix_replacements_ is null or empty!" , ORT_RUNTIME_EXCEPTION);
706+ }
677707 const char * prefix_replacement = &prefix_replacements_[longest_prefix_offset];
678- return {prefix_replacement, strlen (prefix_replacement), longest_prefix_length};
708+ // Calculate safe length: find null terminator within remaining buffer
709+ size_t max_len = prefix_replacements_size_ - longest_prefix_offset;
710+ size_t replacement_len = 0 ;
711+ while (replacement_len < max_len && prefix_replacement[replacement_len] != ' \0 ' ) {
712+ replacement_len++;
713+ }
714+ if (replacement_len == max_len) {
715+ ORTX_CXX_API_THROW (" [UgmTok]Replacement string not null-terminated in precompiled charsmap!" , ORT_RUNTIME_EXCEPTION);
716+ }
717+ // Additional safety: if empty replacement, return original input segment
718+ if (replacement_len == 0 ) {
719+ size_t prefix_offset = input_offset + ustring::UTF8Len (input[input_offset]);
720+ if (prefix_offset <= input.size ()) {
721+ return {&input[input_offset], prefix_offset - input_offset, longest_prefix_length};
722+ }
723+ return {" \xEF\xBF\xBD " , 3 , 1 };
724+ }
725+ return {prefix_replacement, replacement_len, longest_prefix_length};
679726 } else {
680727 // if yes, return this sequence unmodified
681728 size_t prefix_offset = input_offset + ustring::UTF8Len (input[input_offset]);
0 commit comments