@@ -521,8 +521,21 @@ struct SpmUgmTokenizer {
521521 if (longest_prefix_offset >= prefix_replacements_size_) {
522522 ORTX_CXX_API_THROW (" [UgmTok]Index out of array bounds in precompiled charsmap!" , ORT_RUNTIME_EXCEPTION);
523523 }
524+ // Validate that we have valid prefix_replacements_ data
525+ if (prefix_replacements_ == nullptr || prefix_replacements_size_ == 0 ) {
526+ ORTX_CXX_API_THROW (" [UgmTok]prefix_replacements_ is null or empty!" , ORT_RUNTIME_EXCEPTION);
527+ }
524528 const char * prefix_replacement = &prefix_replacements_[longest_prefix_offset];
525- return {prefix_replacement, static_cast <int >(longest_prefix_length)};
529+ // Calculate safe length: find null terminator within remaining buffer
530+ size_t max_len = prefix_replacements_size_ - longest_prefix_offset;
531+ size_t replacement_len = 0 ;
532+ while (replacement_len < max_len && prefix_replacement[replacement_len] != ' \0 ' ) {
533+ replacement_len++;
534+ }
535+ if (replacement_len == max_len) {
536+ ORTX_CXX_API_THROW (" [UgmTok]Replacement string not null-terminated in precompiled charsmap!" , ORT_RUNTIME_EXCEPTION);
537+ }
538+ return {std::string_view (prefix_replacement, replacement_len), static_cast <int >(longest_prefix_length)};
526539 } else {
527540 // if yes, return this sequence unmodified
528541 size_t prefix_offset = ustring::UTF8Len (input_view[0 ]);
@@ -674,8 +687,21 @@ struct SpmUgmTokenizer {
674687 if (longest_prefix_offset >= prefix_replacements_size_) {
675688 ORTX_CXX_API_THROW (" [UgmTok]Index out of array bounds in precompiled charsmap!" , ORT_RUNTIME_EXCEPTION);
676689 }
690+ // Validate that we have valid prefix_replacements_ data
691+ if (prefix_replacements_ == nullptr || prefix_replacements_size_ == 0 ) {
692+ ORTX_CXX_API_THROW (" [UgmTok]prefix_replacements_ is null or empty!" , ORT_RUNTIME_EXCEPTION);
693+ }
677694 const char * prefix_replacement = &prefix_replacements_[longest_prefix_offset];
678- return {prefix_replacement, strlen (prefix_replacement), longest_prefix_length};
695+ // Calculate safe length: find null terminator within remaining buffer
696+ size_t max_len = prefix_replacements_size_ - longest_prefix_offset;
697+ size_t replacement_len = 0 ;
698+ while (replacement_len < max_len && prefix_replacement[replacement_len] != ' \0 ' ) {
699+ replacement_len++;
700+ }
701+ if (replacement_len == max_len) {
702+ ORTX_CXX_API_THROW (" [UgmTok]Replacement string not null-terminated in precompiled charsmap!" , ORT_RUNTIME_EXCEPTION);
703+ }
704+ return {prefix_replacement, replacement_len, longest_prefix_length};
679705 } else {
680706 // if yes, return this sequence unmodified
681707 size_t prefix_offset = input_offset + ustring::UTF8Len (input[input_offset]);
0 commit comments