@@ -896,20 +896,27 @@ public void TestLargeInputConsistency(int length)
896896 // Verify that large and small inputs with same content produce identical tokens
897897 // This ensures the heap-based algorithm produces the same results as the original
898898
899- string input = new string ( 'z' , length ) ;
900- IReadOnlyList < int > ids = GPT4 . EncodeToIds ( input ) ;
899+ // Test with repeated character
900+ string inputRepeated = new string ( 'z' , length ) ;
901+ IReadOnlyList < int > idsRepeated = GPT4 . EncodeToIds ( inputRepeated ) ;
901902
902903 // Verify round-trip
903- string decoded = GPT4 . Decode ( ids ) ;
904- Assert . Equal ( input , decoded ) ;
904+ string decodedRepeated = GPT4 . Decode ( idsRepeated ) ;
905+ Assert . Equal ( inputRepeated , decodedRepeated ) ;
906+
907+ // Test with mixed content (more realistic scenario)
908+ string inputMixed = string . Join ( " " , Enumerable . Repeat ( "Hello World! Test123" , length / 20 + 1 ) ) . Substring ( 0 , length ) ;
909+ IReadOnlyList < int > idsMixed = GPT4 . EncodeToIds ( inputMixed ) ;
910+ string decodedMixed = GPT4 . Decode ( idsMixed ) ;
911+ Assert . Equal ( inputMixed , decodedMixed ) ;
905912
906913 // Verify with EncodingToTokens as well
907- IReadOnlyList < EncodedToken > tokens = GPT4 . EncodeToTokens ( input , out string ? normalizedText ) ;
914+ IReadOnlyList < EncodedToken > tokens = GPT4 . EncodeToTokens ( inputRepeated , out string ? normalizedText ) ;
908915 Assert . Null ( normalizedText ) ; // No normalization expected
909916
910917 // Reconstruct from tokens
911918 var reconstructed = string . Concat ( tokens . Select ( t => t . Value ) ) ;
912- Assert . Equal ( input , reconstructed ) ;
919+ Assert . Equal ( inputRepeated , reconstructed ) ;
913920 }
914921
915922 [ Fact ]
0 commit comments