Skip to content

Commit dbd8ad2

Browse files
Copilotstephentoub
andcommitted
Address code review feedback - improve test coverage and optimize heap capacity
Co-authored-by: stephentoub <2642209+stephentoub@users.noreply.github.com>
1 parent d47b417 commit dbd8ad2

File tree

2 files changed

+14
-7
lines changed

2 files changed

+14
-7
lines changed

src/Microsoft.ML.Tokenizers/Utils/BytePairEncoder.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ private static (int Id, int TokenIndex, int TokenLength)[] BytePairEncodeLarge(R
166166
CurRank = int.MaxValue
167167
};
168168

169-
var heap = new PriorityQueue<MergeEntry>(mergingBytes.Length);
169+
var heap = new PriorityQueue<MergeEntry>(mergingBytes.Length - 1);
170170

171171
for (int i = 0; i < mergingBytes.Length - 1; i++)
172172
{

test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -896,20 +896,27 @@ public void TestLargeInputConsistency(int length)
896896
// Verify that large and small inputs with same content produce identical tokens
897897
// This ensures the heap-based algorithm produces the same results as the original
898898

899-
string input = new string('z', length);
900-
IReadOnlyList<int> ids = GPT4.EncodeToIds(input);
899+
// Test with repeated character
900+
string inputRepeated = new string('z', length);
901+
IReadOnlyList<int> idsRepeated = GPT4.EncodeToIds(inputRepeated);
901902

902903
// Verify round-trip
903-
string decoded = GPT4.Decode(ids);
904-
Assert.Equal(input, decoded);
904+
string decodedRepeated = GPT4.Decode(idsRepeated);
905+
Assert.Equal(inputRepeated, decodedRepeated);
906+
907+
// Test with mixed content (more realistic scenario)
908+
string inputMixed = string.Join(" ", Enumerable.Repeat("Hello World! Test123", length / 20 + 1)).Substring(0, length);
909+
IReadOnlyList<int> idsMixed = GPT4.EncodeToIds(inputMixed);
910+
string decodedMixed = GPT4.Decode(idsMixed);
911+
Assert.Equal(inputMixed, decodedMixed);
905912

906913
// Verify with EncodingToTokens as well
907-
IReadOnlyList<EncodedToken> tokens = GPT4.EncodeToTokens(input, out string? normalizedText);
914+
IReadOnlyList<EncodedToken> tokens = GPT4.EncodeToTokens(inputRepeated, out string? normalizedText);
908915
Assert.Null(normalizedText); // No normalization expected
909916

910917
// Reconstruct from tokens
911918
var reconstructed = string.Concat(tokens.Select(t => t.Value));
912-
Assert.Equal(input, reconstructed);
919+
Assert.Equal(inputRepeated, reconstructed);
913920
}
914921

915922
[Fact]

0 commit comments

Comments
 (0)