Skip to content

Commit d47b417

Browse files
Copilotstephentoub
andcommitted
Add tests for large input BPE optimization
Co-authored-by: stephentoub <2642209+stephentoub@users.noreply.github.com>
1 parent d2d5f40 commit d47b417

File tree

9 files changed

+98
-15
lines changed

9 files changed

+98
-15
lines changed

src/Microsoft.ML.Tokenizers/Utils/ByteToUnicodeEncoding.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Licensed to the .NET Foundation under one or more agreements.
1+
// Licensed to the .NET Foundation under one or more agreements.
22
// The .NET Foundation licenses this file to you under the MIT license.
33
// See the LICENSE file in the project root for more information.
44

src/Microsoft.ML.Tokenizers/Utils/Helpers.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Licensed to the .NET Foundation under one or more agreements.
1+
// Licensed to the .NET Foundation under one or more agreements.
22
// The .NET Foundation licenses this file to you under the MIT license.
33
// See the LICENSE file in the project root for more information.
44

src/Microsoft.ML.Tokenizers/Utils/Helpers.netcoreapp.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@
77
using System.Diagnostics;
88
using System.Globalization;
99
using System.IO;
10+
using System.Net.Http;
1011
using System.Text;
11-
using System.Threading.Tasks;
1212
using System.Threading;
13-
using System.Net.Http;
13+
using System.Threading.Tasks;
1414

1515
#if Test
1616
namespace Microsoft.ML.Tokenizers.Tests

src/Microsoft.ML.Tokenizers/Utils/OrdinalUtf8StringComparer.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Licensed to the .NET Foundation under one or more agreements.
1+
// Licensed to the .NET Foundation under one or more agreements.
22
// The .NET Foundation licenses this file to you under the MIT license.
33
// See the LICENSE file in the project root for more information.
44

test/Microsoft.ML.Tokenizers.Tests/EnglishRobertaTests.cs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,11 @@
33
// See the LICENSE file in the project root for more information.
44

55
using System;
6-
using System.IO;
6+
using System.Buffers;
77
using System.Collections.Generic;
8+
using System.IO;
89
using System.Linq;
9-
1010
using Xunit;
11-
using System.Buffers;
1211

1312
namespace Microsoft.ML.Tokenizers.Tests
1413
{

test/Microsoft.ML.Tokenizers.Tests/LlamaTests.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
// The .NET Foundation licenses this file to you under the MIT license.
33
// See the LICENSE file in the project root for more information.
44

5-
using Microsoft.ML.Tokenizers;
65
using System;
76
using System.Buffers;
87
using System.Collections.Generic;
@@ -12,6 +11,7 @@
1211
using System.Reflection;
1312
using System.Runtime.CompilerServices;
1413
using System.Text;
14+
using Microsoft.ML.Tokenizers;
1515
using Xunit;
1616

1717
namespace Microsoft.ML.Tokenizers.Tests

test/Microsoft.ML.Tokenizers.Tests/NormalizerTests.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
// Licensed to the .NET Foundation under one or more agreements.
1+
// Licensed to the .NET Foundation under one or more agreements.
22
// The .NET Foundation licenses this file to you under the MIT license.
33
// See the LICENSE file in the project root for more information.
44

5-
using Microsoft.ML.Tokenizers;
65
using System;
76
using System.Collections.Generic;
87
using System.Linq;
98
using System.Text;
9+
using Microsoft.ML.Tokenizers;
1010
using Xunit;
1111

1212
namespace Microsoft.ML.Tokenizers.Tests

test/Microsoft.ML.Tokenizers.Tests/PreTokenizerTests.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
// Licensed to the .NET Foundation under one or more agreements.
1+
// Licensed to the .NET Foundation under one or more agreements.
22
// The .NET Foundation licenses this file to you under the MIT license.
33
// See the LICENSE file in the project root for more information.
44

5-
using Microsoft.ML.Tokenizers;
65
using System;
7-
using System.Linq;
86
using System.Collections.Generic;
7+
using System.Linq;
8+
using Microsoft.ML.Tokenizers;
99
using Xunit;
1010

1111
namespace Microsoft.ML.Tokenizers.Tests

test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
// The .NET Foundation licenses this file to you under the MIT license.
33
// See the LICENSE file in the project root for more information.
44

5-
using Microsoft.DotNet.RemoteExecutor;
65
using System;
76
using System.Buffers;
87
using System.Collections.Generic;
@@ -13,6 +12,7 @@
1312
using System.Text;
1413
using System.Text.Json;
1514
using System.Threading.Tasks;
15+
using Microsoft.DotNet.RemoteExecutor;
1616
using Xunit;
1717

1818
namespace Microsoft.ML.Tokenizers.Tests
@@ -848,6 +848,90 @@ public void TestOss()
848848

849849
private static IReadOnlyDictionary<string, int>? GetVocabulary(TiktokenTokenizer tiktoken)
850850
=> typeof(TiktokenTokenizer).GetProperty("Vocabulary", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<string, int>;
851+
852+
[Fact]
853+
public void TestLargeInputOptimization()
854+
{
855+
// Test that large inputs (>128 bytes) are handled correctly and produce same results as small input path
856+
// This tests the heap-based algorithm added for performance
857+
858+
// Test with repeated characters - this is the adversarial case that caused O(n^2) behavior
859+
string largeRepeatedInput = new string('a', 1000);
860+
IReadOnlyList<int> ids = GPT4.EncodeToIds(largeRepeatedInput);
861+
string decoded = GPT4.Decode(ids);
862+
Assert.Equal(largeRepeatedInput, decoded);
863+
864+
// Test with a more realistic large input
865+
string largeMixedInput = string.Join(" ", Enumerable.Repeat("Hello World! This is a test.", 50));
866+
IReadOnlyList<int> mixedIds = GPT4.EncodeToIds(largeMixedInput);
867+
string mixedDecoded = GPT4.Decode(mixedIds);
868+
Assert.Equal(largeMixedInput, mixedDecoded);
869+
870+
// Test boundary case - exactly at threshold (128)
871+
string boundaryInput = new string('x', 128);
872+
IReadOnlyList<int> boundaryIds = GPT4.EncodeToIds(boundaryInput);
873+
string boundaryDecoded = GPT4.Decode(boundaryIds);
874+
Assert.Equal(boundaryInput, boundaryDecoded);
875+
876+
// Test just below threshold (127)
877+
string belowThresholdInput = new string('x', 127);
878+
IReadOnlyList<int> belowIds = GPT4.EncodeToIds(belowThresholdInput);
879+
string belowDecoded = GPT4.Decode(belowIds);
880+
Assert.Equal(belowThresholdInput, belowDecoded);
881+
882+
// Test just above threshold (129)
883+
string aboveThresholdInput = new string('x', 129);
884+
IReadOnlyList<int> aboveIds = GPT4.EncodeToIds(aboveThresholdInput);
885+
string aboveDecoded = GPT4.Decode(aboveIds);
886+
Assert.Equal(aboveThresholdInput, aboveDecoded);
887+
}
888+
889+
[Theory]
890+
[InlineData(200)]
891+
[InlineData(500)]
892+
[InlineData(1000)]
893+
[InlineData(2000)]
894+
public void TestLargeInputConsistency(int length)
895+
{
896+
// Verify that large and small inputs with same content produce identical tokens
897+
// This ensures the heap-based algorithm produces the same results as the original
898+
899+
string input = new string('z', length);
900+
IReadOnlyList<int> ids = GPT4.EncodeToIds(input);
901+
902+
// Verify round-trip
903+
string decoded = GPT4.Decode(ids);
904+
Assert.Equal(input, decoded);
905+
906+
// Verify with EncodingToTokens as well
907+
IReadOnlyList<EncodedToken> tokens = GPT4.EncodeToTokens(input, out string? normalizedText);
908+
Assert.Null(normalizedText); // No normalization expected
909+
910+
// Reconstruct from tokens
911+
var reconstructed = string.Concat(tokens.Select(t => t.Value));
912+
Assert.Equal(input, reconstructed);
913+
}
914+
915+
[Fact]
916+
public void TestLargeInputPerformance()
917+
{
918+
// Test that very large inputs complete in reasonable time
919+
// This would timeout or take extremely long with O(n^2) algorithm
920+
921+
string veryLargeInput = new string('a', 5000);
922+
var stopwatch = System.Diagnostics.Stopwatch.StartNew();
923+
IReadOnlyList<int> ids = GPT4.EncodeToIds(veryLargeInput);
924+
stopwatch.Stop();
925+
926+
// Should complete in well under a second even for 5000 chars
927+
// With O(n^2) this could take several seconds
928+
Assert.True(stopwatch.ElapsedMilliseconds < 5000,
929+
$"Large input encoding took {stopwatch.ElapsedMilliseconds}ms, expected < 5000ms");
930+
931+
// Verify correctness
932+
string decoded = GPT4.Decode(ids);
933+
Assert.Equal(veryLargeInput, decoded);
934+
}
851935
}
852936
}
853937

0 commit comments

Comments
 (0)