Add tests for large input BPE optimization

Copilot · stephentoub · Copilot · commit d47b417d6aa6 · 2026-02-12T22:54:09.000Z
Co-authored-by: stephentoub &lt;2642209+stephentoub@users.noreply.github.com&gt;
diff --git a/src/Microsoft.ML.Tokenizers/Utils/ByteToUnicodeEncoding.cs b/src/Microsoft.ML.Tokenizers/Utils/ByteToUnicodeEncoding.cs
@@ -1,4 +1,4 @@
-// Licensed to the .NET Foundation under one or more agreements.
+﻿// Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
diff --git a/src/Microsoft.ML.Tokenizers/Utils/Helpers.cs b/src/Microsoft.ML.Tokenizers/Utils/Helpers.cs
@@ -1,4 +1,4 @@
-// Licensed to the .NET Foundation under one or more agreements.
+﻿// Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
diff --git a/src/Microsoft.ML.Tokenizers/Utils/Helpers.netcoreapp.cs b/src/Microsoft.ML.Tokenizers/Utils/Helpers.netcoreapp.cs
@@ -7,10 +7,10 @@
 using System.Diagnostics;
 using System.Globalization;
 using System.IO;
+using System.Net.Http;
 using System.Text;
-using System.Threading.Tasks;
 using System.Threading;
-using System.Net.Http;
+using System.Threading.Tasks;
 
 #if Test
 namespace Microsoft.ML.Tokenizers.Tests
diff --git a/src/Microsoft.ML.Tokenizers/Utils/OrdinalUtf8StringComparer.cs b/src/Microsoft.ML.Tokenizers/Utils/OrdinalUtf8StringComparer.cs
@@ -1,4 +1,4 @@
-// Licensed to the .NET Foundation under one or more agreements.
+﻿// Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
diff --git a/test/Microsoft.ML.Tokenizers.Tests/EnglishRobertaTests.cs b/test/Microsoft.ML.Tokenizers.Tests/EnglishRobertaTests.cs
@@ -3,12 +3,11 @@
 // See the LICENSE file in the project root for more information.
 
 using System;
-using System.IO;
+using System.Buffers;
 using System.Collections.Generic;
+using System.IO;
 using System.Linq;
-
 using Xunit;
-using System.Buffers;
 
 namespace Microsoft.ML.Tokenizers.Tests
 {
diff --git a/test/Microsoft.ML.Tokenizers.Tests/LlamaTests.cs b/test/Microsoft.ML.Tokenizers.Tests/LlamaTests.cs
@@ -2,7 +2,6 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
-using Microsoft.ML.Tokenizers;
 using System;
 using System.Buffers;
 using System.Collections.Generic;
@@ -12,6 +11,7 @@
 using System.Reflection;
 using System.Runtime.CompilerServices;
 using System.Text;
+using Microsoft.ML.Tokenizers;
 using Xunit;
 
 namespace Microsoft.ML.Tokenizers.Tests
diff --git a/test/Microsoft.ML.Tokenizers.Tests/NormalizerTests.cs b/test/Microsoft.ML.Tokenizers.Tests/NormalizerTests.cs
@@ -1,12 +1,12 @@
-// Licensed to the .NET Foundation under one or more agreements.
+﻿// Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
-using Microsoft.ML.Tokenizers;
 using System;
 using System.Collections.Generic;
 using System.Linq;
 using System.Text;
+using Microsoft.ML.Tokenizers;
 using Xunit;
 
 namespace Microsoft.ML.Tokenizers.Tests
diff --git a/test/Microsoft.ML.Tokenizers.Tests/PreTokenizerTests.cs b/test/Microsoft.ML.Tokenizers.Tests/PreTokenizerTests.cs
@@ -1,11 +1,11 @@
-// Licensed to the .NET Foundation under one or more agreements.
+﻿// Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
-using Microsoft.ML.Tokenizers;
 using System;
-using System.Linq;
 using System.Collections.Generic;
+using System.Linq;
+using Microsoft.ML.Tokenizers;
 using Xunit;
 
 namespace Microsoft.ML.Tokenizers.Tests
diff --git a/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs b/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs
@@ -2,7 +2,6 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
-using Microsoft.DotNet.RemoteExecutor;
 using System;
 using System.Buffers;
 using System.Collections.Generic;
@@ -13,6 +12,7 @@
 using System.Text;
 using System.Text.Json;
 using System.Threading.Tasks;
+using Microsoft.DotNet.RemoteExecutor;
 using Xunit;
 
 namespace Microsoft.ML.Tokenizers.Tests
@@ -848,6 +848,90 @@ public void TestOss()
 
         private static IReadOnlyDictionary<string, int>? GetVocabulary(TiktokenTokenizer tiktoken)
             => typeof(TiktokenTokenizer).GetProperty("Vocabulary", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<string, int>;
+
+        [Fact]
+        public void TestLargeInputOptimization()
+        {
+            // Test that large inputs (>128 bytes) are handled correctly and produce same results as small input path
+            // This tests the heap-based algorithm added for performance
+
+            // Test with repeated characters - this is the adversarial case that caused O(n^2) behavior
+            string largeRepeatedInput = new string('a', 1000);
+            IReadOnlyList<int> ids = GPT4.EncodeToIds(largeRepeatedInput);
+            string decoded = GPT4.Decode(ids);
+            Assert.Equal(largeRepeatedInput, decoded);
+
+            // Test with a more realistic large input
+            string largeMixedInput = string.Join(" ", Enumerable.Repeat("Hello World! This is a test.", 50));
+            IReadOnlyList<int> mixedIds = GPT4.EncodeToIds(largeMixedInput);
+            string mixedDecoded = GPT4.Decode(mixedIds);
+            Assert.Equal(largeMixedInput, mixedDecoded);
+
+            // Test boundary case - exactly at threshold (128)
+            string boundaryInput = new string('x', 128);
+            IReadOnlyList<int> boundaryIds = GPT4.EncodeToIds(boundaryInput);
+            string boundaryDecoded = GPT4.Decode(boundaryIds);
+            Assert.Equal(boundaryInput, boundaryDecoded);
+
+            // Test just below threshold (127)
+            string belowThresholdInput = new string('x', 127);
+            IReadOnlyList<int> belowIds = GPT4.EncodeToIds(belowThresholdInput);
+            string belowDecoded = GPT4.Decode(belowIds);
+            Assert.Equal(belowThresholdInput, belowDecoded);
+
+            // Test just above threshold (129)
+            string aboveThresholdInput = new string('x', 129);
+            IReadOnlyList<int> aboveIds = GPT4.EncodeToIds(aboveThresholdInput);
+            string aboveDecoded = GPT4.Decode(aboveIds);
+            Assert.Equal(aboveThresholdInput, aboveDecoded);
+        }
+
+        [Theory]
+        [InlineData(200)]
+        [InlineData(500)]
+        [InlineData(1000)]
+        [InlineData(2000)]
+        public void TestLargeInputConsistency(int length)
+        {
+            // Verify that large and small inputs with same content produce identical tokens
+            // This ensures the heap-based algorithm produces the same results as the original
+
+            string input = new string('z', length);
+            IReadOnlyList<int> ids = GPT4.EncodeToIds(input);
+
+            // Verify round-trip
+            string decoded = GPT4.Decode(ids);
+            Assert.Equal(input, decoded);
+
+            // Verify with EncodingToTokens as well
+            IReadOnlyList<EncodedToken> tokens = GPT4.EncodeToTokens(input, out string? normalizedText);
+            Assert.Null(normalizedText); // No normalization expected
+
+            // Reconstruct from tokens
+            var reconstructed = string.Concat(tokens.Select(t => t.Value));
+            Assert.Equal(input, reconstructed);
+        }
+
+        [Fact]
+        public void TestLargeInputPerformance()
+        {
+            // Test that very large inputs complete in reasonable time
+            // This would timeout or take extremely long with O(n^2) algorithm
+
+            string veryLargeInput = new string('a', 5000);
+            var stopwatch = System.Diagnostics.Stopwatch.StartNew();
+            IReadOnlyList<int> ids = GPT4.EncodeToIds(veryLargeInput);
+            stopwatch.Stop();
+
+            // Should complete in well under a second even for 5000 chars
+            // With O(n^2) this could take several seconds
+            Assert.True(stopwatch.ElapsedMilliseconds < 5000,
+                $"Large input encoding took {stopwatch.ElapsedMilliseconds}ms, expected < 5000ms");
+
+            // Verify correctness
+            string decoded = GPT4.Decode(ids);
+            Assert.Equal(veryLargeInput, decoded);
+        }
     }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-// Licensed to the .NET Foundation under one or more agreements.`
	`1`	`+// Licensed to the .NET Foundation under one or more agreements.`
`2`	`2`	`// The .NET Foundation licenses this file to you under the MIT license.`
`3`	`3`	`// See the LICENSE file in the project root for more information.`
`4`	`4`