Add explanatory comments for threshold and heap capacity choices

Copilot · stephentoub · Copilot · commit 8c0543d251f1 · 2026-02-13T01:47:47.000Z
Co-authored-by: stephentoub &lt;2642209+stephentoub@users.noreply.github.com&gt;
diff --git a/src/Microsoft.ML.Tokenizers/Utils/BytePairEncoder.cs b/src/Microsoft.ML.Tokenizers/Utils/BytePairEncoder.cs
@@ -20,6 +20,10 @@ public static (int Id, int TokenIndex, int TokenLength)[] BytePairEncode(ReadOnl
                 return [(ranks[mergingBytes], 0, 1)];
             }
 
+            // For large inputs, use heap-based algorithm to avoid O(n²) behavior.
+            // Threshold of 128 chosen empirically: linear scan is cache-friendly for small inputs,
+            // while heap overhead (O(log n) per operation) becomes worthwhile for larger inputs.
+            // Based on upstream tiktoken using 100, adjusted upward for C#'s efficient span operations.
             if (mergingBytes.Length > 128)
             {
                 return BytePairEncodeLarge(mergingBytes, ranks, indexMappingSpan);
@@ -166,6 +170,9 @@ private static (int Id, int TokenIndex, int TokenLength)[] BytePairEncodeLarge(R
                 CurRank = int.MaxValue
             };
 
+            // Initial capacity: in the worst case, every adjacent pair is a valid merge candidate.
+            // In practice, many pairs won't be in the vocabulary, so this over-allocates slightly,
+            // but List resizing is cheap and this avoids multiple reallocations during initialization.
             var heap = new PriorityQueue<MergeEntry>(mergingBytes.Length - 1);
 
             for (int i = 0; i < mergingBytes.Length - 1; i++)