- Implemented Tokenizer.IDsToTokens()

budgetdevv · budgetdevv · commit 907997ad6496 · 2024-10-12T07:29:03.000+08:00
- TempFixedAllocator's memory is now 128b-aligned

- Moved output data structures to Output directory
diff --git a/Codegen/Program.cs b/Codegen/Program.cs
@@ -5,6 +5,7 @@
 using System.Threading;
 using Tokenizers.NET;
 using Tokenizers.NET.Collections;
+using Tokenizers.NET.Outputs;
 
 namespace Codegen
 {
diff --git a/Native/src/lib.rs b/Native/src/lib.rs
@@ -1,9 +1,9 @@
+use std::string::String;
 use std::marker::PhantomData;
 use std::ptr::{ null, null_mut };
 use std::slice;
 use tokenizers::tokenizer::Tokenizer;
 use tokenizers::Encoding;
-
 // #[inline(always)] is used aggressively - Realistically we only have a few callsites.
 
 #[repr(C)]
@@ -445,6 +445,34 @@ pub unsafe extern "C" fn tokenizer_decode_core(
     return DecodeOutput::from_text(text);
 }
 
+#[no_mangle]
+#[inline(always)]
+pub unsafe extern "C" fn ids_to_tokens(
+    tokenizer_ptr: *mut Tokenizer,
+    id_buffer: NativeBuffer<u32>,
+    token_buffer: NativeBuffer<NativeBuffer<u8>>)
+    -> *mut DropHandle<Vec<String>>
+{
+    let tokenizer = &*tokenizer_ptr;
+
+    let mut token_buffers = Vec::with_capacity(id_buffer.length);
+
+    let mut current_token_ptr = token_buffer.ptr.mutable;
+
+    for id in id_buffer.as_slice()
+    {
+        let mut token = tokenizer.id_to_token(*id).unwrap();
+
+        *current_token_ptr = NativeBuffer::from_mutable_vec(token.as_mut_vec());
+
+        current_token_ptr = current_token_ptr.add(1);
+
+        token_buffers.push(token);
+    }
+
+    return DropHandle::from_value_and_allocate_box(token_buffers);
+}
+
 #[no_mangle]
 #[inline(always)]
 pub unsafe extern "C" fn free_with_handle(handle: *mut DropHandle<()>)
diff --git a/Sample/Program.cs b/Sample/Program.cs
@@ -47,7 +47,7 @@ private static void Main(string[] args)
             
             foreach (var token in outputSpan)
             {
-                const bool TEST_OVERFLOW = true;
+                const bool TEST_OVERFLOW = false;
                 
                 if (TEST_OVERFLOW)
                 {
diff --git a/Tests/DecodeTests.cs b/Tests/DecodeTests.cs
@@ -1,3 +1,4 @@
+using System.Text;
 using Allure.NUnit;
 using FluentAssertions;
 using Tokenizers.NET;
@@ -107,5 +108,33 @@ public void DecodeMutatingStressTest()
                 x.Should().Be(text);
             }
         }
+
+        [Test]
+        public void IDsToTokens()
+        {
+            ref var tokenizer = ref FlorenceTokenizer;
+            
+            const nuint MAX_VALUE = 500;
+            
+            var stringBuilder = new StringBuilder();
+            
+            for (nuint i = 1; i <= MAX_VALUE; i++)
+            {
+                var text = AllocateStringWithRandomChars((int) i);
+                
+                using var tokenizeResult = tokenizer.Tokenize(text, addSpecialTokens: false);
+                
+                var tokens = tokenizer.IDsToTokens(tokenizeResult.IDs);
+                
+                foreach (var token in tokens)
+                {
+                    stringBuilder.Append(token.Replace('Ġ', ' '));
+                }
+                
+                stringBuilder.ToString().Should().Be(text);
+                
+                stringBuilder.Clear();
+            }
+        }
     }
 }
diff --git a/Tests/EncodeTests.cs b/Tests/EncodeTests.cs
@@ -3,6 +3,7 @@
 using FluentAssertions;
 using Tokenizers.NET;
 using Tokenizers.NET.Collections;
+using Tokenizers.NET.Outputs;
 
 namespace Tests
 {
diff --git a/Tokenizers.NET/Helpers/ThrowHelpers.cs b/Tokenizers.NET/Helpers/ThrowHelpers.cs
@@ -32,5 +32,11 @@ public static void UTF8EncodingPirated_GetMaxCharCount_OutOfRange()
         {
             throw new InvalidOperationException("Too many bytes. The resulting number of chars is larger than what can be returned as an int.");
         }
+
+        [DoesNotReturn]
+        public static void IDsToTokens_LengthCheckFailed()
+        {
+            throw new ArgumentException("Output Span / Buffer length must be more than or equal to the input length.");
+        }
     }
 }
diff --git a/Tokenizers.NET/Outputs/DecodeOutput.cs b/Tokenizers.NET/Outputs/DecodeOutput.cs
@@ -4,7 +4,7 @@
 using System.Text;
 using Tokenizers.NET.Collections;
 
-namespace Tokenizers.NET
+namespace Tokenizers.NET.Outputs
 {
     [StructLayout(LayoutKind.Sequential)]
     public readonly struct DecodeOutput: IDisposable
diff --git a/Tokenizers.NET/Outputs/FreeHandle.cs b/Tokenizers.NET/Outputs/FreeHandle.cs
@@ -0,0 +1,14 @@
+using System;
+using System.Runtime.CompilerServices;
+
+namespace Tokenizers.NET.Outputs
+{
+    public readonly struct FreeHandle(nint handle): IDisposable
+    {
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public void Dispose()
+        {
+            TokenizerNativeMethods.FreeWithHandle(handle);
+        }
+    }
+}
diff --git a/Tokenizers.NET/Outputs/TokenizeOutput.cs b/Tokenizers.NET/Outputs/TokenizeOutput.cs
@@ -2,11 +2,8 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using Tokenizers.NET.Collections;
-#if DEBUG
-using System.Diagnostics;
-#endif
 
-namespace Tokenizers.NET
+namespace Tokenizers.NET.Outputs
 {
     public interface ITokenizeOutput
     {
diff --git a/Tokenizers.NET/Tokenizer.cs b/Tokenizers.NET/Tokenizer.cs
@@ -9,6 +9,7 @@
 using Tokenizers.NET.Collections;
 using Tokenizers.NET.Enumerators;
 using Tokenizers.NET.Helpers;
+using Tokenizers.NET.Outputs;
 
 namespace Tokenizers.NET
 {
@@ -171,15 +172,28 @@ private static readonly int
 
             private readonly int Count;
 
+            // Modern cacheline size is either 64 or 128 bytes,
+            // reducing cross-cacheline reads for SIMD instructions.
+            // This should also satisfy the alignment for NativeBuffer<NativeBuffer<byte>>,
+            // enabling us to reinterpret the memory in IDsToTokens() to avoid allocation.
+            private const int ALIGNMENT = 128;
+
+            static TempFixedAllocator()
+            {
+                Debug.Assert(ALIGNMENT % sizeof(NativeBuffer<NativeBuffer<byte>>) == 0);
+            }
+            
             public TempFixedAllocator()
             {
                 var maxExpectedBatches = Config.ExpectedMaxBatches.ToSignedUnchecked();
                 
-                var buffers = Buffers = AllocationHelpers.AllocatePinnedUninitialized<byte>(
-                    TOTAL_BUFFER_SIZE
+                var buffers = Buffers = AllocationHelpers.AllocatePinnedUninitializedAligned<byte>(
+                    TOTAL_BUFFER_SIZE,
+                    ALIGNMENT,
+                    out var buffersPtr
                 );
 
-                BuffersPtr = buffers.PinnedArrayToPointer();
+                BuffersPtr = buffersPtr;
 
                 Count = maxExpectedBatches;
                 
@@ -525,6 +539,88 @@ public DecodeOutput DecodeMutating(NativeBuffer<ulong> ids, bool skipSpecialToke
                 skipSpecialTokens
             );
         }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public FreeHandle IDsToTokens(NativeBuffer<uint> ids, Span<NativeBuffer<byte>> u8Strings)
+        {
+            fixed (NativeBuffer<byte>* ptr = &MemoryMarshal.GetReference(u8Strings))
+            {
+                var u8StringsBuffer = new NativeBuffer<NativeBuffer<byte>>(ptr, (nuint) u8Strings.Length);
+                
+                return IDsToTokens(ids, u8StringsBuffer);
+            }
+        }
+        
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public FreeHandle IDsToTokens(
+            NativeBuffer<uint> ids,
+            NativeBuffer<NativeBuffer<byte>> tokens,
+            bool performSizeCheck = true)
+        {
+            if (performSizeCheck && tokens.Length < ids.Length)
+            {
+                ThrowHelpers.IDsToTokens_LengthCheckFailed();
+            }
+            
+            var tokenizerHandle = TokenizerHandle;
+
+            return new(TokenizerNativeMethods.IDsToTokens(tokenizerHandle, ids, tokens));
+        }
+        
+        public string[] IDsToTokens(NativeBuffer<uint> ids)
+        {
+            var tokens = new string[ids.Length];
+            
+            IDsToTokens(ids, tokens, performSizeCheck: false);
+            
+            return tokens;
+        }
+        
+        public void IDsToTokens(NativeBuffer<uint> ids, Span<string> tokens, bool performSizeCheck = true)
+        {
+            var inputLength = ids.Length;
+            
+            if (performSizeCheck && (nuint) tokens.Length < inputLength)
+            {
+                ThrowHelpers.IDsToTokens_LengthCheckFailed();
+            }
+
+            var allocationSizeInBytes = (int) inputLength * sizeof(NativeBuffer<NativeBuffer<byte>>);
+
+            var allocateNative = allocationSizeInBytes > (Config.ExpectedMaxInputLength * Config.ExpectedMaxBatches);
+            
+            NativeBuffer<NativeBuffer<byte>> allocation;
+            
+            if (!allocateNative)
+            {
+                var ptr = Allocator.GetFullAllocationUnsafely().Ptr;
+                
+                allocation = new((NativeBuffer<byte>*) ptr, inputLength);
+            }
+
+            else
+            {
+                allocation = new NativeMemory<NativeBuffer<byte>>(inputLength).Buffer;
+            }
+            
+            using var freeHandle = IDsToTokens(ids, allocation, performSizeCheck: false);
+
+            ref var currentToken = ref MemoryMarshal.GetReference(tokens);
+            
+            foreach (var buffer in allocation)
+            {
+                // In theory, we could intern the tokenizer's vocab and greatly reduce string allocs,
+                // but it is what it is for now...
+                currentToken = Encoding.UTF8.GetString(buffer.Ptr, (int) buffer.Length);
+                
+                currentToken = ref Unsafe.Add(ref currentToken, 1);
+            }
+            
+            if (allocateNative)
+            {
+                NativeMemory<NativeBuffer<byte>>.FreeWithPtrUnsafely(allocation.Ptr);
+            }
+        }
         
         public void Dispose()
         {
diff --git a/Tokenizers.NET/TokenizerNativeMethods.cs b/Tokenizers.NET/TokenizerNativeMethods.cs
@@ -1,6 +1,7 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using Tokenizers.NET.Collections;
+using Tokenizers.NET.Outputs;
 
 namespace Tokenizers.NET
 {
@@ -118,6 +119,14 @@ public static DecodeOutput TokenizerDecode(
         [LibraryImport(DLL_NAME, EntryPoint = "tokenizer_decode_skip_special_tokens")]
         private static partial DecodeOutput TokenizerDecodeSkipSpecialTokens(nint tokenizerPtr, NativeBuffer<uint> idBuffer);
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [LibraryImport(DLL_NAME, EntryPoint = "ids_to_tokens")]
+        public static partial nint IDsToTokens(
+            nint tokenizerPtr,
+            NativeBuffer<uint> idBuffer,
+            NativeBuffer<NativeBuffer<byte>> tokenBuffer
+        );
+        
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         [LibraryImport(DLL_NAME, EntryPoint = "free_with_handle")]
         public static partial void FreeWithHandle(nint handle);

Original file line number	Diff line number	Diff line change
`@@ -5,6 +5,7 @@`
`5`	`5`	`using System.Threading;`
`6`	`6`	`using Tokenizers.NET;`
`7`	`7`	`using Tokenizers.NET.Collections;`
	`8`	`+using Tokenizers.NET.Outputs;`
`8`	`9`
`9`	`10`	`namespace Codegen`
`10`	`11`	`{`
Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,7 @@ private static void Main(string[] args)`
`47`	`47`
`48`	`48`	`foreach (var token in outputSpan)`
`49`	`49`	`{`
`50`		`- const bool TEST_OVERFLOW = true;`
	`50`	`+ const bool TEST_OVERFLOW = false;`
`51`	`51`
`52`	`52`	`if (TEST_OVERFLOW)`
`53`	`53`	`{`
Original file line number	Diff line number	Diff line change
`@@ -3,6 +3,7 @@`
`3`	`3`	`using FluentAssertions;`
`4`	`4`	`using Tokenizers.NET;`
`5`	`5`	`using Tokenizers.NET.Collections;`
	`6`	`+using Tokenizers.NET.Outputs;`
`6`	`7`
`7`	`8`	`namespace Tests`
`8`	`9`	`{`
Original file line number	Diff line number	Diff line change
`@@ -32,5 +32,11 @@ public static void UTF8EncodingPirated_GetMaxCharCount_OutOfRange()`
`32`	`32`	`{`
`33`	`33`	`throw new InvalidOperationException("Too many bytes. The resulting number of chars is larger than what can be returned as an int.");`
`34`	`34`	`}`
	`35`	`+`
	`36`	`+ [DoesNotReturn]`
	`37`	`+ public static void IDsToTokens_LengthCheckFailed()`
	`38`	`+ {`
	`39`	`+ throw new ArgumentException("Output Span / Buffer length must be more than or equal to the input length.");`
	`40`	`+ }`
`35`	`41`	`}`
`36`	`42`	`}`
Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@`
`4`	`4`	`using System.Text;`
`5`	`5`	`using Tokenizers.NET.Collections;`
`6`	`6`
`7`		`-namespace Tokenizers.NET`
	`7`	`+namespace Tokenizers.NET.Outputs`
`8`	`8`	`{`
`9`	`9`	`[StructLayout(LayoutKind.Sequential)]`
`10`	`10`	`public readonly struct DecodeOutput: IDisposable`
Original file line number	Diff line number	Diff line change
`@@ -2,11 +2,8 @@`
`2`	`2`	`using System.Runtime.CompilerServices;`
`3`	`3`	`using System.Runtime.InteropServices;`
`4`	`4`	`using Tokenizers.NET.Collections;`
`5`		`-#if DEBUG`
`6`		`-using System.Diagnostics;`
`7`		`-#endif`
`8`	`5`
`9`		`-namespace Tokenizers.NET`
	`6`	`+namespace Tokenizers.NET.Outputs`
`10`	`7`	`{`
`11`	`8`	`public interface ITokenizeOutput`
`12`	`9`	`{`