forked from SciSharp/LLamaSharp
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathNativeApi.Mtmd.cs
More file actions
367 lines (293 loc) · 17.4 KB
/
Copy pathNativeApi.Mtmd.cs
File metadata and controls
367 lines (293 loc) · 17.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
using System;
using static LLama.Native.SafeMtmdInputChunk;
namespace LLama.Native;
/// <summary>
/// P/Invoke surface for MTMD (multimodal) helpers exposed by llama.cpp.
/// </summary>
public static partial class NativeApi
{
/// <summary>
/// Native context parameters returned by <see cref="mtmd_context_params_default"/>.
/// </summary>
[StructLayout(LayoutKind.Sequential)]
internal struct mtmd_context_params
{
[MarshalAs(UnmanagedType.I1)] public bool use_gpu;
[MarshalAs(UnmanagedType.I1)] public bool print_timings;
public int n_threads;
public IntPtr image_marker;
public IntPtr media_marker;
public LLamaFlashAttentionType flash_attn_type;
[MarshalAs(UnmanagedType.I1)] public bool warmup;
public int image_min_tokens;
public int image_max_tokens;
private IntPtr /* ggml_backend_sched_eval_callback */ cb_eval;
private IntPtr cb_eval_user_data;
}
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_default_marker", CallingConvention = CallingConvention.Cdecl)]
internal static extern IntPtr mtmd_default_marker();
/// <summary>
/// Retrieve the default multimodal marker text.
/// </summary>
public static string? MtmdDefaultMarker()
=> mtmd_default_marker().PtrToString();
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_context_params_default", CallingConvention = CallingConvention.Cdecl)]
internal static extern mtmd_context_params mtmd_context_params_default();
/// <summary>
/// whether we need to set non-causal mask before llama_decode
/// if chunk is nullptr, we assume the default case where chunk is an image chunk
/// </summary>
/// <param name="ctx"></param>
/// <returns></returns>
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_decode_use_non_causal", CallingConvention = CallingConvention.Cdecl)]
[return: MarshalAs(UnmanagedType.I1)]
internal static extern bool mtmd_decode_use_non_causal(SafeMtmdModelHandle ctx);
/// <summary>
/// whether the current model use M-RoPE for llama_decode
/// </summary>
/// <param name="ctx"></param>
/// <returns></returns>
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_decode_use_mrope", CallingConvention = CallingConvention.Cdecl)]
[return: MarshalAs(UnmanagedType.I1)]
internal static extern bool mtmd_decode_use_mrope(SafeMtmdModelHandle ctx);
/// <summary>
/// whether the current model supports vision input
/// </summary>
/// <param name="ctx"></param>
/// <returns></returns>
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_support_vision", CallingConvention = CallingConvention.Cdecl)]
[return: MarshalAs(UnmanagedType.I1)]
internal static extern bool mtmd_support_vision(SafeMtmdModelHandle ctx);
/// <summary>
/// whether the current model supports audio input
/// </summary>
/// <param name="ctx"></param>
/// <returns></returns>
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_support_audio", CallingConvention = CallingConvention.Cdecl)]
[return: MarshalAs(UnmanagedType.I1)]
internal static extern bool mtmd_support_audio(SafeMtmdModelHandle ctx);
/// <summary>
/// get audio sample rate in Hz, for example 16000 for Whisper
/// </summary>
/// <param name="ctx"></param>
/// <returns></returns>
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_get_audio_sample_rate", CallingConvention = CallingConvention.Cdecl)]
internal static extern int mtmd_get_audio_sample_rate(SafeMtmdModelHandle ctx);
// bitmap ------------------------------------------------------------
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_bitmap_init", CallingConvention = CallingConvention.Cdecl)]
internal static extern unsafe IntPtr mtmd_bitmap_init(uint nx, uint ny, byte* data);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_bitmap_init_from_audio", CallingConvention = CallingConvention.Cdecl)]
internal static extern unsafe IntPtr mtmd_bitmap_init_from_audio(ulong n_samples, float* data);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_bitmap_get_nx", CallingConvention = CallingConvention.Cdecl)]
internal static extern uint mtmd_bitmap_get_nx(SafeMtmdEmbed bitmap);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_bitmap_get_ny", CallingConvention = CallingConvention.Cdecl)]
internal static extern uint mtmd_bitmap_get_ny(SafeMtmdEmbed bitmap);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_bitmap_get_data", CallingConvention = CallingConvention.Cdecl)]
internal static extern unsafe byte* mtmd_bitmap_get_data(SafeMtmdEmbed bitmap);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_bitmap_get_n_bytes", CallingConvention = CallingConvention.Cdecl)]
internal static extern UIntPtr mtmd_bitmap_get_n_bytes(SafeMtmdEmbed bitmap);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_bitmap_is_audio", CallingConvention = CallingConvention.Cdecl)]
[return: MarshalAs(UnmanagedType.I1)]
internal static extern bool mtmd_bitmap_is_audio(SafeMtmdEmbed bitmap);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_bitmap_free", CallingConvention = CallingConvention.Cdecl)]
internal static extern void mtmd_bitmap_free(IntPtr bitmap);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_bitmap_get_id", CallingConvention = CallingConvention.Cdecl)]
internal static extern IntPtr mtmd_bitmap_get_id(SafeMtmdEmbed bitmap);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_bitmap_set_id", CallingConvention = CallingConvention.Cdecl)]
private static extern unsafe void mtmd_bitmap_set_id_native(SafeMtmdEmbed bitmap, byte* id);
/// <summary>
/// Assign an identifier to a bitmap using a UTF-8 encoded string.
/// </summary>
internal static unsafe void mtmd_bitmap_set_id(SafeMtmdEmbed bitmap, string? id)
{
if (id is null)
{
mtmd_bitmap_set_id_native(bitmap, null);
return;
}
using var pinned = PinnedUtf8String.Create(id) ?? throw new ArgumentNullException(nameof(id));
mtmd_bitmap_set_id_native(bitmap, (byte*)pinned.Pointer);
}
// input_chunks ------------------------------------------------------
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_input_chunks_init", CallingConvention = CallingConvention.Cdecl)]
internal static extern IntPtr mtmd_input_chunks_init();
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_input_chunks_size", CallingConvention = CallingConvention.Cdecl)]
internal static extern UIntPtr mtmd_input_chunks_size(IntPtr chunks);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_input_chunks_get", CallingConvention = CallingConvention.Cdecl)]
internal static extern IntPtr mtmd_input_chunks_get(IntPtr chunks, UIntPtr idx);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_input_chunks_free", CallingConvention = CallingConvention.Cdecl)]
internal static extern void mtmd_input_chunks_free(IntPtr chunks);
// input_chunk -------------------------------------------------------
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_input_chunk_get_type", CallingConvention = CallingConvention.Cdecl)]
internal static extern SafeMtmdInputChunkType mtmd_input_chunk_get_type(SafeMtmdInputChunk chunk);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_input_chunk_get_tokens_text", CallingConvention = CallingConvention.Cdecl)]
internal static extern IntPtr mtmd_input_chunk_get_tokens_text(SafeMtmdInputChunk chunk, out UIntPtr n_tokens);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_input_chunk_get_tokens_image", CallingConvention = CallingConvention.Cdecl)]
internal static extern IntPtr mtmd_input_chunk_get_tokens_image(IntPtr chunk);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_input_chunk_get_n_tokens", CallingConvention = CallingConvention.Cdecl)]
internal static extern UIntPtr mtmd_input_chunk_get_n_tokens(SafeMtmdInputChunk chunk);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_input_chunk_get_id", CallingConvention = CallingConvention.Cdecl)]
internal static extern IntPtr mtmd_input_chunk_get_id(SafeMtmdInputChunk chunk);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_input_chunk_get_n_pos", CallingConvention = CallingConvention.Cdecl)]
internal static extern int mtmd_input_chunk_get_n_pos(SafeMtmdInputChunk chunk);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_input_chunk_copy", CallingConvention = CallingConvention.Cdecl)]
internal static extern IntPtr mtmd_input_chunk_copy(SafeMtmdInputChunk chunk);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_input_chunk_free", CallingConvention = CallingConvention.Cdecl)]
internal static extern void mtmd_input_chunk_free(IntPtr chunk);
// image_tokens ------------------------------------------------------
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_image_tokens_get_n_tokens", CallingConvention = CallingConvention.Cdecl)]
internal static extern UIntPtr mtmd_image_tokens_get_n_tokens(IntPtr image_tokens);
[Obsolete("use mtmd_image_tokens_get_decoder_pos() instead")]
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_image_tokens_get_nx", CallingConvention = CallingConvention.Cdecl)]
internal static extern UIntPtr mtmd_image_tokens_get_nx(IntPtr image_tokens);
[Obsolete("use mtmd_image_tokens_get_decoder_pos() instead")]
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_image_tokens_get_ny", CallingConvention = CallingConvention.Cdecl)]
internal static extern UIntPtr mtmd_image_tokens_get_ny(IntPtr image_tokens);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_image_tokens_get_id", CallingConvention = CallingConvention.Cdecl)]
internal static extern IntPtr mtmd_image_tokens_get_id(IntPtr image_tokens);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_image_tokens_get_n_pos", CallingConvention = CallingConvention.Cdecl)]
internal static extern int mtmd_image_tokens_get_n_pos(IntPtr image_tokens);
[StructLayout(LayoutKind.Explicit)]
internal struct mtmd_decoder_pos
{
[FieldOffset(0)]
uint t;
[FieldOffset(4)]
uint x;
[FieldOffset(8)]
uint y;
[FieldOffset(12)]
uint z;
};
/// <summary>
/// get position for decoder attention, to be used by M-RoPE models
/// </summary>
/// <param name="image_tokens"></param>
/// <param name="pos_0">pos_0 is the absolute position of the first token</param>
/// <param name="i">i is the index of the embedding token, ranging from 0 to mtmd_image_tokens_get_n_tokens() - 1</param>
/// <returns>return relative position (for example, embedding 0 will have position (0, 0, 0); remember to adjust it to the current absolute position)</returns>
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_image_tokens_get_decoder_pos", CallingConvention = CallingConvention.Cdecl)]
internal static extern mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(IntPtr image_tokens, LLamaPos pos_0, nuint i);
// tokenize ----------------------------------------------------------
/// <summary>
/// Native text structure consumed by <see cref="NativeApi.mtmd_tokenize(LLama.Native.SafeMtmdModelHandle,System.IntPtr,in LLama.Native.NativeApi.mtmd_input_text_native,System.IntPtr[],System.UIntPtr)"/>.
/// </summary>
internal unsafe struct mtmd_input_text_native
{
public byte* text;
[MarshalAs(UnmanagedType.I1)] public bool add_special;
[MarshalAs(UnmanagedType.I1)] public bool parse_special;
}
/// <summary>
/// Utility scope that pins managed text while invoking the native tokenizer.
/// </summary>
internal readonly unsafe ref struct MtmdInputTextScope
{
public readonly mtmd_input_text_native Value;
private readonly PinnedUtf8String _text;
public MtmdInputTextScope(string text, bool addSpecial, bool parseSpecial)
{
_text = PinnedUtf8String.Create(text) ?? throw new ArgumentNullException(nameof(text));
Value = new mtmd_input_text_native
{
text = (byte*)_text.Pointer,
add_special = addSpecial,
parse_special = parseSpecial
};
}
public void Dispose() => _text.Dispose();
}
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_tokenize", CallingConvention = CallingConvention.Cdecl)]
private static extern unsafe int mtmd_tokenize_native(
SafeMtmdModelHandle ctx,
IntPtr output,
mtmd_input_text_native* text,
IntPtr[] bitmaps,
UIntPtr n_bitmaps);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_tokenize", CallingConvention = CallingConvention.Cdecl)]
private static extern unsafe int mtmd_tokenize_native(
SafeMtmdModelHandle ctx,
IntPtr output,
mtmd_input_text_native* text,
SafeMtmdEmbed[] bitmaps,
UIntPtr n_bitmaps);
internal static unsafe int mtmd_tokenize(SafeMtmdModelHandle ctx, IntPtr output, in mtmd_input_text_native text, IntPtr[] bitmaps, nuint n_bitmaps)
{
var temp = text;
return mtmd_tokenize_native(ctx, output, &temp, bitmaps, n_bitmaps);
}
internal static unsafe int mtmd_tokenize(SafeMtmdModelHandle ctx, IntPtr output, string text, bool addSpecial, bool parseSpecial, IntPtr[] bitmaps, nuint n_bitmaps)
{
using var scope = new MtmdInputTextScope(text, addSpecial, parseSpecial);
return mtmd_tokenize_native(ctx, output, &scope.Value, bitmaps, n_bitmaps);
}
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_encode", CallingConvention = CallingConvention.Cdecl)]
internal static extern int mtmd_encode(IntPtr ctx, IntPtr image_tokens);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_encode_chunk", CallingConvention = CallingConvention.Cdecl)]
internal static extern int mtmd_encode_chunk(IntPtr ctx, IntPtr chunk);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_get_output_embd", CallingConvention = CallingConvention.Cdecl)]
internal static extern IntPtr mtmd_get_output_embd(IntPtr ctx);
// helper ------------------------------------------------------------
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_test_create_input_chunks", CallingConvention = CallingConvention.Cdecl)]
internal static extern IntPtr mtmd_test_create_input_chunks();
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_helper_bitmap_init_from_file", CallingConvention = CallingConvention.Cdecl)]
private static extern unsafe IntPtr mtmd_helper_bitmap_init_from_file_native(SafeMtmdModelHandle ctx, byte* fname);
internal static unsafe IntPtr mtmd_helper_bitmap_init_from_file(SafeMtmdModelHandle ctx, string fname)
{
using var pinned = PinnedUtf8String.Create(fname) ?? throw new ArgumentNullException(nameof(fname));
return mtmd_helper_bitmap_init_from_file_native(ctx, (byte*)pinned.Pointer);
}
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_helper_bitmap_init_from_buf", CallingConvention = CallingConvention.Cdecl)]
internal static extern unsafe IntPtr mtmd_helper_bitmap_init_from_buf(SafeMtmdModelHandle ctx, byte* buf, nuint len);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_helper_get_n_tokens", CallingConvention = CallingConvention.Cdecl)]
internal static extern UIntPtr mtmd_helper_get_n_tokens(SafeMtmdInputChunks chunks);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_helper_get_n_pos", CallingConvention = CallingConvention.Cdecl)]
internal static extern int mtmd_helper_get_n_pos(SafeMtmdInputChunks chunks);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_helper_image_get_decoder_pos", CallingConvention = CallingConvention.Cdecl)]
// helper to get the list of relative positions corresponding to the embedding tokens, to be used by M-RoPE
// out_pos must have length == mtmd_helper_get_n_tokens(image)
internal static extern void mtmd_helper_image_get_decoder_pos(
IntPtr /* mtmd_image_tokens* */ image,
LLamaPos pos_0,
IntPtr /* mtmd_decoder_pos* */ out_pos
);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_helper_eval_chunks", CallingConvention = CallingConvention.Cdecl)]
internal static extern int mtmd_helper_eval_chunks(
SafeMtmdModelHandle ctx,
SafeLLamaContextHandle lctx,
SafeMtmdInputChunks chunks,
int n_past,
int seq_id,
int n_batch,
[MarshalAs(UnmanagedType.I1)] bool logits_last,
ref int new_n_past);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_helper_eval_chunk_single", CallingConvention = CallingConvention.Cdecl)]
internal static extern int mtmd_helper_eval_chunk_single(
SafeMtmdModelHandle ctx,
SafeLLamaContextHandle lctx,
IntPtr chunk,
int n_past,
int seq_id,
int n_batch,
[MarshalAs(UnmanagedType.I1)] bool logits_last,
ref int new_n_past);
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_helper_decode_image_chunk", CallingConvention = CallingConvention.Cdecl)]
internal static extern int mtmd_helper_decode_image_chunk(
SafeMtmdModelHandle ctx,
SafeLLamaContextHandle lctx,
IntPtr chunk,
IntPtr encoded_embd,
int n_past,
int seq_id,
int n_batch,
ref int new_n_past);
/*
* // EXPERIMENTAL API to get mmproj's capabilities without initializing the full context
// This is only intended to be used by llama-server, breaking changes is expected
struct mtmd_caps {
bool inp_vision;
bool inp_audio;
};
MTMD_API struct mtmd_caps mtmd_get_cap_from_file(const char * mmproj_fname);
*/
}