|
220 | 220 | # LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
221 | 221 | # LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
222 | 222 | # LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
| 223 | +# LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26, |
223 | 224 | # };
|
224 | 225 | LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
|
225 | 226 | LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
|
|
247 | 248 | LLAMA_VOCAB_PRE_TYPE_BLOOM = 23
|
248 | 249 | LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24
|
249 | 250 | LLAMA_VOCAB_PRE_TYPE_EXAONE = 25
|
| 251 | +LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26 |
250 | 252 |
|
251 | 253 |
|
252 | 254 | # // note: these values should be synchronized with ggml_rope
|
|
404 | 406 | # LLAMA_POOLING_TYPE_MEAN = 1,
|
405 | 407 | # LLAMA_POOLING_TYPE_CLS = 2,
|
406 | 408 | # LLAMA_POOLING_TYPE_LAST = 3,
|
| 409 | +# LLAMA_POOLING_TYPE_RANK = 4, // used by reranking models to attach the classification head to the graph |
407 | 410 | # };
|
408 | 411 | LLAMA_POOLING_TYPE_UNSPECIFIED = -1
|
409 | 412 | LLAMA_POOLING_TYPE_NONE = 0
|
410 | 413 | LLAMA_POOLING_TYPE_MEAN = 1
|
411 | 414 | LLAMA_POOLING_TYPE_CLS = 2
|
412 | 415 | LLAMA_POOLING_TYPE_LAST = 3
|
| 416 | +LLAMA_POOLING_TYPE_RANK = 4 |
413 | 417 |
|
414 | 418 | # enum llama_attention_type {
|
415 | 419 | # LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1,
|
|
420 | 424 | LLAMA_ATTENTION_TYPE_CAUSAL = 0
|
421 | 425 | LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1
|
422 | 426 |
|
| 427 | + |
423 | 428 | # enum llama_split_mode {
|
424 |
| -# LLAMA_SPLIT_MODE_NONE = 0, // single GPU |
425 |
| -# LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs |
426 |
| -# LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs |
| 429 | +# LLAMA_SPLIT_MODE_NONE = 0, // single GPU |
| 430 | +# LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs |
| 431 | +# LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs |
427 | 432 | # };
|
428 | 433 | LLAMA_SPLIT_MODE_NONE = 0
|
429 | 434 | LLAMA_SPLIT_MODE_LAYER = 1
|
@@ -2520,7 +2525,8 @@ def llama_get_embeddings_ith(
|
2520 | 2525 |
|
2521 | 2526 | # // Get the embeddings for a sequence id
|
2522 | 2527 | # // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
|
2523 |
| -# // shape: [n_embd] (1-dimensional) |
| 2528 | +# // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence |
| 2529 | +# // otherwise: float[n_embd] (1-dimensional) |
2524 | 2530 | # LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
|
2525 | 2531 | @ctypes_function(
|
2526 | 2532 | "llama_get_embeddings_seq",
|
@@ -2672,6 +2678,8 @@ def llama_token_eot(model: llama_model_p, /) -> int:
|
2672 | 2678 | # //
|
2673 | 2679 | # // Tokenization
|
2674 | 2680 | # //
|
| 2681 | +# // The API is thread-safe. |
| 2682 | +# // |
2675 | 2683 |
|
2676 | 2684 |
|
2677 | 2685 | # /// @details Convert the provided text into tokens.
|
|
0 commit comments