janhq
diff --git a/‎common/jinja/parser.cpp‎
Lines changed: 35 additions & 27 deletions b/‎common/jinja/parser.cpp‎
Lines changed: 35 additions & 27 deletions
diff --git a/‎ggml/src/ggml-cuda/CMakeLists.txt‎
Lines changed: 5 additions & 6 deletions b/‎ggml/src/ggml-cuda/CMakeLists.txt‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎ggml/src/ggml-cuda/convert.cuh‎
Lines changed: 10 additions & 0 deletions b/‎ggml/src/ggml-cuda/convert.cuh‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cuda/fattn-common.cuh‎
Lines changed: 48 additions & 0 deletions b/‎ggml/src/ggml-cuda/fattn-common.cuh‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cuda/fattn-vec.cuh‎
Lines changed: 20 additions & 6 deletions b/‎ggml/src/ggml-cuda/fattn-vec.cuh‎
Lines changed: 20 additions & 6 deletions
@@ -53,6 +53,13 @@ class parser {
         return tokens[current + offset];
     }
 
+    const token & next() {
+        if (current >= tokens.size()) {
+            throw parser_exception("Parser Error: Unexpected EOF", source, tokens.empty() ? 0 : tokens.back().pos);
+        }
+        return tokens[current++];
+    }
+
     token expect(token::type type, const std::string&  error) {
         const auto & t = peek();
         if (t.t != type) {
@@ -90,9 +97,9 @@ class parser {
         size_t start_pos = current;
         switch (peek().t) {
             case token::comment:
-                return mk_stmt<comment_statement>(start_pos, tokens[current++].value);
+                return mk_stmt<comment_statement>(start_pos, next().value);
             case token::text:
-                return mk_stmt<string_literal>(start_pos, tokens[current++].value);
+                return mk_stmt<string_literal>(start_pos, next().value);
             case token::open_statement:
                 return parse_jinja_statement();
             case token::open_expression:
@@ -119,8 +126,7 @@ class parser {
         }
 
         size_t start_pos = current;
-        std::string name = peek().value;
-        current++; // consume identifier
+        std::string name = next().value;
 
         statement_ptr result;
         if (name == "set") {
@@ -202,7 +208,7 @@ class parser {
             // Ignore generation blocks (transformers-specific)
             // See https://github.com/huggingface/transformers/pull/30650 for more information.
             result = mk_stmt<noop_statement>(start_pos);
-            current++;
+            ++current;
 
         } else {
             throw std::runtime_error("Unknown statement: " + name);
@@ -217,7 +223,7 @@ class parser {
         statements body;
 
         if (is(token::equals)) {
-            current++;
+            ++current;
             value = parse_expression_sequence();
         } else {
             // parsing multiline set here
@@ -280,7 +286,7 @@ class parser {
         exprs.push_back(primary ? parse_primary_expression() : parse_expression());
         bool is_tuple = is(token::comma);
         while (is(token::comma)) {
-            current++; // consume comma
+            ++current; // consume comma
             exprs.push_back(primary ? parse_primary_expression() : parse_expression());
         }
         return is_tuple ? mk_stmt<tuple_literal>(start_pos, std::move(exprs)) : std::move(exprs[0]);
@@ -290,7 +296,7 @@ class parser {
         // e.g., `message` in `for message in messages`
         auto loop_var = parse_expression_sequence(true); // should be an identifier/tuple
         if (!is_identifier("in")) throw std::runtime_error("Expected 'in'");
-        current++;
+        ++current; // consume 'in'
 
         // `messages` in `for message in messages`
         auto iterable = parse_expression();
@@ -305,7 +311,8 @@ class parser {
         }
 
         if (is_statement({"else"})) {
-            current += 2;
+            ++current; // consume {%
+            ++current; // consume 'else'
             expect(token::close_statement, "Expected %}");
             while (!is_statement({"endfor"})) {
                 alternate.push_back(parse_any());
@@ -347,7 +354,7 @@ class parser {
         auto left = parse_logical_and_expression();
         while (is_identifier("or")) {
             size_t start_pos = current;
-            token op = tokens[current++];
+            token op = next();
             left = mk_stmt<binary_expression>(start_pos, op, std::move(left), parse_logical_and_expression());
         }
         return left;
@@ -357,7 +364,7 @@ class parser {
         auto left = parse_logical_negation_expression();
         while (is_identifier("and")) {
             size_t start_pos = current;
-            auto op = tokens[current++];
+            auto op = next();
             left = mk_stmt<binary_expression>(start_pos, op, std::move(left), parse_logical_negation_expression());
         }
         return left;
@@ -367,7 +374,7 @@ class parser {
         // Try parse unary operators
         if (is_identifier("not")) {
             size_t start_pos = current;
-            auto op = tokens[current++];
+            auto op = next();
             return mk_stmt<unary_expression>(start_pos, op, parse_logical_negation_expression());
         }
         return parse_comparison_expression();
@@ -382,11 +389,12 @@ class parser {
             size_t start_pos = current;
             if (is_identifier("not") && peek(1).t == token::identifier && peek(1).value == "in") {
                 op = {token::identifier, "not in", tokens[current].pos};
-                current += 2;
+                ++current; // consume 'not'
+                ++current; // consume 'in'
             } else if (is_identifier("in")) {
-                op = tokens[current++];
+                op = next();
             } else if (is(token::comparison_binary_operator)) {
-                op = tokens[current++];
+                op = next();
             } else break;
             left = mk_stmt<binary_expression>(start_pos, op, std::move(left), parse_additive_expression());
         }
@@ -397,7 +405,7 @@ class parser {
         auto left = parse_multiplicative_expression();
         while (is(token::additive_binary_operator)) {
             size_t start_pos = current;
-            auto op = tokens[current++];
+            auto op = next();
             left = mk_stmt<binary_expression>(start_pos, op, std::move(left), parse_multiplicative_expression());
         }
         return left;
@@ -407,7 +415,7 @@ class parser {
         auto left = parse_test_expression();
         while (is(token::multiplicative_binary_operator)) {
             size_t start_pos = current;
-            auto op = tokens[current++];
+            auto op = next();
             left = mk_stmt<binary_expression>(start_pos, op, std::move(left), parse_test_expression());
         }
         return left;
@@ -417,9 +425,9 @@ class parser {
         auto operand = parse_filter_expression();
         while (is_identifier("is")) {
             size_t start_pos = current;
-            current++;
+            ++current; // consume 'is'
             bool negate = false;
-            if (is_identifier("not")) { current++; negate = true; }
+            if (is_identifier("not")) { ++current; negate = true; }
             auto test_id = parse_primary_expression();
             // FIXME: tests can also be expressed like this: if x is eq 3
             if (is(token::open_paren)) test_id = parse_call_expression(std::move(test_id));
@@ -432,7 +440,7 @@ class parser {
         auto operand = parse_call_member_expression();
         while (is(token::pipe)) {
             size_t start_pos = current;
-            current++;
+            ++current; // consume pipe
             auto filter = parse_primary_expression();
             if (is(token::open_paren)) filter = parse_call_expression(std::move(filter));
             operand = mk_stmt<filter_expression>(start_pos, std::move(operand), std::move(filter));
@@ -490,7 +498,7 @@ class parser {
     statement_ptr parse_member_expression(statement_ptr object) {
         size_t start_pos = current;
         while (is(token::dot) || is(token::open_square_bracket)) {
-            auto op = tokens[current++];
+            auto op = next();
             bool computed = op.t == token::open_square_bracket;
             statement_ptr prop;
             if (computed) {
@@ -536,7 +544,7 @@ class parser {
 
     statement_ptr parse_primary_expression() {
         size_t start_pos = current;
-        auto t = tokens[current++];
+        auto t = next();
         switch (t.t) {
             case token::numeric_literal:
                 if (t.value.find('.') != std::string::npos) {
@@ -547,7 +555,7 @@ class parser {
             case token::string_literal: {
                 std::string val = t.value;
                 while (is(token::string_literal)) {
-                    val += tokens[current++].value;
+                    val += next().value;
                 }
                 return mk_stmt<string_literal>(start_pos, val);
             }
@@ -562,9 +570,9 @@ class parser {
                 statements vals;
                 while (!is(token::close_square_bracket)) {
                     vals.push_back(parse_expression());
-                    if (is(token::comma)) current++;
+                    if (is(token::comma)) ++current;
                 }
-                current++;
+                ++current;
                 return mk_stmt<array_literal>(start_pos, std::move(vals));
             }
             case token::open_curly_bracket: {
@@ -573,9 +581,9 @@ class parser {
                     auto key = parse_expression();
                     expect(token::colon, "Expected :");
                     pairs.push_back({std::move(key), parse_expression()});
-                    if (is(token::comma)) current++;
+                    if (is(token::comma)) ++current;
                 }
-                current++;
+                ++current;
                 return mk_stmt<object_literal>(start_pos, std::move(pairs));
             }
             default:
 
@@ -116,12 +116,11 @@ if (CUDAToolkit_FOUND)
         list(APPEND GGML_SOURCES_CUDA ${SRCS})
         add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
     else()
-        file(GLOB   SRCS "template-instances/fattn-vec*q4_0-q4_0.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})
-        file(GLOB   SRCS "template-instances/fattn-vec*q8_0-q8_0.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})
-        file(GLOB   SRCS "template-instances/fattn-vec*f16-f16.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})
+        list(APPEND GGML_SOURCES_CUDA
+            template-instances/fattn-vec-instance-f16-f16.cu
+            template-instances/fattn-vec-instance-q4_0-q4_0.cu
+            template-instances/fattn-vec-instance-q8_0-q8_0.cu
+            template-instances/fattn-vec-instance-bf16-bf16.cu)
     endif()
 
     ggml_add_backend_library(ggml-cuda
 
@@ -41,6 +41,16 @@ template<typename dst_t, typename src_t>
         return __bfloat162float(x);
     } else if constexpr(std::is_same_v<src_t, float2> && std::is_same_v<dst_t, half2>) {
         return __float22half2_rn(x);
+    } else if constexpr(std::is_same_v<src_t, nv_bfloat162> && std::is_same_v<dst_t, float2>) {
+#ifdef GGML_USE_HIP
+        return make_float2(__bfloat162float(__low2bfloat16(x)), __bfloat162float(__high2bfloat16(x)));
+#else
+#if __CUDA_ARCH__ >= 800
+        return __bfloat1622float2(x);
+#else
+        return make_float2(__bfloat162float(x.x), __bfloat162float(x.y));
+#endif // __CUDA_ARCH__ >= 800
+#endif // GGML_USE_HIP
     } else if constexpr(std::is_same_v<src_t, float2> && std::is_same_v<dst_t, nv_bfloat162>) {
         // bypass compile error on cuda 12.0.1
 #ifdef GGML_USE_HIP
 
@@ -74,6 +74,37 @@ static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_f16(
     return sum;
 }
 
+template <int D, int nthreads>
+static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_bf16(
+    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds_v) {
+
+    const nv_bfloat162 * K_bf16 = (const nv_bfloat162 *) K_c;
+    GGML_UNUSED(Q_q8);
+    GGML_UNUSED(Q_ds_v);
+
+    constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
+    constexpr int cpy_ne = cpy_nb / 4;
+
+    float sum = 0.0f;
+
+#pragma unroll
+    for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += nthreads*cpy_ne) {
+        __align__(16) nv_bfloat162 tmp[cpy_ne];
+        ggml_cuda_memcpy_1<sizeof(tmp)>(tmp, K_bf16 + k_KQ_0 + (threadIdx.x % nthreads)*cpy_ne);
+#pragma unroll
+        for (int k_KQ_1 = 0; k_KQ_1 < cpy_ne; ++k_KQ_1) {
+#ifdef V_DOT2_F32_F16_AVAILABLE
+            // FIXME replace macros in vector FA kernel with templating and use FP32 for BF16
+            ggml_cuda_mad(sum, ggml_cuda_cast<float2>(tmp[k_KQ_1]), __half22float2(((const half2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1]));
+#else
+            ggml_cuda_mad(sum, ggml_cuda_cast<float2>(tmp[k_KQ_1]), ((const float2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1]);
+#endif // V_DOT2_F32_F16_AVAILABLE
+        }
+    }
+
+    return sum;
+}
+
 template<int D, int nthreads>
 static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_q4_0(
     const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
@@ -321,6 +352,19 @@ static __device__ __forceinline__ void dequantize_V_f16(const void * __restrict_
     }
 }
 
+template <typename T, int ne>
+static __device__ __forceinline__ void dequantize_V_bf16(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
+    static_assert(std::is_same_v<T, float>, "BF16 V dequantization only supports float output");
+    static_assert(ne % 2 == 0, "bad ne");
+    __align__(16) nv_bfloat162 tmp[ne/2];
+    ggml_cuda_memcpy_1<ne*sizeof(nv_bfloat16)>(tmp, (const nv_bfloat16 *) vx + i0);
+    float2 * dst_f2 = (float2 *) dst;
+#pragma unroll
+    for (int l = 0; l < ne/2; ++l) {
+        dst_f2[l] = ggml_cuda_cast<float2>(tmp[l]);
+    }
+}
+
 template <typename T, int ne>
 static __device__ __forceinline__ void dequantize_V_q4_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
     const block_q4_0 * x = (const block_q4_0 *) vx;
@@ -547,6 +591,8 @@ constexpr __device__ vec_dot_KQ_t get_vec_dot_KQ() {
         return vec_dot_fattn_vec_KQ_q5_1<D, nthreads>;
     } else if constexpr (type_K == GGML_TYPE_Q8_0) {
         return vec_dot_fattn_vec_KQ_q8_0<D, nthreads>;
+    } else if constexpr (type_K == GGML_TYPE_BF16) {
+        return vec_dot_fattn_vec_KQ_bf16<D, nthreads>;
     } else {
         static_assert(type_K == -1, "bad type");
         return nullptr;
@@ -567,6 +613,8 @@ constexpr __device__ dequantize_V_t get_dequantize_V() {
         return dequantize_V_q5_1<T, ne>;
     } else if constexpr (type_V == GGML_TYPE_Q8_0) {
         return dequantize_V_q8_0<T, ne>;
+    } else if constexpr (type_V == GGML_TYPE_BF16) {
+        return dequantize_V_bf16<float, ne>;
     } else {
         static_assert(type_V == -1, "bad type");
         return nullptr;
 
@@ -75,17 +75,17 @@ static __global__ void flash_attn_ext_vec(
 #endif // GGML_USE_HIP
 
     constexpr int nthreads    = ggml_cuda_fattn_vec_get_nthreads_device();
-    constexpr int nthreads_KQ = type_K == GGML_TYPE_F16 ? 128 / cpy_nb : nthreads_KQ_q;
-    constexpr int nthreads_V  = type_V == GGML_TYPE_F16 ? 128 / cpy_nb : nthreads_V_q;
+    constexpr int nthreads_KQ = (type_K == GGML_TYPE_F16 || type_K == GGML_TYPE_BF16) ? 128 / cpy_nb : nthreads_KQ_q;
+    constexpr int nthreads_V  = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16) ? 128 / cpy_nb : nthreads_V_q;
 
     static_assert(WARP_SIZE % nthreads_KQ == 0, "bad nthreads_K");
     static_assert(WARP_SIZE % nthreads_V  == 0, "bad nthreads_V");
 
-    constexpr int V_rows_per_thread = type_V == GGML_TYPE_F16 ? 2*cpy_ne : 4;
+    constexpr int V_rows_per_thread = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16) ? 2*cpy_ne : 4;
     constexpr int V_cols_per_iter   = WARP_SIZE / nthreads_V;
 
     constexpr vec_dot_KQ_t vec_dot_KQ = get_vec_dot_KQ<type_K, D, nthreads_KQ>();
-    constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16;
+    constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16 && type_K != GGML_TYPE_BF16;
 #ifdef V_DOT2_F32_F16_AVAILABLE
     constexpr dequantize_V_t dequantize_V = get_dequantize_V<type_V, half,  V_rows_per_thread>();
 #else
@@ -323,8 +323,18 @@ static __global__ void flash_attn_ext_vec(
 #pragma unroll
             for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
                 half2 tmp[V_rows_per_thread/2];
-                dequantize_V(V + k*nb21, tmp,
-                    2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread);
+                if constexpr (type_V == GGML_TYPE_BF16) {
+                    float2 tmp_f[V_rows_per_thread/2];
+                    dequantize_V(V + k*nb21, tmp_f,
+                        2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread);
+#pragma unroll
+                    for (int i_VKQ_1 = 0; i_VKQ_1 < V_rows_per_thread/2; ++i_VKQ_1) {
+                        tmp[i_VKQ_1] = __float22half2_rn(tmp_f[i_VKQ_1]);
+                    }
+                } else {
+                    dequantize_V(V + k*nb21, tmp,
+                        2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread);
+                }
 #pragma unroll
                 for (int i_VKQ_1 = 0; i_VKQ_1 < V_rows_per_thread/2; ++i_VKQ_1) {
 #pragma unroll
@@ -563,24 +573,28 @@ void ggml_cuda_flash_attn_ext_vec_case(ggml_backend_cuda_context & ctx, ggml_ten
     extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q5_0); \
     extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q5_1); \
     extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q8_0); \
+    extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_BF16); \
 
 EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_F16)
 EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q4_0)
 EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q4_1)
 EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q5_0)
 EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q5_1)
 EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q8_0)
+EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_BF16)
 
 EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_F16)
 EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q4_0)
 EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q4_1)
 EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q5_0)
 EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q5_1)
 EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q8_0)
+EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_BF16)
 
 EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_F16)
 EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q4_0)
 EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q4_1)
 EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_0)
 EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_1)
 EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q8_0)
+EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_BF16)