|
| 1 | +/* |
| 2 | + * Copyright (c) 2025 by FlashInfer team. |
| 3 | + * |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | + * you may not use this file except in compliance with the License. |
| 6 | + * You may obtain a copy of the License at |
| 7 | + * |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | + * |
| 10 | + * Unless required by applicable law or agreed to in writing, software |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | + * See the License for the specific language governing permissions and |
| 14 | + * limitations under the License. |
| 15 | + */ |
| 16 | +// Extern template declarations to prevent implicit instantiation in the dispatcher. |
| 17 | +// Explicit instantiations are in separate generated files for parallel compilation. |
| 18 | + |
| 19 | +#pragma once |
| 20 | + |
| 21 | +#include <cuda_bf16.h> |
| 22 | +#include <cuda_fp16.h> |
| 23 | +#include "cutlass/arch/arch.h" |
| 24 | + |
| 25 | +namespace flat { |
| 26 | + |
| 27 | +// clang-format off |
| 28 | + |
| 29 | +#define FOR_EACH_BOOL_4(MACRO, ...) \ |
| 30 | + MACRO(false, false, false, false, __VA_ARGS__) \ |
| 31 | + MACRO(false, false, false, true, __VA_ARGS__) \ |
| 32 | + MACRO(false, false, true, false, __VA_ARGS__) \ |
| 33 | + MACRO(false, false, true, true, __VA_ARGS__) \ |
| 34 | + MACRO(false, true, false, false, __VA_ARGS__) \ |
| 35 | + MACRO(false, true, false, true, __VA_ARGS__) \ |
| 36 | + MACRO(false, true, true, false, __VA_ARGS__) \ |
| 37 | + MACRO(false, true, true, true, __VA_ARGS__) \ |
| 38 | + MACRO(true, false, false, false, __VA_ARGS__) \ |
| 39 | + MACRO(true, false, false, true, __VA_ARGS__) \ |
| 40 | + MACRO(true, false, true, false, __VA_ARGS__) \ |
| 41 | + MACRO(true, false, true, true, __VA_ARGS__) \ |
| 42 | + MACRO(true, true, false, false, __VA_ARGS__) \ |
| 43 | + MACRO(true, true, false, true, __VA_ARGS__) \ |
| 44 | + MACRO(true, true, true, false, __VA_ARGS__) \ |
| 45 | + MACRO(true, true, true, true, __VA_ARGS__) |
| 46 | + |
| 47 | +#define DECLARE_TEMPLATE_INSTANCE(is_gva, needs_beta, needs_alpha, init_state, ctype) \ |
| 48 | +extern template void launch_delta_rule_prefill_kernel_gbai<is_gva, needs_beta, needs_alpha, init_state, cutlass::arch::Sm90, ctype, ctype, float>( \ |
| 49 | + cudaStream_t, ctype*, float*, ctype const*, ctype const*, ctype const*, \ |
| 50 | + float const*, float const*, float const*, int64_t const*, uint8_t*, int32_t, int32_t, \ |
| 51 | + int32_t, int32_t, int32_t, int32_t, int64_t, float, int32_t); |
| 52 | + |
| 53 | +// Extern template declarations for half |
| 54 | +FOR_EACH_BOOL_4(DECLARE_TEMPLATE_INSTANCE, half) |
| 55 | + |
| 56 | +// Extern template declarations for nv_bfloat16 |
| 57 | +FOR_EACH_BOOL_4(DECLARE_TEMPLATE_INSTANCE, nv_bfloat16) |
| 58 | + |
| 59 | +#undef DECLARE_TEMPLATE_INSTANCE |
| 60 | +#undef FOR_EACH_BOOL_4 |
| 61 | + |
| 62 | +// clang-format on |
| 63 | + |
| 64 | +} // namespace flat |
0 commit comments