|
| 1 | +// Copyright (c) Microsoft Corporation. All rights reserved. |
| 2 | +// Licensed under the MIT License. |
| 3 | + |
| 4 | +#param components |
| 5 | +#param work_group_size |
| 6 | +#param use_smooth_softmax |
| 7 | +#param has_seqlen_k |
| 8 | +#param has_head_sink |
| 9 | +#param has_sliding_window |
| 10 | + |
| 11 | +#if components == 4 |
| 12 | +alias f32_val_t = vec4<f32>; |
| 13 | +#elif components == 2 |
| 14 | +alias f32_val_t = vec2<f32>; |
| 15 | +#else |
| 16 | +alias f32_val_t = f32; |
| 17 | +#endif |
| 18 | + |
| 19 | +var<workgroup> thread_max: array<f32, work_group_size>; |
| 20 | +var<workgroup> thread_sum: array<f32, work_group_size>; |
| 21 | + |
| 22 | +$MAIN { |
| 23 | + let sequence_length = uniforms.sequence_length; |
| 24 | + let batch_idx = u32(workgroup_idx / sequence_length) / uniforms.num_heads; |
| 25 | + let head_idx = u32(workgroup_idx / sequence_length) % uniforms.num_heads; |
| 26 | + var total_sequence_length = uniforms.total_sequence_length_comp * components; |
| 27 | +#if has_seqlen_k |
| 28 | + total_sequence_length = u32(seqlen_k[batch_idx]) + 1; |
| 29 | + var past_sequence_length: u32 = select(total_sequence_length - sequence_length, 0u, uniforms.is_first_prompt > 0); |
| 30 | +#else |
| 31 | + let past_sequence_length = uniforms.past_sequence_length; |
| 32 | +#endif |
| 33 | +#if has_seqlen_k |
| 34 | + let seq_causal_length = past_sequence_length + workgroup_idx % sequence_length + 1; |
| 35 | +#else |
| 36 | + let seq_causal_length = uniforms.total_sequence_length_comp; |
| 37 | +#endif |
| 38 | + let local_offset = local_idx * uniforms.elements_per_thread; |
| 39 | + let offset = workgroup_idx * uniforms.total_sequence_length_comp + local_offset; |
| 40 | + |
| 41 | +#if has_sliding_window |
| 42 | + // Sliding window |
| 43 | + let should_apply_local_window = uniforms.local_window_size >= 0 && seq_causal_length > uniforms.local_window_size; |
| 44 | + let start_offset = select(0, seq_causal_length - uniforms.local_window_size, should_apply_local_window); |
| 45 | + let effective_seq_length = select(seq_causal_length, uniforms.local_window_size, should_apply_local_window); |
| 46 | +#else |
| 47 | + // No sliding window: we keep the code for sliding window in the shader but |
| 48 | + // using const for start_offset and should_apply_local_window will make the compiler optimize it out. |
| 49 | + const start_offset = 0; |
| 50 | + const should_apply_local_window = false; |
| 51 | + let effective_seq_length = seq_causal_length; |
| 52 | +#endif |
| 53 | + |
| 54 | + var thread_max_vector = f32_val_t(-3.4028234663852886e+38f); |
| 55 | + for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < effective_seq_length; i++) { |
| 56 | + let actual_pos = local_offset + i + start_offset; |
| 57 | + if (!should_apply_local_window || actual_pos < seq_causal_length) { |
| 58 | + thread_max_vector = max(f32_val_t(x[offset + i + start_offset]), thread_max_vector); |
| 59 | + } |
| 60 | + } |
| 61 | +#if components == 4 |
| 62 | + thread_max[local_idx] = max(max(thread_max_vector.x, thread_max_vector.y), max(thread_max_vector.z, thread_max_vector.w)); |
| 63 | +#elif components == 2 |
| 64 | + thread_max[local_idx] = max(thread_max_vector.x, thread_max_vector.y); |
| 65 | +#else |
| 66 | + thread_max[local_idx] = thread_max_vector; |
| 67 | +#endif |
| 68 | + workgroupBarrier(); |
| 69 | + |
| 70 | +#if has_head_sink |
| 71 | + // Handle head sink |
| 72 | + let sink_value: f32 = f32(head_sink[head_idx]); |
| 73 | + var max_value = sink_value; |
| 74 | +#elif use_smooth_softmax |
| 75 | + var max_value: f32 = 0.0; |
| 76 | +#else |
| 77 | + var max_value = f32(-3.4028234663852886e+38f); |
| 78 | +#endif |
| 79 | + |
| 80 | + for (var i = 0u; i < work_group_size; i++) { |
| 81 | + max_value = max(thread_max[i], max_value); |
| 82 | + } |
| 83 | + var sum_vector = f32_val_t(0); |
| 84 | + for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < effective_seq_length; i++) { |
| 85 | + let actual_pos = local_offset + i + start_offset; |
| 86 | + if (!should_apply_local_window || actual_pos < seq_causal_length) { |
| 87 | + sum_vector += exp(f32_val_t(x[offset + i + start_offset]) - max_value); |
| 88 | + } |
| 89 | + } |
| 90 | +#if components == 4 |
| 91 | + thread_sum[local_idx] = sum_vector.x + sum_vector.y + sum_vector.z + sum_vector.w; |
| 92 | +#elif components == 2 |
| 93 | + thread_sum[local_idx] = sum_vector.x + sum_vector.y; |
| 94 | +#else |
| 95 | + thread_sum[local_idx] = sum_vector; |
| 96 | +#endif |
| 97 | + workgroupBarrier(); |
| 98 | + var sum: f32 = 0; |
| 99 | + for (var i = 0u; i < work_group_size; i++) { |
| 100 | + sum += thread_sum[i] |
| 101 | + ;} |
| 102 | + |
| 103 | +#if has_head_sink |
| 104 | + sum += exp(sink_value - max_value); |
| 105 | +#elif use_smooth_softmax |
| 106 | + sum += exp(-max_value); |
| 107 | +#endif |
| 108 | + |
| 109 | + if (sum == 0) { |
| 110 | + for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < effective_seq_length; i++) { |
| 111 | + let actual_pos = local_offset + i + start_offset; |
| 112 | + if (actual_pos < seq_causal_length) { |
| 113 | + x[offset + i + start_offset] = x_value_t(x_element_t(1.0)/x_element_t(effective_seq_length)); |
| 114 | + } |
| 115 | + } |
| 116 | + } else { |
| 117 | + for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < effective_seq_length; i++) { |
| 118 | + let actual_pos = local_offset + i + start_offset; |
| 119 | + let pos = offset + i + start_offset; |
| 120 | + if (!should_apply_local_window || actual_pos < seq_causal_length) { |
| 121 | + var f32input = f32_val_t(x[pos]); |
| 122 | + x[pos] = x_value_t(exp(f32input - max_value) / sum); |
| 123 | + } |
| 124 | + } |
| 125 | + } |
| 126 | + |
| 127 | + // zero out elements outside the sliding window |
| 128 | + if (should_apply_local_window) { |
| 129 | + for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < seq_causal_length; i++) { |
| 130 | + let global_pos = i + local_offset; |
| 131 | + if (global_pos < start_offset) { |
| 132 | + x[offset + i] = x_value_t(x_element_t(0)); |
| 133 | + } |
| 134 | + } |
| 135 | + } |
| 136 | + |
| 137 | +#if has_seqlen_k |
| 138 | + for (var total_seq_id: u32 = seq_causal_length; total_seq_id + local_offset < uniforms.total_sequence_length_comp; total_seq_id++) { |
| 139 | + x[offset + total_seq_id] = x_value_t(x_element_t(0)); |
| 140 | + } |
| 141 | +#endif |
| 142 | +} // MAIN |
0 commit comments