|
| 1 | +/* |
| 2 | + * attack_method_kbc_list.hpp |
| 3 | + * |
| 4 | + * Created on: Nov 7, 2021 |
| 5 | + * Author: nick |
| 6 | + */ |
| 7 | + |
| 8 | +#ifndef ATTACK_METHOD_KBC_LIST_HPP_ |
| 9 | +#define ATTACK_METHOD_KBC_LIST_HPP_ |
| 10 | + |
| 11 | +#define ATTACK_FILTER_BITMASK(chacha_y,i) \ |
| 12 | +{ \ |
| 13 | + uint64_t Ry = (((uint64_t) chacha_y) << 6) + (x >> 26); \ |
| 14 | + int kbc_bucket_id_L = (uint32_t (Ry / kBC)) - 1; \ |
| 15 | + if (kbc_bucket_id_L > 0) { \ |
| 16 | + int kbc_bitmask_bucket = kbc_bucket_id_L / 32; \ |
| 17 | + unsigned int kbc_bit_slot = kbc_bucket_id_L % 32; \ |
| 18 | + unsigned int kbc_mask = 1 << kbc_bit_slot; \ |
| 19 | + unsigned int kbc_value = kbc_global_bitmask[kbc_bitmask_bucket]; \ |
| 20 | + if ((kbc_mask & kbc_value) > 0) { \ |
| 21 | + int slot = atomicAdd(&count[0],1); \ |
| 22 | + xs[slot] = (x+i); \ |
| 23 | + chachas[slot] = chacha_y; \ |
| 24 | + } \ |
| 25 | + } \ |
| 26 | +} |
| 27 | + |
| 28 | +__global__ |
| 29 | +void gpu_chacha8_filter_rxs_by_kbc_bitmask(const uint32_t N, |
| 30 | + const __restrict__ uint32_t *input, |
| 31 | + const unsigned int* __restrict__ kbc_global_bitmask, |
| 32 | + uint32_t * __restrict__ xs, uint32_t * __restrict__ chachas, int *count) |
| 33 | +{ |
| 34 | + uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; |
| 35 | + |
| 36 | + int index = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; |
| 37 | + int stride = blockDim.x * gridDim.x; |
| 38 | + const uint32_t end_n = N / 16; // 16 x's in each group |
| 39 | + |
| 40 | + for (uint32_t x_group = index; x_group <= end_n; x_group += stride) { |
| 41 | + uint32_t x = x_group << 4;// *16; |
| 42 | + uint32_t pos = x_group; |
| 43 | + |
| 44 | + x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7]; |
| 45 | + x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11]; |
| 46 | + x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32; |
| 47 | + x14 = input[14];x15 = input[15]; |
| 48 | + |
| 49 | + #pragma unroll |
| 50 | + for (int i = 0; i < 4; i++) { |
| 51 | + QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15); |
| 52 | + QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14); |
| 53 | + } |
| 54 | + |
| 55 | + x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4]; |
| 56 | + x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9]; |
| 57 | + x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0; |
| 58 | + x14 += input[14];x15 += input[15]; |
| 59 | + |
| 60 | + // convert to little endian/big endian whatever, chia needs it like this |
| 61 | + BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5); |
| 62 | + BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11); |
| 63 | + BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15); |
| 64 | + |
| 65 | + //uint64_t y = x0 << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 |
| 66 | + //int nick_bucket_id; // = x0 >> 22; // gives bucket id 0..1023 |
| 67 | + ATTACK_FILTER_BITMASK(x0,0);ATTACK_FILTER_BITMASK(x1,1);ATTACK_FILTER_BITMASK(x2,2);ATTACK_FILTER_BITMASK(x3,3); |
| 68 | + ATTACK_FILTER_BITMASK(x4,4);ATTACK_FILTER_BITMASK(x5,5);ATTACK_FILTER_BITMASK(x6,6);ATTACK_FILTER_BITMASK(x7,7); |
| 69 | + ATTACK_FILTER_BITMASK(x8,8);ATTACK_FILTER_BITMASK(x9,9);ATTACK_FILTER_BITMASK(x10,10);ATTACK_FILTER_BITMASK(x11,11); |
| 70 | + ATTACK_FILTER_BITMASK(x12,12);ATTACK_FILTER_BITMASK(x13,13);ATTACK_FILTER_BITMASK(x14,14);ATTACK_FILTER_BITMASK(x15,15); |
| 71 | + } |
| 72 | +} |
| 73 | + |
| 74 | +__global__ |
| 75 | +void gpu_set_kbc_bitmask_from_kbc_list(const uint32_t N, |
| 76 | + uint32_t *kbc_list, unsigned int* kbc_bitmask) |
| 77 | +{ |
| 78 | + int i = blockIdx.x * blockDim.x + threadIdx.x; |
| 79 | + if (i < N) { |
| 80 | + uint32_t kbc_bucket_id = kbc_list[i]; |
| 81 | + int kbc_bitmask_bucket = kbc_bucket_id / 32; |
| 82 | + int kbc_bit_slot = kbc_bucket_id % 32; |
| 83 | + unsigned int kbc_mask = 1 << kbc_bit_slot; |
| 84 | + atomicOr(&kbc_bitmask[kbc_bitmask_bucket],kbc_mask); |
| 85 | + //printf("kbc slot %u value %u SET mask bucket: %u bitslot:%u\n",i, kbc_bucket_id, kbc_bitmask_bucket, kbc_bit_slot); |
| 86 | + // don't forget buckets needed for rx's. |
| 87 | + kbc_bitmask_bucket = (kbc_bucket_id+1) / 32; |
| 88 | + kbc_bit_slot = (kbc_bucket_id+1) % 32; |
| 89 | + kbc_mask = 1 << kbc_bit_slot; |
| 90 | + atomicOr(&kbc_bitmask[kbc_bitmask_bucket],kbc_mask); |
| 91 | + //printf("kbc %u SET mask bucket: %u bitslot:%u\n",kbc_bucket_id+1, kbc_bitmask_bucket, kbc_bit_slot); |
| 92 | + } |
| 93 | +} |
| 94 | + |
| 95 | +__global__ |
| 96 | +void gpu_count_kbc_mask_bits(unsigned int* kbc_bitmask) |
| 97 | +{ |
| 98 | + int count = 0; |
| 99 | + for (int kbc_bucket_id_L=0;kbc_bucket_id_L<kBC_NUM_BUCKETS;kbc_bucket_id_L++) { |
| 100 | + int kbc_bitmask_bucket = kbc_bucket_id_L / 32; |
| 101 | + int kbc_bit_slot = kbc_bucket_id_L % 32; |
| 102 | + unsigned int kbc_mask = 1 << kbc_bit_slot; |
| 103 | + unsigned int kbc_value = kbc_bitmask[kbc_bitmask_bucket]; |
| 104 | + if ((kbc_mask & kbc_value) > 0) { |
| 105 | + count++; |
| 106 | + } |
| 107 | + } |
| 108 | + printf("Counted kbc masks: %u\n",count); |
| 109 | +} |
| 110 | + |
| 111 | +#include <bits/stdc++.h> |
| 112 | + |
| 113 | +void attack_method_kbc_list(uint32_t bits) { |
| 114 | + |
| 115 | + const uint32_t NUM_L_KBCS = 208147; // T4 16-bit entry list size |
| 116 | + std::cout << "ATTACK METHOD KBC LIST NUM: " << NUM_L_KBCS << std::endl; |
| 117 | + |
| 118 | + /* Tried, really tried, but the bitmask slows it down too much, all those x's checking 4 billion times against |
| 119 | + * ram and then doing a simple xs/ys add, even so it's 109ms just to filter the xs compared to kbc bit scan method |
| 120 | + * that's done with that phase and sorted into buckets at 40ms tops. |
| 121 | + * DrPlotter v0.1d |
| 122 | +Attack it! |
| 123 | +ATTACK METHOD KBC LIST NUM: 208147 |
| 124 | + kbc list bytes size:832588 |
| 125 | + kbc_bitmask:832588 |
| 126 | + expected xs:106571264 size: 426285056 |
| 127 | + chachas:106571264 size: 426285056 |
| 128 | +Generating kbc list (step:87) |
| 129 | + num uniques:208146 duplicates: 0 |
| 130 | +setting kbc mask |
| 131 | + gpu_chacha8_set_Lxs_into_kbc_bitmask results: 1 ms |
| 132 | +Counted kbc masks: 411613 |
| 133 | +getting filtered xs/chachas list |
| 134 | + gpu_chacha8_filter_rxs_by_kbc_bitmask time: 109 ms |
| 135 | + xs count: 97190536 |
| 136 | +Freeing memory... |
| 137 | + compute only time: 287 ms |
| 138 | +end. |
| 139 | + * |
| 140 | + */ |
| 141 | + |
| 142 | + using milli = std::chrono::milliseconds; |
| 143 | + auto attack_start = std::chrono::high_resolution_clock::now(); |
| 144 | + |
| 145 | + // first we "read" the kbc list on host |
| 146 | + |
| 147 | + const uint32_t EXPECTED_XS = NUM_L_KBCS*2*256; |
| 148 | + uint32_t *kbc_list; |
| 149 | + unsigned int *kbc_bitmask; |
| 150 | + int *xs_count; |
| 151 | + uint32_t *xs; |
| 152 | + uint32_t *chachas; |
| 153 | + |
| 154 | + std::cout << " kbc list bytes size:" << (sizeof(uint32_t)*NUM_L_KBCS) << std::endl; |
| 155 | + CUDA_CHECK_RETURN(cudaMallocManaged(&kbc_list, sizeof(uint32_t)*NUM_L_KBCS)); |
| 156 | + std::cout << " kbc_bitmask:" << (sizeof(unsigned int)*NUM_L_KBCS) << std::endl; |
| 157 | + CUDA_CHECK_RETURN(cudaMalloc(&kbc_bitmask, kBC_NUM_BUCKETS*sizeof(unsigned int))); |
| 158 | + CUDA_CHECK_RETURN(cudaMemset(kbc_bitmask, 0, kBC_NUM_BUCKETS*sizeof(unsigned int))); |
| 159 | + std::cout << " expected xs:" << EXPECTED_XS << " size: " << (sizeof(uint32_t)*EXPECTED_XS) << std::endl; |
| 160 | + CUDA_CHECK_RETURN(cudaMalloc(&xs, EXPECTED_XS*sizeof(uint32_t))); |
| 161 | + std::cout << " chachas:" << EXPECTED_XS << " size: " << (sizeof(uint32_t)*EXPECTED_XS) << std::endl; |
| 162 | + CUDA_CHECK_RETURN(cudaMalloc(&chachas, EXPECTED_XS*sizeof(uint32_t))); |
| 163 | + CUDA_CHECK_RETURN(cudaMallocManaged(&xs_count, 1024)); // 1024 blocks maybe? |
| 164 | + |
| 165 | + auto compute_only_start = std::chrono::high_resolution_clock::now(); |
| 166 | + |
| 167 | + int step = kBC_NUM_BUCKETS / NUM_L_KBCS; |
| 168 | + std::cout << "Generating kbc list (step:" << step << ")" << std::endl; |
| 169 | + for (int i=0;i<NUM_L_KBCS;i++) { |
| 170 | + int value = rand() % kBC_NUM_BUCKETS;//i*step; |
| 171 | + //std::cout << " setting kbc " << value << std::endl; |
| 172 | + kbc_list[i] = value; // just set distribution but consistent for testing. |
| 173 | + } |
| 174 | + //std::sort(kbc_list, kbc_list + NUM_L_KBCS); |
| 175 | + int duplicates = 0; |
| 176 | + int uniques = 0; |
| 177 | + for (int i=1;i<NUM_L_KBCS;i++) { |
| 178 | + if (kbc_list[i] == kbc_list[i-1]) duplicates++; |
| 179 | + else uniques++; |
| 180 | + } |
| 181 | + std::cout << " num uniques:" << uniques << " duplicates: " << duplicates << std::endl; |
| 182 | + |
| 183 | + std::cout << "setting kbc mask" << std::endl; |
| 184 | + int blockSize = 256; // # of threads per block, maximum is 1024. |
| 185 | + uint64_t calc_N = NUM_L_KBCS; |
| 186 | + uint64_t calc_blockSize = blockSize; |
| 187 | + uint64_t calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize); |
| 188 | + int numBlocks = calc_numBlocks; |
| 189 | + |
| 190 | + auto time_start = std::chrono::high_resolution_clock::now(); |
| 191 | + gpu_set_kbc_bitmask_from_kbc_list<<<numBlocks,blockSize>>>(calc_N, kbc_list, kbc_bitmask); |
| 192 | + CUDA_CHECK_RETURN(cudaDeviceSynchronize()); |
| 193 | + auto time_finish = std::chrono::high_resolution_clock::now(); |
| 194 | + std::cout << " gpu_chacha8_set_Lxs_into_kbc_bitmask results: " << std::chrono::duration_cast<milli>(time_finish - time_start).count() << " ms\n"; |
| 195 | + |
| 196 | + gpu_count_kbc_mask_bits<<<1,1>>>(kbc_bitmask); |
| 197 | + CUDA_CHECK_RETURN(cudaDeviceSynchronize()); |
| 198 | + |
| 199 | + std::cout << "getting filtered xs/chachas list" << std::endl; |
| 200 | + blockSize = 256; // # of threads per block, maximum is 1024. |
| 201 | + calc_N = UINT_MAX; |
| 202 | + calc_blockSize = blockSize; |
| 203 | + calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16); |
| 204 | + numBlocks = calc_numBlocks; |
| 205 | + xs_count[0] = 0; |
| 206 | + time_start = std::chrono::high_resolution_clock::now(); |
| 207 | + gpu_chacha8_filter_rxs_by_kbc_bitmask<<<numBlocks,blockSize>>>(calc_N,chacha_input, |
| 208 | + kbc_bitmask, xs, chachas, &xs_count[0]); |
| 209 | + CUDA_CHECK_RETURN(cudaDeviceSynchronize()); |
| 210 | + time_finish = std::chrono::high_resolution_clock::now(); |
| 211 | + std::cout << " gpu_chacha8_filter_rxs_by_kbc_bitmask time: " << std::chrono::duration_cast<milli>(time_finish - time_start).count() << " ms\n"; |
| 212 | + std::cout << " xs count: " << xs_count[0] << "\n"; |
| 213 | + |
| 214 | + |
| 215 | + auto compute_only_finish = std::chrono::high_resolution_clock::now(); |
| 216 | + |
| 217 | + std::cout << "Freeing memory..." << std::endl; |
| 218 | + CUDA_CHECK_RETURN(cudaFree(kbc_bitmask)); |
| 219 | + CUDA_CHECK_RETURN(cudaFree(xs)); |
| 220 | + CUDA_CHECK_RETURN(cudaFree(chachas)); |
| 221 | + |
| 222 | + std::cout << " compute only time: " << std::chrono::duration_cast<milli>(compute_only_finish - compute_only_start).count() << " ms\n"; |
| 223 | + std::cout << "end." << std::endl; |
| 224 | + |
| 225 | +} |
| 226 | + |
| 227 | +#endif /* ATTACK_METHOD_KBC_LIST_HPP_ */ |
0 commit comments