Skip to content

Commit 0bc8385

Browse files
author
drnick23
authored
Merge pull request #1 from drnick23/full-plot
main
2 parents 6cb4804 + 8c397fa commit 0bc8385

14 files changed

Lines changed: 9967 additions & 46 deletions

attack.hpp

Lines changed: 1468 additions & 0 deletions
Large diffs are not rendered by default.

attack_method_1.hpp

Lines changed: 493 additions & 0 deletions
Large diffs are not rendered by default.

attack_method_2.hpp

Lines changed: 1460 additions & 0 deletions
Large diffs are not rendered by default.

attack_method_kbc_list.hpp

Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
/*
2+
* attack_method_kbc_list.hpp
3+
*
4+
* Created on: Nov 7, 2021
5+
* Author: nick
6+
*/
7+
8+
#ifndef ATTACK_METHOD_KBC_LIST_HPP_
9+
#define ATTACK_METHOD_KBC_LIST_HPP_
10+
11+
#define ATTACK_FILTER_BITMASK(chacha_y,i) \
12+
{ \
13+
uint64_t Ry = (((uint64_t) chacha_y) << 6) + (x >> 26); \
14+
int kbc_bucket_id_L = (uint32_t (Ry / kBC)) - 1; \
15+
if (kbc_bucket_id_L > 0) { \
16+
int kbc_bitmask_bucket = kbc_bucket_id_L / 32; \
17+
unsigned int kbc_bit_slot = kbc_bucket_id_L % 32; \
18+
unsigned int kbc_mask = 1 << kbc_bit_slot; \
19+
unsigned int kbc_value = kbc_global_bitmask[kbc_bitmask_bucket]; \
20+
if ((kbc_mask & kbc_value) > 0) { \
21+
int slot = atomicAdd(&count[0],1); \
22+
xs[slot] = (x+i); \
23+
chachas[slot] = chacha_y; \
24+
} \
25+
} \
26+
}
27+
28+
__global__
29+
void gpu_chacha8_filter_rxs_by_kbc_bitmask(const uint32_t N,
30+
const __restrict__ uint32_t *input,
31+
const unsigned int* __restrict__ kbc_global_bitmask,
32+
uint32_t * __restrict__ xs, uint32_t * __restrict__ chachas, int *count)
33+
{
34+
uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
35+
36+
int index = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16;
37+
int stride = blockDim.x * gridDim.x;
38+
const uint32_t end_n = N / 16; // 16 x's in each group
39+
40+
for (uint32_t x_group = index; x_group <= end_n; x_group += stride) {
41+
uint32_t x = x_group << 4;// *16;
42+
uint32_t pos = x_group;
43+
44+
x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7];
45+
x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11];
46+
x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32;
47+
x14 = input[14];x15 = input[15];
48+
49+
#pragma unroll
50+
for (int i = 0; i < 4; i++) {
51+
QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15);
52+
QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14);
53+
}
54+
55+
x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4];
56+
x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9];
57+
x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0;
58+
x14 += input[14];x15 += input[15];
59+
60+
// convert to little endian/big endian whatever, chia needs it like this
61+
BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5);
62+
BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11);
63+
BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15);
64+
65+
//uint64_t y = x0 << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
66+
//int nick_bucket_id; // = x0 >> 22; // gives bucket id 0..1023
67+
ATTACK_FILTER_BITMASK(x0,0);ATTACK_FILTER_BITMASK(x1,1);ATTACK_FILTER_BITMASK(x2,2);ATTACK_FILTER_BITMASK(x3,3);
68+
ATTACK_FILTER_BITMASK(x4,4);ATTACK_FILTER_BITMASK(x5,5);ATTACK_FILTER_BITMASK(x6,6);ATTACK_FILTER_BITMASK(x7,7);
69+
ATTACK_FILTER_BITMASK(x8,8);ATTACK_FILTER_BITMASK(x9,9);ATTACK_FILTER_BITMASK(x10,10);ATTACK_FILTER_BITMASK(x11,11);
70+
ATTACK_FILTER_BITMASK(x12,12);ATTACK_FILTER_BITMASK(x13,13);ATTACK_FILTER_BITMASK(x14,14);ATTACK_FILTER_BITMASK(x15,15);
71+
}
72+
}
73+
74+
__global__
75+
void gpu_set_kbc_bitmask_from_kbc_list(const uint32_t N,
76+
uint32_t *kbc_list, unsigned int* kbc_bitmask)
77+
{
78+
int i = blockIdx.x * blockDim.x + threadIdx.x;
79+
if (i < N) {
80+
uint32_t kbc_bucket_id = kbc_list[i];
81+
int kbc_bitmask_bucket = kbc_bucket_id / 32;
82+
int kbc_bit_slot = kbc_bucket_id % 32;
83+
unsigned int kbc_mask = 1 << kbc_bit_slot;
84+
atomicOr(&kbc_bitmask[kbc_bitmask_bucket],kbc_mask);
85+
//printf("kbc slot %u value %u SET mask bucket: %u bitslot:%u\n",i, kbc_bucket_id, kbc_bitmask_bucket, kbc_bit_slot);
86+
// don't forget buckets needed for rx's.
87+
kbc_bitmask_bucket = (kbc_bucket_id+1) / 32;
88+
kbc_bit_slot = (kbc_bucket_id+1) % 32;
89+
kbc_mask = 1 << kbc_bit_slot;
90+
atomicOr(&kbc_bitmask[kbc_bitmask_bucket],kbc_mask);
91+
//printf("kbc %u SET mask bucket: %u bitslot:%u\n",kbc_bucket_id+1, kbc_bitmask_bucket, kbc_bit_slot);
92+
}
93+
}
94+
95+
__global__
96+
void gpu_count_kbc_mask_bits(unsigned int* kbc_bitmask)
97+
{
98+
int count = 0;
99+
for (int kbc_bucket_id_L=0;kbc_bucket_id_L<kBC_NUM_BUCKETS;kbc_bucket_id_L++) {
100+
int kbc_bitmask_bucket = kbc_bucket_id_L / 32;
101+
int kbc_bit_slot = kbc_bucket_id_L % 32;
102+
unsigned int kbc_mask = 1 << kbc_bit_slot;
103+
unsigned int kbc_value = kbc_bitmask[kbc_bitmask_bucket];
104+
if ((kbc_mask & kbc_value) > 0) {
105+
count++;
106+
}
107+
}
108+
printf("Counted kbc masks: %u\n",count);
109+
}
110+
111+
#include <bits/stdc++.h>
112+
113+
void attack_method_kbc_list(uint32_t bits) {
114+
115+
const uint32_t NUM_L_KBCS = 208147; // T4 16-bit entry list size
116+
std::cout << "ATTACK METHOD KBC LIST NUM: " << NUM_L_KBCS << std::endl;
117+
118+
/* Tried, really tried, but the bitmask slows it down too much, all those x's checking 4 billion times against
119+
* ram and then doing a simple xs/ys add, even so it's 109ms just to filter the xs compared to kbc bit scan method
120+
* that's done with that phase and sorted into buckets at 40ms tops.
121+
* DrPlotter v0.1d
122+
Attack it!
123+
ATTACK METHOD KBC LIST NUM: 208147
124+
kbc list bytes size:832588
125+
kbc_bitmask:832588
126+
expected xs:106571264 size: 426285056
127+
chachas:106571264 size: 426285056
128+
Generating kbc list (step:87)
129+
num uniques:208146 duplicates: 0
130+
setting kbc mask
131+
gpu_chacha8_set_Lxs_into_kbc_bitmask results: 1 ms
132+
Counted kbc masks: 411613
133+
getting filtered xs/chachas list
134+
gpu_chacha8_filter_rxs_by_kbc_bitmask time: 109 ms
135+
xs count: 97190536
136+
Freeing memory...
137+
compute only time: 287 ms
138+
end.
139+
*
140+
*/
141+
142+
using milli = std::chrono::milliseconds;
143+
auto attack_start = std::chrono::high_resolution_clock::now();
144+
145+
// first we "read" the kbc list on host
146+
147+
const uint32_t EXPECTED_XS = NUM_L_KBCS*2*256;
148+
uint32_t *kbc_list;
149+
unsigned int *kbc_bitmask;
150+
int *xs_count;
151+
uint32_t *xs;
152+
uint32_t *chachas;
153+
154+
std::cout << " kbc list bytes size:" << (sizeof(uint32_t)*NUM_L_KBCS) << std::endl;
155+
CUDA_CHECK_RETURN(cudaMallocManaged(&kbc_list, sizeof(uint32_t)*NUM_L_KBCS));
156+
std::cout << " kbc_bitmask:" << (sizeof(unsigned int)*NUM_L_KBCS) << std::endl;
157+
CUDA_CHECK_RETURN(cudaMalloc(&kbc_bitmask, kBC_NUM_BUCKETS*sizeof(unsigned int)));
158+
CUDA_CHECK_RETURN(cudaMemset(kbc_bitmask, 0, kBC_NUM_BUCKETS*sizeof(unsigned int)));
159+
std::cout << " expected xs:" << EXPECTED_XS << " size: " << (sizeof(uint32_t)*EXPECTED_XS) << std::endl;
160+
CUDA_CHECK_RETURN(cudaMalloc(&xs, EXPECTED_XS*sizeof(uint32_t)));
161+
std::cout << " chachas:" << EXPECTED_XS << " size: " << (sizeof(uint32_t)*EXPECTED_XS) << std::endl;
162+
CUDA_CHECK_RETURN(cudaMalloc(&chachas, EXPECTED_XS*sizeof(uint32_t)));
163+
CUDA_CHECK_RETURN(cudaMallocManaged(&xs_count, 1024)); // 1024 blocks maybe?
164+
165+
auto compute_only_start = std::chrono::high_resolution_clock::now();
166+
167+
int step = kBC_NUM_BUCKETS / NUM_L_KBCS;
168+
std::cout << "Generating kbc list (step:" << step << ")" << std::endl;
169+
for (int i=0;i<NUM_L_KBCS;i++) {
170+
int value = rand() % kBC_NUM_BUCKETS;//i*step;
171+
//std::cout << " setting kbc " << value << std::endl;
172+
kbc_list[i] = value; // just set distribution but consistent for testing.
173+
}
174+
//std::sort(kbc_list, kbc_list + NUM_L_KBCS);
175+
int duplicates = 0;
176+
int uniques = 0;
177+
for (int i=1;i<NUM_L_KBCS;i++) {
178+
if (kbc_list[i] == kbc_list[i-1]) duplicates++;
179+
else uniques++;
180+
}
181+
std::cout << " num uniques:" << uniques << " duplicates: " << duplicates << std::endl;
182+
183+
std::cout << "setting kbc mask" << std::endl;
184+
int blockSize = 256; // # of threads per block, maximum is 1024.
185+
uint64_t calc_N = NUM_L_KBCS;
186+
uint64_t calc_blockSize = blockSize;
187+
uint64_t calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize);
188+
int numBlocks = calc_numBlocks;
189+
190+
auto time_start = std::chrono::high_resolution_clock::now();
191+
gpu_set_kbc_bitmask_from_kbc_list<<<numBlocks,blockSize>>>(calc_N, kbc_list, kbc_bitmask);
192+
CUDA_CHECK_RETURN(cudaDeviceSynchronize());
193+
auto time_finish = std::chrono::high_resolution_clock::now();
194+
std::cout << " gpu_chacha8_set_Lxs_into_kbc_bitmask results: " << std::chrono::duration_cast<milli>(time_finish - time_start).count() << " ms\n";
195+
196+
gpu_count_kbc_mask_bits<<<1,1>>>(kbc_bitmask);
197+
CUDA_CHECK_RETURN(cudaDeviceSynchronize());
198+
199+
std::cout << "getting filtered xs/chachas list" << std::endl;
200+
blockSize = 256; // # of threads per block, maximum is 1024.
201+
calc_N = UINT_MAX;
202+
calc_blockSize = blockSize;
203+
calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16);
204+
numBlocks = calc_numBlocks;
205+
xs_count[0] = 0;
206+
time_start = std::chrono::high_resolution_clock::now();
207+
gpu_chacha8_filter_rxs_by_kbc_bitmask<<<numBlocks,blockSize>>>(calc_N,chacha_input,
208+
kbc_bitmask, xs, chachas, &xs_count[0]);
209+
CUDA_CHECK_RETURN(cudaDeviceSynchronize());
210+
time_finish = std::chrono::high_resolution_clock::now();
211+
std::cout << " gpu_chacha8_filter_rxs_by_kbc_bitmask time: " << std::chrono::duration_cast<milli>(time_finish - time_start).count() << " ms\n";
212+
std::cout << " xs count: " << xs_count[0] << "\n";
213+
214+
215+
auto compute_only_finish = std::chrono::high_resolution_clock::now();
216+
217+
std::cout << "Freeing memory..." << std::endl;
218+
CUDA_CHECK_RETURN(cudaFree(kbc_bitmask));
219+
CUDA_CHECK_RETURN(cudaFree(xs));
220+
CUDA_CHECK_RETURN(cudaFree(chachas));
221+
222+
std::cout << " compute only time: " << std::chrono::duration_cast<milli>(compute_only_finish - compute_only_start).count() << " ms\n";
223+
std::cout << "end." << std::endl;
224+
225+
}
226+
227+
#endif /* ATTACK_METHOD_KBC_LIST_HPP_ */

0 commit comments

Comments
 (0)