Skip to content

Commit 8c397fa

Browse files
author
drnick23
authored
Merge branch 'main' into full-plot
2 parents 10b5a7a + 6cb4804 commit 8c397fa

2 files changed

Lines changed: 28 additions & 5 deletions

File tree

drplotter.cu

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "phase2.hpp"
3131

3232

33+
3334
const uint16_t THREADS_FOR_MATCHING = 256; // 386 is 10600ms matching. 256 is 9761ms matching. 237 is...10109
3435

3536
int cmd_read = 0;
@@ -54,6 +55,7 @@ char *host_criss_cross_blocks; // aka host_meta_blocks
5455
char *host_refdata_blocks;
5556
char *device_buffer_A;
5657
char *device_buffer_B;
58+
5759
char *device_buffer_C;
5860
char *device_buffer_T3_base;
5961
char *device_buffer_refdata;
@@ -589,11 +591,8 @@ void gpu_find_tx_matches(uint16_t table, uint32_t batch_id, uint32_t start_kbc_L
589591
}
590592
__syncthreads(); // all written initialize data should sync
591593

592-
/*bool printandquit = ((global_kbc_L_bucket_id == 0));
593-
594-
595-
596594

595+
/*bool printandquit = ((global_kbc_L_bucket_id == 0));
597596
if (printandquit) {
598597
if (threadIdx.x == 0) {
599598
@@ -929,8 +928,10 @@ void gpu_find_tx_matches(uint16_t table, uint32_t batch_id, uint32_t start_kbc_L
929928

930929
if ((doPrint >=1) && (threadIdx.x == 0)) {
931930
//if ((doPrint > 0) && (global_kbc_L_bucket_id < 10 == 0)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches);
931+
932932
if ((global_kbc_L_bucket_id % 1000000 == 0) || (global_kbc_L_bucket_id < 10)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches);
933933

934+
934935
}
935936
/*
936937
kBC bucket id: 0 L entries: 222 R entries: 242 matches: 219
@@ -2660,6 +2661,24 @@ void gpu_find_tx_matches_orig(uint16_t table, uint32_t batch_id, uint32_t start_
26602661
} } \
26612662
}
26622663

2664+
//if ((x + i) < 256) { printf("x: %u y:%llu kbc:%u\n", (x+i), y, kbc_bucket_id); }
2665+
#define KBCFILTER(chacha_y,i) \
2666+
{ \
2667+
uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \
2668+
uint32_t kbc_bucket_id = uint32_t (y / kBC); \
2669+
for (int j=0;j<64;j++) { \
2670+
if (include_xs[j] == (x+i)) { printf("including x %u\n", (x+i)); \
2671+
if ((kbc_bucket_id >= KBC_START) && (kbc_bucket_id <= KBC_END)) { \
2672+
uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START; \
2673+
int slot = atomicAdd(&kbc_local_num_entries[local_kbc_bucket_id],1); \
2674+
F1_Bucketed_kBC_Entry entry = { (x+i), (uint32_t) (y % kBC) }; \
2675+
if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
2676+
uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
2677+
kbc_local_entries[entries_address] = entry; \
2678+
} \
2679+
} } \
2680+
}
2681+
26632682
//if ((x + i) < 256) { printf("x: %u y:%llu kbc:%u\n", (x+i), y, kbc_bucket_id); }
26642683
//if (((x+i) % (1024*1024)) == 0) { printf("x: %u chacha: %u y:%llu kbc:%u\n", (x+i), chacha_y, y, kbc_bucket_id); }
26652684
//if (kbc_bucket_id == 0) { printf("x: %u chacha: %u y:%llu kbc:%u\n", (x+i), chacha_y, y, kbc_bucket_id); }
@@ -3451,7 +3470,9 @@ void doTxBatch(uint16_t table, uint32_t batch_id) {
34513470
transferBucketedBlocksFromDeviceToHost(table, batch_id, device_buffer_B, transfer_out_size, device_buffer_refdata, device_block_entry_counts);
34523471
CUDA_CHECK_RETURN(cudaDeviceSynchronize());
34533472
finish = std::chrono::high_resolution_clock::now();
3473+
34543474
table_transfer_out_time_ms += std::chrono::duration_cast<milli>(finish - start).count();
3475+
34553476
//std::cout << " done. " << std::chrono::duration_cast<milli>(finish - start).count() << " ms\n";
34563477
//} else if (table == 6) {
34573478
// TODO: handle final T6 file...maybe this can write into hostmem instead of to file.
@@ -3553,6 +3574,7 @@ void setupMemory() {
35533574
std::cout << " device_buffer_B " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " * (UNIT BYTES:" << DEVICE_BUFFER_UNIT_BYTES << ") = " << DEVICE_BUFFER_ALLOCATED_BYTES << std::endl;
35543575
CUDA_CHECK_RETURN(cudaMalloc(&device_buffer_B, DEVICE_BUFFER_ALLOCATED_BYTES));
35553576

3577+
35563578
std::cout << " device_buffer_C " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " * (UNIT BYTES:" << DEVICE_BUFFER_UNIT_BYTES << ") = " << DEVICE_BUFFER_ALLOCATED_BYTES << std::endl;
35573579
CUDA_CHECK_RETURN(cudaMalloc(&device_buffer_C, DEVICE_BUFFER_ALLOCATED_BYTES));
35583580

@@ -3618,7 +3640,6 @@ int main(int argc, char *argv[])
36183640
auto total_start = std::chrono::high_resolution_clock::now();
36193641
doT1();
36203642
doTx(2);
3621-
//return;
36223643
doTx(3);
36233644
doTx(4);
36243645
doTx(5);

nick_globals.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,9 @@ const uint64_t HOST_MAX_BLOCK_ENTRIES = 1114112;//1114112; // MUST be multiple o
2828
const uint64_t HOST_ALLOCATED_ENTRIES = HOST_MAX_BLOCK_ENTRIES * BATCHES * BATCHES;
2929
const uint64_t HOST_ALLOCATED_BYTES = HOST_UNIT_BYTES * HOST_ALLOCATED_ENTRIES;
3030

31+
3132
const uint64_t DEVICE_BUFFER_UNIT_BYTES = 24;//32; // Tx_pairing_chunk_meta4 is 24 bytes, w/ backref is 32 bytes
33+
3234
const uint64_t DEVICE_BUFFER_ALLOCATED_ENTRIES = KBC_LOCAL_NUM_BUCKETS * KBC_MAX_ENTRIES_PER_BUCKET; // HOST_MAX_BLOCK_ENTRIES * BATCHES;// DEVICE_BUFFER_ALLOCATED_ENTRIES = 120 * ((uint64_t) 1 << 32) / (100*BATCHES);
3335
const uint64_t DEVICE_BUFFER_ALLOCATED_BYTES = DEVICE_BUFFER_ALLOCATED_ENTRIES * DEVICE_BUFFER_UNIT_BYTES;
3436
const uint64_t BACKREF_UNIT_BYTES = 12; // backref w/y for last table is 12 bytes

0 commit comments

Comments
 (0)