3030#include " phase2.hpp"
3131
3232
33+
3334const uint16_t THREADS_FOR_MATCHING = 256 ; // 386 is 10600ms matching. 256 is 9761ms matching. 237 is...10109
3435
3536int cmd_read = 0 ;
@@ -54,6 +55,7 @@ char *host_criss_cross_blocks; // aka host_meta_blocks
5455char *host_refdata_blocks;
5556char *device_buffer_A;
5657char *device_buffer_B;
58+
5759char *device_buffer_C;
5860char *device_buffer_T3_base;
5961char *device_buffer_refdata;
@@ -589,11 +591,8 @@ void gpu_find_tx_matches(uint16_t table, uint32_t batch_id, uint32_t start_kbc_L
589591 }
590592 __syncthreads (); // all written initialize data should sync
591593
592- /* bool printandquit = ((global_kbc_L_bucket_id == 0));
593-
594-
595-
596594
595+ /* bool printandquit = ((global_kbc_L_bucket_id == 0));
597596 if (printandquit) {
598597 if (threadIdx.x == 0) {
599598
@@ -929,8 +928,10 @@ void gpu_find_tx_matches(uint16_t table, uint32_t batch_id, uint32_t start_kbc_L
929928
930929 if ((doPrint >=1 ) && (threadIdx .x == 0 )) {
931930 // if ((doPrint > 0) && (global_kbc_L_bucket_id < 10 == 0)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches);
931+
932932 if ((global_kbc_L_bucket_id % 1000000 == 0 ) || (global_kbc_L_bucket_id < 10 )) printf (" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n " , global_kbc_L_bucket_id, num_L, num_R, total_matches);
933933
934+
934935 }
935936 /*
936937 kBC bucket id: 0 L entries: 222 R entries: 242 matches: 219
@@ -2660,6 +2661,24 @@ void gpu_find_tx_matches_orig(uint16_t table, uint32_t batch_id, uint32_t start_
26602661 } } \
26612662}
26622663
2664+ // if ((x + i) < 256) { printf("x: %u y:%llu kbc:%u\n", (x+i), y, kbc_bucket_id); }
2665+ #define KBCFILTER (chacha_y,i ) \
2666+ { \
2667+ uint64_t y = (((uint64_t ) chacha_y) << 6 ) + (x >> 26 ); \
2668+ uint32_t kbc_bucket_id = uint32_t (y / kBC ); \
2669+ for (int j=0 ;j<64 ;j++) { \
2670+ if (include_xs[j] == (x+i)) { printf (" including x %u\n " , (x+i)); \
2671+ if ((kbc_bucket_id >= KBC_START ) && (kbc_bucket_id <= KBC_END )) { \
2672+ uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START ; \
2673+ int slot = atomicAdd (&kbc_local_num_entries[local_kbc_bucket_id],1 ); \
2674+ F1_Bucketed_kBC_Entry entry = { (x+i), (uint32_t ) (y % kBC ) }; \
2675+ if (slot >= KBC_MAX_ENTRIES_PER_BUCKET ) { printf (" ERROR KBC OVERFLOW MAX:%u actual:%u" , KBC_MAX_ENTRIES_PER_BUCKET , slot); } \
2676+ uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
2677+ kbc_local_entries[entries_address] = entry; \
2678+ } \
2679+ } } \
2680+ }
2681+
26632682// if ((x + i) < 256) { printf("x: %u y:%llu kbc:%u\n", (x+i), y, kbc_bucket_id); }
26642683// if (((x+i) % (1024*1024)) == 0) { printf("x: %u chacha: %u y:%llu kbc:%u\n", (x+i), chacha_y, y, kbc_bucket_id); }
26652684// if (kbc_bucket_id == 0) { printf("x: %u chacha: %u y:%llu kbc:%u\n", (x+i), chacha_y, y, kbc_bucket_id); }
@@ -3451,7 +3470,9 @@ void doTxBatch(uint16_t table, uint32_t batch_id) {
34513470 transferBucketedBlocksFromDeviceToHost (table, batch_id, device_buffer_B, transfer_out_size, device_buffer_refdata, device_block_entry_counts);
34523471 CUDA_CHECK_RETURN (cudaDeviceSynchronize ());
34533472 finish = std::chrono::high_resolution_clock::now ();
3473+
34543474 table_transfer_out_time_ms += std::chrono::duration_cast<milli>(finish - start).count ();
3475+
34553476 // std::cout << " done. " << std::chrono::duration_cast<milli>(finish - start).count() << " ms\n";
34563477 // } else if (table == 6) {
34573478 // TODO: handle final T6 file...maybe this can write into hostmem instead of to file.
@@ -3553,6 +3574,7 @@ void setupMemory() {
35533574 std::cout << " device_buffer_B " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " * (UNIT BYTES:" << DEVICE_BUFFER_UNIT_BYTES << " ) = " << DEVICE_BUFFER_ALLOCATED_BYTES << std::endl;
35543575 CUDA_CHECK_RETURN (cudaMalloc (&device_buffer_B, DEVICE_BUFFER_ALLOCATED_BYTES ));
35553576
3577+
35563578 std::cout << " device_buffer_C " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " * (UNIT BYTES:" << DEVICE_BUFFER_UNIT_BYTES << " ) = " << DEVICE_BUFFER_ALLOCATED_BYTES << std::endl;
35573579 CUDA_CHECK_RETURN (cudaMalloc (&device_buffer_C, DEVICE_BUFFER_ALLOCATED_BYTES ));
35583580
@@ -3618,7 +3640,6 @@ int main(int argc, char *argv[])
36183640 auto total_start = std::chrono::high_resolution_clock::now ();
36193641 doT1 ();
36203642 doTx (2 );
3621- // return;
36223643 doTx (3 );
36233644 doTx (4 );
36243645 doTx (5 );
0 commit comments