diff --git a/include/gemmini.h b/include/gemmini.h index d958a994..160c8a7b 100644 --- a/include/gemmini.h +++ b/include/gemmini.h @@ -19,6 +19,8 @@ // Accelerator interface #include "rocc-software/src/xcustom.h" +#define unlikely(x) __builtin_expect(x, 0) + #define k_CONFIG 0 #define k_MVIN2 1 #define k_MVIN 2 @@ -45,6 +47,8 @@ #define k_LOOP_CONV_WS_CONFIG_5 20 #define k_LOOP_CONV_WS_CONFIG_6 21 +#define k_FENCE 127 + #define CONFIG_EX 0 #define CONFIG_LD 1 #define CONFIG_ST 2 @@ -182,18 +186,75 @@ acc_scale_t_bits acc_scale_t_to_acc_scale_t_bits(acc_scale_t x) { return un.b; } + #define ROCC_INSTRUCTION_RS1_RS2(x, rs1, rs2, funct) \ ROCC_INSTRUCTION_0_R_R(x, rs1, rs2, funct) +// flush +#define gemmini_flush(skip) \ + ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, skip, 0, k_FLUSH) + + +#define LABEL(i, f) LABEL1(i, f, __LINE__) +#define LABEL1(i, f, l) LABEL2(i, f, l) +#define LABEL2(i, f, l) LABEL_##i##_##f##_##l + +// Handle the exception reported by the return code of a gemmini instruction +// LSB indicates if the page addr is load fault or store fault +void handle_gemmini_xcpt(uint64_t rd) { + char is_st = (uint64_t) rd & 1; + char* addr = (char*) (rd & ~1); + if (is_st) { + printf("[GEMMINI] Store page fault at %x\n", addr); + *addr = 0; + } else { + printf("[GEMMINI] Load page fault at %x\n", addr); + volatile char t = *addr; + } + gemmini_flush(0); +} + +// When GEMMINI_XCPT_DEBUG is set, software will be compiled +// to check exception status in gemmini. Page faults will attempt +// to be resolved in the handle_gemmini_xcpt_routine. +// When GEMMINI_XCPT_DEBUG is unset, exception status will not be +// checked, so software must not cause page faults in gemmini +#ifdef GEMMINI_XCPT_DEBUG +#define ROCC_INSTRUCTION_RD_RS1_RS2(x, rs1, rs2, funct) \ + ROCC_INSTRUCTION_RD_RS1_RS2_labeled(x, rs1, rs2, funct, 0) +#else +#define ROCC_INSTRUCTION_RD_RS1_RS2(x, rs1, rs2, funct) \ + ROCC_INSTRUCTION_RS1_RS2(x, rs1, rs2, funct) +#endif + + + +// First, issue the instruction, and then check the return code +// If the return code indicates a pending exception from gemmini, handle it, +// then attempt to re-issue the instruction, +// To maintain the illusion of precise restartable exceptions, all gemmini +// instructions should be in this form (except flush) +#define ROCC_INSTRUCTION_RD_RS1_RS2_labeled(x, rs1, rs2, funct, l) { \ + uint64_t rd = 0; \ + LABEL(l, funct): \ + ROCC_INSTRUCTION_R_R_R(x, rd, rs1, rs2, funct); \ + if (unlikely(rd != 0)) { \ + handle_gemmini_xcpt(rd); \ + goto LABEL(l, funct); \ + } \ + } + + + // mvin and mvout #define gemmini_extended_mvin(dram_addr, spad_addr, cols, rows) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, dram_addr, ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (spad_addr), k_MVIN) + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, dram_addr, ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (spad_addr), k_MVIN) #define gemmini_extended_mvin2(dram_addr, spad_addr, cols, rows) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, dram_addr, ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (spad_addr), k_MVIN2) + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, dram_addr, ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (spad_addr), k_MVIN2) #define gemmini_extended_mvin3(dram_addr, spad_addr, cols, rows) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, dram_addr, ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (spad_addr), k_MVIN3) + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, dram_addr, ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (spad_addr), k_MVIN3) #define gemmini_block_mvin(dram_addr, spad_addr, len) \ gemmini_extended_mvin(dram_addr, spad_addr, (len) * DIM, DIM) @@ -202,17 +263,17 @@ acc_scale_t_bits acc_scale_t_to_acc_scale_t_bits(acc_scale_t x) { gemmini_extended_mvin(dram_addr, spad_addr, DIM, DIM) #define gemmini_extended_mvout(dram_addr, spad_addr, cols, rows) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, dram_addr, ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (uint64_t)(spad_addr), k_MVOUT) + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, dram_addr, ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (uint64_t)(spad_addr), k_MVOUT) #define gemmini_mvout(dram_addr, spad_addr) \ gemmini_extended_mvout(dram_addr, spad_addr, DIM, DIM) // compute #define gemmini_extended_compute_preloaded(A, BD, A_cols, A_rows, BD_cols, BD_rows) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_PRELOADED) + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_PRELOADED) #define gemmini_extended_compute_accumulated(A, BD, A_cols, A_rows, BD_cols, BD_rows) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_ACCUMULATE) + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_ACCUMULATE) #define gemmini_compute_preloaded(A, BD) \ gemmini_extended_compute_preloaded(A, BD, DIM, DIM, DIM, DIM) @@ -222,7 +283,7 @@ acc_scale_t_bits acc_scale_t_to_acc_scale_t_bits(acc_scale_t x) { // preload #define gemmini_extended_preload(BD, C, BD_cols, BD_rows, C_cols, C_rows) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD) + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD) #define gemmini_preload(BD, C) \ gemmini_extended_preload(BD, C, DIM, DIM, DIM, DIM) @@ -230,12 +291,21 @@ acc_scale_t_bits acc_scale_t_to_acc_scale_t_bits(acc_scale_t x) { #define gemmini_preload_zeros(C) \ gemmini_preload(GARBAGE_ADDR, C) +// weight-stationary matmul loop +#define gemmini_loop_ws(I, J, K, pad_I, pad_J, pad_K, A, B, D, C, A_stride, B_stride, D_stride, C_stride, A_transpose, B_transpose, full_C, low_D, ex_accumulate) \ + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(pad_K) << 32) | ((uint64_t)(pad_J) << 16) | (uint64_t)(pad_I), ((uint64_t)(K) << 32) | ((uint64_t)(J) << 16) | (uint64_t)(I), k_LOOP_WS_CONFIG_BOUNDS) \ + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, A, B, k_LOOP_WS_CONFIG_ADDRS_AB) \ + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, D, C, k_LOOP_WS_CONFIG_ADDRS_DC) \ + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, A_stride, B_stride, k_LOOP_WS_CONFIG_STRIDES_AB) \ + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, D_stride, C_stride, k_LOOP_WS_CONFIG_STRIDES_DC) \ + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, ((low_D) << 2) | ((full_C) << 1) | (ex_accumulate), ((B_transpose) << 1) | (A_transpose), k_LOOP_WS) + // config #define gemmini_extended2_config_ex(dataflow, act, sys_shift, acc_scale, relu6_shift, A_stride, A_transpose, B_transpose, ocol, row_turn, kdim, stride, channel, row_left, kdim2, weight_double_bank, weight_triple_bank) \ { \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)acc_scale_t_to_acc_scale_t_bits((acc_scale_t)acc_scale) << 32) | ((uint64_t)(A_stride) << 16) | (B_transpose << 9) | (A_transpose << 8) | ((act) << 3) | ((dataflow) << 2) | CONFIG_EX, ((uint64_t)(relu6_shift) << 32) | (sys_shift), k_CONFIG); \ + ROCC_INSTRUCTION_RD_RS1_RS2_labeled(XCUSTOM_ACC, ((uint64_t)acc_scale_t_to_acc_scale_t_bits((acc_scale_t)acc_scale) << 32) | ((uint64_t)(A_stride) << 16) | (B_transpose << 9) | (A_transpose << 8) | ((act) << 3) | ((dataflow) << 2) | CONFIG_EX, ((uint64_t)(relu6_shift) << 32) | (sys_shift), k_CONFIG, 0); \ \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(weight_triple_bank) << 59) | ((uint64_t)(weight_double_bank) << 58) | ((uint64_t)(row_left) << 54) | ((uint64_t)(row_turn) << 42) | CONFIG_IM2COL, ((uint64_t)ocol << 56) | ((uint64_t)kdim2 << 48) | ((uint64_t)kdim << 44) | ((uint64_t)channel << 23) | ((uint64_t)stride << 20), k_CONFIG) \ + ROCC_INSTRUCTION_RD_RS1_RS2_labeled(XCUSTOM_ACC, ((uint64_t)(weight_triple_bank) << 59) | ((uint64_t)(weight_double_bank) << 58) | ((uint64_t)(row_left) << 54) | ((uint64_t)(row_turn) << 42) | CONFIG_IM2COL, ((uint64_t)ocol << 56) | ((uint64_t)kdim2 << 48) | ((uint64_t)kdim << 44) | ((uint64_t)channel << 23) | ((uint64_t)stride << 20), k_CONFIG, 1) \ } #define gemmini_extended_config_ex(dataflow, act, sys_shift, acc_scale, relu6_shift, A_stride, A_transpose, B_transpose) \ @@ -245,7 +315,7 @@ acc_scale_t_bits acc_scale_t_to_acc_scale_t_bits(acc_scale_t x) { gemmini_extended_config_ex(dataflow, act, sys_shift, acc_scale, relu6_shift, 1, 0, 0) #define gemmini_extended4_config_ld(stride, scale, shrunk, block_mvin_stride, id) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(scale_t_to_scale_t_bits(scale)) << 32) | ((uint64_t)(block_mvin_stride) << 16) | ((id) << 3) | ((shrunk) << 2) | CONFIG_LD, stride, k_CONFIG) + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(scale_t_to_scale_t_bits(scale)) << 32) | ((uint64_t)(block_mvin_stride) << 16) | ((id) << 3) | ((shrunk) << 2) | CONFIG_LD, stride, k_CONFIG) #define gemmini_extended3_config_ld(stride, scale, shrunk, id) \ gemmini_extended4_config_ld(stride, scale, shrunk, DIM, id) @@ -260,45 +330,36 @@ acc_scale_t_bits acc_scale_t_to_acc_scale_t_bits(acc_scale_t x) { gemmini_extended_config_ld(stride, MVIN_SCALE_IDENTITY) #define gemmini_extended_config_st(stride, pool_stride, pool_size, pool_out_dim, porows, pocols, orows, ocols, upad, lpad) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(ocols) << 56) | ((uint64_t)(orows) << 48) | ((uint64_t)(pocols) << 40) | ((uint64_t)(porows) << 32) | ((uint64_t)(pool_out_dim) << 24) | ((uint64_t)(lpad) << 10) | ((uint64_t)(upad) << 8) | ((uint64_t)(pool_size) << 6) | ((uint64_t)(pool_stride) << 4) | CONFIG_ST, stride, k_CONFIG) + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(ocols) << 56) | ((uint64_t)(orows) << 48) | ((uint64_t)(pocols) << 40) | ((uint64_t)(porows) << 32) | ((uint64_t)(pool_out_dim) << 24) | ((uint64_t)(lpad) << 10) | ((uint64_t)(upad) << 8) | ((uint64_t)(pool_size) << 6) | ((uint64_t)(pool_stride) << 4) | CONFIG_ST, stride, k_CONFIG) #define gemmini_config_st(stride) \ gemmini_extended_config_st(stride, 0, 0, 0, 0, 0, 0, 0, 0, 0) -// flush -#define gemmini_flush(skip) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, skip, 0, k_FLUSH) - // fence -#define gemmini_fence() asm volatile("fence") - -// weight-stationary matmul loop -#define gemmini_loop_ws(I, J, K, pad_I, pad_J, pad_K, A, B, D, C, A_stride, B_stride, D_stride, C_stride, A_transpose, B_transpose, full_C, low_D, ex_accumulate) \ - { \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(pad_K) << 32) | ((uint64_t)(pad_J) << 16) | (uint64_t)(pad_I), ((uint64_t)(K) << 32) | ((uint64_t)(J) << 16) | (uint64_t)(I), k_LOOP_WS_CONFIG_BOUNDS) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, A, B, k_LOOP_WS_CONFIG_ADDRS_AB) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, D, C, k_LOOP_WS_CONFIG_ADDRS_DC) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, A_stride, B_stride, k_LOOP_WS_CONFIG_STRIDES_AB) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, D_stride, C_stride, k_LOOP_WS_CONFIG_STRIDES_DC) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((low_D) << 2) | ((full_C) << 1) | (ex_accumulate), ((B_transpose) << 1) | (A_transpose), k_LOOP_WS) \ +// First, spin until no instructions are busy in gemmini, with k_FENCE +// Then, use the generic fence +#define gemmini_fence() { \ + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, 0, 0, k_FENCE); \ + asm volatile("fence"); \ } + // weight-stationary matmul loop #define gemmini_loop_conv_ws(batch_size, in_dim, in_channels, out_channels, out_dim, pool_out_dim, stride, padding, kernel_dim, pool_size, pool_stride, pool_padding, batches, porows, pocols, pochs, krows, kcols, kchs, lpad, rpad, upad, dpad, plpad, prpad, pupad, pdpad, orows, ocols, weights, output, bias, input, no_bias, no_pool) \ { \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(out_channels) << 48) | ((uint64_t)(in_channels) << 32) | ((uint64_t)(in_dim) << 16) | (uint64_t)(batch_size), \ + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(out_channels) << 48) | ((uint64_t)(in_channels) << 32) | ((uint64_t)(in_dim) << 16) | (uint64_t)(batch_size), \ ((uint64_t)(padding) << 48) | ((uint64_t)(stride) << 32) | ((uint64_t)(pool_out_dim) << 16) | (uint64_t)(out_dim), k_LOOP_CONV_WS_CONFIG_1) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(kernel_dim) << 48) | ((uint64_t)(pool_size) << 32) | ((uint64_t)(pool_stride) << 16) | (uint64_t)(pool_padding), \ + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(kernel_dim) << 48) | ((uint64_t)(pool_size) << 32) | ((uint64_t)(pool_stride) << 16) | (uint64_t)(pool_padding), \ ((uint64_t)(batches) << 48) | ((uint64_t)(porows) << 32) | ((uint64_t)(pocols) << 16) | (uint64_t)(pochs), k_LOOP_CONV_WS_CONFIG_2) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(krows) << 48) | ((uint64_t)(kcols) << 32) | ((uint64_t)(kchs) << 16) | (uint64_t)(lpad), \ + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(krows) << 48) | ((uint64_t)(kcols) << 32) | ((uint64_t)(kchs) << 16) | (uint64_t)(lpad), \ ((uint64_t)(rpad) << 48) | ((uint64_t)(upad) << 32) | ((uint64_t)(dpad) << 16) | (uint64_t)(plpad), k_LOOP_CONV_WS_CONFIG_3) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(orows) << 48) | ((uint64_t)(prpad) << 32) | ((uint64_t)(pupad) << 16) | (uint64_t)(pdpad), \ + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(orows) << 48) | ((uint64_t)(prpad) << 32) | ((uint64_t)(pupad) << 16) | (uint64_t)(pdpad), \ ocols, k_LOOP_CONV_WS_CONFIG_4) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, weights, \ + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, weights, \ output, k_LOOP_CONV_WS_CONFIG_5) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, bias, \ + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, bias, \ input, k_LOOP_CONV_WS_CONFIG_6) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, no_bias, \ + ROCC_INSTRUCTION_RD_RS1_RS2(XCUSTOM_ACC, no_bias, \ no_pool, k_LOOP_CONV_WS) \ } @@ -1347,7 +1408,7 @@ static void sp_tiled_conv_ds( // Calculate image dimensions const int irows = (orows - 1) * stride + 1; - const int icols = (ocols - 1) * stride + 1;//kcols; + const int icols = (ocols - 1) * stride + 1;//kcols; const int ichs = kchs; const int im2col_height = ocols*orows; @@ -1356,9 +1417,9 @@ static void sp_tiled_conv_ds( const int row_turn = row_left == 0 ? im2col_height/DIM - 1 : im2col_height/DIM; const int double_bank = weight_bank > 1 ? 1 : 0; const int triple_bank = weight_bank > 2 ? 1 : 0; - + int odims = im2col_height; - + gemmini_extended2_config_ex(WEIGHT_STATIONARY, act, 0, scale, relu6_shift, 1, false, false, ocols, row_turn, 1, stride, kchs, row_left, 1, double_bank, triple_bank); //if want 2 banks for weight, last is 1 const uint32_t A_sp_addr_start = 0; @@ -1391,7 +1452,7 @@ static void sp_tiled_conv_ds( gemmini_config_ld(out_channels*sizeof(elem_t)); for (int och = 0; och < ochs; och += DIM) { const int J = ochs - och > DIM ? DIM : ochs - och; - const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kchs; + const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kchs; for (int kch = 0; kch < kchs; kch += DIM) { const int K = kchs - kch > DIM ? DIM : kchs - kch; gemmini_extended_mvin(weights + kch * out_channels + och, @@ -1402,7 +1463,7 @@ static void sp_tiled_conv_ds( } // gemmini_fence(); - int idims = irows*icols; + int idims = irows*icols; int bidims = batches*idims; // mvin input // printf("mvin inputs\n"); @@ -1428,15 +1489,15 @@ int bidims = batches*idims; if(odims > DIM){ //output dimension (row*col) bigger than DIM for (int b = 0; b < batches; b++){ for (int och = 0; och < ochs; och += DIM) { - const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kchs;// + kch; + const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kchs;// + kch; const uint32_t C_sp_addr = C_sp_addr_start + (och / DIM) * batches * odims + b * odims; for (int kch = 0; kch < kchs; kch += DIM) { // gemmini_extended_mvin(weights + kch * out_channels + och, // B_sp_addr+kch, // DIM, DIM); - - const uint32_t A_sp_addr = A_sp_addr_start + (kch / DIM)*batches*idims + b*idims; + + const uint32_t A_sp_addr = A_sp_addr_start + (kch / DIM)*batches*idims + b*idims; for(int odim = 0; odim < odims; odim += DIM){ //both dimension at the same time const int I = odims - odim > DIM ? DIM : odims - odim; gemmini_extended_preload(B_sp_addr+kch, C_sp_addr+odim, @@ -1445,7 +1506,7 @@ int bidims = batches*idims; } } // if(output!=NULL) gemmini_extended_mvout(output + (b * out_dim * out_dim)*out_channels + och, C_sp_addr, DIM, 0); - + } } }else{//ds layer @@ -1462,7 +1523,7 @@ int bidims = batches*idims; DIM, DIM); } */ - const uint32_t A_sp_addr = A_sp_addr_start + (kch / DIM)*bidims + b*idims; + const uint32_t A_sp_addr = A_sp_addr_start + (kch / DIM)*bidims + b*idims; gemmini_extended_preload(B_sp_addr, C_sp_addr, DIM, DIM, DIM, odims); gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, DIM, odims, DIM, odims); @@ -1495,7 +1556,7 @@ int bidims = batches*idims; DIM, DIM, DIM, odims); gemmini_extended_compute_preloaded(A_sp_addr+7*bidims, GARBAGE_ADDR, DIM, odims, DIM, odims); kch_bound = kch + 8*DIM; - + } //if kch is not divisible by DIM for (; kch_bound < kchs; kch_bound += DIM) { @@ -1504,16 +1565,16 @@ int bidims = batches*idims; // gemmini_extended_mvin(weights + kch_bound * out_channels + och, // B_sp_addr, // DIM, DIM); - const uint32_t A_sp_addr = A_sp_addr_start + (kch_bound / DIM)*bidims + b*idims; - + const uint32_t A_sp_addr = A_sp_addr_start + (kch_bound / DIM)*bidims + b*idims; + gemmini_extended_preload(B_sp_addr, C_sp_addr, DIM, DIM, DIM, odims); gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, DIM, odims, DIM, odims); - + } // const uint32_t C_sp_addr = C_sp_addr_start + (och / DIM) * batches * odims + b * odims; // if(output!=NULL) gemmini_extended_mvout(output + (b * out_dim * out_dim)*out_channels + och, C_sp_addr, DIM, 0); - + } } } @@ -1526,7 +1587,7 @@ int bidims = batches*idims; for(int och = 0; och < ochs; och += DIM){ const uint32_t C_sp_addr = C_sp_addr_start + (och / DIM) * batches * odims + b * odims; gemmini_extended_mvout(output + (b * out_dim * out_dim)*out_channels + och, C_sp_addr, DIM, 0); - } + } } } @@ -1558,11 +1619,11 @@ static void sp_tiled_conv_dw( const int ocols = pocols * pool_stride + pool_size - 1 - plpad - prpad; // Calculate image dimensions const int irows = (orows - 1) * stride + kernel_dim; - const int icols = (ocols - 1) * stride + kernel_dim;//kcols; + const int icols = (ocols - 1) * stride + kernel_dim;//kcols; const int irows_unpadded = irows - upad - dpad; const int icols_unpadded = icols - lpad - rpad; int kchs = 1; - int kdims = kernel_dim * kernel_dim; + int kdims = kernel_dim * kernel_dim; int double_bank = 0;//weight_bank > 1 ? 1 : 0; int triple_bank = 0;//weight_bank > 2 ? 1 : 0; @@ -1579,7 +1640,7 @@ static void sp_tiled_conv_dw( const uint32_t D_sp_addr_start = 1 << (ADDR_LEN - 1); const uint32_t C_sp_addr_start = 3 << (ADDR_LEN - 2); - if (!no_bias && bias != NULL) { + if (!no_bias && bias != NULL) { gemmini_config_ld(0); for (int b = 0; b < batches; b++){ const int J = 1; @@ -1591,7 +1652,7 @@ static void sp_tiled_conv_dw( J, I); } } - } + } if (mvin_weight) { // mvin weights @@ -1599,7 +1660,7 @@ static void sp_tiled_conv_dw( gemmini_config_ld(out_channels * sizeof(elem_t)); for (int krow = 0; krow < kernel_dim; krow++){ const uint32_t B_sp_addr = B_sp_addr_start+ krow*kernel_dim; - + for (int kcol = 0; kcol < kernel_dim; kcol++){ gemmini_extended_mvin(weights + (krow*kernel_dim + kcol) * out_channels, B_sp_addr+kcol, @@ -1607,7 +1668,7 @@ static void sp_tiled_conv_dw( } } } - + // mvin input // printf("mvin inputs\n"); gemmini_config_ld(in_channels * sizeof(elem_t)); @@ -1622,13 +1683,13 @@ static void sp_tiled_conv_dw( int I = icols_unpadded - icol > DIM ? DIM : icols_unpadded - icol; elem_t * in = input + (b*in_dim*in_dim + irow*in_dim + icol) * in_channels;// + ich; - + if (icol < 0) { I = -icol > DIM ? DIM : -icol; } else if (icol >= icols_unpadded) { I = icols_unpadded + rpad - icol > DIM ? DIM : icols_unpadded + rpad - icol; } - const bool is_zeros = irow < 0 || irow >= irows_unpadded || icol < 0 || icol >= icols_unpadded; + const bool is_zeros = irow < 0 || irow >= irows_unpadded || icol < 0 || icol >= icols_unpadded; const int icol_padded = icol + lpad; const uint32_t A_sp_addr = A_sp_addr_start + b * idims + irow_padded * icols + icol_padded; if(is_zeros){ @@ -1657,14 +1718,14 @@ static void sp_tiled_conv_dw( } } // gemmini_fence(); - + // gemmini_config_ld(0); for (int b = 0; b < batches; b++){ const int J = 1; //const uint32_t D_sp_addr = D_sp_addr_start + b * odims;// + odim; const uint32_t C_sp_addr_outer = C_sp_addr_start + b * odims;// + odim; - - const uint32_t A_sp_addr = A_sp_addr_start + b*idims; + + const uint32_t A_sp_addr = A_sp_addr_start + b*idims; const int kkdims = kdims; const uint32_t B_sp_addr = B_sp_addr_start; const int K = 1; @@ -1676,11 +1737,11 @@ static void sp_tiled_conv_dw( // J, I); const uint32_t C_sp_addr = C_sp_addr_outer + odim; - for(int kkdim = 0; kkdim < kkdims; kkdim += K){ + for(int kkdim = 0; kkdim < kkdims; kkdim += K){ gemmini_extended_preload(B_sp_addr + kkdim, C_sp_addr, J, K, J, I); gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, K, I, J, I); - + } // gemmini_extended_mvout(output + (b * out_dim * out_dim)*out_channels, C_sp_addr_outer, J, 0); // } @@ -1700,7 +1761,7 @@ static void sp_tiled_conv_dw( gemmini_extended_mvout(output + (b*out_dim*out_dim + orow*out_dim + ocol) * out_channels, C_sp_addr, 1, I); - } + } } //for first layer @@ -1735,8 +1796,8 @@ static void sp_tiled_conv_first( // Calculate image dimensions const int irows = (orows - 1) * stride + krows; - const int icols = (ocols - 1) * stride + 1;//krows; - int kdims = krows*krows; + const int icols = (ocols - 1) * stride + 1;//krows; + int kdims = krows*krows; const int ichs = kchs*krows; //pack rows (kchs: normal channel number) int double_bank = weight_bank > 1 ? 1 : 0; @@ -1788,7 +1849,7 @@ static void sp_tiled_conv_first( J, K); } } - } + } } // mvin input // printf("mvin inputs\n"); @@ -1798,7 +1859,7 @@ static void sp_tiled_conv_first( for (int icol = 0; icol < icols;) { int I = icols - icol > DIM ? DIM : icols- icol; elem_t * in = input + (b*in_dim*in_dim + irow*in_dim + icol) * in_channels;// + ich; - + const uint32_t A_sp_addr = A_sp_addr_start + b * idims + irow * icols + icol; for (int ich = 0; ich < ichs; ich += DIM) { const int K = ichs - ich > DIM ? DIM : ichs - ich; @@ -1819,13 +1880,13 @@ static void sp_tiled_conv_first( const uint32_t C_sp_addr = C_sp_addr_start + (och / DIM) * batches * odims + b * odims;// + odim; for (int kch = 0; kch < ichs; kch += DIM) { //treat as 3x7=21 channels const int K = ichs - kch > DIM ? DIM : ichs - kch; - const uint32_t A_sp_addr = A_sp_addr_start + (kch / DIM)*batches*idims + b*idims; + const uint32_t A_sp_addr = A_sp_addr_start + (kch / DIM)*batches*idims + b*idims; const int kkdims = K*krows;//kdims; const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kchs * kdims + kch*krows; for(int odim = 0; odim < odims; odim += DIM){ //both dimension at the same time const int I = odims - odim > DIM ? DIM : odims - odim; - for(int kkdim = 0; kkdim < kkdims; kkdim += K){ + for(int kkdim = 0; kkdim < kkdims; kkdim += K){ gemmini_extended_preload(B_sp_addr + kkdim, C_sp_addr+odim, J, K, J, I); gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, K, I, J, I); @@ -1840,7 +1901,7 @@ static void sp_tiled_conv_first( C_sp_addr, J, 0); } -*/ +*/ } } @@ -1892,11 +1953,11 @@ static void sp_tiled_conv_ws_original( // Calculate image dimensions const int irows = (orows - 1) * stride + krows; - const int icols = (ocols - 1) * stride + krows;//kcols; + const int icols = (ocols - 1) * stride + krows;//kcols; const int irows_unpadded = irows - upad - dpad; const int icols_unpadded = icols - lpad - rpad; const int ichs = kchs; - int kdims = krows*krows; + int kdims = krows*krows; int idims = irows*icols; int bidims = batches*irows*icols; @@ -1942,7 +2003,7 @@ static void sp_tiled_conv_ws_original( const int K = kchs - kch > DIM ? DIM : kchs - kch; for (int krow = 0; krow < krows; krow++){ const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kdims * kchs + kch*kdims + krow*krows*K; - + for (int kcol = 0; kcol < krows; kcol++){ gemmini_extended_mvin(weights + (krow*kernel_dim*in_channels + kcol*in_channels + kch) * out_channels + och, B_sp_addr+kcol*K, @@ -1954,7 +2015,7 @@ static void sp_tiled_conv_ws_original( // mvin input // printf("mvin inputs\n"); gemmini_config_ld(in_channels * sizeof(elem_t)); - static elem_t zeros[MAX_BYTES / sizeof(elem_t)] = {0}; + static elem_t zeros[MAX_BYTES / sizeof(elem_t)] = {0}; // gemmini_fence(); // TODO fix ROB to get rid of this requirement for (int b = 0; b < batches; b++) { for (int irow = -upad; irow < irows_unpadded + dpad; irow++) { @@ -1963,17 +2024,17 @@ static void sp_tiled_conv_ws_original( for (int icol = -lpad; icol < icols_unpadded + rpad;) { int I = icols_unpadded - icol > DIM ? DIM : icols_unpadded - icol; const elem_t * in = input + (b*in_dim*in_dim + irow*in_dim + icol) * in_channels;// + ich; - + if (icol < 0) { I = -icol > DIM ? DIM : -icol; } else if (icol >= icols_unpadded) { I = icols_unpadded + rpad - icol > DIM ? DIM : icols_unpadded + rpad - icol; } - const bool is_zeros = irow < 0 || irow >= irows_unpadded || icol < 0 || icol >= icols_unpadded; + const bool is_zeros = irow < 0 || irow >= irows_unpadded || icol < 0 || icol >= icols_unpadded; const int icol_padded = icol + lpad; const uint32_t A_sp_addr = A_sp_addr_start + b * idims + irow_padded * icols + icol_padded; if(is_zeros){ - gemmini_config_ld(0); + gemmini_config_ld(0); for (int ich = 0; ich < ichs; ich += DIM) { const int K = ichs - ich > DIM ? DIM : ichs - ich; in = &zeros[0]; @@ -2001,11 +2062,11 @@ static void sp_tiled_conv_ws_original( const int J = ochs - och > DIM ? DIM : ochs - och; for (int kch = 0; kch < kchs; kch += DIM) { const int K = kchs - kch > DIM ? DIM : kchs - kch; - const uint32_t A_sp_addr = A_sp_addr_start + (kch / DIM)*batches*idims + b*idims; - const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kdims * kchs + kch*kdims; + const uint32_t A_sp_addr = A_sp_addr_start + (kch / DIM)*batches*idims + b*idims; + const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kdims * kchs + kch*kdims; for(int odim = 0; odim < odims; odim += DIM){ //both dimension at the same time const int I = odims - odim > DIM ? DIM : odims - odim; - const int C_sp_addr = C_sp_addr_start + (och / DIM) * batches * odims + b * odims + odim; + const int C_sp_addr = C_sp_addr_start + (och / DIM) * batches * odims + b * odims + odim; for(int kkdim = 0; kkdim < K*kdims; kkdim += K){ gemmini_extended_preload(B_sp_addr+kkdim, C_sp_addr, J, K, J, I); @@ -2032,7 +2093,7 @@ static void sp_tiled_conv_ws_original( C_sp_addr, J, I); } - + } } else { @@ -2086,11 +2147,11 @@ static void sp_tiled_conv_ws_original_first( // Calculate image dimensions const int irows = (orows - 1) * stride + krows; - const int icols = (ocols - 1) * stride + krows;//kcols; + const int icols = (ocols - 1) * stride + krows;//kcols; const int irows_unpadded = irows - upad - dpad; const int icols_unpadded = icols - lpad - rpad; const int ichs = kchs; - int kdims = krows*krows; + int kdims = krows*krows; int idims = irows*icols; int bidims = batches*irows*icols; @@ -2127,7 +2188,7 @@ int bidims = batches*irows*icols; // mvin input // printf("mvin inputs\n"); gemmini_config_ld(in_channels * sizeof(elem_t)); - static elem_t zeros[MAX_BYTES / sizeof(elem_t)] = {0}; + static elem_t zeros[MAX_BYTES / sizeof(elem_t)] = {0}; for (int b = 0; b < batches; b++) { for (int irow = -upad; irow < irows_unpadded + dpad; irow++) { const int irow_padded = irow + upad; @@ -2135,17 +2196,17 @@ int bidims = batches*irows*icols; for (int icol = -lpad; icol < icols_unpadded + rpad;) { int I = icols_unpadded - icol > DIM ? DIM : icols_unpadded - icol; elem_t * in = input + (b*in_dim*in_dim + irow*in_dim + icol) * in_channels;// + ich; - + if (icol < 0) { I = -icol > DIM ? DIM : -icol; } else if (icol >= icols_unpadded) { I = icols_unpadded + rpad - icol > DIM ? DIM : icols_unpadded + rpad - icol; } - const bool is_zeros = irow < 0 || irow >= irows_unpadded || icol < 0 || icol >= icols_unpadded; + const bool is_zeros = irow < 0 || irow >= irows_unpadded || icol < 0 || icol >= icols_unpadded; const int icol_padded = icol + lpad; const uint32_t A_sp_addr = A_sp_addr_start + b * idims + irow_padded * icols + icol_padded; if(is_zeros){ - gemmini_config_ld(0); + gemmini_config_ld(0); in = &zeros[0]; gemmini_extended_mvin(in, A_sp_addr, @@ -2170,14 +2231,14 @@ int bidims = batches*irows*icols; const int K = kchs;//kchs - kch > DIM ? DIM : kchs - kch; for (int krow = 0; krow < krows; krow++){ const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kdims * kchs + krow*krows*K; - + for (int kcol = 0; kcol < krows; kcol++){ gemmini_extended_mvin(weights + (krow*kernel_dim*in_channels + kcol*in_channels) * out_channels + och, B_sp_addr+kcol*K, J, K); } } - + } } @@ -2186,10 +2247,10 @@ int bidims = batches*irows*icols; for (int b = 0; b < batches; b++){ for (int och = 0; och < ochs; och += DIM) { const int J = ochs - och > DIM ? DIM : ochs - och; - const uint32_t C_sp_addr = C_sp_addr_start + (och / DIM) * batches * odims + b * odims;// + odim; + const uint32_t C_sp_addr = C_sp_addr_start + (och / DIM) * batches * odims + b * odims;// + odim; const int K = kchs;// - kch > DIM ? DIM : kchs - kch; - const uint32_t A_sp_addr = A_sp_addr_start + b*idims; - const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kdims * kchs;// + kch*kdims; + const uint32_t A_sp_addr = A_sp_addr_start + b*idims; + const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kdims * kchs;// + kch*kdims; for(int odim = 0; odim < odims; odim += DIM){ //both dimension at the same time const int I = odims - odim > DIM ? DIM : odims - odim; for(int kkdim = 0; kkdim < K*kdims; kkdim += K){ @@ -2198,13 +2259,13 @@ int bidims = batches*irows*icols; gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, K, I, J, I); } } -/* - //attempt to merge matmul and mvout +/* + //attempt to merge matmul and mvout elem_t * pout = output + (b * pool_out_dim * pool_out_dim)*out_channels + och; gemmini_extended_mvout(pout, C_sp_addr, J, 0); -*/ +*/ } } @@ -3057,9 +3118,9 @@ static void tiled_conv_first( //for (int kch = 0; kch < in_channels; kch += kchs) { elem_t * out = output + (b*pool_out_dim*pool_out_dim + porow*pool_out_dim + pocol) * out_channels + poch; - + acc_t * bias_ = bias + poch; - + const int batches_ = batch_size - b > batches ? batches : batch_size - b; const int porows_ = pool_out_dim - porow > porows ? porows : pool_out_dim - porow; const int pocols_ = pool_out_dim - pocol > pocols ? pocols : pool_out_dim - pocol; @@ -3097,7 +3158,7 @@ static void tiled_conv_first( lpad, rpad, upad, dpad, plpad, prpad, pupad, pdpad, - input + (b*in_dim*in_dim + (irow+upad)*in_dim + (icol+lpad)) * in_channels,// + kch, + input + (b*in_dim*in_dim + (irow+upad)*in_dim + (icol+lpad)) * in_channels,// + kch, weights + poch, //weights + (krow*kernel_dim*in_channels + kcol*in_channels + kch) * out_channels + poch, out, @@ -3106,7 +3167,7 @@ static void tiled_conv_first( act, scale, relu6_shift, no_bias, no_pool, mvin_weight, weight_bank); - + } } } @@ -3128,9 +3189,9 @@ static void tiled_conv_first( const int ocol_floored = ocol < 0 ? 0 : ocol; const int icol = ocol_floored * stride - padding; //+ kcol - padding; elem_t * out = output + (b*pool_out_dim*pool_out_dim + porow*pool_out_dim + pocol) * out_channels + poch; - + acc_t * bias_ = bias + poch; - + const int batches_ = batch_size - b > batches ? batches : batch_size - b; const int porows_ = pool_out_dim - porow > porows ? porows : pool_out_dim - porow; @@ -3211,11 +3272,11 @@ static void sp_tiled_conv_ws( // Calculate image dimensions const int irows = (orows - 1) * stride + krows; - const int icols = (ocols - 1) * stride + krows;//kcols; + const int icols = (ocols - 1) * stride + krows;//kcols; const int irows_unpadded = irows - upad - dpad; const int icols_unpadded = icols - lpad - rpad; const int ichs = kchs; - int kdims = krows*krows; + int kdims = krows*krows; int double_bank = weight_bank > 1 ? 1 : 0; int triple_bank = weight_bank > 2 ? 1 : 0; @@ -3270,13 +3331,13 @@ static void sp_tiled_conv_ws( for (int icol = -lpad; icol < icols_unpadded + rpad;) { int I = icols_unpadded - icol > DIM ? DIM : icols_unpadded - icol; const elem_t * in = input + (b*in_dim*in_dim + irow*in_dim + icol) * in_channels;// + ich; - + if (icol < 0) { I = -icol > DIM ? DIM : -icol; } else if (icol >= icols_unpadded) { I = icols_unpadded + rpad - icol > DIM ? DIM : icols_unpadded + rpad - icol; } - const bool is_zeros = irow < 0 || irow >= irows_unpadded || icol < 0 || icol >= icols_unpadded; + const bool is_zeros = irow < 0 || irow >= irows_unpadded || icol < 0 || icol >= icols_unpadded; const int icol_padded = icol + lpad; const uint32_t A_sp_addr = A_sp_addr_start + b * idims + irow_padded * icols + icol_padded; if(is_zeros){ @@ -3313,15 +3374,15 @@ static void sp_tiled_conv_ws( for (int kch = 0; kch < kchs; kch += DIM) { const int K = kchs - kch > DIM ? DIM : kchs - kch; - const uint32_t A_sp_addr = A_sp_addr_start + (kch / DIM)*batches*idims + b*idims; + const uint32_t A_sp_addr = A_sp_addr_start + (kch / DIM)*batches*idims + b*idims; const int kkdims = K*kdims; const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kdims * kchs + kch*kdims; - + for(int odim = 0; odim < odims; odim += DIM){ //both dimension at the same time const int I = odims - odim > DIM ? DIM : odims - odim; const uint32_t C_sp_addr = C_sp_addr_outer + odim; - for(int kkdim = 0; kkdim < kkdims; kkdim += K){ + for(int kkdim = 0; kkdim < kkdims; kkdim += K){ gemmini_extended_preload(B_sp_addr + kkdim, C_sp_addr, J, K, J, I); gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, K, I, J, I); @@ -3348,7 +3409,7 @@ static void sp_tiled_conv_ws( C_sp_addr, J, I); } - + } } else { @@ -3378,10 +3439,10 @@ static void sp_tiled_conv_ws( for (int kch = 0; kch < kchs; kch += DIM) { const int K = kchs - kch > DIM ? DIM : kchs - kch; - const uint32_t A_sp_addr = A_sp_addr_start + (kch / DIM)*batches*idims + b*idims; + const uint32_t A_sp_addr = A_sp_addr_start + (kch / DIM)*batches*idims + b*idims; const int kkdims = K*kdims; const uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * kdims * kchs + kch*kdims; - + for(int odim = 0; odim < odims; odim += DIM){ //both dimension at the same time const int I = odims - odim > DIM ? DIM : odims - odim; if(kch == 0) @@ -3390,7 +3451,7 @@ static void sp_tiled_conv_ws( J, I); const uint32_t C_sp_addr = C_sp_addr_outer + odim; - for(int kkdim = 0; kkdim < kkdims; kkdim += K){ + for(int kkdim = 0; kkdim < kkdims; kkdim += K){ gemmini_extended_preload(B_sp_addr + kkdim, C_sp_addr, J, K, J, I); gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, K, I, J, I); @@ -3453,7 +3514,7 @@ static void sp_tiled_conv_ws( const int J = ochs - och > DIM ? DIM : ochs - och; const uint32_t C_sp_addr = C_sp_addr_start + (och / DIM) * batches * odims + b * odims; gemmini_extended_mvout(output + (b * out_dim * out_dim)*out_channels + och, C_sp_addr, J, 0); - } + } //} */ @@ -3622,7 +3683,7 @@ static void tiled_conv_original( act, scale, relu6_shift, no_bias, no_pool, weight_bank); - + else //downsampling layer sp_tiled_conv_ds( batch_size, in_dim, in_channels, @@ -3638,14 +3699,14 @@ static void tiled_conv_original( plpad, prpad, pupad, pdpad, input + (b*in_dim*in_dim + (irow+upad)*in_dim + (icol+lpad)) * in_channels + kch, - 0, + 0, weights + kch * out_channels + poch, //weights + (krow*kernel_dim*in_channels + kcol*in_channels + kch) * out_channels + poch, out, bias_, act, scale, relu6_shift, - no_bias, no_pool, + no_bias, no_pool, weight_bank); } } @@ -3735,7 +3796,7 @@ static void tiled_conv( #endif int kdims = kcols*kcols; const uint32_t B_sp_addr_start = (BANK_NUM - weight_bank) * BANK_ROWS; - + const int pool_out_dim = (out_dim + 2*pool_padding - pool_size) / pool_stride + 1; for (int poch = 0; poch < out_channels; poch += pochs) { @@ -3760,7 +3821,7 @@ static void tiled_conv( } } } - } + } for (int b = 0; b < batch_size; b += batches) { for (int porow = 0; porow < pool_out_dim; porow += porows) { const int orow = porow * pool_stride - pool_padding; @@ -3774,7 +3835,7 @@ static void tiled_conv( const int irow = orow_floored * stride - padding;//+ krow - padding; const int ocol_floored = ocol < 0 ? 0 : ocol; const int icol = ocol_floored * stride - padding; //+ kcol - padding; - + const int ocols_ = pocols_ * pool_stride + pool_size - 1; const int orows_ = porows_ * pool_stride + pool_size - 1; @@ -4004,10 +4065,10 @@ static void tiled_conv_auto_dw( args[1]--; args[2]--; }else{ - args[0]--; + args[0]--; } - acc_rows = tiled_conv_total_spad_rows(true, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); + acc_rows = tiled_conv_total_spad_rows(true, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); } int batches = args[0]; @@ -4042,7 +4103,7 @@ static void tiled_conv_auto_dw( act, scale, relu6_shift, pool_size, no_pool ? 0 : pool_stride, pool_padding, - + tiled_conv_type); } @@ -4060,13 +4121,13 @@ static void tiled_conv_auto_original( int act, acc_scale_t scale, size_t relu6_shift, int pool_size, int pool_stride, int pool_padding, - + enum tiled_matmul_type_t tiled_conv_type) { const int weight_bank = in_channels > 500? 3 : 2; const bool no_pool = pool_stride == 0 || (pool_stride == 1 && pool_size == 1 && pool_padding == 0); const bool no_1d = no_pool; //Todo: change to 1d - + if (no_pool) { pool_size = 1; pool_stride = 1; @@ -4117,7 +4178,7 @@ static void tiled_conv_auto_original( } else args[0]--; } - acc_rows = tiled_conv_total_spad_rows(true, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); + acc_rows = tiled_conv_total_spad_rows(true, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); } // printf("batch: %d, out_dim: %d, out_channel: %d, in_channel: %d \n", args[0], args[1], args[3], args[4]); int spad_rows_input = tiled_conv_total_spad_rows(false, false, @@ -4126,7 +4187,7 @@ static void tiled_conv_auto_original( while(spad_rows_input > BANK_ROWS*(BANK_NUM-weight_bank)){// tile input last - //batch, input dimension, input channel + //batch, input dimension, input channel int max_val = -1; int max_idx = -1; for(int i = 0; i < 5; i++){ @@ -4146,7 +4207,7 @@ static void tiled_conv_auto_original( } else args[max_idx]--; - spad_rows_input = tiled_conv_total_spad_rows(false, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); + spad_rows_input = tiled_conv_total_spad_rows(false, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); } @@ -4247,14 +4308,14 @@ static void tiled_conv_auto_largeC( } args[max_idx]--; - acc_rows = tiled_conv_total_spad_rows(true, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); + acc_rows = tiled_conv_total_spad_rows(true, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); } // printf("batch: %d, out_dim: %d, out_channel: %d, in_channel: %d \n", args[0], args[1], args[3], args[4]); int spad_rows_input = tiled_conv_total_spad_rows(false, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); while(spad_rows_input > BANK_ROWS*(BANK_NUM-weight_bank)){// tile input last - //batch, input dimension + //batch, input dimension int max_val = -1; int max_idx = -1; for(int i = 0; i < 3; i++){ @@ -4265,7 +4326,7 @@ static void tiled_conv_auto_largeC( } args[max_idx]--; - spad_rows_input = tiled_conv_total_spad_rows(false, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); + spad_rows_input = tiled_conv_total_spad_rows(false, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); } int batches = args[0]; @@ -4331,7 +4392,7 @@ static void tiled_conv_auto( int args[] = {batch_size, pool_out_dim, pool_out_dim, out_channels, in_channels}; int och_floor = (args[3]/DIM) + 1; - + int spad_rows_weight = tiled_conv_total_spad_rows(false, true, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); while(spad_rows_weight > BANK_ROWS * weight_bank){ //tile weight first (allocate bank3 to weight) @@ -4341,7 +4402,7 @@ static void tiled_conv_auto( spad_rows_weight = tiled_conv_total_spad_rows(false, true, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); } - + int acc_rows = tiled_conv_total_spad_rows(true, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); @@ -4350,7 +4411,7 @@ static void tiled_conv_auto( args[1]--; args[2]--; - acc_rows = tiled_conv_total_spad_rows(true, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); + acc_rows = tiled_conv_total_spad_rows(true, false, stride, args[0], args[1], args[2], args[3], kernel_dim, kernel_dim, args[4], pool_size, pool_stride); } // printf("batch: %d, out_dim: %d, out_channel: %d, in_channel: %d \n", args[0], args[1], args[3], args[4]); @@ -4386,7 +4447,7 @@ static void tiled_conv_auto( act, scale, relu6_shift, pool_size, no_pool ? 0 : pool_stride, pool_padding, - + weight_bank, tiled_conv_type); } @@ -4564,4 +4625,3 @@ static void tiled_resadd_auto(const size_t I, const size_t J, #undef abs #endif // SRC_MAIN_C_GEMMINI_H -