diff --git a/bareMetalC/Makefile b/bareMetalC/Makefile index e0ca5e1d..5b72482a 100644 --- a/bareMetalC/Makefile +++ b/bareMetalC/Makefile @@ -19,6 +19,10 @@ tests = \ mvin_scale \ conv \ conv_rect \ + conv_rect_kernel \ + conv_rect_kernel_trans_weight_0132 \ + conv_rect_kernel_trans_weight_1203 \ + conv_rect_kernel_and_input \ conv_with_pool \ conv_with_rot180 \ conv_with_kernel_dilation \ diff --git a/bareMetalC/conv.c b/bareMetalC/conv.c index 4ace514a..fccc78a2 100644 --- a/bareMetalC/conv.c +++ b/bareMetalC/conv.c @@ -15,7 +15,8 @@ #define IN_COL_DIM 224 #define IN_CHANNELS 3 #define OUT_CHANNELS 32 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 2 @@ -38,7 +39,8 @@ #endif #define BATCH_SIZE 2 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 2 @@ -46,28 +48,28 @@ #define NO_BIAS false -#define OUT_ROW_DIM ((IN_ROW_DIM + 2*PADDING - KERNEL_DIM) / STRIDE + 1) -#define OUT_COL_DIM ((IN_COL_DIM + 2*PADDING - KERNEL_DIM) / STRIDE + 1) -#define PATCH_SIZE (KERNEL_DIM * KERNEL_DIM * IN_CHANNELS) +#define OUT_ROW_DIM ((IN_ROW_DIM + 2*PADDING - KERNEL_ROW_DIM) / STRIDE + 1) +#define OUT_COL_DIM ((IN_COL_DIM + 2*PADDING - KERNEL_COL_DIM) / STRIDE + 1) +#define PATCH_SIZE (KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS) #define N_PATCHES (BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM) void conv(int batch_size, int in_channels, int in_row_dim, int in_col_dim, - int out_channels, int kernel_dim, + int out_channels, int kernel_row_dim, int kernel_col_dim, int out_row_dim, int out_col_dim, int stride, int padding, elem_t input[batch_size][in_row_dim][in_col_dim][in_channels], - elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], acc_t bias[out_channels], elem_t output[batch_size][out_row_dim][out_col_dim][out_channels]) { #ifdef GEMMINI_ASSERTIONS - if (out_row_dim != (in_row_dim + 2 * padding - kernel_dim) / stride + 1) { + if (out_row_dim != (in_row_dim + 2 * padding - kernel_row_dim) / stride + 1) { printf("conv out_row_dim is not correct\n"); exit(1); } - if (out_col_dim != (in_col_dim + 2 * padding - kernel_dim) / stride + 1) { + if (out_col_dim != (in_col_dim + 2 * padding - kernel_col_dim) / stride + 1) { printf("conv out_col_dim is not correct\n"); exit(1); } @@ -79,8 +81,8 @@ void conv(int batch_size, int in_channels, for (int och = 0; och < out_channels; och++) { acc_t result = bias[och]; - for (int krow = 0; krow < kernel_dim; krow++) { - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { for (int kch = 0; kch < in_channels; kch++) { int irow = orow * stride + krow - padding; int icol = ocol * stride + kcol - padding; @@ -106,18 +108,18 @@ void conv(int batch_size, int in_channels, } } -void flatten_weights(int out_channels, int kernel_dim, int in_channels, +void flatten_weights(int out_channels, int kernel_row_dim, int kernel_col_dim, int in_channels, int patch_size, - elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], elem_t weights_mat[patch_size][out_channels]) { - assert(patch_size == kernel_dim * kernel_dim * in_channels); + assert(patch_size == kernel_row_dim * kernel_col_dim * in_channels); for (int outc = 0; outc < out_channels; outc++) { - for (int krow = 0; krow < kernel_dim; krow++) { - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { for (int inc = 0; inc < in_channels; inc++) { - int wmatrow = krow * kernel_dim * in_channels + + int wmatrow = krow * kernel_col_dim * in_channels + kcol * in_channels + inc; @@ -182,7 +184,7 @@ int main() { printf("Output dimensions (rows by columns): %u by %u\n\n", OUT_ROW_DIM, OUT_COL_DIM); static elem_t input[BATCH_SIZE][IN_ROW_DIM][IN_COL_DIM][IN_CHANNELS]; - static elem_t weights[OUT_CHANNELS][KERNEL_DIM][KERNEL_DIM][IN_CHANNELS]; + static elem_t weights[OUT_CHANNELS][KERNEL_ROW_DIM][KERNEL_COL_DIM][IN_CHANNELS]; static acc_t bias[OUT_CHANNELS]; static elem_t output[BATCH_SIZE][OUT_ROW_DIM][OUT_COL_DIM][OUT_CHANNELS]; @@ -203,7 +205,7 @@ int main() { #ifndef FAST conv(BATCH_SIZE, IN_CHANNELS, IN_ROW_DIM, IN_COL_DIM, - OUT_CHANNELS, KERNEL_DIM, + OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, OUT_ROW_DIM, OUT_COL_DIM, STRIDE, PADDING, input, @@ -218,7 +220,7 @@ int main() { static elem_t output_mat[N_PATCHES][OUT_CHANNELS]; printf("Flatten weights...\n"); - flatten_weights(OUT_CHANNELS, KERNEL_DIM, IN_CHANNELS, + flatten_weights(OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, IN_CHANNELS, PATCH_SIZE, weights, weights_mat); @@ -228,7 +230,7 @@ int main() { tiled_conv_auto( BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, IN_CHANNELS, OUT_CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, - STRIDE, 1, 1, PADDING, KERNEL_DIM, + STRIDE, 1, 1, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, false, false, false, false, false, (elem_t*)input, @@ -271,9 +273,9 @@ int main() { printf("weights:\n"); for (int och = 0; och < OUT_CHANNELS; och++) { printf("["); - for (int wrow = 0; wrow < KERNEL_DIM; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM; wrow++) { printf("["); - for (int wcol = 0; wcol < KERNEL_DIM; wcol++) { + for (int wcol = 0; wcol < KERNEL_COL_DIM; wcol++) { printf("["); for (int ich = 0; ich < IN_CHANNELS; ich++) { printf("%d,", weights[och][wrow][wcol][ich]); @@ -287,7 +289,7 @@ int main() { printf("\b\n\n"); printf("weights_mat:\n"); - for (int wrow = 0; wrow < KERNEL_DIM * KERNEL_DIM * IN_CHANNELS; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS; wrow++) { printf("["); for (int wcol = 0; wcol < OUT_CHANNELS; wcol++) { printf("%d,", weights_mat[wrow][wcol]); diff --git a/bareMetalC/conv_dw.c b/bareMetalC/conv_dw.c index a0b00217..b0277261 100644 --- a/bareMetalC/conv_dw.c +++ b/bareMetalC/conv_dw.c @@ -8,13 +8,23 @@ #endif #include "include/gemmini_testutils.h" +#define RECT_KERNEL + #ifndef BAREMETAL #define BATCH_SIZE 3 #define IN_ROW_DIM 112 #define IN_COL_DIM 112 #define CHANNELS 17 -#define KERNEL_DIM 3 + +#ifdef RECT_KERNEL +#define KERNEL_ROW_DIM 4 +#define KERNEL_COL_DIM 2 +#else +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 +#endif + #define PADDING 1 #define STRIDE 2 @@ -30,21 +40,31 @@ #define IN_ROW_DIM 17 #define IN_COL_DIM 17 -#define CHANNELS 15 +#define CHANNELS 2 #endif #define BATCH_SIZE 3 -#define KERNEL_DIM 3 + +#ifdef RECT_KERNEL +#define KERNEL_ROW_DIM 4 +#define KERNEL_COL_DIM 2 +#else +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 +#endif + #define PADDING 1 #define STRIDE 2 #endif + + #define NO_BIAS false -#define OUT_ROW_DIM ((IN_ROW_DIM + 2*PADDING - KERNEL_DIM) / STRIDE + 1) -#define OUT_COL_DIM ((IN_COL_DIM + 2*PADDING - KERNEL_DIM) / STRIDE + 1) +#define OUT_ROW_DIM ((IN_ROW_DIM + 2*PADDING - KERNEL_ROW_DIM) / STRIDE + 1) +#define OUT_COL_DIM ((IN_COL_DIM + 2*PADDING - KERNEL_COL_DIM) / STRIDE + 1) bool vec_is_equal(elem_t * a, elem_t * b, int len) { for (int i = 0; i < len; i++) @@ -57,11 +77,11 @@ void init_random(elem_t * buf, int len) { elem_t i = 0; for (elem_t * ptr = buf; ptr < buf + len; ptr++) { // *ptr = (rand() % 32) - 16; -#ifdef FAST +//#ifdef FAST *ptr = 1; -#else - *ptr = (rand() % 5) - 2; -#endif +//#else +// *ptr = (rand() % 5) - 2; +//#endif } } @@ -99,7 +119,7 @@ int main() { printf("Output dimensions: %u by %u\n\n", OUT_ROW_DIM, OUT_COL_DIM); static elem_t input[BATCH_SIZE][IN_ROW_DIM][IN_COL_DIM][CHANNELS]; - static elem_t weights[CHANNELS][KERNEL_DIM][KERNEL_DIM]; + static elem_t weights[CHANNELS][KERNEL_ROW_DIM][KERNEL_COL_DIM]; static acc_t bias[CHANNELS]; static elem_t output[BATCH_SIZE][OUT_ROW_DIM][OUT_COL_DIM][CHANNELS]; @@ -120,7 +140,7 @@ int main() { #ifndef FAST tiled_conv_dw_auto(BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, - STRIDE, PADDING, KERNEL_DIM, + STRIDE, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, (elem_t*)input, (elem_t*)weights, @@ -140,7 +160,7 @@ int main() { uint64_t start_gemmini = read_cycles(); tiled_conv_dw_auto(BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, - STRIDE, PADDING, KERNEL_DIM, + STRIDE, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, (elem_t*)input, (elem_t*)weights, @@ -180,9 +200,9 @@ int main() { printf("weights:\n"); for (int och = 0; och < CHANNELS; och++) { printf("["); - for (int wrow = 0; wrow < KERNEL_DIM; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM; wrow++) { printf("["); - for (int wcol = 0; wcol < KERNEL_DIM; wcol++) { + for (int wcol = 0; wcol < KERNEL_COL_DIM; wcol++) { printf("%d,", weights[och][wrow][wcol]); } printf("\b],\n"); diff --git a/bareMetalC/conv_dw_perf.c b/bareMetalC/conv_dw_perf.c index d516cce5..58243ce3 100644 --- a/bareMetalC/conv_dw_perf.c +++ b/bareMetalC/conv_dw_perf.c @@ -85,7 +85,7 @@ int main (int argc, char * argv[]) { uint64_t start_gemmini = read_cycles(); tiled_conv_dw_auto(BATCH_SIZE, IN_DIM, IN_DIM, CHANNELS, OUT_DIM, OUT_DIM, - STRIDE, PADDING, KERNEL_DIM, + STRIDE, PADDING, KERNEL_DIM, KERNEL_DIM, (elem_t*)input, (elem_t*)weights, diff --git a/bareMetalC/conv_first_layer.c b/bareMetalC/conv_first_layer.c index 47e3c2af..6c884a5d 100644 --- a/bareMetalC/conv_first_layer.c +++ b/bareMetalC/conv_first_layer.c @@ -18,7 +18,8 @@ #define IN_COL_DIM 224 #define IN_CHANNELS 3 #define OUT_CHANNELS 32 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 2 @@ -41,7 +42,8 @@ #endif #define BATCH_SIZE 2 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 2 @@ -49,27 +51,27 @@ #define NO_BIAS false -#define OUT_ROW_DIM ((IN_ROW_DIM + 2*PADDING - KERNEL_DIM) / STRIDE + 1) -#define OUT_COL_DIM ((IN_COL_DIM + 2*PADDING - KERNEL_DIM) / STRIDE + 1) -#define PATCH_SIZE (KERNEL_DIM * KERNEL_DIM * IN_CHANNELS) +#define OUT_ROW_DIM ((IN_ROW_DIM + 2*PADDING - KERNEL_ROW_DIM) / STRIDE + 1) +#define OUT_COL_DIM ((IN_COL_DIM + 2*PADDING - KERNEL_COL_DIM) / STRIDE + 1) +#define PATCH_SIZE (KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS) #define N_PATCHES (BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM) void conv(int batch_size, int in_channels, int in_row_dim, int in_col_dim, - int out_channels, int kernel_dim, + int out_channels, int kernel_row_dim, int kernel_col_dim, int out_row_dim, int out_col_dim, int stride, int padding, elem_t input[batch_size][in_row_dim][in_col_dim][in_channels], - elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], acc_t bias[out_channels], elem_t output[batch_size][out_row_dim][out_col_dim][out_channels]) { #ifdef GEMMINI_ASSERTIONS - if (out_row_dim != (in_row_dim + 2*padding - kernel_dim) / stride + 1) { + if (out_row_dim != (in_row_dim + 2*padding - kernel_row_dim) / stride + 1) { printf("conv out_row_dim is not correct\n"); exit(1); } - if (out_col_dim != (in_col_dim + 2*padding - kernel_dim) / stride + 1) { + if (out_col_dim != (in_col_dim + 2*padding - kernel_col_dim) / stride + 1) { printf("conv out_col_dim is not correct\n"); exit(1); } @@ -81,8 +83,8 @@ void conv(int batch_size, int in_channels, for (int och = 0; och < out_channels; och++) { acc_t result = bias[och]; - for (int krow = 0; krow < kernel_dim; krow++) { - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { for (int kch = 0; kch < in_channels; kch++) { int irow = orow * stride + krow - padding; int icol = ocol * stride + kcol - padding; @@ -108,18 +110,18 @@ void conv(int batch_size, int in_channels, } } -void flatten_weights(int out_channels, int kernel_dim, int in_channels, +void flatten_weights(int out_channels, int kernel_row_dim, int kernel_col_dim, int in_channels, int patch_size, - elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], elem_t weights_mat[patch_size][out_channels]) { - assert(patch_size == kernel_dim * kernel_dim * in_channels); + assert(patch_size == kernel_row_dim * kernel_col_dim * in_channels); for (int outc = 0; outc < out_channels; outc++) { - for (int krow = 0; krow < kernel_dim; krow++) { - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { for (int inc = 0; inc < in_channels; inc++) { - int wmatrow = krow * kernel_dim * in_channels + + int wmatrow = krow * kernel_col_dim * in_channels + kcol * in_channels + inc; @@ -184,7 +186,7 @@ int main() { printf("Output dimensions: %u by %u\n\n", OUT_ROW_DIM, OUT_COL_DIM); static elem_t input[BATCH_SIZE][IN_ROW_DIM][IN_COL_DIM][IN_CHANNELS]; - static elem_t weights[OUT_CHANNELS][KERNEL_DIM][KERNEL_DIM][IN_CHANNELS]; + static elem_t weights[OUT_CHANNELS][KERNEL_ROW_DIM][KERNEL_COL_DIM][IN_CHANNELS]; static acc_t bias[OUT_CHANNELS]; static elem_t output[BATCH_SIZE][OUT_ROW_DIM][OUT_COL_DIM][OUT_CHANNELS]; @@ -204,7 +206,7 @@ int main() { uint64_t start_cpu = read_cycles(); #ifndef FAST conv(BATCH_SIZE, IN_CHANNELS, IN_ROW_DIM, IN_COL_DIM, - OUT_CHANNELS, KERNEL_DIM, + OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, OUT_ROW_DIM, OUT_COL_DIM, STRIDE, PADDING, input, @@ -219,7 +221,7 @@ int main() { static elem_t output_mat[N_PATCHES][OUT_CHANNELS]; printf("Flatten weights...\n"); - flatten_weights(OUT_CHANNELS, KERNEL_DIM, IN_CHANNELS, + flatten_weights(OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, IN_CHANNELS, PATCH_SIZE, weights, weights_mat); @@ -229,7 +231,7 @@ int main() { tiled_conv_auto( BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, IN_CHANNELS, OUT_CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, - STRIDE, 1, 1, PADDING, KERNEL_DIM, + STRIDE, 1, 1, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, false, false, false, false, false, (elem_t*)input, @@ -272,9 +274,9 @@ int main() { printf("weights:\n"); for (int och = 0; och < OUT_CHANNELS; och++) { printf("["); - for (int wrow = 0; wrow < KERNEL_DIM; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM; wrow++) { printf("["); - for (int wcol = 0; wcol < KERNEL_DIM; wcol++) { + for (int wcol = 0; wcol < KERNEL_COL_DIM; wcol++) { printf("["); for (int ich = 0; ich < IN_CHANNELS; ich++) { printf("%d,", weights[och][wrow][wcol][ich]); @@ -288,7 +290,7 @@ int main() { printf("\b\n\n"); printf("weights_mat:\n"); - for (int wrow = 0; wrow < KERNEL_DIM * KERNEL_DIM * IN_CHANNELS; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS; wrow++) { printf("["); for (int wcol = 0; wcol < OUT_CHANNELS; wcol++) { printf("%d,", weights_mat[wrow][wcol]); diff --git a/bareMetalC/conv_perf.c b/bareMetalC/conv_perf.c index bacb09c4..a76feefd 100644 --- a/bareMetalC/conv_perf.c +++ b/bareMetalC/conv_perf.c @@ -114,7 +114,7 @@ int main (int argc, char * argv[]) { tiled_conv_auto( BATCH_SIZE, IN_DIM, IN_DIM, IN_CHANNELS, OUT_CHANNELS, OUT_DIM, OUT_DIM, - STRIDE, 1, 1, PADDING, KERNEL_DIM, + STRIDE, 1, 1, PADDING, KERNEL_DIM, KERNEL_DIM, false, false, false, false, false, (elem_t*)input, diff --git a/bareMetalC/conv_rect.c b/bareMetalC/conv_rect.c index 4c5fd898..36d44e9c 100644 --- a/bareMetalC/conv_rect.c +++ b/bareMetalC/conv_rect.c @@ -15,7 +15,8 @@ #define IN_COL_DIM 224 #define IN_CHANNELS 3 #define OUT_CHANNELS 32 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 2 @@ -38,7 +39,8 @@ #endif #define BATCH_SIZE 2 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 2 @@ -46,28 +48,28 @@ #define NO_BIAS false -#define OUT_ROW_DIM ((IN_ROW_DIM + 2*PADDING - KERNEL_DIM) / STRIDE + 1) -#define OUT_COL_DIM ((IN_COL_DIM + 2*PADDING - KERNEL_DIM) / STRIDE + 1) -#define PATCH_SIZE (KERNEL_DIM * KERNEL_DIM * IN_CHANNELS) +#define OUT_ROW_DIM ((IN_ROW_DIM + 2*PADDING - KERNEL_ROW_DIM) / STRIDE + 1) +#define OUT_COL_DIM ((IN_COL_DIM + 2*PADDING - KERNEL_COL_DIM) / STRIDE + 1) +#define PATCH_SIZE (KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS) #define N_PATCHES (BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM) void conv(int batch_size, int in_channels, int in_row_dim, int in_col_dim, - int out_channels, int kernel_dim, + int out_channels, int kernel_row_dim, int kernel_col_dim, int out_row_dim, int out_col_dim, int stride, int padding, elem_t input[batch_size][in_row_dim][in_col_dim][in_channels], - elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], acc_t bias[out_channels], elem_t output[batch_size][out_row_dim][out_col_dim][out_channels]) { #ifdef GEMMINI_ASSERTIONS - if (out_row_dim != (in_row_dim + 2 * padding - kernel_dim) / stride + 1) { + if (out_row_dim != (in_row_dim + 2 * padding - kernel_row_dim) / stride + 1) { printf("conv out_row_dim is not correct\n"); exit(1); } - if (out_col_dim != (in_col_dim + 2 * padding - kernel_dim) / stride + 1) { + if (out_col_dim != (in_col_dim + 2 * padding - kernel_col_dim) / stride + 1) { printf("conv out_col_dim is not correct\n"); exit(1); } @@ -79,8 +81,8 @@ void conv(int batch_size, int in_channels, for (int och = 0; och < out_channels; och++) { acc_t result = bias[och]; - for (int krow = 0; krow < kernel_dim; krow++) { - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { for (int kch = 0; kch < in_channels; kch++) { int irow = orow * stride + krow - padding; int icol = ocol * stride + kcol - padding; @@ -106,18 +108,18 @@ void conv(int batch_size, int in_channels, } } -void flatten_weights(int out_channels, int kernel_dim, int in_channels, +void flatten_weights(int out_channels, int kernel_row_dim, int kernel_col_dim, int in_channels, int patch_size, - elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], elem_t weights_mat[patch_size][out_channels]) { - assert(patch_size == kernel_dim * kernel_dim * in_channels); + assert(patch_size == kernel_row_dim * kernel_col_dim * in_channels); for (int outc = 0; outc < out_channels; outc++) { - for (int krow = 0; krow < kernel_dim; krow++) { - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { for (int inc = 0; inc < in_channels; inc++) { - int wmatrow = krow * kernel_dim * in_channels + + int wmatrow = krow * kernel_col_dim * in_channels + kcol * in_channels + inc; @@ -182,7 +184,7 @@ int main() { printf("Output dimensions (rows by columns): %u by %u\n\n", OUT_ROW_DIM, OUT_COL_DIM); static elem_t input[BATCH_SIZE][IN_ROW_DIM][IN_COL_DIM][IN_CHANNELS]; - static elem_t weights[OUT_CHANNELS][KERNEL_DIM][KERNEL_DIM][IN_CHANNELS]; + static elem_t weights[OUT_CHANNELS][KERNEL_ROW_DIM][KERNEL_ROW_DIM][IN_CHANNELS]; static acc_t bias[OUT_CHANNELS]; static elem_t output[BATCH_SIZE][OUT_ROW_DIM][OUT_COL_DIM][OUT_CHANNELS]; @@ -203,7 +205,7 @@ int main() { #ifndef FAST conv(BATCH_SIZE, IN_CHANNELS, IN_ROW_DIM, IN_COL_DIM, - OUT_CHANNELS, KERNEL_DIM, + OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, OUT_ROW_DIM, OUT_COL_DIM, STRIDE, PADDING, input, @@ -218,7 +220,7 @@ int main() { static elem_t output_mat[N_PATCHES][OUT_CHANNELS]; printf("Flatten weights...\n"); - flatten_weights(OUT_CHANNELS, KERNEL_DIM, IN_CHANNELS, + flatten_weights(OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, IN_CHANNELS, PATCH_SIZE, weights, weights_mat); @@ -228,7 +230,7 @@ int main() { tiled_conv_auto( BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, IN_CHANNELS, OUT_CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, - STRIDE, 1, 1, PADDING, KERNEL_DIM, + STRIDE, 1, 1, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, false, false, false, false, false, (elem_t*)input, @@ -271,9 +273,9 @@ int main() { printf("weights:\n"); for (int och = 0; och < OUT_CHANNELS; och++) { printf("["); - for (int wrow = 0; wrow < KERNEL_DIM; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM; wrow++) { printf("["); - for (int wcol = 0; wcol < KERNEL_DIM; wcol++) { + for (int wcol = 0; wcol < KERNEL_COL_DIM; wcol++) { printf("["); for (int ich = 0; ich < IN_CHANNELS; ich++) { printf("%d,", weights[och][wrow][wcol][ich]); @@ -287,7 +289,7 @@ int main() { printf("\b\n\n"); printf("weights_mat:\n"); - for (int wrow = 0; wrow < KERNEL_DIM * KERNEL_DIM * IN_CHANNELS; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS; wrow++) { printf("["); for (int wcol = 0; wcol < OUT_CHANNELS; wcol++) { printf("%d,", weights_mat[wrow][wcol]); diff --git a/bareMetalC/conv_rect_kernel.c b/bareMetalC/conv_rect_kernel.c new file mode 100644 index 00000000..953f5303 --- /dev/null +++ b/bareMetalC/conv_rect_kernel.c @@ -0,0 +1,353 @@ +#include +#include +#include +#include +#include +#ifndef BAREMETAL +#include +#endif +#include "include/gemmini_testutils.h" + + +#ifndef BAREMETAL + + #define BATCH_SIZE 4 + #define IN_ROW_DIM 16 + #define IN_COL_DIM 16 + #define IN_CHANNELS 3 + #define OUT_CHANNELS 32 + #define KERNEL_ROW_DIM 4 + #define KERNEL_COL_DIM 3 + #define PADDING 1 + #define STRIDE 2 + +#else + + #ifdef FAST + + #define IN_ROW_DIM 16 + #define IN_COL_DIM 16 + #define IN_CHANNELS 9 + #define OUT_CHANNELS 10 + + #else + + #define IN_ROW_DIM 17 + #define IN_COL_DIM 17 + #define IN_CHANNELS 18 + #define OUT_CHANNELS 19 + + #endif + + #define BATCH_SIZE 1 + #define KERNEL_ROW_DIM 4 + #define KERNEL_COL_DIM 2 + #define PADDING 1 + #define STRIDE 2 + +#endif + +#define NO_BIAS true + +#define OUT_ROW_DIM ((IN_ROW_DIM + 2*PADDING - KERNEL_ROW_DIM) / STRIDE + 1) +#define OUT_COL_DIM ((IN_COL_DIM + 2*PADDING - KERNEL_COL_DIM) / STRIDE + 1) +#define PATCH_SIZE (KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS) +#define N_PATCHES (BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM) + +void conv(int batch_size, int in_channels, + int in_row_dim, int in_col_dim, + int out_channels, int kernel_row_dim, int kernel_col_dim, + int out_row_dim, int out_col_dim, + int stride, int padding, + elem_t input[batch_size][in_row_dim][in_col_dim][in_channels], + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], + acc_t bias[out_channels], + elem_t output[batch_size][out_row_dim][out_col_dim][out_channels]) { + +#ifdef GEMMINI_ASSERTIONS + if (out_row_dim != (in_row_dim + 2 * padding - kernel_row_dim) / stride + 1) { + printf("conv out_row_dim is not correct\n"); + exit(1); + } + + if (out_col_dim != (in_col_dim + 2 * padding - kernel_col_dim) / stride + 1) { + printf("conv out_col_dim is not correct\n"); + exit(1); + } +#endif + + for (int b = 0; b < batch_size; b++) { + for (int orow = 0; orow < out_row_dim; orow++) { + for (int ocol = 0; ocol < out_col_dim; ocol++) { + for (int och = 0; och < out_channels; och++) { + acc_t result = bias[och]; + + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { + for (int kch = 0; kch < in_channels; kch++) { + int irow = orow * stride + krow - padding; + int icol = ocol * stride + kcol - padding; + + elem_t pixel = irow < 0 || irow >= in_row_dim || + icol < 0 || icol >= in_col_dim ? + 0 : input[b][irow][icol][kch]; + + result += + weights[och][krow][kcol][kch] * + pixel; + } + } + } + + // Clip result + result = result > elem_t_max ? elem_t_max : (result < elem_t_min ? elem_t_min : result); + + output[b][orow][ocol][och] = result; + } + } + } + } +} + +void flatten_weights(int out_channels, int kernel_row_dim, int kernel_col_dim, int in_channels, + int patch_size, + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], + elem_t weights_mat[patch_size][out_channels]) { + + assert(patch_size == kernel_row_dim * kernel_col_dim * in_channels); + + for (int outc = 0; outc < out_channels; outc++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { + for (int inc = 0; inc < in_channels; inc++) { + int wmatrow = krow * kernel_col_dim * in_channels + + kcol * in_channels + + inc; + + weights_mat[wmatrow][outc] = + weights[outc][krow][kcol][inc]; + } + } + } + } +} + +bool vec_is_equal(elem_t * a, elem_t * b, int len) { + for (int i = 0; i < len; i++) + if (a[i] != b[i]) + return false; + return true; +} + +void init_random(elem_t * buf, int len) { + elem_t i = 0; + for (elem_t * ptr = buf; ptr < buf + len; ptr++) { + // *ptr = (rand() % 32) - 16; +#ifdef FAST + *ptr = 1; +#else + *ptr = (rand() % 5) - 2; +#endif + } +} + +void init_random_acc(acc_t * buf, int len) { + elem_t i = 0; + for (acc_t * ptr = buf; ptr < buf + len; ptr++) { + // *ptr = (rand() % 32) - 16; +#ifdef FAST + *ptr = 1; +#else + *ptr = (rand() % 5) - 2; +#endif + } +} + +void init_zeros_acc(acc_t * buf, int len) { + for (acc_t * ptr = buf; ptr < buf + len; ptr++) { + *ptr = 0; + } +} + +int main() { +#ifndef BAREMETAL + if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) { + perror("mlockall failed"); + exit(1); + } +#endif + + gemmini_flush(0); + + // assert((in_dim + 2*padding - kernel_dim) % stride == 0); + + printf("Kernel dimensions (rows by columns): %u by %u\n", KERNEL_ROW_DIM, KERNEL_COL_DIM); + printf("Input dimensions (rows by columns): %u by %u\n", IN_ROW_DIM, IN_COL_DIM); + printf("Output dimensions (rows by columns): %u by %u\n\n", OUT_ROW_DIM, OUT_COL_DIM); + + static elem_t input[BATCH_SIZE][IN_ROW_DIM][IN_COL_DIM][IN_CHANNELS]; + static elem_t weights[OUT_CHANNELS][KERNEL_ROW_DIM][KERNEL_ROW_DIM][IN_CHANNELS]; + static acc_t bias[OUT_CHANNELS]; + static elem_t output[BATCH_SIZE][OUT_ROW_DIM][OUT_COL_DIM][OUT_CHANNELS]; + + printf("Randomize inputs...\n"); + init_random(&input[0][0][0][0], sizeof(input) / sizeof(elem_t)); + + printf("Randomize weights...\n"); + init_random(&weights[0][0][0][0], sizeof(weights) / sizeof(elem_t)); + + printf("Randomize bias...\n"); + if (NO_BIAS) + init_zeros_acc(&bias[0], sizeof(bias) / sizeof(acc_t)); + else + init_random_acc(&bias[0], sizeof(bias) / sizeof(acc_t)); + + printf("CPU conv...\n"); + uint64_t start_cpu = read_cycles(); +#ifndef FAST + conv(BATCH_SIZE, IN_CHANNELS, + IN_ROW_DIM, IN_COL_DIM, + OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, + OUT_ROW_DIM, OUT_COL_DIM, + STRIDE, PADDING, + input, + weights, + bias, + output); +#endif + uint64_t end_cpu = read_cycles(); + printf("CPU conv took %llu cycles\n", end_cpu - start_cpu); + + static elem_t weights_mat[PATCH_SIZE][OUT_CHANNELS]; + static elem_t output_mat[N_PATCHES][OUT_CHANNELS]; + + printf("Flatten weights...\n"); + flatten_weights(OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, IN_CHANNELS, + PATCH_SIZE, + weights, + weights_mat); + + printf("Gemmini conv...\n"); + uint64_t start_gemmini = read_cycles(); + tiled_conv_auto( + BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, IN_CHANNELS, + OUT_CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, + STRIDE, 1, 1, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, + false, false, false, false, false, + + (elem_t*)input, + (elem_t*)weights_mat, + NO_BIAS ? NULL : (acc_t*)bias, + (elem_t*)output_mat, + + NO_ACTIVATION, ACC_SCALE_IDENTITY, 0, 0, 0, + + WS); + uint64_t end_gemmini = read_cycles(); + printf("Gemmini conv took %llu cycles\n", end_gemmini - start_gemmini); + + assert(sizeof(output_mat) == sizeof(output)); + +#ifdef FAST + bool success = true; + for (int orow = 0; orow < BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM; orow++) { + for (int ocol = 0; ocol < OUT_CHANNELS; ocol++) { + elem_t v = output_mat[orow][ocol]; + if (v != 21 && v != 31 && v != 46) { + success = false; + break; + } + } + } +#else + bool success = vec_is_equal(&output[0][0][0][0], &output_mat[0][0], sizeof(output) / sizeof(elem_t)); +#endif + + if (!success) { + // return 1; + + printf("bias:\n"); + for (int och = 0; och < OUT_CHANNELS; och++) { + printf("%d,", bias[och]); + } + printf("\b\n\n"); + + printf("weights:\n"); + for (int och = 0; och < OUT_CHANNELS; och++) { + printf("["); + for (int wrow = 0; wrow < KERNEL_ROW_DIM; wrow++) { + printf("["); + for (int wcol = 0; wcol < KERNEL_COL_DIM; wcol++) { + printf("["); + for (int ich = 0; ich < IN_CHANNELS; ich++) { + printf("%d,", weights[och][wrow][wcol][ich]); + } + printf("\b],"); + } + printf("\b],\n"); + } + printf("\b],"); + } + printf("\b\n\n"); + + printf("weights_mat:\n"); + for (int wrow = 0; wrow < KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS; wrow++) { + printf("["); + for (int wcol = 0; wcol < OUT_CHANNELS; wcol++) { + printf("%d,", weights_mat[wrow][wcol]); + } + printf("\b],\n"); + } + printf("\b\n\n"); + + printf("input:\n"); + for (int batch = 0; batch < BATCH_SIZE; batch++) { + printf("["); + for (int irow = 0; irow < IN_ROW_DIM; irow++) { + printf("["); + for (int icol = 0; icol < IN_COL_DIM; icol++) { + printf("["); + for (int ich = 0; ich < IN_CHANNELS; ich++) { + printf("%d,", input[batch][irow][icol][ich]); + } + printf("\b],"); + } + printf("\b],\n"); + } + printf("\b],"); + } + printf("\b\n\n"); + + printf("output:\n"); + for (int batch = 0; batch < BATCH_SIZE; batch++) { + printf("["); + for (int orow = 0; orow < OUT_ROW_DIM; orow++) { + printf("["); + for (int ocol = 0; ocol < OUT_COL_DIM; ocol++) { + printf("["); + for (int och = 0; och < OUT_CHANNELS; och++) { + printf("%d,", output[batch][orow][ocol][och]); + } + printf("\b],"); + } + printf("\b],\n"); + } + printf("\b],"); + } + printf("\b\n\n"); + + printf("output_mat:\n"); + for (int orow = 0; orow < BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM; orow++) { + printf("["); + for (int ocol = 0; ocol < OUT_CHANNELS; ocol++) { + printf("%d,", output_mat[orow][ocol]); + } + printf("\b],\n"); + } + printf("\b\n\n"); + + return 1; + } + + return 0; +} diff --git a/bareMetalC/conv_rect_kernel_and_input.c b/bareMetalC/conv_rect_kernel_and_input.c new file mode 100644 index 00000000..33e05ed5 --- /dev/null +++ b/bareMetalC/conv_rect_kernel_and_input.c @@ -0,0 +1,352 @@ +#include +#include +#include +#include +#include +#ifndef BAREMETAL +#include +#endif +#include "include/gemmini_testutils.h" + +#ifndef BAREMETAL + +#define BATCH_SIZE 4 +#define IN_ROW_DIM 224 +#define IN_COL_DIM 448 +#define IN_CHANNELS 3 +#define OUT_CHANNELS 32 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 4 +#define PADDING 1 +#define STRIDE 2 + +#else + +#ifdef FAST + +#define IN_ROW_DIM 9 +#define IN_COL_DIM 9 +#define IN_CHANNELS 5 +#define OUT_CHANNELS 7 + +#else + +#define IN_ROW_DIM 224 +#define IN_COL_DIM 448 +#define IN_CHANNELS 18 +#define OUT_CHANNELS 19 + +#endif + +#define BATCH_SIZE 2 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 4 +#define PADDING 1 +#define STRIDE 2 + +#endif + +#define NO_BIAS false + +#define OUT_ROW_DIM ((IN_ROW_DIM + 2*PADDING - KERNEL_ROW_DIM) / STRIDE + 1) +#define OUT_COL_DIM ((IN_COL_DIM + 2*PADDING - KERNEL_COL_DIM) / STRIDE + 1) +#define PATCH_SIZE (KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS) +#define N_PATCHES (BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM) + +void conv(int batch_size, int in_channels, + int in_row_dim, int in_col_dim, + int out_channels, int kernel_row_dim, int kernel_col_dim, + int out_row_dim, int out_col_dim, + int stride, int padding, + elem_t input[batch_size][in_row_dim][in_col_dim][in_channels], + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], + acc_t bias[out_channels], + elem_t output[batch_size][out_row_dim][out_col_dim][out_channels]) { + +#ifdef GEMMINI_ASSERTIONS + if (out_row_dim != (in_row_dim + 2 * padding - kernel_row_dim) / stride + 1) { + printf("conv out_row_dim is not correct\n"); + exit(1); + } + + if (out_col_dim != (in_col_dim + 2 * padding - kernel_col_dim) / stride + 1) { + printf("conv out_col_dim is not correct\n"); + exit(1); + } +#endif + + for (int b = 0; b < batch_size; b++) { + for (int orow = 0; orow < out_row_dim; orow++) { + for (int ocol = 0; ocol < out_col_dim; ocol++) { + for (int och = 0; och < out_channels; och++) { + acc_t result = bias[och]; + + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { + for (int kch = 0; kch < in_channels; kch++) { + int irow = orow * stride + krow - padding; + int icol = ocol * stride + kcol - padding; + + elem_t pixel = irow < 0 || irow >= in_row_dim || + icol < 0 || icol >= in_col_dim ? + 0 : input[b][irow][icol][kch]; + + result += + weights[och][krow][kcol][kch] * + pixel; + } + } + } + + // Clip result + result = result > elem_t_max ? elem_t_max : (result < elem_t_min ? elem_t_min : result); + + output[b][orow][ocol][och] = result; + } + } + } + } +} + +void flatten_weights(int out_channels, int kernel_row_dim, int kernel_col_dim, int in_channels, + int patch_size, + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], + elem_t weights_mat[patch_size][out_channels]) { + + assert(patch_size == kernel_row_dim * kernel_col_dim * in_channels); + + for (int outc = 0; outc < out_channels; outc++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { + for (int inc = 0; inc < in_channels; inc++) { + int wmatrow = krow * kernel_col_dim * in_channels + + kcol * in_channels + + inc; + + weights_mat[wmatrow][outc] = + weights[outc][krow][kcol][inc]; + } + } + } + } +} + +bool vec_is_equal(elem_t * a, elem_t * b, int len) { + for (int i = 0; i < len; i++) + if (a[i] != b[i]) + return false; + return true; +} + +void init_random(elem_t * buf, int len) { + elem_t i = 0; + for (elem_t * ptr = buf; ptr < buf + len; ptr++) { + // *ptr = (rand() % 32) - 16; +#ifdef FAST + *ptr = 1; +#else + *ptr = (rand() % 5) - 2; +#endif + } +} + +void init_random_acc(acc_t * buf, int len) { + elem_t i = 0; + for (acc_t * ptr = buf; ptr < buf + len; ptr++) { + // *ptr = (rand() % 32) - 16; +#ifdef FAST + *ptr = 1; +#else + *ptr = (rand() % 5) - 2; +#endif + } +} + +void init_zeros_acc(acc_t * buf, int len) { + for (acc_t * ptr = buf; ptr < buf + len; ptr++) { + *ptr = 0; + } +} + +int main() { +#ifndef BAREMETAL + if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) { + perror("mlockall failed"); + exit(1); + } +#endif + + gemmini_flush(0); + + // assert((in_dim + 2*padding - kernel_dim) % stride == 0); + + printf("Kernel dimensions (rows by columns): %u by %u\n", KERNEL_ROW_DIM, KERNEL_COL_DIM); + printf("Input dimensions (rows by columns): %u by %u\n", IN_ROW_DIM, IN_COL_DIM); + printf("Output dimensions (rows by columns): %u by %u\n\n", OUT_ROW_DIM, OUT_COL_DIM); + + static elem_t input[BATCH_SIZE][IN_ROW_DIM][IN_COL_DIM][IN_CHANNELS]; + static elem_t weights[OUT_CHANNELS][KERNEL_ROW_DIM][KERNEL_ROW_DIM][IN_CHANNELS]; + static acc_t bias[OUT_CHANNELS]; + static elem_t output[BATCH_SIZE][OUT_ROW_DIM][OUT_COL_DIM][OUT_CHANNELS]; + + printf("Randomize inputs...\n"); + init_random(&input[0][0][0][0], sizeof(input) / sizeof(elem_t)); + + printf("Randomize weights...\n"); + init_random(&weights[0][0][0][0], sizeof(weights) / sizeof(elem_t)); + + printf("Randomize bias...\n"); + if (NO_BIAS) + init_zeros_acc(&bias[0], sizeof(bias) / sizeof(acc_t)); + else + init_random_acc(&bias[0], sizeof(bias) / sizeof(acc_t)); + + printf("CPU conv...\n"); + uint64_t start_cpu = read_cycles(); +#ifndef FAST + conv(BATCH_SIZE, IN_CHANNELS, + IN_ROW_DIM, IN_COL_DIM, + OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, + OUT_ROW_DIM, OUT_COL_DIM, + STRIDE, PADDING, + input, + weights, + bias, + output); +#endif + uint64_t end_cpu = read_cycles(); + printf("CPU conv took %llu cycles\n", end_cpu - start_cpu); + + static elem_t weights_mat[PATCH_SIZE][OUT_CHANNELS]; + static elem_t output_mat[N_PATCHES][OUT_CHANNELS]; + + printf("Flatten weights...\n"); + flatten_weights(OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, IN_CHANNELS, + PATCH_SIZE, + weights, + weights_mat); + + printf("Gemmini conv...\n"); + uint64_t start_gemmini = read_cycles(); + tiled_conv_auto( + BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, IN_CHANNELS, + OUT_CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, + STRIDE, 1, 1, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, + false, false, false, false, false, + + (elem_t*)input, + (elem_t*)weights_mat, + NO_BIAS ? NULL : (acc_t*)bias, + (elem_t*)output_mat, + + NO_ACTIVATION, ACC_SCALE_IDENTITY, 0, 0, 0, + + WS); + uint64_t end_gemmini = read_cycles(); + printf("Gemmini conv took %llu cycles\n", end_gemmini - start_gemmini); + + assert(sizeof(output_mat) == sizeof(output)); + +#ifdef FAST + bool success = true; + for (int orow = 0; orow < BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM; orow++) { + for (int ocol = 0; ocol < OUT_CHANNELS; ocol++) { + elem_t v = output_mat[orow][ocol]; + if (v != 21 && v != 31 && v != 46) { + success = false; + break; + } + } + } +#else + bool success = vec_is_equal(&output[0][0][0][0], &output_mat[0][0], sizeof(output) / sizeof(elem_t)); +#endif + + if (!success) { + // return 1; + + printf("bias:\n"); + for (int och = 0; och < OUT_CHANNELS; och++) { + printf("%d,", bias[och]); + } + printf("\b\n\n"); + + printf("weights:\n"); + for (int och = 0; och < OUT_CHANNELS; och++) { + printf("["); + for (int wrow = 0; wrow < KERNEL_ROW_DIM; wrow++) { + printf("["); + for (int wcol = 0; wcol < KERNEL_COL_DIM; wcol++) { + printf("["); + for (int ich = 0; ich < IN_CHANNELS; ich++) { + printf("%d,", weights[och][wrow][wcol][ich]); + } + printf("\b],"); + } + printf("\b],\n"); + } + printf("\b],"); + } + printf("\b\n\n"); + + printf("weights_mat:\n"); + for (int wrow = 0; wrow < KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS; wrow++) { + printf("["); + for (int wcol = 0; wcol < OUT_CHANNELS; wcol++) { + printf("%d,", weights_mat[wrow][wcol]); + } + printf("\b],\n"); + } + printf("\b\n\n"); + + printf("input:\n"); + for (int batch = 0; batch < BATCH_SIZE; batch++) { + printf("["); + for (int irow = 0; irow < IN_ROW_DIM; irow++) { + printf("["); + for (int icol = 0; icol < IN_COL_DIM; icol++) { + printf("["); + for (int ich = 0; ich < IN_CHANNELS; ich++) { + printf("%d,", input[batch][irow][icol][ich]); + } + printf("\b],"); + } + printf("\b],\n"); + } + printf("\b],"); + } + printf("\b\n\n"); + + printf("output:\n"); + for (int batch = 0; batch < BATCH_SIZE; batch++) { + printf("["); + for (int orow = 0; orow < OUT_ROW_DIM; orow++) { + printf("["); + for (int ocol = 0; ocol < OUT_COL_DIM; ocol++) { + printf("["); + for (int och = 0; och < OUT_CHANNELS; och++) { + printf("%d,", output[batch][orow][ocol][och]); + } + printf("\b],"); + } + printf("\b],\n"); + } + printf("\b],"); + } + printf("\b\n\n"); + + printf("output_mat:\n"); + for (int orow = 0; orow < BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM; orow++) { + printf("["); + for (int ocol = 0; ocol < OUT_CHANNELS; ocol++) { + printf("%d,", output_mat[orow][ocol]); + } + printf("\b],\n"); + } + printf("\b\n\n"); + + return 1; + } + + return 0; +} diff --git a/bareMetalC/conv_rect_kernel_trans_weight_0132.c b/bareMetalC/conv_rect_kernel_trans_weight_0132.c new file mode 100644 index 00000000..c79b8291 --- /dev/null +++ b/bareMetalC/conv_rect_kernel_trans_weight_0132.c @@ -0,0 +1,305 @@ +#include +#include +#include +#include +#include +#ifndef BAREMETAL +#include +#endif +#include "include/gemmini_testutils.h" + +#ifndef BAREMETAL + +#define BATCH_SIZE 4 +#define IN_ROW_DIM 224 +#define IN_COL_DIM 224 +#define IN_CHANNELS 17 +#define OUT_CHANNELS 32 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 +#define PADDING 1 +#define STRIDE 2 + +#else + +#ifdef FAST + +#define IN_ROW_DIM 9 +#define IN_COL_DIM 9 +#define IN_CHANNELS 5 +#define OUT_CHANNELS 7 + +#else + +#define IN_ROW_DIM 17 +#define IN_COL_DIM 17 +#define IN_CHANNELS 18 +#define OUT_CHANNELS 19 + +#endif + +#define BATCH_SIZE 2 +#define KERNEL_ROW_DIM 4 +#define KERNEL_COL_DIM 2 +#define PADDING 1 +#define STRIDE 2 + +#endif + +#define NO_BIAS false + +#define TRANS_OUTPUT_1203 false +#define TRANS_WEIGHT_1203 false +#define TRANS_WEIGHT_0132 true + +#define OUT_ROW_DIM ((IN_ROW_DIM + 2*PADDING - KERNEL_ROW_DIM) / STRIDE + 1) +#define OUT_COL_DIM ((IN_COL_DIM + 2*PADDING - KERNEL_COL_DIM) / STRIDE + 1) +#define PATCH_SIZE (KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS) +#define N_PATCHES (BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM) + +void flatten_weights(int out_channels, int kernel_row_dim, int kernel_col_dim, int in_channels, + int patch_size, + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], + elem_t weights_mat[patch_size][out_channels]) { + + assert(patch_size == kernel_row_dim * kernel_col_dim * in_channels); + + for (int outc = 0; outc < out_channels; outc++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { + for (int inc = 0; inc < in_channels; inc++) { + int wmatrow = krow * kernel_col_dim * in_channels + + kcol * in_channels + + inc; + + weights_mat[wmatrow][outc] = + weights[outc][krow][kcol][inc]; + } + } + } + } +} + +bool vec_is_equal(elem_t * a, elem_t * b, int len) { + for (int i = 0; i < len; i++) + if (a[i] != b[i]) + return false; + return true; +} + +void init_random(elem_t * buf, int len) { + elem_t i = 0; + for (elem_t * ptr = buf; ptr < buf + len; ptr++) { + // *ptr = (rand() % 32) - 16; +#ifdef FAST + *ptr = 1; +#else + *ptr = (rand() % 5) - 2; +#endif + } +} + +void init_random_acc(acc_t * buf, int len) { + elem_t i = 0; + for (acc_t * ptr = buf; ptr < buf + len; ptr++) { + // *ptr = (rand() % 32) - 16; +#ifdef FAST + *ptr = 1; +#else + *ptr = (rand() % 5) - 2; +#endif + } +} + +void init_zeros_acc(acc_t * buf, int len) { + for (acc_t * ptr = buf; ptr < buf + len; ptr++) { + *ptr = 0; + } +} + +int main() { +#ifndef BAREMETAL + if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) { + perror("mlockall failed"); + exit(1); + } +#endif + + gemmini_flush(0); + + // assert((in_dim + 2*padding - kernel_dim) % stride == 0); + + printf("Input dimensions: %u by %u\n", IN_ROW_DIM, IN_COL_DIM); + printf("Output dimensions: %u by %u\n\n", OUT_ROW_DIM, OUT_COL_DIM); + + static elem_t input[BATCH_SIZE][IN_ROW_DIM][IN_COL_DIM][IN_CHANNELS]; + static elem_t weights[OUT_CHANNELS][KERNEL_ROW_DIM][KERNEL_COL_DIM][IN_CHANNELS]; + static acc_t bias[OUT_CHANNELS]; + static elem_t output[BATCH_SIZE][OUT_ROW_DIM][OUT_COL_DIM][OUT_CHANNELS]; + + printf("Randomize inputs...\n"); + init_random(&input[0][0][0][0], sizeof(input) / sizeof(elem_t)); + + printf("Randomize weights...\n"); + init_random(&weights[0][0][0][0], sizeof(weights) / sizeof(elem_t)); + + printf("Randomize bias...\n"); + if (NO_BIAS) + init_zeros_acc(&bias[0], sizeof(bias) / sizeof(acc_t)); + else + init_random_acc(&bias[0], sizeof(bias) / sizeof(acc_t)); + + static elem_t weights_mat[PATCH_SIZE][OUT_CHANNELS]; + static elem_t output_mat[N_PATCHES][OUT_CHANNELS]; + + printf("Flatten weights...\n"); + flatten_weights(OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, IN_CHANNELS, + PATCH_SIZE, + weights, + weights_mat); + + printf("CPU conv...\n"); + uint64_t start_cpu = read_cycles(); +#ifndef FAST + tiled_conv_auto( + BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, IN_CHANNELS, + OUT_CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, + STRIDE, 1, 1, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, + false, TRANS_OUTPUT_1203, false, TRANS_WEIGHT_1203, TRANS_WEIGHT_0132, + + (elem_t*)input, + (elem_t*)weights_mat, + NO_BIAS ? NULL : (acc_t*)bias, + (elem_t*)output, + + NO_ACTIVATION, ACC_SCALE_IDENTITY, 0, 0, 0, + + CPU); +#endif + uint64_t end_cpu = read_cycles(); + printf("CPU conv took %llu cycles\n", end_cpu - start_cpu); + + printf("Gemmini conv...\n"); + uint64_t start_gemmini = read_cycles(); + tiled_conv_auto( + BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, IN_CHANNELS, + OUT_CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, + STRIDE, 1, 1, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, + false, TRANS_OUTPUT_1203, false, TRANS_WEIGHT_1203, TRANS_WEIGHT_0132, + + (elem_t*)input, + (elem_t*)weights_mat, + NO_BIAS ? NULL : (acc_t*)bias, + (elem_t*)output_mat, + + NO_ACTIVATION, ACC_SCALE_IDENTITY, 0, 0, 0, + + WS); + uint64_t end_gemmini = read_cycles(); + printf("Gemmini conv took %llu cycles\n", end_gemmini - start_gemmini); + + assert(sizeof(output_mat) == sizeof(output)); + +#ifdef FAST + bool success = true; + for (int orow = 0; orow < BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM; orow++) { + for (int ocol = 0; ocol < OUT_CHANNELS; ocol++) { + elem_t v = output_mat[orow][ocol]; + if (v != 21 && v != 31 && v != 46) { + success = false; + break; + } + } + } +#else + bool success = vec_is_equal(&output[0][0][0][0], &output_mat[0][0], sizeof(output) / sizeof(elem_t)); +#endif + + if (!success) { + // return 1; + + printf("bias:\n"); + for (int och = 0; och < OUT_CHANNELS; och++) { + printf("%d,", bias[och]); + } + printf("\b\n\n"); + + printf("weights:\n"); + for (int och = 0; och < OUT_CHANNELS; och++) { + printf("["); + for (int wrow = 0; wrow < KERNEL_ROW_DIM; wrow++) { + printf("["); + for (int wcol = 0; wcol < KERNEL_COL_DIM; wcol++) { + printf("["); + for (int ich = 0; ich < IN_CHANNELS; ich++) { + printf("%d,", weights[och][wrow][wcol][ich]); + } + printf("\b],"); + } + printf("\b],\n"); + } + printf("\b],"); + } + printf("\b\n\n"); + + printf("weights_mat:\n"); + for (int wrow = 0; wrow < KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS; wrow++) { + printf("["); + for (int wcol = 0; wcol < OUT_CHANNELS; wcol++) { + printf("%d,", weights_mat[wrow][wcol]); + } + printf("\b],\n"); + } + printf("\b\n\n"); + + printf("input:\n"); + for (int batch = 0; batch < BATCH_SIZE; batch++) { + printf("["); + for (int irow = 0; irow < IN_ROW_DIM; irow++) { + printf("["); + for (int icol = 0; icol < IN_COL_DIM; icol++) { + printf("["); + for (int ich = 0; ich < IN_CHANNELS; ich++) { + printf("%d,", input[batch][irow][icol][ich]); + } + printf("\b],"); + } + printf("\b],\n"); + } + printf("\b],"); + } + printf("\b\n\n"); + + printf("output:\n"); + for (int batch = 0; batch < BATCH_SIZE; batch++) { + printf("["); + for (int orow = 0; orow < OUT_ROW_DIM; orow++) { + printf("["); + for (int ocol = 0; ocol < OUT_COL_DIM; ocol++) { + printf("["); + for (int och = 0; och < OUT_CHANNELS; och++) { + printf("%d,", output[batch][orow][ocol][och]); + } + printf("\b],"); + } + printf("\b],\n"); + } + printf("\b],"); + } + printf("\b\n\n"); + + printf("output_mat:\n"); + for (int orow = 0; orow < BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM; orow++) { + printf("["); + for (int ocol = 0; ocol < OUT_CHANNELS; ocol++) { + printf("%d,", output_mat[orow][ocol]); + } + printf("\b],\n"); + } + printf("\b\n\n"); + + return 1; + } + + return 0; +} diff --git a/bareMetalC/conv_rect_kernel_trans_weight_1203.c b/bareMetalC/conv_rect_kernel_trans_weight_1203.c new file mode 100644 index 00000000..991c5016 --- /dev/null +++ b/bareMetalC/conv_rect_kernel_trans_weight_1203.c @@ -0,0 +1,304 @@ +#include +#include +#include +#include +#include +#ifndef BAREMETAL +#include +#endif +#include "include/gemmini_testutils.h" + +#ifndef BAREMETAL + +#define BATCH_SIZE 4 +#define IN_ROW_DIM 224 +#define IN_COL_DIM 224 +#define IN_CHANNELS 17 +#define OUT_CHANNELS 32 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 +#define PADDING 1 +#define STRIDE 2 + +#else + +#ifdef FAST + +#define IN_ROW_DIM 9 +#define IN_COL_DIM 9 +#define IN_CHANNELS 5 +#define OUT_CHANNELS 7 + +#else + +#define IN_ROW_DIM 17 +#define IN_COL_DIM 17 +#define IN_CHANNELS 18 +#define OUT_CHANNELS 19 + +#endif + +#define BATCH_SIZE 2 +#define KERNEL_ROW_DIM 4 +#define KERNEL_COL_DIM 2 +#define PADDING 1 +#define STRIDE 2 + +#endif + +#define NO_BIAS false + +#define TRANS_OUTPUT_1203 false +#define TRANS_WEIGHT_1203 true + +#define OUT_ROW_DIM ((IN_ROW_DIM + 2*PADDING - KERNEL_ROW_DIM) / STRIDE + 1) +#define OUT_COL_DIM ((IN_COL_DIM + 2*PADDING - KERNEL_COL_DIM) / STRIDE + 1) +#define PATCH_SIZE (KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS) +#define N_PATCHES (BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM) + +void flatten_weights(int out_channels, int kernel_row_dim, int kernel_col_dim, int in_channels, + int patch_size, + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], + elem_t weights_mat[patch_size][out_channels]) { + + assert(patch_size == kernel_row_dim * kernel_col_dim * in_channels); + + for (int outc = 0; outc < out_channels; outc++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { + for (int inc = 0; inc < in_channels; inc++) { + int wmatrow = krow * kernel_col_dim * in_channels + + kcol * in_channels + + inc; + + weights_mat[wmatrow][outc] = + weights[outc][krow][kcol][inc]; + } + } + } + } +} + +bool vec_is_equal(elem_t * a, elem_t * b, int len) { + for (int i = 0; i < len; i++) + if (a[i] != b[i]) + return false; + return true; +} + +void init_random(elem_t * buf, int len) { + elem_t i = 0; + for (elem_t * ptr = buf; ptr < buf + len; ptr++) { + // *ptr = (rand() % 32) - 16; +#ifdef FAST + *ptr = 1; +#else + *ptr = (rand() % 5) - 2; +#endif + } +} + +void init_random_acc(acc_t * buf, int len) { + elem_t i = 0; + for (acc_t * ptr = buf; ptr < buf + len; ptr++) { + // *ptr = (rand() % 32) - 16; +#ifdef FAST + *ptr = 1; +#else + *ptr = (rand() % 5) - 2; +#endif + } +} + +void init_zeros_acc(acc_t * buf, int len) { + for (acc_t * ptr = buf; ptr < buf + len; ptr++) { + *ptr = 0; + } +} + +int main() { +#ifndef BAREMETAL + if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) { + perror("mlockall failed"); + exit(1); + } +#endif + + gemmini_flush(0); + + // assert((in_dim + 2*padding - kernel_dim) % stride == 0); + + printf("Input dimensions: %u by %u\n", IN_ROW_DIM, IN_COL_DIM); + printf("Output dimensions: %u by %u\n\n", OUT_ROW_DIM, OUT_COL_DIM); + + static elem_t input[BATCH_SIZE][IN_ROW_DIM][IN_COL_DIM][IN_CHANNELS]; + static elem_t weights[OUT_CHANNELS][KERNEL_ROW_DIM][KERNEL_COL_DIM][IN_CHANNELS]; + static acc_t bias[OUT_CHANNELS]; + static elem_t output[BATCH_SIZE][OUT_ROW_DIM][OUT_COL_DIM][OUT_CHANNELS]; + + printf("Randomize inputs...\n"); + init_random(&input[0][0][0][0], sizeof(input) / sizeof(elem_t)); + + printf("Randomize weights...\n"); + init_random(&weights[0][0][0][0], sizeof(weights) / sizeof(elem_t)); + + printf("Randomize bias...\n"); + if (NO_BIAS) + init_zeros_acc(&bias[0], sizeof(bias) / sizeof(acc_t)); + else + init_random_acc(&bias[0], sizeof(bias) / sizeof(acc_t)); + + static elem_t weights_mat[PATCH_SIZE][OUT_CHANNELS]; + static elem_t output_mat[N_PATCHES][OUT_CHANNELS]; + + printf("Flatten weights...\n"); + flatten_weights(OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, IN_CHANNELS, + PATCH_SIZE, + weights, + weights_mat); + + printf("CPU conv...\n"); + uint64_t start_cpu = read_cycles(); +#ifndef FAST + tiled_conv_auto( + BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, IN_CHANNELS, + OUT_CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, + STRIDE, 1, 1, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, + false, TRANS_OUTPUT_1203, false, TRANS_WEIGHT_1203, false, + + (elem_t*)input, + (elem_t*)weights_mat, + NO_BIAS ? NULL : (acc_t*)bias, + (elem_t*)output, + + NO_ACTIVATION, ACC_SCALE_IDENTITY, 0, 0, 0, + + CPU); +#endif + uint64_t end_cpu = read_cycles(); + printf("CPU conv took %llu cycles\n", end_cpu - start_cpu); + + printf("Gemmini conv...\n"); + uint64_t start_gemmini = read_cycles(); + tiled_conv_auto( + BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, IN_CHANNELS, + OUT_CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, + STRIDE, 1, 1, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, + false, TRANS_OUTPUT_1203, false, TRANS_WEIGHT_1203, false, + + (elem_t*)input, + (elem_t*)weights_mat, + NO_BIAS ? NULL : (acc_t*)bias, + (elem_t*)output_mat, + + NO_ACTIVATION, ACC_SCALE_IDENTITY, 0, 0, 0, + + WS); + uint64_t end_gemmini = read_cycles(); + printf("Gemmini conv took %llu cycles\n", end_gemmini - start_gemmini); + + assert(sizeof(output_mat) == sizeof(output)); + +#ifdef FAST + bool success = true; + for (int orow = 0; orow < BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM; orow++) { + for (int ocol = 0; ocol < OUT_CHANNELS; ocol++) { + elem_t v = output_mat[orow][ocol]; + if (v != 21 && v != 31 && v != 46) { + success = false; + break; + } + } + } +#else + bool success = vec_is_equal(&output[0][0][0][0], &output_mat[0][0], sizeof(output) / sizeof(elem_t)); +#endif + + if (!success) { + // return 1; + + printf("bias:\n"); + for (int och = 0; och < OUT_CHANNELS; och++) { + printf("%d,", bias[och]); + } + printf("\b\n\n"); + + printf("weights:\n"); + for (int och = 0; och < OUT_CHANNELS; och++) { + printf("["); + for (int wrow = 0; wrow < KERNEL_ROW_DIM; wrow++) { + printf("["); + for (int wcol = 0; wcol < KERNEL_COL_DIM; wcol++) { + printf("["); + for (int ich = 0; ich < IN_CHANNELS; ich++) { + printf("%d,", weights[och][wrow][wcol][ich]); + } + printf("\b],"); + } + printf("\b],\n"); + } + printf("\b],"); + } + printf("\b\n\n"); + + printf("weights_mat:\n"); + for (int wrow = 0; wrow < KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS; wrow++) { + printf("["); + for (int wcol = 0; wcol < OUT_CHANNELS; wcol++) { + printf("%d,", weights_mat[wrow][wcol]); + } + printf("\b],\n"); + } + printf("\b\n\n"); + + printf("input:\n"); + for (int batch = 0; batch < BATCH_SIZE; batch++) { + printf("["); + for (int irow = 0; irow < IN_ROW_DIM; irow++) { + printf("["); + for (int icol = 0; icol < IN_COL_DIM; icol++) { + printf("["); + for (int ich = 0; ich < IN_CHANNELS; ich++) { + printf("%d,", input[batch][irow][icol][ich]); + } + printf("\b],"); + } + printf("\b],\n"); + } + printf("\b],"); + } + printf("\b\n\n"); + + printf("output:\n"); + for (int batch = 0; batch < BATCH_SIZE; batch++) { + printf("["); + for (int orow = 0; orow < OUT_ROW_DIM; orow++) { + printf("["); + for (int ocol = 0; ocol < OUT_COL_DIM; ocol++) { + printf("["); + for (int och = 0; och < OUT_CHANNELS; och++) { + printf("%d,", output[batch][orow][ocol][och]); + } + printf("\b],"); + } + printf("\b],\n"); + } + printf("\b],"); + } + printf("\b\n\n"); + + printf("output_mat:\n"); + for (int orow = 0; orow < BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM; orow++) { + printf("["); + for (int ocol = 0; ocol < OUT_CHANNELS; ocol++) { + printf("%d,", output_mat[orow][ocol]); + } + printf("\b],\n"); + } + printf("\b\n\n"); + + return 1; + } + + return 0; +} diff --git a/bareMetalC/conv_trans_input_3120.c b/bareMetalC/conv_trans_input_3120.c index f909682c..22ed8fa1 100644 --- a/bareMetalC/conv_trans_input_3120.c +++ b/bareMetalC/conv_trans_input_3120.c @@ -15,7 +15,8 @@ #define IN_COL_DIM 224 #define IN_CHANNELS 17 #define OUT_CHANNELS 32 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 2 @@ -38,7 +39,8 @@ #endif #define BATCH_SIZE 2 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 2 @@ -51,23 +53,23 @@ #define TRANS_WEIGHT_0132 false #define TRANS_INPUT_3120 true -#define OUT_ROW_DIM ((IN_ROW_DIM + 2*PADDING - KERNEL_DIM) / STRIDE + 1) -#define OUT_COL_DIM ((IN_COL_DIM + 2*PADDING - KERNEL_DIM) / STRIDE + 1) -#define PATCH_SIZE (KERNEL_DIM * KERNEL_DIM * IN_CHANNELS) +#define OUT_ROW_DIM ((IN_ROW_DIM + 2*PADDING - KERNEL_ROW_DIM) / STRIDE + 1) +#define OUT_COL_DIM ((IN_COL_DIM + 2*PADDING - KERNEL_COL_DIM) / STRIDE + 1) +#define PATCH_SIZE (KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS) #define N_PATCHES (BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM) -void flatten_weights(int out_channels, int kernel_dim, int in_channels, +void flatten_weights(int out_channels, int kernel_row_dim, int kernel_col_dim, int in_channels, int patch_size, - elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], elem_t weights_mat[patch_size][out_channels]) { - assert(patch_size == kernel_dim * kernel_dim * in_channels); + assert(patch_size == kernel_row_dim * kernel_col_dim * in_channels); for (int outc = 0; outc < out_channels; outc++) { - for (int krow = 0; krow < kernel_dim; krow++) { - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { for (int inc = 0; inc < in_channels; inc++) { - int wmatrow = krow * kernel_dim * in_channels + + int wmatrow = krow * kernel_col_dim * in_channels + kcol * in_channels + inc; @@ -132,7 +134,7 @@ int main() { printf("Output dimensions: %u by %u\n\n", OUT_ROW_DIM, OUT_COL_DIM); static elem_t input[IN_ROW_DIM][IN_COL_DIM][IN_CHANNELS][BATCH_SIZE]; - static elem_t weights[OUT_CHANNELS][KERNEL_DIM][KERNEL_DIM][IN_CHANNELS]; + static elem_t weights[OUT_CHANNELS][KERNEL_ROW_DIM][KERNEL_COL_DIM][IN_CHANNELS]; static acc_t bias[OUT_CHANNELS]; static elem_t output[BATCH_SIZE][OUT_ROW_DIM][OUT_COL_DIM][OUT_CHANNELS]; @@ -152,7 +154,7 @@ int main() { static elem_t output_mat[N_PATCHES][OUT_CHANNELS]; printf("Flatten weights...\n"); - flatten_weights(OUT_CHANNELS, KERNEL_DIM, IN_CHANNELS, + flatten_weights(OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, IN_CHANNELS, PATCH_SIZE, weights, weights_mat); @@ -163,7 +165,7 @@ int main() { tiled_conv_auto( BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, IN_CHANNELS, OUT_CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, - STRIDE, 1, 1, PADDING, KERNEL_DIM, + STRIDE, 1, 1, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, false, TRANS_OUTPUT_1203, TRANS_INPUT_3120, TRANS_WEIGHT_1203, TRANS_WEIGHT_0132, (elem_t*)input, @@ -183,7 +185,7 @@ int main() { tiled_conv_auto( BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, IN_CHANNELS, OUT_CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, - STRIDE, 1, 1, PADDING, KERNEL_DIM, + STRIDE, 1, 1, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, false, TRANS_OUTPUT_1203, TRANS_INPUT_3120, TRANS_WEIGHT_1203, TRANS_WEIGHT_0132, (elem_t*)input, @@ -226,9 +228,9 @@ int main() { printf("weights:\n"); for (int och = 0; och < OUT_CHANNELS; och++) { printf("["); - for (int wrow = 0; wrow < KERNEL_DIM; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM; wrow++) { printf("["); - for (int wcol = 0; wcol < KERNEL_DIM; wcol++) { + for (int wcol = 0; wcol < KERNEL_COL_DIM; wcol++) { printf("["); for (int ich = 0; ich < IN_CHANNELS; ich++) { printf("%d,", weights[och][wrow][wcol][ich]); @@ -242,7 +244,7 @@ int main() { printf("\b\n\n"); printf("weights_mat:\n"); - for (int wrow = 0; wrow < KERNEL_DIM * KERNEL_DIM * IN_CHANNELS; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS; wrow++) { printf("["); for (int wcol = 0; wcol < OUT_CHANNELS; wcol++) { printf("%d,", weights_mat[wrow][wcol]); diff --git a/bareMetalC/conv_trans_input_3120_with_kernel_dilation.c b/bareMetalC/conv_trans_input_3120_with_kernel_dilation.c index e6d45f55..3bb40d51 100644 --- a/bareMetalC/conv_trans_input_3120_with_kernel_dilation.c +++ b/bareMetalC/conv_trans_input_3120_with_kernel_dilation.c @@ -15,7 +15,8 @@ #define IN_COL_DIM 224 #define IN_CHANNELS 17 #define OUT_CHANNELS 32 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 1 #define KERNEL_DILATION 2 @@ -39,7 +40,8 @@ #endif #define BATCH_SIZE 2 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 1 #define KERNEL_DILATION 2 @@ -53,23 +55,23 @@ #define TRANS_WEIGHT_0132 false #define TRANS_INPUT_3120 true -#define OUT_ROW_DIM ((IN_ROW_DIM + 2*PADDING - KERNEL_DIM) / STRIDE + 1) -#define OUT_COL_DIM ((IN_COL_DIM + 2*PADDING - KERNEL_DIM) / STRIDE + 1) -#define PATCH_SIZE (KERNEL_DIM * KERNEL_DIM * IN_CHANNELS) +#define OUT_ROW_DIM ((IN_ROW_DIM + 2*PADDING - KERNEL_ROW_DIM) / STRIDE + 1) +#define OUT_COL_DIM ((IN_COL_DIM + 2*PADDING - KERNEL_COL_DIM) / STRIDE + 1) +#define PATCH_SIZE (KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS) #define N_PATCHES (BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM) -void flatten_weights(int out_channels, int kernel_dim, int in_channels, +void flatten_weights(int out_channels, int kernel_row_dim, int kernel_col_dim, int in_channels, int patch_size, - elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], elem_t weights_mat[patch_size][out_channels]) { - assert(patch_size == kernel_dim * kernel_dim * in_channels); + assert(patch_size == kernel_row_dim * kernel_col_dim * in_channels); for (int outc = 0; outc < out_channels; outc++) { - for (int krow = 0; krow < kernel_dim; krow++) { - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { for (int inc = 0; inc < in_channels; inc++) { - int wmatrow = krow * kernel_dim * in_channels + + int wmatrow = krow * kernel_col_dim * in_channels + kcol * in_channels + inc; @@ -134,7 +136,7 @@ int main() { printf("Output dimensions: %u by %u\n\n", OUT_ROW_DIM, OUT_COL_DIM); static elem_t input[BATCH_SIZE][IN_ROW_DIM][IN_COL_DIM][IN_CHANNELS]; - static elem_t weights[OUT_CHANNELS][KERNEL_DIM][KERNEL_DIM][IN_CHANNELS]; + static elem_t weights[OUT_CHANNELS][KERNEL_ROW_DIM][KERNEL_COL_DIM][IN_CHANNELS]; static acc_t bias[OUT_CHANNELS]; static elem_t output[BATCH_SIZE][OUT_ROW_DIM][OUT_COL_DIM][OUT_CHANNELS]; @@ -154,7 +156,7 @@ int main() { static elem_t output_mat[N_PATCHES][OUT_CHANNELS]; printf("Flatten weights...\n"); - flatten_weights(OUT_CHANNELS, KERNEL_DIM, IN_CHANNELS, + flatten_weights(OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, IN_CHANNELS, PATCH_SIZE, weights, weights_mat); @@ -165,7 +167,7 @@ int main() { tiled_conv_auto( BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, IN_CHANNELS, OUT_CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, - STRIDE, 1, KERNEL_DILATION, PADDING, KERNEL_DIM, + STRIDE, 1, KERNEL_DILATION, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, false, TRANS_OUTPUT_1203, TRANS_INPUT_3120, TRANS_WEIGHT_1203, TRANS_WEIGHT_0132, (elem_t*)input, @@ -185,7 +187,7 @@ int main() { tiled_conv_auto( BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, IN_CHANNELS, OUT_CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, - STRIDE, 1, KERNEL_DILATION, PADDING, KERNEL_DIM, + STRIDE, 1, KERNEL_DILATION, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, false, TRANS_OUTPUT_1203, TRANS_INPUT_3120, TRANS_WEIGHT_1203, TRANS_WEIGHT_0132, (elem_t*)input, @@ -228,9 +230,9 @@ int main() { printf("weights:\n"); for (int och = 0; och < OUT_CHANNELS; och++) { printf("["); - for (int wrow = 0; wrow < KERNEL_DIM; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM; wrow++) { printf("["); - for (int wcol = 0; wcol < KERNEL_DIM; wcol++) { + for (int wcol = 0; wcol < KERNEL_COL_DIM; wcol++) { printf("["); for (int ich = 0; ich < IN_CHANNELS; ich++) { printf("%d,", weights[och][wrow][wcol][ich]); @@ -244,7 +246,7 @@ int main() { printf("\b\n\n"); printf("weights_mat:\n"); - for (int wrow = 0; wrow < KERNEL_DIM * KERNEL_DIM * IN_CHANNELS; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS; wrow++) { printf("["); for (int wcol = 0; wcol < OUT_CHANNELS; wcol++) { printf("%d,", weights_mat[wrow][wcol]); diff --git a/bareMetalC/conv_trans_output_1203.c b/bareMetalC/conv_trans_output_1203.c index 652da5e2..36792121 100644 --- a/bareMetalC/conv_trans_output_1203.c +++ b/bareMetalC/conv_trans_output_1203.c @@ -15,7 +15,8 @@ #define IN_COL_DIM 224 #define IN_CHANNELS 17 #define OUT_CHANNELS 32 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 2 @@ -38,7 +39,8 @@ #endif #define BATCH_SIZE 2 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 2 @@ -48,23 +50,23 @@ #define TRANS_OUTPUT_1203 true -#define OUT_ROW_DIM ((IN_ROW_DIM + 2*PADDING - KERNEL_DIM) / STRIDE + 1) -#define OUT_COL_DIM ((IN_COL_DIM + 2*PADDING - KERNEL_DIM) / STRIDE + 1) -#define PATCH_SIZE (KERNEL_DIM * KERNEL_DIM * IN_CHANNELS) +#define OUT_ROW_DIM ((IN_ROW_DIM + 2*PADDING - KERNEL_ROW_DIM) / STRIDE + 1) +#define OUT_COL_DIM ((IN_COL_DIM + 2*PADDING - KERNEL_COL_DIM) / STRIDE + 1) +#define PATCH_SIZE (KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS) #define N_PATCHES (BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM) -void flatten_weights(int out_channels, int kernel_dim, int in_channels, +void flatten_weights(int out_channels, int kernel_row_dim, int kernel_col_dim, int in_channels, int patch_size, - elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], elem_t weights_mat[patch_size][out_channels]) { - assert(patch_size == kernel_dim * kernel_dim * in_channels); + assert(patch_size == kernel_row_dim * kernel_col_dim * in_channels); for (int outc = 0; outc < out_channels; outc++) { - for (int krow = 0; krow < kernel_dim; krow++) { - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { for (int inc = 0; inc < in_channels; inc++) { - int wmatrow = krow * kernel_dim * in_channels + + int wmatrow = krow * kernel_col_dim * in_channels + kcol * in_channels + inc; @@ -129,7 +131,7 @@ int main() { printf("Output dimensions: %u by %u\n\n", OUT_ROW_DIM, OUT_COL_DIM); static elem_t input[BATCH_SIZE][IN_ROW_DIM][IN_COL_DIM][IN_CHANNELS]; - static elem_t weights[OUT_CHANNELS][KERNEL_DIM][KERNEL_DIM][IN_CHANNELS]; + static elem_t weights[OUT_CHANNELS][KERNEL_ROW_DIM][KERNEL_COL_DIM][IN_CHANNELS]; static acc_t bias[OUT_CHANNELS]; static elem_t output[BATCH_SIZE][OUT_ROW_DIM][OUT_COL_DIM][OUT_CHANNELS]; @@ -149,7 +151,7 @@ int main() { static elem_t output_mat[N_PATCHES][OUT_CHANNELS]; printf("Flatten weights...\n"); - flatten_weights(OUT_CHANNELS, KERNEL_DIM, IN_CHANNELS, + flatten_weights(OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, IN_CHANNELS, PATCH_SIZE, weights, weights_mat); @@ -160,7 +162,7 @@ int main() { tiled_conv_auto( BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, IN_CHANNELS, OUT_CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, - STRIDE, 1, 1, PADDING, KERNEL_DIM, + STRIDE, 1, 1, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, false, TRANS_OUTPUT_1203, false, false, false, (elem_t*)input, @@ -180,7 +182,7 @@ int main() { tiled_conv_auto( BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, IN_CHANNELS, OUT_CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, - STRIDE, 1, 1, PADDING, KERNEL_DIM, + STRIDE, 1, 1, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, false, TRANS_OUTPUT_1203, false, false, false, (elem_t*)input, @@ -223,9 +225,9 @@ int main() { printf("weights:\n"); for (int och = 0; och < OUT_CHANNELS; och++) { printf("["); - for (int wrow = 0; wrow < KERNEL_DIM; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM; wrow++) { printf("["); - for (int wcol = 0; wcol < KERNEL_DIM; wcol++) { + for (int wcol = 0; wcol < KERNEL_COL_DIM; wcol++) { printf("["); for (int ich = 0; ich < IN_CHANNELS; ich++) { printf("%d,", weights[och][wrow][wcol][ich]); @@ -239,7 +241,7 @@ int main() { printf("\b\n\n"); printf("weights_mat:\n"); - for (int wrow = 0; wrow < KERNEL_DIM * KERNEL_DIM * IN_CHANNELS; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS; wrow++) { printf("["); for (int wcol = 0; wcol < OUT_CHANNELS; wcol++) { printf("%d,", weights_mat[wrow][wcol]); diff --git a/bareMetalC/conv_trans_weight_0132.c b/bareMetalC/conv_trans_weight_0132.c index b5bb31f6..50b855fd 100644 --- a/bareMetalC/conv_trans_weight_0132.c +++ b/bareMetalC/conv_trans_weight_0132.c @@ -15,7 +15,8 @@ #define IN_COL_DIM 224 #define IN_CHANNELS 17 #define OUT_CHANNELS 32 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 2 @@ -38,7 +39,8 @@ #endif #define BATCH_SIZE 2 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 2 @@ -50,23 +52,23 @@ #define TRANS_WEIGHT_1203 false #define TRANS_WEIGHT_0132 true -#define OUT_ROW_DIM ((IN_ROW_DIM + 2*PADDING - KERNEL_DIM) / STRIDE + 1) -#define OUT_COL_DIM ((IN_COL_DIM + 2*PADDING - KERNEL_DIM) / STRIDE + 1) -#define PATCH_SIZE (KERNEL_DIM * KERNEL_DIM * IN_CHANNELS) +#define OUT_ROW_DIM ((IN_ROW_DIM + 2*PADDING - KERNEL_ROW_DIM) / STRIDE + 1) +#define OUT_COL_DIM ((IN_COL_DIM + 2*PADDING - KERNEL_COL_DIM) / STRIDE + 1) +#define PATCH_SIZE (KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS) #define N_PATCHES (BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM) -void flatten_weights(int out_channels, int kernel_dim, int in_channels, +void flatten_weights(int out_channels, int kernel_row_dim, int kernel_col_dim, int in_channels, int patch_size, - elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], elem_t weights_mat[patch_size][out_channels]) { - assert(patch_size == kernel_dim * kernel_dim * in_channels); + assert(patch_size == kernel_row_dim * kernel_col_dim * in_channels); for (int outc = 0; outc < out_channels; outc++) { - for (int krow = 0; krow < kernel_dim; krow++) { - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { for (int inc = 0; inc < in_channels; inc++) { - int wmatrow = krow * kernel_dim * in_channels + + int wmatrow = krow * kernel_col_dim * in_channels + kcol * in_channels + inc; @@ -131,7 +133,7 @@ int main() { printf("Output dimensions: %u by %u\n\n", OUT_ROW_DIM, OUT_COL_DIM); static elem_t input[BATCH_SIZE][IN_ROW_DIM][IN_COL_DIM][IN_CHANNELS]; - static elem_t weights[OUT_CHANNELS][KERNEL_DIM][KERNEL_DIM][IN_CHANNELS]; + static elem_t weights[OUT_CHANNELS][KERNEL_ROW_DIM][KERNEL_COL_DIM][IN_CHANNELS]; static acc_t bias[OUT_CHANNELS]; static elem_t output[BATCH_SIZE][OUT_ROW_DIM][OUT_COL_DIM][OUT_CHANNELS]; @@ -151,7 +153,7 @@ int main() { static elem_t output_mat[N_PATCHES][OUT_CHANNELS]; printf("Flatten weights...\n"); - flatten_weights(OUT_CHANNELS, KERNEL_DIM, IN_CHANNELS, + flatten_weights(OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, IN_CHANNELS, PATCH_SIZE, weights, weights_mat); @@ -162,7 +164,7 @@ int main() { tiled_conv_auto( BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, IN_CHANNELS, OUT_CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, - STRIDE, 1, 1, PADDING, KERNEL_DIM, + STRIDE, 1, 1, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, false, TRANS_OUTPUT_1203, false, TRANS_WEIGHT_1203, TRANS_WEIGHT_0132, (elem_t*)input, @@ -182,7 +184,7 @@ int main() { tiled_conv_auto( BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, IN_CHANNELS, OUT_CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, - STRIDE, 1, 1, PADDING, KERNEL_DIM, + STRIDE, 1, 1, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, false, TRANS_OUTPUT_1203, false, TRANS_WEIGHT_1203, TRANS_WEIGHT_0132, (elem_t*)input, @@ -225,9 +227,9 @@ int main() { printf("weights:\n"); for (int och = 0; och < OUT_CHANNELS; och++) { printf("["); - for (int wrow = 0; wrow < KERNEL_DIM; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM; wrow++) { printf("["); - for (int wcol = 0; wcol < KERNEL_DIM; wcol++) { + for (int wcol = 0; wcol < KERNEL_COL_DIM; wcol++) { printf("["); for (int ich = 0; ich < IN_CHANNELS; ich++) { printf("%d,", weights[och][wrow][wcol][ich]); @@ -241,7 +243,7 @@ int main() { printf("\b\n\n"); printf("weights_mat:\n"); - for (int wrow = 0; wrow < KERNEL_DIM * KERNEL_DIM * IN_CHANNELS; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS; wrow++) { printf("["); for (int wcol = 0; wcol < OUT_CHANNELS; wcol++) { printf("%d,", weights_mat[wrow][wcol]); diff --git a/bareMetalC/conv_trans_weight_1203.c b/bareMetalC/conv_trans_weight_1203.c index 6d53f944..e0986c6c 100644 --- a/bareMetalC/conv_trans_weight_1203.c +++ b/bareMetalC/conv_trans_weight_1203.c @@ -15,7 +15,8 @@ #define IN_COL_DIM 224 #define IN_CHANNELS 17 #define OUT_CHANNELS 32 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 2 @@ -38,7 +39,8 @@ #endif #define BATCH_SIZE 2 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 2 @@ -49,23 +51,23 @@ #define TRANS_OUTPUT_1203 false #define TRANS_WEIGHT_1203 true -#define OUT_ROW_DIM ((IN_ROW_DIM + 2*PADDING - KERNEL_DIM) / STRIDE + 1) -#define OUT_COL_DIM ((IN_COL_DIM + 2*PADDING - KERNEL_DIM) / STRIDE + 1) -#define PATCH_SIZE (KERNEL_DIM * KERNEL_DIM * IN_CHANNELS) +#define OUT_ROW_DIM ((IN_ROW_DIM + 2*PADDING - KERNEL_ROW_DIM) / STRIDE + 1) +#define OUT_COL_DIM ((IN_COL_DIM + 2*PADDING - KERNEL_COL_DIM) / STRIDE + 1) +#define PATCH_SIZE (KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS) #define N_PATCHES (BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM) -void flatten_weights(int out_channels, int kernel_dim, int in_channels, +void flatten_weights(int out_channels, int kernel_row_dim, int kernel_col_dim, int in_channels, int patch_size, - elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], elem_t weights_mat[patch_size][out_channels]) { - assert(patch_size == kernel_dim * kernel_dim * in_channels); + assert(patch_size == kernel_row_dim * kernel_col_dim * in_channels); for (int outc = 0; outc < out_channels; outc++) { - for (int krow = 0; krow < kernel_dim; krow++) { - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { for (int inc = 0; inc < in_channels; inc++) { - int wmatrow = krow * kernel_dim * in_channels + + int wmatrow = krow * kernel_col_dim * in_channels + kcol * in_channels + inc; @@ -130,7 +132,7 @@ int main() { printf("Output dimensions: %u by %u\n\n", OUT_ROW_DIM, OUT_COL_DIM); static elem_t input[BATCH_SIZE][IN_ROW_DIM][IN_COL_DIM][IN_CHANNELS]; - static elem_t weights[OUT_CHANNELS][KERNEL_DIM][KERNEL_DIM][IN_CHANNELS]; + static elem_t weights[OUT_CHANNELS][KERNEL_ROW_DIM][KERNEL_COL_DIM][IN_CHANNELS]; static acc_t bias[OUT_CHANNELS]; static elem_t output[BATCH_SIZE][OUT_ROW_DIM][OUT_COL_DIM][OUT_CHANNELS]; @@ -150,7 +152,7 @@ int main() { static elem_t output_mat[N_PATCHES][OUT_CHANNELS]; printf("Flatten weights...\n"); - flatten_weights(OUT_CHANNELS, KERNEL_DIM, IN_CHANNELS, + flatten_weights(OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, IN_CHANNELS, PATCH_SIZE, weights, weights_mat); @@ -161,7 +163,7 @@ int main() { tiled_conv_auto( BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, IN_CHANNELS, OUT_CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, - STRIDE, 1, 1, PADDING, KERNEL_DIM, + STRIDE, 1, 1, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, false, TRANS_OUTPUT_1203, false, TRANS_WEIGHT_1203, false, (elem_t*)input, @@ -181,7 +183,7 @@ int main() { tiled_conv_auto( BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, IN_CHANNELS, OUT_CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, - STRIDE, 1, 1, PADDING, KERNEL_DIM, + STRIDE, 1, 1, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, false, TRANS_OUTPUT_1203, false, TRANS_WEIGHT_1203, false, (elem_t*)input, @@ -224,9 +226,9 @@ int main() { printf("weights:\n"); for (int och = 0; och < OUT_CHANNELS; och++) { printf("["); - for (int wrow = 0; wrow < KERNEL_DIM; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM; wrow++) { printf("["); - for (int wcol = 0; wcol < KERNEL_DIM; wcol++) { + for (int wcol = 0; wcol < KERNEL_COL_DIM; wcol++) { printf("["); for (int ich = 0; ich < IN_CHANNELS; ich++) { printf("%d,", weights[och][wrow][wcol][ich]); @@ -240,7 +242,7 @@ int main() { printf("\b\n\n"); printf("weights_mat:\n"); - for (int wrow = 0; wrow < KERNEL_DIM * KERNEL_DIM * IN_CHANNELS; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS; wrow++) { printf("["); for (int wcol = 0; wcol < OUT_CHANNELS; wcol++) { printf("%d,", weights_mat[wrow][wcol]); diff --git a/bareMetalC/conv_with_input_dilation.c b/bareMetalC/conv_with_input_dilation.c index f667fdff..bd877024 100644 --- a/bareMetalC/conv_with_input_dilation.c +++ b/bareMetalC/conv_with_input_dilation.c @@ -15,7 +15,8 @@ #define IN_COL_DIM 224 #define IN_CHANNELS 3 #define OUT_CHANNELS 17 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 1 #define INPUT_DILATION 2 @@ -49,7 +50,8 @@ #endif #define BATCH_SIZE 2 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 1 #define INPUT_DILATION 2 @@ -60,18 +62,18 @@ #define IN_ROW_DIM_DILATED (IN_ROW_DIM + (INPUT_DILATION - 1)*(IN_ROW_DIM - 1)) #define IN_COL_DIM_DILATED (IN_COL_DIM + (INPUT_DILATION - 1)*(IN_COL_DIM - 1)) -#define OUT_ROW_DIM ((IN_ROW_DIM_DILATED + 2*PADDING - KERNEL_DIM) / STRIDE + 1) -#define OUT_COL_DIM ((IN_COL_DIM_DILATED + 2*PADDING - KERNEL_DIM) / STRIDE + 1) -#define PATCH_SIZE (KERNEL_DIM * KERNEL_DIM * IN_CHANNELS) +#define OUT_ROW_DIM ((IN_ROW_DIM_DILATED + 2*PADDING - KERNEL_ROW_DIM) / STRIDE + 1) +#define OUT_COL_DIM ((IN_COL_DIM_DILATED + 2*PADDING - KERNEL_COL_DIM) / STRIDE + 1) +#define PATCH_SIZE (KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS) #define N_PATCHES (BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM) void conv(int batch_size, int in_channels, int in_row_dim, int in_col_dim, - int out_channels, int kernel_dim, + int out_channels, int kernel_row_dim, int kernel_col_dim, int out_row_dim, int out_col_dim, int stride, int input_dilation, int padding, elem_t input[batch_size][in_row_dim][in_col_dim][in_channels], - elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], acc_t bias[out_channels], elem_t output[batch_size][out_row_dim][out_col_dim][out_channels]) { @@ -82,12 +84,12 @@ void conv(int batch_size, int in_channels, static elem_t dilated[BATCH_SIZE][IN_ROW_DIM_DILATED][IN_COL_DIM_DILATED][IN_CHANNELS]; #ifdef GEMMINI_ASSERTIONS - if (out_row_dim != (in_row_dim_dilated + 2 * padding - kernel_dim) / stride + 1) { + if (out_row_dim != (in_row_dim_dilated + 2 * padding - kernel_row_dim) / stride + 1) { printf("conv out_row_dim is not correct\n"); printf("out_row_dim\n"); exit(1); } - if (out_col_dim != (in_col_dim_dilated + 2 * padding - kernel_dim) / stride + 1) { + if (out_col_dim != (in_col_dim_dilated + 2 * padding - kernel_col_dim) / stride + 1) { printf("conv out_col_dim is not correct\n"); printf("out_col_dim\n"); exit(1); @@ -115,8 +117,8 @@ void conv(int batch_size, int in_channels, for (int och = 0; och < out_channels; och++) { acc_t result = bias[och]; - for (int krow = 0; krow < kernel_dim; krow++) { - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { for (int kch = 0; kch < in_channels; kch++) { int irow = orow * stride + krow - padding; int icol = ocol * stride + kcol - padding; @@ -142,18 +144,18 @@ void conv(int batch_size, int in_channels, } } -void flatten_weights(int out_channels, int kernel_dim, int in_channels, +void flatten_weights(int out_channels, int kernel_row_dim, int kernel_col_dim, int in_channels, int patch_size, - elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], elem_t weights_mat[patch_size][out_channels]) { - assert(patch_size == kernel_dim * kernel_dim * in_channels); + assert(patch_size == kernel_row_dim * kernel_col_dim * in_channels); for (int outc = 0; outc < out_channels; outc++) { - for (int krow = 0; krow < kernel_dim; krow++) { - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { for (int inc = 0; inc < in_channels; inc++) { - int wmatrow = krow * kernel_dim * in_channels + + int wmatrow = krow * kernel_col_dim * in_channels + kcol * in_channels + inc; @@ -218,7 +220,7 @@ int main() { printf("Output dimensions: %u by %u\n\n", OUT_ROW_DIM, OUT_COL_DIM); static elem_t input[BATCH_SIZE][IN_ROW_DIM][IN_COL_DIM][IN_CHANNELS]; - static elem_t weights[OUT_CHANNELS][KERNEL_DIM][KERNEL_DIM][IN_CHANNELS]; + static elem_t weights[OUT_CHANNELS][KERNEL_ROW_DIM][KERNEL_COL_DIM][IN_CHANNELS]; static acc_t bias[OUT_CHANNELS]; static elem_t output[BATCH_SIZE][OUT_ROW_DIM][OUT_COL_DIM][OUT_CHANNELS]; @@ -238,7 +240,7 @@ int main() { uint64_t start_cpu = read_cycles(); #ifndef FAST conv(BATCH_SIZE, IN_CHANNELS, IN_ROW_DIM, IN_COL_DIM, - OUT_CHANNELS, KERNEL_DIM, + OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, OUT_ROW_DIM, OUT_COL_DIM, STRIDE, INPUT_DILATION, PADDING, input, @@ -253,7 +255,7 @@ int main() { static elem_t output_mat[N_PATCHES][OUT_CHANNELS]; printf("Flatten weights...\n"); - flatten_weights(OUT_CHANNELS, KERNEL_DIM, IN_CHANNELS, + flatten_weights(OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, IN_CHANNELS, PATCH_SIZE, weights, weights_mat); @@ -263,7 +265,7 @@ int main() { tiled_conv_auto( BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, IN_CHANNELS, OUT_CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, - STRIDE, INPUT_DILATION, 1, PADDING, KERNEL_DIM, + STRIDE, INPUT_DILATION, 1, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, false, false, false, false, false, (elem_t*)input, @@ -306,9 +308,9 @@ int main() { printf("weights:\n"); for (int och = 0; och < OUT_CHANNELS; och++) { printf("["); - for (int wrow = 0; wrow < KERNEL_DIM; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM; wrow++) { printf("["); - for (int wcol = 0; wcol < KERNEL_DIM; wcol++) { + for (int wcol = 0; wcol < KERNEL_COL_DIM; wcol++) { printf("["); for (int ich = 0; ich < IN_CHANNELS; ich++) { printf("%d,", weights[och][wrow][wcol][ich]); @@ -322,7 +324,7 @@ int main() { printf("\b\n\n"); printf("weights_mat:\n"); - for (int wrow = 0; wrow < KERNEL_DIM * KERNEL_DIM * IN_CHANNELS; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS; wrow++) { printf("["); for (int wcol = 0; wcol < OUT_CHANNELS; wcol++) { printf("%d,", weights_mat[wrow][wcol]); diff --git a/bareMetalC/conv_with_input_dilation_and_neg_padding.c b/bareMetalC/conv_with_input_dilation_and_neg_padding.c index 2258618d..c000ccce 100644 --- a/bareMetalC/conv_with_input_dilation_and_neg_padding.c +++ b/bareMetalC/conv_with_input_dilation_and_neg_padding.c @@ -15,7 +15,8 @@ #define IN_COL_DIM 224 #define IN_CHANNELS 3 #define OUT_CHANNELS 17 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING -1 #define STRIDE 1 #define INPUT_DILATION 2 @@ -39,7 +40,8 @@ #endif #define BATCH_SIZE 2 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING -1 #define STRIDE 1 #define INPUT_DILATION 2 @@ -50,18 +52,18 @@ #define IN_ROW_DIM_DILATED (IN_ROW_DIM + (INPUT_DILATION - 1)*(IN_ROW_DIM - 1)) #define IN_COL_DIM_DILATED (IN_COL_DIM + (INPUT_DILATION - 1)*(IN_COL_DIM - 1)) -#define OUT_ROW_DIM ((IN_ROW_DIM_DILATED + 2*PADDING - KERNEL_DIM) / STRIDE + 1) -#define OUT_COL_DIM ((IN_COL_DIM_DILATED + 2*PADDING - KERNEL_DIM) / STRIDE + 1) -#define PATCH_SIZE (KERNEL_DIM * KERNEL_DIM * IN_CHANNELS) +#define OUT_ROW_DIM ((IN_ROW_DIM_DILATED + 2*PADDING - KERNEL_ROW_DIM) / STRIDE + 1) +#define OUT_COL_DIM ((IN_COL_DIM_DILATED + 2*PADDING - KERNEL_COL_DIM) / STRIDE + 1) +#define PATCH_SIZE (KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS) #define N_PATCHES (BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM) void conv(int batch_size, int in_channels, int in_row_dim, int in_col_dim, - int out_channels, int kernel_dim, + int out_channels, int kernel_row_dim, int kernel_col_dim, int out_row_dim, int out_col_dim, int stride, int input_dilation, int padding, elem_t input[batch_size][in_row_dim][in_col_dim][in_channels], - elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], acc_t bias[out_channels], elem_t output[batch_size][out_row_dim][out_col_dim][out_channels]) { @@ -72,12 +74,12 @@ void conv(int batch_size, int in_channels, static elem_t dilated[BATCH_SIZE][IN_ROW_DIM_DILATED][IN_COL_DIM_DILATED][IN_CHANNELS]; #ifdef GEMMINI_ASSERTIONS - if (out_row_dim != (in_row_dim_dilated + 2*padding - kernel_dim) / stride + 1) { + if (out_row_dim != (in_row_dim_dilated + 2*padding - kernel_row_dim) / stride + 1) { printf("conv out_row_dim is not correct\n"); printf("out_row_dim\n"); exit(1); } - if (out_col_dim != (in_col_dim_dilated + 2*padding - kernel_dim) / stride + 1) { + if (out_col_dim != (in_col_dim_dilated + 2*padding - kernel_col_dim) / stride + 1) { printf("conv out_col_dim is not correct\n"); printf("out_col_dim\n"); exit(1); @@ -105,8 +107,8 @@ void conv(int batch_size, int in_channels, for (int och = 0; och < out_channels; och++) { acc_t result = bias[och]; - for (int krow = 0; krow < kernel_dim; krow++) { - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { for (int kch = 0; kch < in_channels; kch++) { int irow = orow * stride + krow - padding; int icol = ocol * stride + kcol - padding; @@ -132,18 +134,18 @@ void conv(int batch_size, int in_channels, } } -void flatten_weights(int out_channels, int kernel_dim, int in_channels, +void flatten_weights(int out_channels, int kernel_row_dim, int kernel_col_dim, int in_channels, int patch_size, - elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], elem_t weights_mat[patch_size][out_channels]) { - assert(patch_size == kernel_dim * kernel_dim * in_channels); + assert(patch_size == kernel_row_dim * kernel_col_dim * in_channels); for (int outc = 0; outc < out_channels; outc++) { - for (int krow = 0; krow < kernel_dim; krow++) { - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { for (int inc = 0; inc < in_channels; inc++) { - int wmatrow = krow * kernel_dim * in_channels + + int wmatrow = krow * kernel_col_dim * in_channels + kcol * in_channels + inc; @@ -209,7 +211,7 @@ int main() { printf("Output dimensions: %u by %u\n\n", OUT_ROW_DIM, OUT_COL_DIM); static elem_t input[BATCH_SIZE][IN_ROW_DIM][IN_COL_DIM][IN_CHANNELS]; - static elem_t weights[OUT_CHANNELS][KERNEL_DIM][KERNEL_DIM][IN_CHANNELS]; + static elem_t weights[OUT_CHANNELS][KERNEL_ROW_DIM][KERNEL_COL_DIM][IN_CHANNELS]; static acc_t bias[OUT_CHANNELS]; static elem_t output[BATCH_SIZE][OUT_ROW_DIM][OUT_COL_DIM][OUT_CHANNELS]; @@ -230,7 +232,7 @@ int main() { #ifndef FAST conv(BATCH_SIZE, IN_CHANNELS, IN_ROW_DIM, IN_COL_DIM, - OUT_CHANNELS, KERNEL_DIM, + OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, OUT_ROW_DIM, OUT_COL_DIM, STRIDE, INPUT_DILATION, PADDING, input, @@ -245,7 +247,7 @@ int main() { static elem_t output_mat[N_PATCHES][OUT_CHANNELS]; printf("Flatten weights...\n"); - flatten_weights(OUT_CHANNELS, KERNEL_DIM, IN_CHANNELS, + flatten_weights(OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, IN_CHANNELS, PATCH_SIZE, weights, weights_mat); @@ -255,7 +257,7 @@ int main() { tiled_conv_auto( BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, IN_CHANNELS, OUT_CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, - STRIDE, INPUT_DILATION, 1, PADDING, KERNEL_DIM, + STRIDE, INPUT_DILATION, 1, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, false, false, false, false, false, (elem_t*)input, @@ -299,9 +301,9 @@ int main() { printf("weights:\n"); for (int och = 0; och < OUT_CHANNELS; och++) { printf("["); - for (int wrow = 0; wrow < KERNEL_DIM; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM; wrow++) { printf("["); - for (int wcol = 0; wcol < KERNEL_DIM; wcol++) { + for (int wcol = 0; wcol < KERNEL_COL_DIM; wcol++) { printf("["); for (int ich = 0; ich < IN_CHANNELS; ich++) { printf("%d,", weights[och][wrow][wcol][ich]); @@ -315,7 +317,7 @@ int main() { printf("\b\n\n"); printf("weights_mat:\n"); - for (int wrow = 0; wrow < KERNEL_DIM * KERNEL_DIM * IN_CHANNELS; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS; wrow++) { printf("["); for (int wcol = 0; wcol < OUT_CHANNELS; wcol++) { printf("%d,", weights_mat[wrow][wcol]); diff --git a/bareMetalC/conv_with_input_dilation_and_rot180.c b/bareMetalC/conv_with_input_dilation_and_rot180.c index c1fbdf3e..a507237b 100644 --- a/bareMetalC/conv_with_input_dilation_and_rot180.c +++ b/bareMetalC/conv_with_input_dilation_and_rot180.c @@ -15,7 +15,8 @@ #define IN_COL_DIM 224 #define IN_CHANNELS 3 #define OUT_CHANNELS 17 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 1 #define INPUT_DILATION 2 @@ -39,7 +40,8 @@ #endif #define BATCH_SIZE 2 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 1 #define INPUT_DILATION 2 @@ -52,18 +54,18 @@ #define IN_ROW_DIM_DILATED (IN_ROW_DIM + (INPUT_DILATION - 1)*(IN_ROW_DIM - 1)) #define IN_COL_DIM_DILATED (IN_COL_DIM + (INPUT_DILATION - 1)*(IN_COL_DIM - 1)) -#define OUT_ROW_DIM ((IN_ROW_DIM_DILATED + 2*PADDING - KERNEL_DIM) / STRIDE + 1) -#define OUT_COL_DIM ((IN_COL_DIM_DILATED + 2*PADDING - KERNEL_DIM) / STRIDE + 1) -#define PATCH_SIZE (KERNEL_DIM * KERNEL_DIM * IN_CHANNELS) +#define OUT_ROW_DIM ((IN_ROW_DIM_DILATED + 2*PADDING - KERNEL_ROW_DIM) / STRIDE + 1) +#define OUT_COL_DIM ((IN_COL_DIM_DILATED + 2*PADDING - KERNEL_COL_DIM) / STRIDE + 1) +#define PATCH_SIZE (KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS) #define N_PATCHES (BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM) void conv(int batch_size, int in_channels, int in_row_dim, int in_col_dim, - int out_channels, int kernel_dim, + int out_channels, int kernel_row_dim, int kernel_col_dim, int out_row_dim, int out_col_dim, int stride, int input_dilation, int padding, bool wrot180, elem_t input[batch_size][in_row_dim][in_col_dim][in_channels], - elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], acc_t bias[out_channels], elem_t output[batch_size][out_row_dim][out_col_dim][out_channels]) { @@ -73,15 +75,15 @@ void conv(int batch_size, int in_channels, assert(in_col_dim_dilated == IN_COL_DIM_DILATED); static elem_t dilated[BATCH_SIZE][IN_ROW_DIM_DILATED][IN_COL_DIM_DILATED][IN_CHANNELS]; - static elem_t weights_rot180[OUT_CHANNELS][KERNEL_DIM][KERNEL_DIM][IN_CHANNELS]; + static elem_t weights_rot180[OUT_CHANNELS][KERNEL_ROW_DIM][KERNEL_COL_DIM][IN_CHANNELS]; #ifdef GEMMINI_ASSERTIONS - if (out_row_dim != (in_row_dim_dilated + 2*padding - kernel_dim) / stride + 1) { + if (out_row_dim != (in_row_dim_dilated + 2*padding - kernel_row_dim) / stride + 1) { printf("conv out_row_dim is not correct\n"); printf("out_row_dim\n"); exit(1); } - if (out_col_dim != (in_col_dim_dilated + 2*padding - kernel_dim) / stride + 1) { + if (out_col_dim != (in_col_dim_dilated + 2*padding - kernel_col_dim) / stride + 1) { printf("conv out_col_dim is not correct\n"); printf("out_col_dim\n"); exit(1); @@ -106,11 +108,11 @@ void conv(int batch_size, int in_channels, // Populate weights_rot180 for (int och = 0; och < out_channels; och++) - for (int krow = 0; krow < kernel_dim; krow++) - for (int kcol = 0; kcol < kernel_dim; kcol++) + for (int krow = 0; krow < kernel_row_dim; krow++) + for (int kcol = 0; kcol < kernel_col_dim; kcol++) for (int kch = 0; kch < in_channels; kch++) weights_rot180[och][krow][kcol][kch] = - weights[och][kernel_dim-krow-1][kernel_dim-kcol-1][kch]; + weights[och][kernel_row_dim-krow-1][kernel_col_dim-kcol-1][kch]; for (int b = 0; b < batch_size; b++) { for (int orow = 0; orow < out_row_dim; orow++) { @@ -118,8 +120,8 @@ void conv(int batch_size, int in_channels, for (int och = 0; och < out_channels; och++) { acc_t result = bias[och]; - for (int krow = 0; krow < kernel_dim; krow++) { - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { for (int kch = 0; kch < in_channels; kch++) { int irow = orow * stride + krow - padding; int icol = ocol * stride + kcol - padding; @@ -147,18 +149,18 @@ void conv(int batch_size, int in_channels, } } -void flatten_weights(int out_channels, int kernel_dim, int in_channels, +void flatten_weights(int out_channels, int kernel_row_dim, int kernel_col_dim, int in_channels, int patch_size, - elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], elem_t weights_mat[patch_size][out_channels]) { - assert(patch_size == kernel_dim * kernel_dim * in_channels); + assert(patch_size == kernel_row_dim * kernel_col_dim * in_channels); for (int outc = 0; outc < out_channels; outc++) { - for (int krow = 0; krow < kernel_dim; krow++) { - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { for (int inc = 0; inc < in_channels; inc++) { - int wmatrow = krow * kernel_dim * in_channels + + int wmatrow = krow * kernel_col_dim * in_channels + kcol * in_channels + inc; @@ -223,7 +225,7 @@ int main() { printf("Output dimensions: %u by %u\n\n", OUT_ROW_DIM, OUT_COL_DIM); static elem_t input[BATCH_SIZE][IN_ROW_DIM][IN_COL_DIM][IN_CHANNELS]; - static elem_t weights[OUT_CHANNELS][KERNEL_DIM][KERNEL_DIM][IN_CHANNELS]; + static elem_t weights[OUT_CHANNELS][KERNEL_ROW_DIM][KERNEL_COL_DIM][IN_CHANNELS]; static acc_t bias[OUT_CHANNELS]; static elem_t output[BATCH_SIZE][OUT_ROW_DIM][OUT_COL_DIM][OUT_CHANNELS]; @@ -244,7 +246,7 @@ int main() { #ifndef FAST conv(BATCH_SIZE, IN_CHANNELS, IN_ROW_DIM, IN_COL_DIM, - OUT_CHANNELS, KERNEL_DIM, + OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, OUT_ROW_DIM, OUT_COL_DIM, STRIDE, INPUT_DILATION, PADDING, WROT180, input, @@ -259,7 +261,7 @@ int main() { static elem_t output_mat[N_PATCHES][OUT_CHANNELS]; printf("Flatten weights...\n"); - flatten_weights(OUT_CHANNELS, KERNEL_DIM, IN_CHANNELS, + flatten_weights(OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, IN_CHANNELS, PATCH_SIZE, weights, weights_mat); @@ -269,7 +271,7 @@ int main() { tiled_conv_auto( BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, IN_CHANNELS, OUT_CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, - STRIDE, INPUT_DILATION, 1, PADDING, KERNEL_DIM, + STRIDE, INPUT_DILATION, 1, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, WROT180, false, false, false, false, (elem_t*)input, @@ -312,9 +314,9 @@ int main() { printf("weights:\n"); for (int och = 0; och < OUT_CHANNELS; och++) { printf("["); - for (int wrow = 0; wrow < KERNEL_DIM; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM; wrow++) { printf("["); - for (int wcol = 0; wcol < KERNEL_DIM; wcol++) { + for (int wcol = 0; wcol < KERNEL_COL_DIM; wcol++) { printf("["); for (int ich = 0; ich < IN_CHANNELS; ich++) { printf("%d,", weights[och][wrow][wcol][ich]); @@ -328,7 +330,7 @@ int main() { printf("\b\n\n"); printf("weights_mat:\n"); - for (int wrow = 0; wrow < KERNEL_DIM * KERNEL_DIM * IN_CHANNELS; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS; wrow++) { printf("["); for (int wcol = 0; wcol < OUT_CHANNELS; wcol++) { printf("%d,", weights_mat[wrow][wcol]); diff --git a/bareMetalC/conv_with_kernel_dilation.c b/bareMetalC/conv_with_kernel_dilation.c index 8ffeab75..8761aaf6 100644 --- a/bareMetalC/conv_with_kernel_dilation.c +++ b/bareMetalC/conv_with_kernel_dilation.c @@ -15,7 +15,8 @@ #define IN_COL_DIM 224 #define IN_CHANNELS 3 #define OUT_CHANNELS 17 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 2 #define INPUT_DILATION 1 @@ -40,7 +41,8 @@ #endif #define BATCH_SIZE 2 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 1 #define INPUT_DILATION 1 @@ -52,19 +54,20 @@ #define IN_ROW_DIM_DILATED (IN_ROW_DIM + (INPUT_DILATION - 1)*(IN_ROW_DIM - 1)) #define IN_COL_DIM_DILATED (IN_COL_DIM + (INPUT_DILATION - 1)*(IN_COL_DIM - 1)) -#define KERNEL_DIM_DILATED (KERNEL_DIM + (KERNEL_DILATION - 1)*(KERNEL_DIM - 1)) -#define OUT_ROW_DIM ((IN_ROW_DIM_DILATED + 2*PADDING - KERNEL_DIM_DILATED) / STRIDE + 1) -#define OUT_COL_DIM ((IN_COL_DIM_DILATED + 2*PADDING - KERNEL_DIM_DILATED) / STRIDE + 1) -#define PATCH_SIZE (KERNEL_DIM * KERNEL_DIM * IN_CHANNELS) +#define KERNEL_ROW_DIM_DILATED (KERNEL_ROW_DIM + (KERNEL_DILATION - 1)*(KERNEL_ROW_DIM - 1)) +#define KERNEL_COL_DIM_DILATED (KERNEL_COL_DIM + (KERNEL_DILATION - 1)*(KERNEL_COL_DIM - 1)) +#define OUT_ROW_DIM ((IN_ROW_DIM_DILATED + 2*PADDING - KERNEL_ROW_DIM_DILATED) / STRIDE + 1) +#define OUT_COL_DIM ((IN_COL_DIM_DILATED + 2*PADDING - KERNEL_COL_DIM_DILATED) / STRIDE + 1) +#define PATCH_SIZE (KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS) #define N_PATCHES (BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM) void conv(int batch_size, int in_channels, int in_row_dim, int in_col_dim, - int out_channels, int kernel_dim, + int out_channels, int kernel_row_dim, int kernel_col_dim, int out_row_dim, int out_col_dim, int stride, int input_dilation, int kernel_dilation, int padding, elem_t input[batch_size][in_row_dim][in_col_dim][in_channels], - elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], acc_t bias[out_channels], elem_t output[batch_size][out_row_dim][out_col_dim][out_channels]) { @@ -74,17 +77,19 @@ void conv(int batch_size, int in_channels, assert(in_col_dim_dilated == IN_COL_DIM_DILATED); static elem_t dilated[BATCH_SIZE][IN_ROW_DIM_DILATED][IN_COL_DIM_DILATED][IN_CHANNELS]; - const size_t kernel_dim_dilated = kernel_dim + (kernel_dilation - 1)*(kernel_dim - 1); - assert(kernel_dim_dilated == KERNEL_DIM_DILATED); - static elem_t weights_dilated[OUT_CHANNELS][KERNEL_DIM_DILATED][KERNEL_DIM_DILATED][IN_CHANNELS]; + const size_t kernel_row_dim_dilated = kernel_row_dim + (kernel_dilation - 1)*(kernel_row_dim - 1); + const size_t kernel_col_dim_dilated = kernel_col_dim + (kernel_dilation - 1)*(kernel_col_dim - 1); + assert(kernel_row_dim_dilated == KERNEL_ROW_DIM_DILATED); + assert(kernel_col_dim_dilated == KERNEL_COL_DIM_DILATED); + static elem_t weights_dilated[OUT_CHANNELS][KERNEL_ROW_DIM_DILATED][KERNEL_COL_DIM_DILATED][IN_CHANNELS]; #ifdef GEMMINI_ASSERTIONS - if (out_row_dim != (in_row_dim_dilated + 2 * padding - kernel_dim_dilated) / stride + 1) { + if (out_row_dim != (in_row_dim_dilated + 2 * padding - kernel_row_dim_dilated) / stride + 1) { printf("conv out_row_dim is not correct\n"); printf("out_row_dim\n"); exit(1); } - if (out_col_dim != (in_col_dim_dilated + 2 * padding - kernel_dim_dilated) / stride + 1) { + if (out_col_dim != (in_col_dim_dilated + 2 * padding - kernel_col_dim_dilated) / stride + 1) { printf("conv out_col_dim is not correct\n"); printf("out_col_dim\n"); exit(1); @@ -107,15 +112,15 @@ void conv(int batch_size, int in_channels, } for (int och = 0; och < out_channels; och++) - for (int krow = 0; krow < kernel_dim_dilated; krow++) - for (int kcol = 0; kcol < kernel_dim_dilated; kcol++) + for (int krow = 0; krow < kernel_row_dim_dilated; krow++) + for (int kcol = 0; kcol < kernel_col_dim_dilated; kcol++) for (int kch = 0; kch < in_channels; kch++) weights_dilated[och][krow][kcol][kch] = 0; idx = 0; for (int och = 0; och < out_channels; och++) - for (int krow = 0; krow < kernel_dim_dilated; krow += kernel_dilation) - for (int kcol = 0; kcol < kernel_dim_dilated; kcol += kernel_dilation) + for (int krow = 0; krow < kernel_row_dim_dilated; krow += kernel_dilation) + for (int kcol = 0; kcol < kernel_col_dim_dilated; kcol += kernel_dilation) for (int kch = 0; kch < in_channels; kch++) { weights_dilated[och][krow][kcol][kch] = *((elem_t*)weights + idx); idx++; @@ -127,8 +132,8 @@ void conv(int batch_size, int in_channels, for (int och = 0; och < out_channels; och++) { acc_t result = bias[och]; - for (int krow = 0; krow < kernel_dim_dilated; krow++) { - for (int kcol = 0; kcol < kernel_dim_dilated; kcol++) { + for (int krow = 0; krow < kernel_row_dim_dilated; krow++) { + for (int kcol = 0; kcol < kernel_col_dim_dilated; kcol++) { for (int kch = 0; kch < in_channels; kch++) { int irow = orow * stride + krow - padding; int icol = ocol * stride + kcol - padding; @@ -154,18 +159,18 @@ void conv(int batch_size, int in_channels, } } -void flatten_weights(int out_channels, int kernel_dim, int in_channels, +void flatten_weights(int out_channels, int kernel_row_dim, int kernel_col_dim, int in_channels, int patch_size, - elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], elem_t weights_mat[patch_size][out_channels]) { - assert(patch_size == kernel_dim * kernel_dim * in_channels); + assert(patch_size == kernel_row_dim * kernel_col_dim * in_channels); for (int outc = 0; outc < out_channels; outc++) { - for (int krow = 0; krow < kernel_dim; krow++) { - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { for (int inc = 0; inc < in_channels; inc++) { - int wmatrow = krow * kernel_dim * in_channels + + int wmatrow = krow * kernel_col_dim * in_channels + kcol * in_channels + inc; @@ -230,7 +235,7 @@ int main() { printf("Output dimensions: %u by %u\n\n", OUT_ROW_DIM, OUT_COL_DIM); static elem_t input[BATCH_SIZE][IN_ROW_DIM][IN_COL_DIM][IN_CHANNELS]; - static elem_t weights[OUT_CHANNELS][KERNEL_DIM][KERNEL_DIM][IN_CHANNELS]; + static elem_t weights[OUT_CHANNELS][KERNEL_ROW_DIM][KERNEL_COL_DIM][IN_CHANNELS]; static acc_t bias[OUT_CHANNELS]; static elem_t output[BATCH_SIZE][OUT_ROW_DIM][OUT_COL_DIM][OUT_CHANNELS]; @@ -251,7 +256,7 @@ int main() { #ifndef FAST conv(BATCH_SIZE, IN_CHANNELS, IN_ROW_DIM, IN_COL_DIM, - OUT_CHANNELS, KERNEL_DIM, + OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, OUT_ROW_DIM, OUT_COL_DIM, STRIDE, INPUT_DILATION, KERNEL_DILATION, PADDING, input, @@ -266,7 +271,7 @@ int main() { static elem_t output_mat[N_PATCHES][OUT_CHANNELS]; printf("Flatten weights...\n"); - flatten_weights(OUT_CHANNELS, KERNEL_DIM, IN_CHANNELS, + flatten_weights(OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, IN_CHANNELS, PATCH_SIZE, weights, weights_mat); @@ -276,7 +281,7 @@ int main() { tiled_conv_auto( BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, IN_CHANNELS, OUT_CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, - STRIDE, INPUT_DILATION, KERNEL_DILATION, PADDING, KERNEL_DIM, + STRIDE, INPUT_DILATION, KERNEL_DILATION, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, false, false, false, false, false, (elem_t*)input, @@ -319,9 +324,9 @@ int main() { printf("weights:\n"); for (int och = 0; och < OUT_CHANNELS; och++) { printf("["); - for (int wrow = 0; wrow < KERNEL_DIM; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM; wrow++) { printf("["); - for (int wcol = 0; wcol < KERNEL_DIM; wcol++) { + for (int wcol = 0; wcol < KERNEL_COL_DIM; wcol++) { printf("["); for (int ich = 0; ich < IN_CHANNELS; ich++) { printf("%d,", weights[och][wrow][wcol][ich]); @@ -335,7 +340,7 @@ int main() { printf("\b\n\n"); printf("weights_mat:\n"); - for (int wrow = 0; wrow < KERNEL_DIM * KERNEL_DIM * IN_CHANNELS; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS; wrow++) { printf("["); for (int wcol = 0; wcol < OUT_CHANNELS; wcol++) { printf("%d,", weights_mat[wrow][wcol]); diff --git a/bareMetalC/conv_with_pool.c b/bareMetalC/conv_with_pool.c index f896c4a6..60a036aa 100644 --- a/bareMetalC/conv_with_pool.c +++ b/bareMetalC/conv_with_pool.c @@ -15,7 +15,8 @@ #define IN_COL_DIM 224 #define IN_CHANNELS 3 #define OUT_CHANNELS 32 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 2 @@ -38,7 +39,8 @@ #endif #define BATCH_SIZE 2 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 2 @@ -50,9 +52,9 @@ #define NO_BIAS false -#define OUT_ROW_DIM ((IN_ROW_DIM + 2 * PADDING - KERNEL_DIM) / STRIDE + 1) -#define OUT_COL_DIM ((IN_COL_DIM + 2 * PADDING - KERNEL_DIM) / STRIDE + 1) -#define PATCH_SIZE (KERNEL_DIM * KERNEL_DIM * IN_CHANNELS) +#define OUT_ROW_DIM ((IN_ROW_DIM + 2 * PADDING - KERNEL_ROW_DIM) / STRIDE + 1) +#define OUT_COL_DIM ((IN_COL_DIM + 2 * PADDING - KERNEL_COL_DIM) / STRIDE + 1) +#define PATCH_SIZE (KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS) #define N_PATCHES (BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM) #define POOL_OUT_ROW_DIM ((OUT_ROW_DIM + 2 * POOL_PADDING - POOL_SIZE) / POOL_STRIDE + 1) @@ -66,20 +68,20 @@ void conv(int batch_size, int in_channels, int in_row_dim, int in_col_dim, - int out_channels, int kernel_dim, + int out_channels, int kernel_row_dim, int kernel_col_dim, int out_row_dim, int out_col_dim, int stride, int padding, elem_t input[batch_size][in_row_dim][in_col_dim][in_channels], - elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], acc_t bias[out_channels], elem_t output[batch_size][out_row_dim][out_col_dim][out_channels]) { #ifdef GEMMINI_ASSERTIONS - if (out_row_dim != (in_row_dim + 2 * padding - kernel_dim) / stride + 1) { + if (out_row_dim != (in_row_dim + 2 * padding - kernel_row_dim) / stride + 1) { printf("conv out_row_dim is not correct\n"); exit(1); } - if (out_col_dim != (in_col_dim + 2 * padding - kernel_dim) / stride + 1) { + if (out_col_dim != (in_col_dim + 2 * padding - kernel_col_dim) / stride + 1) { printf("conv out_col_dim is not correct\n"); exit(1); } @@ -91,8 +93,8 @@ void conv(int batch_size, int in_channels, for (int och = 0; och < out_channels; och++) { acc_t result = bias[och]; - for (int krow = 0; krow < kernel_dim; krow++) { - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { for (int kch = 0; kch < in_channels; kch++) { int irow = orow * stride + krow - padding; int icol = ocol * stride + kcol - padding; @@ -150,18 +152,18 @@ void pool(int batch_size, int channels, } } -void flatten_weights(int out_channels, int kernel_dim, int in_channels, +void flatten_weights(int out_channels, int kernel_row_dim, int kernel_col_dim, int in_channels, int patch_size, - elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], elem_t weights_mat[patch_size][out_channels]) { - assert(patch_size == kernel_dim * kernel_dim * in_channels); + assert(patch_size == kernel_row_dim * kernel_col_dim * in_channels); for (int outc = 0; outc < out_channels; outc++) { - for (int krow = 0; krow < kernel_dim; krow++) { - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { for (int inc = 0; inc < in_channels; inc++) { - int wmatrow = krow * kernel_dim * in_channels + + int wmatrow = krow * kernel_col_dim * in_channels + kcol * in_channels + inc; @@ -227,7 +229,7 @@ int main() { printf("Pooling output dimensions (rows by columns): %u by %u\n\n", POOL_OUT_ROW_DIM, POOL_OUT_COL_DIM); static elem_t input[BATCH_SIZE][IN_ROW_DIM][IN_COL_DIM][IN_CHANNELS]; - static elem_t weights[OUT_CHANNELS][KERNEL_DIM][KERNEL_DIM][IN_CHANNELS]; + static elem_t weights[OUT_CHANNELS][KERNEL_ROW_DIM][KERNEL_COL_DIM][IN_CHANNELS]; static acc_t bias[OUT_CHANNELS]; static elem_t output[BATCH_SIZE][OUT_ROW_DIM][OUT_COL_DIM][OUT_CHANNELS]; static elem_t pool_output[BATCH_SIZE][POOL_OUT_ROW_DIM][POOL_OUT_COL_DIM][OUT_CHANNELS]; @@ -249,7 +251,7 @@ int main() { uint64_t start_cpu = read_cycles(); conv(BATCH_SIZE, IN_CHANNELS, IN_ROW_DIM, IN_COL_DIM, - OUT_CHANNELS, KERNEL_DIM, + OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, OUT_ROW_DIM, OUT_COL_DIM, STRIDE, PADDING, input, @@ -277,7 +279,7 @@ int main() { static elem_t pool_output_mat[BATCH_SIZE * POOL_OUT_ROW_DIM * POOL_OUT_COL_DIM][OUT_CHANNELS]; printf("Flatten weights...\n"); - flatten_weights(OUT_CHANNELS, KERNEL_DIM, IN_CHANNELS, + flatten_weights(OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, IN_CHANNELS, PATCH_SIZE, weights, weights_mat); @@ -288,7 +290,7 @@ int main() { tiled_conv_auto( BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, IN_CHANNELS, OUT_CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, - STRIDE, 1, 1, PADDING, KERNEL_DIM, + STRIDE, 1, 1, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, false, false, false, false, false, // 1, @@ -335,9 +337,9 @@ int main() { printf("weights:\n"); for (int och = 0; och < OUT_CHANNELS; och++) { printf("["); - for (int wrow = 0; wrow < KERNEL_DIM; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM; wrow++) { printf("["); - for (int wcol = 0; wcol < KERNEL_DIM; wcol++) { + for (int wcol = 0; wcol < KERNEL_COL_DIM; wcol++) { printf("["); for (int ich = 0; ich < IN_CHANNELS; ich++) { printf("%d,", weights[och][wrow][wcol][ich]); @@ -351,7 +353,7 @@ int main() { printf("\b\n\n"); printf("weights_mat:\n"); - for (int wrow = 0; wrow < KERNEL_DIM * KERNEL_DIM * IN_CHANNELS; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS; wrow++) { printf("["); for (int wcol = 0; wcol < OUT_CHANNELS; wcol++) { printf("%d,", weights_mat[wrow][wcol]); diff --git a/bareMetalC/conv_with_rot180.c b/bareMetalC/conv_with_rot180.c index ccbec995..e216fa18 100644 --- a/bareMetalC/conv_with_rot180.c +++ b/bareMetalC/conv_with_rot180.c @@ -15,7 +15,8 @@ #define IN_COL_DIM 224 #define IN_CHANNELS 3 #define OUT_CHANNELS 17 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 1 #define INPUT_DILATION 1 @@ -39,7 +40,8 @@ #endif #define BATCH_SIZE 2 -#define KERNEL_DIM 3 +#define KERNEL_ROW_DIM 3 +#define KERNEL_COL_DIM 3 #define PADDING 1 #define STRIDE 1 #define INPUT_DILATION 1 @@ -52,18 +54,18 @@ #define IN_ROW_DIM_DILATED (IN_ROW_DIM + (INPUT_DILATION - 1) * (IN_ROW_DIM - 1)) #define IN_COL_DIM_DILATED (IN_COL_DIM + (INPUT_DILATION - 1) * (IN_COL_DIM - 1)) -#define OUT_ROW_DIM ((IN_ROW_DIM_DILATED + 2 * PADDING - KERNEL_DIM) / STRIDE + 1) -#define OUT_COL_DIM ((IN_COL_DIM_DILATED + 2 * PADDING - KERNEL_DIM) / STRIDE + 1) -#define PATCH_SIZE (KERNEL_DIM * KERNEL_DIM * IN_CHANNELS) +#define OUT_ROW_DIM ((IN_ROW_DIM_DILATED + 2 * PADDING - KERNEL_ROW_DIM) / STRIDE + 1) +#define OUT_COL_DIM ((IN_COL_DIM_DILATED + 2 * PADDING - KERNEL_COL_DIM) / STRIDE + 1) +#define PATCH_SIZE (KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS) #define N_PATCHES (BATCH_SIZE * OUT_ROW_DIM * OUT_COL_DIM) void conv(int batch_size, int in_channels, int in_row_dim, int in_col_dim, - int out_channels, int kernel_dim, + int out_channels, int kernel_row_dim, int kernel_col_dim, int out_row_dim, int out_col_dim, int stride, int input_dilation, int padding, bool wrot180, elem_t input[batch_size][in_row_dim][in_col_dim][in_channels], - elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + elem_t weights[out_channels][kernel_row_dim][kernel_row_dim][in_channels], acc_t bias[out_channels], elem_t output[batch_size][out_row_dim][out_col_dim][out_channels]) { @@ -73,15 +75,15 @@ void conv(int batch_size, int in_channels, assert(in_col_dim_dilated == IN_COL_DIM_DILATED); static elem_t dilated[BATCH_SIZE][IN_ROW_DIM_DILATED][IN_COL_DIM_DILATED][IN_CHANNELS]; - static elem_t weights_rot180[OUT_CHANNELS][KERNEL_DIM][KERNEL_DIM][IN_CHANNELS]; + static elem_t weights_rot180[OUT_CHANNELS][KERNEL_ROW_DIM][KERNEL_COL_DIM][IN_CHANNELS]; #ifdef GEMMINI_ASSERTIONS - if (out_row_dim != (in_row_dim_dilated + 2 * padding - kernel_dim) / stride + 1) { + if (out_row_dim != (in_row_dim_dilated + 2 * padding - kernel_row_dim) / stride + 1) { printf("conv out_row_dim is not correct\n"); printf("out_row_dim\n"); exit(1); } - if (out_col_dim != (in_col_dim_dilated + 2 * padding - kernel_dim) / stride + 1) { + if (out_col_dim != (in_col_dim_dilated + 2 * padding - kernel_col_dim) / stride + 1) { printf("conv out_col_dim is not correct\n"); printf("out_col_dim\n"); exit(1); @@ -106,11 +108,11 @@ void conv(int batch_size, int in_channels, // Populate weights_rot180 for (int och = 0; och < out_channels; och++) - for (int krow = 0; krow < kernel_dim; krow++) - for (int kcol = 0; kcol < kernel_dim; kcol++) + for (int krow = 0; krow < kernel_row_dim; krow++) + for (int kcol = 0; kcol < kernel_col_dim; kcol++) for (int kch = 0; kch < in_channels; kch++) weights_rot180[och][krow][kcol][kch] = - weights[och][kernel_dim-krow-1][kernel_dim-kcol-1][kch]; + weights[och][kernel_row_dim-krow-1][kernel_col_dim-kcol-1][kch]; for (int b = 0; b < batch_size; b++) { for (int orow = 0; orow < out_row_dim; orow++) { @@ -118,8 +120,8 @@ void conv(int batch_size, int in_channels, for (int och = 0; och < out_channels; och++) { acc_t result = bias[och]; - for (int krow = 0; krow < kernel_dim; krow++) { - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { for (int kch = 0; kch < in_channels; kch++) { int irow = orow * stride + krow - padding; int icol = ocol * stride + kcol - padding; @@ -147,18 +149,18 @@ void conv(int batch_size, int in_channels, } } -void flatten_weights(int out_channels, int kernel_dim, int in_channels, +void flatten_weights(int out_channels, int kernel_row_dim, int kernel_col_dim, int in_channels, int patch_size, - elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + elem_t weights[out_channels][kernel_row_dim][kernel_col_dim][in_channels], elem_t weights_mat[patch_size][out_channels]) { - assert(patch_size == kernel_dim * kernel_dim * in_channels); + assert(patch_size == kernel_row_dim * kernel_col_dim * in_channels); for (int outc = 0; outc < out_channels; outc++) { - for (int krow = 0; krow < kernel_dim; krow++) { - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { for (int inc = 0; inc < in_channels; inc++) { - int wmatrow = krow * kernel_dim * in_channels + + int wmatrow = krow * kernel_col_dim * in_channels + kcol * in_channels + inc; @@ -223,7 +225,7 @@ int main() { printf("Output dimensions: %u by %u\n\n", OUT_ROW_DIM, OUT_COL_DIM); static elem_t input[BATCH_SIZE][IN_ROW_DIM][IN_COL_DIM][IN_CHANNELS]; - static elem_t weights[OUT_CHANNELS][KERNEL_DIM][KERNEL_DIM][IN_CHANNELS]; + static elem_t weights[OUT_CHANNELS][KERNEL_ROW_DIM][KERNEL_COL_DIM][IN_CHANNELS]; static acc_t bias[OUT_CHANNELS]; static elem_t output[BATCH_SIZE][OUT_ROW_DIM][OUT_COL_DIM][OUT_CHANNELS]; @@ -244,7 +246,7 @@ int main() { #ifndef FAST conv(BATCH_SIZE, IN_CHANNELS, IN_ROW_DIM, IN_COL_DIM, - OUT_CHANNELS, KERNEL_DIM, + OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, OUT_ROW_DIM, OUT_COL_DIM, STRIDE, INPUT_DILATION, PADDING, WROT180, input, @@ -259,7 +261,7 @@ int main() { static elem_t output_mat[N_PATCHES][OUT_CHANNELS]; printf("Flatten weights...\n"); - flatten_weights(OUT_CHANNELS, KERNEL_DIM, IN_CHANNELS, + flatten_weights(OUT_CHANNELS, KERNEL_ROW_DIM, KERNEL_COL_DIM, IN_CHANNELS, PATCH_SIZE, weights, weights_mat); @@ -269,7 +271,7 @@ int main() { tiled_conv_auto( BATCH_SIZE, IN_ROW_DIM, IN_COL_DIM, IN_CHANNELS, OUT_CHANNELS, OUT_ROW_DIM, OUT_COL_DIM, - STRIDE, INPUT_DILATION, 1, PADDING, KERNEL_DIM, + STRIDE, INPUT_DILATION, 1, PADDING, KERNEL_ROW_DIM, KERNEL_COL_DIM, WROT180, false, false, false, false, (elem_t*)input, @@ -312,9 +314,9 @@ int main() { printf("weights:\n"); for (int och = 0; och < OUT_CHANNELS; och++) { printf("["); - for (int wrow = 0; wrow < KERNEL_DIM; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM; wrow++) { printf("["); - for (int wcol = 0; wcol < KERNEL_DIM; wcol++) { + for (int wcol = 0; wcol < KERNEL_COL_DIM; wcol++) { printf("["); for (int ich = 0; ich < IN_CHANNELS; ich++) { printf("%d,", weights[och][wrow][wcol][ich]); @@ -328,7 +330,7 @@ int main() { printf("\b\n\n"); printf("weights_mat:\n"); - for (int wrow = 0; wrow < KERNEL_DIM * KERNEL_DIM * IN_CHANNELS; wrow++) { + for (int wrow = 0; wrow < KERNEL_ROW_DIM * KERNEL_COL_DIM * IN_CHANNELS; wrow++) { printf("["); for (int wcol = 0; wcol < OUT_CHANNELS; wcol++) { printf("%d,", weights_mat[wrow][wcol]); diff --git a/bareMetalC/tiled_matmul_scale_ws.c b/bareMetalC/tiled_matmul_scale_ws.c new file mode 100644 index 00000000..1efded63 --- /dev/null +++ b/bareMetalC/tiled_matmul_scale_ws.c @@ -0,0 +1,183 @@ +// See LICENSE for license details. + +#include +#include +#include +#include +#include +#ifndef BAREMETAL +#include +#endif +#include "include/gemmini_testutils.h" + +#define CHECK_RESULT 1 + +#define NO_BIAS 1 +#define FULL_BIAS_WIDTH 1 + +#if FULL_BIAS_WIDTH +typedef acc_t ACC_T; +#else +typedef elem_t ACC_T; +#endif + +#ifndef BAREMETAL +#define MAT_DIM_I 512 +#define MAT_DIM_K 512 +#define MAT_DIM_J 512 +#else +#define MAT_DIM_I 64 +#define MAT_DIM_K 64 +#define MAT_DIM_J 64 +#endif + +float scale = 0.5; + +void print_tile(elem_t* in, int tile_dim) { + for (size_t r = 0; r < tile_dim; r++) { + printf("row starts at: %p\n", in +r*MAT_DIM_J); + for (size_t c = 0; c < tile_dim; c++) { + printf("%d ", *(in +r*MAT_DIM_J + c)); + } + printf("\n"); + } +} + +void full_matmul(elem_t A[MAT_DIM_I][MAT_DIM_K], elem_t B[MAT_DIM_K][MAT_DIM_J], ACC_T D[MAT_DIM_I][MAT_DIM_J], full_t C_full[MAT_DIM_I][MAT_DIM_J]) { + for (size_t r = 0; r < MAT_DIM_I; r++) + for (size_t c = 0; c < MAT_DIM_J; c++) { + C_full[r][c] = D[r][c]; + for (size_t k = 0; k < MAT_DIM_K; k++) + C_full[r][c] += MVIN_SCALE(A[r][k], scale)*MVIN_SCALE(B[k][c], scale); + } +} + +void full_printMatrix(elem_t m[MAT_DIM_I][MAT_DIM_J]) { + for (size_t i = 0; i < MAT_DIM_I; ++i) { + for (size_t j = 0; j < MAT_DIM_J; ++j) + printf("%d ", m[i][j]); + printf("\n"); + } +} + +int full_is_equal(elem_t x[MAT_DIM_I][MAT_DIM_J], elem_t y[MAT_DIM_I][MAT_DIM_J]) { + for (size_t i = 0; i < MAT_DIM_I; ++i) + for (size_t j = 0; j < MAT_DIM_J; ++j) + if (x[i][j] != y[i][j]) + return 0; + return 1; +} + +void full_matscale(full_t full[MAT_DIM_I][MAT_DIM_J], elem_t out[MAT_DIM_I][MAT_DIM_J], acc_scale_t scale) { + for (size_t r = 0; r < MAT_DIM_I; r++) + for (size_t c = 0; c < MAT_DIM_J; c++) { + // Scale element + full_t scaled = ACC_SCALE(full[r][c], scale); + + // Saturate and cast element +#ifndef ELEM_T_IS_FLOAT + full_t elem = scaled > elem_t_max ? elem_t_max : (scaled < elem_t_min ? elem_t_min : scaled); + out[r][c] = elem; +#else + out[r][c] = scaled; // TODO should we also saturate when using floats? +#endif + } +} + +int main() { +#ifndef BAREMETAL + if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) { + perror("mlockall failed"); + exit(1); + } +#endif + + printf("MAT_DIM_I: %d\n", MAT_DIM_I); + printf("MAT_DIM_J: %d\n", MAT_DIM_J); + printf("MAT_DIM_K: %d\n", MAT_DIM_K); + + gemmini_flush(0); + + static elem_t full_A[MAT_DIM_I][MAT_DIM_K] row_align(1); + static elem_t full_B[MAT_DIM_K][MAT_DIM_J] row_align(1); + static elem_t full_C[MAT_DIM_I][MAT_DIM_J] row_align(1); + static ACC_T full_D[MAT_DIM_I][MAT_DIM_J] row_align_acc(1); + + static full_t gold_full[MAT_DIM_I][MAT_DIM_J]; + static elem_t gold[MAT_DIM_I][MAT_DIM_J]; + +#if CHECK_RESULT == 1 +#ifdef FAST +#define RAND 1 +#else +#define RAND rand() +#endif + // printf("Init A\n"); + for (size_t i = 0; i < MAT_DIM_I; ++i) { + for (size_t j = 0; j < MAT_DIM_K; ++j) { + full_A[i][j] = RAND % 2; + } + } + + // printf("Init B\n"); + for (size_t i = 0; i < MAT_DIM_K; ++i) { + for (size_t j = 0; j < MAT_DIM_J; ++j) { + full_B[i][j] = RAND % 2; + } + } + + // printf("Init D\n"); + for (size_t i = 0; i < MAT_DIM_I; ++i) { + for (size_t j = 0; j < MAT_DIM_J; ++j) { + full_D[i][j] = NO_BIAS ? 0 : RAND % 2; + } + } + + printf("Starting slow CPU matmul\n"); + unsigned long cpu_start = read_cycles(); +#ifdef FAST + for (size_t i = 0; i < MAT_DIM_I; ++i) { + for (size_t j = 0; j < MAT_DIM_J; ++j) { + gold_full[i][j] = MAT_DIM_K + (NO_BIAS ? 0 : (RAND % 2)); + } + } + +#else + full_matmul(full_A, full_B, full_D, gold_full); +#endif + unsigned long cpu_end = read_cycles(); + printf("Cycles taken: %u\n", cpu_end-cpu_start); + full_matscale(gold_full, gold, ACC_SCALE_IDENTITY); +#endif + + printf("Starting gemmini matmul\n"); + unsigned long start = read_cycles(); + + tiled_matmul_auto(MAT_DIM_I, MAT_DIM_J, MAT_DIM_K, + (elem_t*)full_A, (elem_t*)full_B, NO_BIAS ? NULL : &full_D[0][0], (elem_t*)full_C, + MAT_DIM_K, MAT_DIM_J, MAT_DIM_J, MAT_DIM_J, + scale, scale, MVIN_SCALE_IDENTITY, //scale_A, scale_B, scale_D + NO_ACTIVATION, ACC_SCALE_IDENTITY, 0, false, + false, false, + false, !FULL_BIAS_WIDTH, + 0, + WS); + + unsigned long end = read_cycles(); + printf("Cycles taken: %u\n", end-start); + +#if CHECK_RESULT == 1 + if (!full_is_equal(full_C, gold)) { + printf("C:\n"); + full_printMatrix(full_C); + printf("Gold:\n"); + full_printMatrix(gold); + printf("\n"); + + exit(1); + } +#endif + + exit(0); +} + diff --git a/bareMetalC/transposed_conv.c b/bareMetalC/transposed_conv.c new file mode 100644 index 00000000..434fa159 --- /dev/null +++ b/bareMetalC/transposed_conv.c @@ -0,0 +1,122 @@ +#include +#include +#include +#include +#include +#ifndef BAREMETAL +#include +#endif +#include "include/gemmini_testutils.h" + +#define BATCH_SIZE 1 +#define IN_DIM 4 +#define PADDING 0 +#define CHANNELS 1 +#define KERNEL_DIM 2 +#define STRIDE 2 +#define OUTPUT_PADDING 0 + +#define A ((IN_DIM + 2*PADDING - KERNEL_DIM) % STRIDE) +#define ZERO_PAD_TRANS (KERNEL_DIM-PADDING-1) +#define STRETCHED_IN_DIM (IN_DIM + (STRIDE-1)*(IN_DIM-1)) +#define A_PADDED_IN_DIM (STRETCHED_IN_DIM + A) +#define OUT_DIM ((IN_DIM-1)*STRIDE - 2*PADDING + (KERNEL_DIM-1) + 1 + OUTPUT_PADDING) + +#define FAST + +//void transposed_conv_cpu(elem_t In[IN_DIM][IN_DIM][CHANNELS], +// elem_t Wght[CHANNELS][KERNEL_DIM][KERNEL_DIM][CHANNELS], +// elem_t Out[OUT_DIM][OUT_DIM][CHANNELS]) +//{ +// +//} + +void init_random(elem_t * buf, int len, elem_t init) { + elem_t i = 0; + for (elem_t * ptr = buf; ptr < buf + len; ptr++) { + // *ptr = (rand() % 32) - 16; +#ifdef FAST + *ptr = init; +#else + *ptr = (rand() % 5) - 2; +#endif + } +} + +void dump_matrix(elem_t * buf, int len, const char * filename){ + for (elem_t * ptr = buf; ptr < buf + len; ptr++) { + printf("%d\n", *ptr); + } +} + +int main (int argc, char * argv[]) { + #ifndef BAREMETAL + if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) { + perror("mlockall failed"); + exit(1); + } + #endif + + printf("IN_DIM: %d\n", IN_DIM); + //printf("STRETCH_IN: %d\n", STRETCHED_IN_DIM); + //printf("A: %d\n", A); + //printf("PADDED_IN_DIM: %d\n", A_PADDED_IN_DIM); + printf("KERNEL_DIM: %d\n", KERNEL_DIM); + printf("OUT_DIM: %d\n", OUT_DIM); + + elem_t input[BATCH_SIZE][IN_DIM][IN_DIM][CHANNELS]; + elem_t weights[CHANNELS][KERNEL_DIM][KERNEL_DIM][CHANNELS]; + elem_t output[BATCH_SIZE][OUT_DIM][OUT_DIM][CHANNELS]; + + init_random(&input[0][0][0][0], sizeof(input)/sizeof(elem_t), 1); + init_random(&weights[0][0][0][0], sizeof(weights)/sizeof(elem_t), 1); + init_random(&output[0][0][0][0], sizeof(output)/sizeof(elem_t), -1); + + dump_matrix(&output[0][0][0][0], sizeof(output)/sizeof(elem_t), "test_data.txt"); + dump_matrix(&input[0][0][0][0], sizeof(input)/sizeof(elem_t), "test_data.txt"); + dump_matrix(&weights[0][0][0][0], sizeof(weights)/sizeof(elem_t), "test_data.txt"); + + //for(int bs = 0; bs < BATCH_SIZE; bs++){ + // for(int row = 0; row < IN_DIM; row++){ + // if(row == IN_DIM-1){ + // for(int col = 0; col < IN_DIM; col++){ + // for(int chan = 0; chan < CHANNELS; chan++){ + // input[bs][row][col][chan] = 0; + // } + // } + // } + // else{ + // for(int chan = 0; chan < CHANNELS; chan++){ + // input[bs][row][IN_DIM-1][chan] = 0; + // } + // continue; + // } + // } + //} + + gemmini_flush(0); + + printf("Gemmini transposed conv...\n"); + uint64_t start_gemmini = read_cycles(); + + tiled_transposed_conv_auto( + BATCH_SIZE, IN_DIM, CHANNELS, OUT_DIM, + STRIDE, PADDING, KERNEL_DIM, + + (elem_t*)input, + (elem_t*)weights, + (elem_t*)output, + + ACC_SCALE_IDENTITY + ); + + uint64_t end_gemmini = read_cycles(); + printf("Gemmini transposed conv took %llu cycles\n", end_gemmini - start_gemmini); + + dump_matrix(&output[0][0][0][0], sizeof(output)/sizeof(elem_t), "test_data.txt"); + + //printf("Slow CPU transposed conv...\n"); + + return 0; +} + diff --git a/build.sh b/build.sh index 7eaf0169..9f9b3741 100755 --- a/build.sh +++ b/build.sh @@ -14,9 +14,9 @@ fi cd build -if [[ $(which riscv64-unknown-linux-gnu-gcc) ]] ; then - make -j $@ -else +#if [[ $(which riscv64-unknown-linux-gnu-gcc) ]] ; then +# make -j $@ +#else make -j BAREMETAL_ONLY=1 $@ -fi +#fi diff --git a/imagenet/Makefile b/imagenet/Makefile index 798dacd9..3abe9789 100644 --- a/imagenet/Makefile +++ b/imagenet/Makefile @@ -3,6 +3,9 @@ include $(abs_top_srcdir)/Makefrag tests = \ resnet50 \ mobilenet \ + #vgg16 \ + #vgg_ssd \ + #gemmini_fftradnet_dh \ # alexnet \ tests_baremetal = $(tests:=-baremetal) diff --git a/imagenet/mobilenet.c b/imagenet/mobilenet.c index a88292be..dee2d5fc 100644 --- a/imagenet/mobilenet.c +++ b/imagenet/mobilenet.c @@ -97,7 +97,7 @@ int main (int argc, char * argv[]) { conv_1_params.batch_size, conv_1_params.in_row_dim, conv_1_params.in_col_dim, conv_1_params.in_channels, conv_1_params.out_channels, conv_1_params.out_row_dim, conv_1_params.out_col_dim, - conv_1_params.stride, 1, 1, conv_1_params.padding, conv_1_params.kernel_size, + conv_1_params.stride, 1, 1, conv_1_params.padding, conv_1_params.kernel_size, conv_1_params.kernel_size, false, false, false, false, false, (elem_t*)images, (elem_t*)conv_1_w, (acc_t*)conv_1_b, (elem_t*)conv_1_out, @@ -126,7 +126,7 @@ int main (int argc, char * argv[]) { conv_dw_2_params.batch_size, conv_dw_2_params.in_row_dim, conv_dw_2_params.in_col_dim, conv_dw_2_params.in_channels, conv_dw_2_params.out_row_dim, conv_dw_2_params.out_col_dim, - conv_dw_2_params.stride, conv_dw_2_params.padding, conv_dw_2_params.kernel_size, + conv_dw_2_params.stride, conv_dw_2_params.padding, conv_dw_2_params.kernel_size, conv_dw_2_params.kernel_size, (elem_t*)conv_1_out, (elem_t*)conv_dw_2_w, (acc_t*)conv_dw_2_b, (elem_t*)conv_dw_2_out, @@ -206,7 +206,7 @@ int main (int argc, char * argv[]) { conv_dw_5_params.batch_size, conv_dw_5_params.in_row_dim, conv_dw_5_params.in_col_dim, conv_dw_5_params.in_channels, conv_dw_5_params.out_row_dim, conv_dw_5_params.out_col_dim, - conv_dw_5_params.stride, conv_dw_5_params.padding, conv_dw_5_params.kernel_size, + conv_dw_5_params.stride, conv_dw_5_params.padding, conv_dw_5_params.kernel_size, conv_dw_5_params.kernel_size, (elem_t*)conv_4_out, (elem_t*)conv_dw_5_w, (acc_t*)conv_dw_5_b, (elem_t*)conv_dw_5_out, @@ -287,7 +287,7 @@ int main (int argc, char * argv[]) { conv_dw_8_params.batch_size, conv_dw_8_params.in_row_dim, conv_dw_8_params.in_col_dim, conv_dw_8_params.in_channels, conv_dw_8_params.out_row_dim, conv_dw_8_params.out_col_dim, - conv_dw_8_params.stride, conv_dw_8_params.padding, conv_dw_8_params.kernel_size, + conv_dw_8_params.stride, conv_dw_8_params.padding, conv_dw_8_params.kernel_size, conv_dw_8_params.kernel_size, (elem_t*)conv_7_out, (elem_t*)conv_dw_8_w, (acc_t*)conv_dw_8_b, (elem_t*)conv_dw_8_out, @@ -383,7 +383,7 @@ int main (int argc, char * argv[]) { conv_dw_11_params.batch_size, conv_dw_11_params.in_row_dim, conv_dw_11_params.in_col_dim, conv_dw_11_params.in_channels, conv_dw_11_params.out_row_dim, conv_dw_11_params.out_col_dim, - conv_dw_11_params.stride, conv_dw_11_params.padding, conv_dw_11_params.kernel_size, + conv_dw_11_params.stride, conv_dw_11_params.padding, conv_dw_11_params.kernel_size, conv_dw_11_params.kernel_size, (elem_t*)conv_10_out, (elem_t*)conv_dw_11_w, (acc_t*)conv_dw_11_b, (elem_t*)conv_dw_11_out, @@ -463,7 +463,7 @@ int main (int argc, char * argv[]) { conv_dw_14_params.batch_size, conv_dw_14_params.in_row_dim, conv_dw_14_params.in_col_dim, conv_dw_14_params.in_channels, conv_dw_14_params.out_row_dim, conv_dw_14_params.out_col_dim, - conv_dw_14_params.stride, conv_dw_14_params.padding, conv_dw_14_params.kernel_size, + conv_dw_14_params.stride, conv_dw_14_params.padding, conv_dw_14_params.kernel_size, conv_dw_14_params.kernel_size, (elem_t*)conv_13_out, (elem_t*)conv_dw_14_w, (acc_t*)conv_dw_14_b, (elem_t*)conv_dw_14_out, @@ -559,7 +559,7 @@ int main (int argc, char * argv[]) { conv_dw_17_params.batch_size, conv_dw_17_params.in_row_dim, conv_dw_17_params.in_col_dim, conv_dw_17_params.in_channels, conv_dw_17_params.out_row_dim, conv_dw_17_params.out_col_dim, - conv_dw_17_params.stride, conv_dw_17_params.padding, conv_dw_17_params.kernel_size, + conv_dw_17_params.stride, conv_dw_17_params.padding, conv_dw_17_params.kernel_size, conv_dw_17_params.kernel_size, (elem_t*)conv_16_out, (elem_t*)conv_dw_17_w, (acc_t*)conv_dw_17_b, (elem_t*)conv_dw_17_out, @@ -655,7 +655,7 @@ int main (int argc, char * argv[]) { conv_dw_20_params.batch_size, conv_dw_20_params.in_row_dim, conv_dw_20_params.in_col_dim, conv_dw_20_params.in_channels, conv_dw_20_params.out_row_dim, conv_dw_20_params.out_col_dim, - conv_dw_20_params.stride, conv_dw_20_params.padding, conv_dw_20_params.kernel_size, + conv_dw_20_params.stride, conv_dw_20_params.padding, conv_dw_20_params.kernel_size, conv_dw_20_params.kernel_size, (elem_t*)conv_19_out, (elem_t*)conv_dw_20_w, (acc_t*)conv_dw_20_b, (elem_t*)conv_dw_20_out, @@ -734,7 +734,7 @@ int main (int argc, char * argv[]) { conv_dw_23_params.batch_size, conv_dw_23_params.in_row_dim, conv_dw_23_params.in_col_dim, conv_dw_23_params.in_channels, conv_dw_23_params.out_row_dim, conv_dw_23_params.out_col_dim, - conv_dw_23_params.stride, conv_dw_23_params.padding, conv_dw_23_params.kernel_size, + conv_dw_23_params.stride, conv_dw_23_params.padding, conv_dw_23_params.kernel_size, conv_dw_23_params.kernel_size, (elem_t*)conv_22_out, (elem_t*)conv_dw_23_w, (acc_t*)conv_dw_23_b, (elem_t*)conv_dw_23_out, @@ -830,7 +830,7 @@ int main (int argc, char * argv[]) { conv_dw_26_params.batch_size, conv_dw_26_params.in_row_dim, conv_dw_26_params.in_col_dim, conv_dw_26_params.in_channels, conv_dw_26_params.out_row_dim, conv_dw_26_params.out_col_dim, - conv_dw_26_params.stride, conv_dw_26_params.padding, conv_dw_26_params.kernel_size, + conv_dw_26_params.stride, conv_dw_26_params.padding, conv_dw_26_params.kernel_size, conv_dw_26_params.kernel_size, (elem_t*)conv_25_out, (elem_t*)conv_dw_26_w, (acc_t*)conv_dw_26_b, (elem_t*)conv_dw_26_out, @@ -926,7 +926,7 @@ int main (int argc, char * argv[]) { conv_dw_29_params.batch_size, conv_dw_29_params.in_row_dim, conv_dw_29_params.in_col_dim, conv_dw_29_params.in_channels, conv_dw_29_params.out_row_dim, conv_dw_29_params.out_col_dim, - conv_dw_29_params.stride, conv_dw_29_params.padding, conv_dw_29_params.kernel_size, + conv_dw_29_params.stride, conv_dw_29_params.padding, conv_dw_29_params.kernel_size, conv_dw_29_params.kernel_size, (elem_t*)conv_28_out, (elem_t*)conv_dw_29_w, (acc_t*)conv_dw_29_b, (elem_t*)conv_dw_29_out, @@ -1022,7 +1022,7 @@ int main (int argc, char * argv[]) { conv_dw_32_params.batch_size, conv_dw_32_params.in_row_dim, conv_dw_32_params.in_col_dim, conv_dw_32_params.in_channels, conv_dw_32_params.out_row_dim, conv_dw_32_params.out_col_dim, - conv_dw_32_params.stride, conv_dw_32_params.padding, conv_dw_32_params.kernel_size, + conv_dw_32_params.stride, conv_dw_32_params.padding, conv_dw_32_params.kernel_size, conv_dw_32_params.kernel_size, (elem_t*)conv_31_out, (elem_t*)conv_dw_32_w, (acc_t*)conv_dw_32_b, (elem_t*)conv_dw_32_out, @@ -1102,7 +1102,7 @@ int main (int argc, char * argv[]) { conv_dw_35_params.batch_size, conv_dw_35_params.in_row_dim, conv_dw_35_params.in_col_dim, conv_dw_35_params.in_channels, conv_dw_35_params.out_row_dim, conv_dw_35_params.out_col_dim, - conv_dw_35_params.stride, conv_dw_35_params.padding, conv_dw_35_params.kernel_size, + conv_dw_35_params.stride, conv_dw_35_params.padding, conv_dw_35_params.kernel_size, conv_dw_35_params.kernel_size, (elem_t*)conv_34_out, (elem_t*)conv_dw_35_w, (acc_t*)conv_dw_35_b, (elem_t*)conv_dw_35_out, @@ -1198,7 +1198,7 @@ int main (int argc, char * argv[]) { conv_dw_38_params.batch_size, conv_dw_38_params.in_row_dim, conv_dw_38_params.in_col_dim, conv_dw_38_params.in_channels, conv_dw_38_params.out_row_dim, conv_dw_38_params.out_col_dim, - conv_dw_38_params.stride, conv_dw_38_params.padding, conv_dw_38_params.kernel_size, + conv_dw_38_params.stride, conv_dw_38_params.padding, conv_dw_38_params.kernel_size, conv_dw_38_params.kernel_size, (elem_t*)conv_37_out, (elem_t*)conv_dw_38_w, (acc_t*)conv_dw_38_b, (elem_t*)conv_dw_38_out, @@ -1294,7 +1294,7 @@ int main (int argc, char * argv[]) { conv_dw_41_params.batch_size, conv_dw_41_params.in_row_dim, conv_dw_41_params.in_col_dim, conv_dw_41_params.in_channels, conv_dw_41_params.out_row_dim, conv_dw_41_params.out_col_dim, - conv_dw_41_params.stride, conv_dw_41_params.padding, conv_dw_41_params.kernel_size, + conv_dw_41_params.stride, conv_dw_41_params.padding, conv_dw_41_params.kernel_size, conv_dw_41_params.kernel_size, (elem_t*)conv_40_out, (elem_t*)conv_dw_41_w, (acc_t*)conv_dw_41_b, (elem_t*)conv_dw_41_out, @@ -1374,7 +1374,7 @@ int main (int argc, char * argv[]) { conv_dw_44_params.batch_size, conv_dw_44_params.in_row_dim, conv_dw_44_params.in_col_dim, conv_dw_44_params.in_channels, conv_dw_44_params.out_row_dim, conv_dw_44_params.out_col_dim, - conv_dw_44_params.stride, conv_dw_44_params.padding, conv_dw_44_params.kernel_size, + conv_dw_44_params.stride, conv_dw_44_params.padding, conv_dw_44_params.kernel_size, conv_dw_44_params.kernel_size, (elem_t*)conv_43_out, (elem_t*)conv_dw_44_w, (acc_t*)conv_dw_44_b, (elem_t*)conv_dw_44_out, @@ -1470,7 +1470,7 @@ int main (int argc, char * argv[]) { conv_dw_47_params.batch_size, conv_dw_47_params.in_row_dim, conv_dw_47_params.in_col_dim, conv_dw_47_params.in_channels, conv_dw_47_params.out_row_dim, conv_dw_47_params.out_col_dim, - conv_dw_47_params.stride, conv_dw_47_params.padding, conv_dw_47_params.kernel_size, + conv_dw_47_params.stride, conv_dw_47_params.padding, conv_dw_47_params.kernel_size, conv_dw_47_params.kernel_size, (elem_t*)conv_46_out, (elem_t*)conv_dw_47_w, (acc_t*)conv_dw_47_b, (elem_t*)conv_dw_47_out, @@ -1566,7 +1566,7 @@ int main (int argc, char * argv[]) { conv_dw_50_params.batch_size, conv_dw_50_params.in_row_dim, conv_dw_50_params.in_col_dim, conv_dw_50_params.in_channels, conv_dw_50_params.out_row_dim, conv_dw_50_params.out_col_dim, - conv_dw_50_params.stride, conv_dw_50_params.padding, conv_dw_50_params.kernel_size, + conv_dw_50_params.stride, conv_dw_50_params.padding, conv_dw_50_params.kernel_size, conv_dw_50_params.kernel_size, (elem_t*)conv_49_out, (elem_t*)conv_dw_50_w, (acc_t*)conv_dw_50_b, (elem_t*)conv_dw_50_out, diff --git a/imagenet/resnet50.c b/imagenet/resnet50.c index 4bbd7849..22f0ce35 100644 --- a/imagenet/resnet50.c +++ b/imagenet/resnet50.c @@ -109,7 +109,7 @@ int main (int argc, char * argv[]) { conv_1_params.batch_size, conv_1_params.in_row_dim, conv_1_params.in_col_dim, conv_1_params.in_channels, conv_1_params.out_channels, conv_1_params.out_row_dim, conv_1_params.out_col_dim, - conv_1_params.stride, 1, 1, conv_1_params.padding, conv_1_params.kernel_size, + conv_1_params.stride, 1, 1, conv_1_params.padding, conv_1_params.kernel_size, conv_1_params.kernel_size, false, false, false, false, false, (elem_t*)images, (elem_t*)conv_1_w, (acc_t*)conv_1_b, (elem_t*)conv_1_out_pooled, @@ -187,7 +187,7 @@ int main (int argc, char * argv[]) { conv_3_params.batch_size, conv_3_params.in_row_dim, conv_3_params.in_col_dim, conv_3_params.in_channels, conv_3_params.out_channels, conv_3_params.out_row_dim, conv_3_params.out_col_dim, - conv_3_params.stride, 1, 1, conv_3_params.padding, conv_3_params.kernel_size, + conv_3_params.stride, 1, 1, conv_3_params.padding, conv_3_params.kernel_size, conv_3_params.kernel_size, false, false, false, false, false, (elem_t*)conv_2_out, (elem_t*)conv_3_w, (acc_t*)conv_3_b, (elem_t*)conv_3_out, @@ -332,7 +332,7 @@ int main (int argc, char * argv[]) { conv_7_params.batch_size, conv_7_params.in_row_dim, conv_7_params.in_col_dim, conv_7_params.in_channels, conv_7_params.out_channels, conv_7_params.out_row_dim, conv_7_params.out_col_dim, - conv_7_params.stride, 1, 1, conv_7_params.padding, conv_7_params.kernel_size, + conv_7_params.stride, 1, 1, conv_7_params.padding, conv_7_params.kernel_size, conv_7_params.kernel_size, false, false, false, false, false, (elem_t*)conv_6_out, (elem_t*)conv_7_w, (acc_t*)conv_7_b, (elem_t*)conv_7_out, @@ -441,7 +441,7 @@ int main (int argc, char * argv[]) { conv_10_params.batch_size, conv_10_params.in_row_dim, conv_10_params.in_col_dim, conv_10_params.in_channels, conv_10_params.out_channels, conv_10_params.out_row_dim, conv_10_params.out_col_dim, - conv_10_params.stride, 1, 1, conv_10_params.padding, conv_10_params.kernel_size, + conv_10_params.stride, 1, 1, conv_10_params.padding, conv_10_params.kernel_size, conv_10_params.kernel_size, false, false, false, false, false, (elem_t*)conv_9_out, (elem_t*)conv_10_w, (acc_t*)conv_10_b, (elem_t*)conv_10_out, @@ -550,7 +550,7 @@ int main (int argc, char * argv[]) { conv_13_params.batch_size, conv_13_params.in_row_dim, conv_13_params.in_col_dim, conv_13_params.in_channels, conv_13_params.out_channels, conv_13_params.out_row_dim, conv_13_params.out_col_dim, - conv_13_params.stride, 1, 1, conv_13_params.padding, conv_13_params.kernel_size, + conv_13_params.stride, 1, 1, conv_13_params.padding, conv_13_params.kernel_size, conv_13_params.kernel_size, false, false, false, false, false, (elem_t*)conv_12_out, (elem_t*)conv_13_w, (acc_t*)conv_13_b, (elem_t*)conv_13_out, @@ -704,7 +704,7 @@ int main (int argc, char * argv[]) { conv_17_params.batch_size, conv_17_params.in_row_dim, conv_17_params.in_col_dim, conv_17_params.in_channels, conv_17_params.out_channels, conv_17_params.out_row_dim, conv_17_params.out_col_dim, - conv_17_params.stride, 1, 1, conv_17_params.padding, conv_17_params.kernel_size, + conv_17_params.stride, 1, 1, conv_17_params.padding, conv_17_params.kernel_size, conv_17_params.kernel_size, false, false, false, false, false, (elem_t*)conv_16_out, (elem_t*)conv_17_w, (acc_t*)conv_17_b, (elem_t*)conv_17_out, @@ -813,7 +813,7 @@ int main (int argc, char * argv[]) { conv_20_params.batch_size, conv_20_params.in_row_dim, conv_20_params.in_col_dim, conv_20_params.in_channels, conv_20_params.out_channels, conv_20_params.out_row_dim, conv_20_params.out_col_dim, - conv_20_params.stride, 1, 1, conv_20_params.padding, conv_20_params.kernel_size, + conv_20_params.stride, 1, 1, conv_20_params.padding, conv_20_params.kernel_size, conv_20_params.kernel_size, false, false, false, false, false, (elem_t*)conv_19_out, (elem_t*)conv_20_w, (acc_t*)conv_20_b, (elem_t*)conv_20_out, @@ -922,7 +922,7 @@ int main (int argc, char * argv[]) { conv_23_params.batch_size, conv_23_params.in_row_dim, conv_23_params.in_col_dim, conv_23_params.in_channels, conv_23_params.out_channels, conv_23_params.out_row_dim, conv_23_params.out_col_dim, - conv_23_params.stride, 1, 1, conv_23_params.padding, conv_23_params.kernel_size, + conv_23_params.stride, 1, 1, conv_23_params.padding, conv_23_params.kernel_size, conv_23_params.kernel_size, false, false, false, false, false, (elem_t*)conv_22_out, (elem_t*)conv_23_w, (acc_t*)conv_23_b, (elem_t*)conv_23_out, @@ -1031,7 +1031,7 @@ int main (int argc, char * argv[]) { conv_26_params.batch_size, conv_26_params.in_row_dim, conv_26_params.in_col_dim, conv_26_params.in_channels, conv_26_params.out_channels, conv_26_params.out_row_dim, conv_26_params.out_col_dim, - conv_26_params.stride, 1, 1, conv_26_params.padding, conv_26_params.kernel_size, + conv_26_params.stride, 1, 1, conv_26_params.padding, conv_26_params.kernel_size, conv_26_params.kernel_size, false, false, false, false, false, (elem_t*)conv_25_out, (elem_t*)conv_26_w, (acc_t*)conv_26_b, (elem_t*)conv_26_out, @@ -1185,7 +1185,7 @@ int main (int argc, char * argv[]) { conv_30_params.batch_size, conv_30_params.in_row_dim, conv_30_params.in_col_dim, conv_30_params.in_channels, conv_30_params.out_channels, conv_30_params.out_row_dim, conv_30_params.out_col_dim, - conv_30_params.stride, 1, 1, conv_30_params.padding, conv_30_params.kernel_size, + conv_30_params.stride, 1, 1, conv_30_params.padding, conv_30_params.kernel_size, conv_30_params.kernel_size, false, false, false, false, false, (elem_t*)conv_29_out, (elem_t*)conv_30_w, (acc_t*)conv_30_b, (elem_t*)conv_30_out, @@ -1294,7 +1294,7 @@ int main (int argc, char * argv[]) { conv_33_params.batch_size, conv_33_params.in_row_dim, conv_33_params.in_col_dim, conv_33_params.in_channels, conv_33_params.out_channels, conv_33_params.out_row_dim, conv_33_params.out_col_dim, - conv_33_params.stride, 1, 1, conv_33_params.padding, conv_33_params.kernel_size, + conv_33_params.stride, 1, 1, conv_33_params.padding, conv_33_params.kernel_size, conv_33_params.kernel_size, false, false, false, false, false, (elem_t*)conv_32_out, (elem_t*)conv_33_w, (acc_t*)conv_33_b, (elem_t*)conv_33_out, @@ -1403,7 +1403,7 @@ int main (int argc, char * argv[]) { conv_36_params.batch_size, conv_36_params.in_row_dim, conv_36_params.in_col_dim, conv_36_params.in_channels, conv_36_params.out_channels, conv_36_params.out_row_dim, conv_36_params.out_col_dim, - conv_36_params.stride, 1, 1, conv_36_params.padding, conv_36_params.kernel_size, + conv_36_params.stride, 1, 1, conv_36_params.padding, conv_36_params.kernel_size, conv_36_params.kernel_size, false, false, false, false, false, (elem_t*)conv_35_out, (elem_t*)conv_36_w, (acc_t*)conv_36_b, (elem_t*)conv_36_out, @@ -1512,7 +1512,7 @@ int main (int argc, char * argv[]) { conv_39_params.batch_size, conv_39_params.in_row_dim, conv_39_params.in_col_dim, conv_39_params.in_channels, conv_39_params.out_channels, conv_39_params.out_row_dim, conv_39_params.out_col_dim, - conv_39_params.stride, 1, 1, conv_39_params.padding, conv_39_params.kernel_size, + conv_39_params.stride, 1, 1, conv_39_params.padding, conv_39_params.kernel_size, conv_39_params.kernel_size, false, false, false, false, false, (elem_t*)conv_38_out, (elem_t*)conv_39_w, (acc_t*)conv_39_b, (elem_t*)conv_39_out, @@ -1621,7 +1621,7 @@ int main (int argc, char * argv[]) { conv_42_params.batch_size, conv_42_params.in_row_dim, conv_42_params.in_col_dim, conv_42_params.in_channels, conv_42_params.out_channels, conv_42_params.out_row_dim, conv_42_params.out_col_dim, - conv_42_params.stride, 1, 1, conv_42_params.padding, conv_42_params.kernel_size, + conv_42_params.stride, 1, 1, conv_42_params.padding, conv_42_params.kernel_size, conv_42_params.kernel_size, false, false, false, false, false, (elem_t*)conv_41_out, (elem_t*)conv_42_w, (acc_t*)conv_42_b, (elem_t*)conv_42_out, @@ -1730,7 +1730,7 @@ int main (int argc, char * argv[]) { conv_45_params.batch_size, conv_45_params.in_row_dim, conv_45_params.in_col_dim, conv_45_params.in_channels, conv_45_params.out_channels, conv_45_params.out_row_dim, conv_45_params.out_col_dim, - conv_45_params.stride, 1, 1, conv_45_params.padding, conv_45_params.kernel_size, + conv_45_params.stride, 1, 1, conv_45_params.padding, conv_45_params.kernel_size, conv_45_params.kernel_size, false, false, false, false, false, (elem_t*)conv_44_out, (elem_t*)conv_45_w, (acc_t*)conv_45_b, (elem_t*)conv_45_out, @@ -1799,7 +1799,7 @@ int main (int argc, char * argv[]) { conv_47_params.batch_size, conv_47_params.in_row_dim, conv_47_params.in_col_dim, conv_47_params.in_channels, conv_47_params.out_channels, conv_47_params.out_row_dim, conv_47_params.out_col_dim, - conv_47_params.stride, 1, 1, conv_47_params.padding, conv_47_params.kernel_size, + conv_47_params.stride, 1, 1, conv_47_params.padding, conv_47_params.kernel_size, conv_47_params.kernel_size, false, false, false, false, false, (elem_t*)conv_43_out, (elem_t*)conv_47_w, (acc_t*)conv_47_b, (elem_t*)conv_47_out, @@ -1883,7 +1883,7 @@ int main (int argc, char * argv[]) { conv_49_params.batch_size, conv_49_params.in_row_dim, conv_49_params.in_col_dim, conv_49_params.in_channels, conv_49_params.out_channels, conv_49_params.out_row_dim, conv_49_params.out_col_dim, - conv_49_params.stride, 1, 1, conv_49_params.padding, conv_49_params.kernel_size, + conv_49_params.stride, 1, 1, conv_49_params.padding, conv_49_params.kernel_size, conv_49_params.kernel_size, false, false, false, false, false, (elem_t*)conv_48_out, (elem_t*)conv_49_w, (acc_t*)conv_49_b, (elem_t*)conv_49_out, @@ -1992,7 +1992,7 @@ int main (int argc, char * argv[]) { conv_52_params.batch_size, conv_52_params.in_row_dim, conv_52_params.in_col_dim, conv_52_params.in_channels, conv_52_params.out_channels, conv_52_params.out_row_dim, conv_52_params.out_col_dim, - conv_52_params.stride, 1, 1, conv_52_params.padding, conv_52_params.kernel_size, + conv_52_params.stride, 1, 1, conv_52_params.padding, conv_52_params.kernel_size, conv_52_params.kernel_size, false, false, false, false, false, (elem_t*)conv_51_out, (elem_t*)conv_52_w, (acc_t*)conv_52_b, (elem_t*)conv_52_out, diff --git a/imagenet/vgg16.c b/imagenet/vgg16.c new file mode 100644 index 00000000..52c1357f --- /dev/null +++ b/imagenet/vgg16.c @@ -0,0 +1,323 @@ +#include +#include +#include +#ifndef BAREMETAL +#include +#endif +#include "include/gemmini.h" +#include "include/gemmini_nn.h" + +#include "vgg16.h" +// #include "resnet50_params_1batch.h" +#include "images.h" + + +int main (int argc, char * argv[]) { +#ifndef BAREMETAL + if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) { + perror("mlockall failed"); + exit(1); + } +#endif + + gemmini_flush(0); + + enum tiled_matmul_type_t tiled_matmul_type = WS; + + if (argc < 2) { + tiled_matmul_type = WS; + } else if (strcmp(argv[1], "cpu") == 0) { + tiled_matmul_type = CPU; + } else if (strcmp(argv[1], "os") == 0) { + tiled_matmul_type = OS; + } else if (strcmp(argv[1], "ws") == 0) { + tiled_matmul_type = WS; + } else if (strcmp(argv[1], "-h") == 0) { + printf("usage: %s [-h] matmul_option [check]\n matmul_option may be 'os', 'ws', or cpu'\n", argv[0]); + exit(0); + } else { + printf("Unknown command-line argument\n"); + printf("usage: %s [-h] matmul_option [check]\n matmul_option may be 'os', 'ws', or cpu'\n", argv[0]); + exit(1); + } + + bool conv = true; + + if (argc < 3) { + conv = true; + } else if (strcmp(argv[2], "conv") == 0) { + conv = true; + } else if (strcmp(argv[2], "matmul") == 0) { + conv = false; + } else { + printf("Unknown command-line argument\n"); + printf("usage: %s [-h] matmul_option [check] [conv]\n matmul_option may be 'os', 'ws', or cpu'\n", argv[0]); + exit(1); + } + + bool check = false; + + if (argc < 4) { + check = false; + } else if (strcmp(argv[3], "check") == 0) { + check = true; + } else { + printf("Unknown command-line argument\n"); + printf("usage: %s [-h] matmul_option [check]\n matmul_option may be 'os', 'ws', or cpu'\n", argv[0]); + exit(1); + } + + uint64_t start, end; + uint64_t im2col_cycles = 0, matmul_cycles = 0, conv_cycles = 0, pool_cycles = 0, conv_dw_cycles = 0, res_add_cycles = 0, other_cycles = 0; + + + + // Conv1 + printf("Starting Conv 1...\n"); + tiled_conv_auto( + conv_1_params.batch_size, + conv_1_params.in_dim, conv_1_params.in_dim, + conv_1_params.in_channels, conv_1_params.out_channels, + conv_1_params.out_dim, conv_1_params.out_dim, + conv_1_params.stride, 1, 1, conv_1_params.padding, + conv_1_params.kernel_size, conv_1_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_1_in, (elem_t*)conv_1_w, (acc_t*)conv_1_b, (elem_t*)conv_1_out, + + RELU, conv_1_params.output_scale, + conv_1_params.pool_size, conv_1_params.pool_stride, conv_1_params.pool_padding, + + tiled_matmul_type); + + + // Conv2 + printf("Starting Conv 2...\n"); + tiled_conv_auto( + conv_2_params.batch_size, + conv_2_params.in_dim, conv_2_params.in_dim, + conv_2_params.in_channels, conv_2_params.out_channels, + conv_2_params.out_dim, conv_2_params.out_dim, + conv_2_params.stride, 1, 1, conv_2_params.padding, + conv_2_params.kernel_size, conv_2_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_1_out, (elem_t*)conv_2_w, (acc_t*)conv_2_b, (elem_t*)conv_1_out, + + RELU, conv_2_params.output_scale, + conv_2_params.pool_size, conv_2_params.pool_stride, conv_2_params.pool_padding, + + tiled_matmul_type); + + // Conv3 + printf("Starting Conv 3...\n"); + tiled_conv_auto( + conv_3_params.batch_size, + conv_3_params.in_dim, conv_3_params.in_dim, + conv_3_params.in_channels, conv_3_params.out_channels, + conv_3_params.out_dim, conv_3_params.out_dim, + conv_3_params.stride, 1, 1, conv_3_params.padding, + conv_3_params.kernel_size, conv_3_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_2_out, (elem_t*)conv_3_w, (acc_t*)conv_3_b, (elem_t*)conv_3_out, + + RELU, conv_3_params.output_scale, + conv_3_params.pool_size, conv_3_params.pool_stride, conv_3_params.pool_padding, + + tiled_matmul_type); + + // Conv4 + printf("Starting Conv 4...\n"); + tiled_conv_auto( + conv_4_params.batch_size, + conv_4_params.in_dim, conv_4_params.in_dim, + conv_4_params.in_channels, conv_4_params.out_channels, + conv_4_params.out_dim, conv_4_params.out_dim, + conv_4_params.stride, 1, 1, conv_4_params.padding, + conv_4_params.kernel_size, conv_4_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_3_out, (elem_t*)conv_4_w, (acc_t*)conv_4_b, (elem_t*)conv_4_out, + + RELU, conv_4_params.output_scale, + conv_4_params.pool_size, conv_4_params.pool_stride, conv_4_params.pool_padding, + + tiled_matmul_type); + + // Conv5 + printf("Starting Conv 5...\n"); + tiled_conv_auto( + conv_5_params.batch_size, + conv_5_params.in_dim, conv_5_params.in_dim, + conv_5_params.in_channels, conv_5_params.out_channels, + conv_5_params.out_dim, conv_5_params.out_dim, + conv_5_params.stride, 1, 1, conv_5_params.padding, + conv_5_params.kernel_size, conv_5_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_4_out, (elem_t*)conv_5_w, (acc_t*)conv_5_b, (elem_t*)conv_5_out, + + RELU, conv_5_params.output_scale, + conv_5_params.pool_size, conv_5_params.pool_stride, conv_5_params.pool_padding, + + tiled_matmul_type); + + // Conv6 + printf("Starting Conv 6...\n"); + tiled_conv_auto( + conv_6_params.batch_size, + conv_6_params.in_dim, conv_6_params.in_dim, + conv_6_params.in_channels, conv_6_params.out_channels, + conv_6_params.out_dim, conv_6_params.out_dim, + conv_6_params.stride, 1, 1, conv_6_params.padding, + conv_6_params.kernel_size, conv_6_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_5_out, (elem_t*)conv_6_w, (acc_t*)conv_6_b, (elem_t*)conv_6_out, + + RELU, conv_6_params.output_scale, + conv_6_params.pool_size, conv_6_params.pool_stride, conv_6_params.pool_padding, + + tiled_matmul_type); + + + // Conv7 + printf("Starting Conv 7...\n"); + tiled_conv_auto( + conv_7_params.batch_size, + conv_7_params.in_dim, conv_7_params.in_dim, + conv_7_params.in_channels, conv_7_params.out_channels, + conv_7_params.out_dim, conv_7_params.out_dim, + conv_7_params.stride, 1, 1, conv_7_params.padding, + conv_7_params.kernel_size, conv_7_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_6_out, (elem_t*)conv_7_w, (acc_t*)conv_7_b, (elem_t*)conv_7_out, + + RELU, conv_8_params.output_scale, + conv_7_params.pool_size, conv_7_params.pool_stride, conv_7_params.pool_padding, + + tiled_matmul_type); + + // Conv8 + printf("Starting Conv 8...\n"); + tiled_conv_auto( + conv_8_params.batch_size, + conv_8_params.in_dim, conv_8_params.in_dim, + conv_8_params.in_channels, conv_8_params.out_channels, + conv_8_params.out_dim, conv_8_params.out_dim, + conv_8_params.stride, 1, 1, conv_8_params.padding, + conv_8_params.kernel_size, conv_8_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_7_out, (elem_t*)conv_8_w, (acc_t*)conv_8_b, (elem_t*)conv_8_out, + + RELU, conv_8_params.output_scale, + conv_8_params.pool_size, conv_8_params.pool_stride, conv_8_params.pool_padding, + + tiled_matmul_type); + + // Conv9 + printf("Starting Conv 9...\n"); + tiled_conv_auto( + conv_9_params.batch_size, + conv_9_params.in_dim, conv_9_params.in_dim, + conv_9_params.in_channels, conv_9_params.out_channels, + conv_9_params.out_dim, conv_9_params.out_dim, + conv_9_params.stride, 1, 1, conv_9_params.padding, + conv_9_params.kernel_size, conv_9_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_8_out, (elem_t*)conv_9_w, (acc_t*)conv_9_b, (elem_t*)conv_9_out, + + RELU, conv_9_params.output_scale, + conv_9_params.pool_size, conv_9_params.pool_stride, conv_9_params.pool_padding, + + tiled_matmul_type); + + // Conv10 + printf("Starting Conv 10...\n"); + tiled_conv_auto( + conv_10_params.batch_size, + conv_10_params.in_dim, conv_10_params.in_dim, + conv_10_params.in_channels, conv_10_params.out_channels, + conv_10_params.out_dim, conv_10_params.out_dim, + conv_10_params.stride, 1, 1, conv_10_params.padding, + conv_10_params.kernel_size, conv_10_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_9_out, (elem_t*)conv_10_w, (acc_t*)conv_10_b, (elem_t*)conv_10_out, + + RELU, conv_10_params.output_scale, + conv_10_params.pool_size, conv_10_params.pool_stride, conv_10_params.pool_padding, + + tiled_matmul_type); + + //Conv11 + printf("Starting Conv 11...\n"); + tiled_conv_auto( + conv_11_params.batch_size, + conv_11_params.in_dim, conv_11_params.in_dim, + conv_11_params.in_channels, conv_11_params.out_channels, + conv_11_params.out_dim, conv_11_params.out_dim, + conv_11_params.stride, 1, 1, conv_11_params.padding, + conv_11_params.kernel_size, conv_11_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_10_out, (elem_t*)conv_11_w, (acc_t*)conv_11_b, (elem_t*)conv_11_out, + + RELU, conv_11_params.output_scale, + conv_11_params.pool_size, conv_11_params.pool_stride, conv_11_params.pool_padding, + + tiled_matmul_type); + + //Conv12 + printf("Starting Conv 12...\n"); + tiled_conv_auto( + conv_12_params.batch_size, + conv_12_params.in_dim, conv_12_params.in_dim, + conv_12_params.in_channels, conv_12_params.out_channels, + conv_12_params.out_dim, conv_12_params.out_dim, + conv_12_params.stride, 1, 1, conv_12_params.padding, + conv_12_params.kernel_size, conv_12_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_11_out, (elem_t*)conv_12_w, (acc_t*)conv_12_b, (elem_t*)conv_12_out, + + RELU, conv_12_params.output_scale, + conv_12_params.pool_size, conv_12_params.pool_stride, conv_12_params.pool_padding, + + tiled_matmul_type); + + //Conv13 + printf("Starting Conv 13...\n"); + tiled_conv_auto( + conv_13_params.batch_size, + conv_13_params.in_dim, conv_13_params.in_dim, + conv_13_params.in_channels, conv_13_params.out_channels, + conv_13_params.out_dim, conv_13_params.out_dim, + conv_13_params.stride, 1, 1, conv_13_params.padding, + conv_13_params.kernel_size, conv_13_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_12_out, (elem_t*)conv_13_w, (acc_t*)conv_13_b, (elem_t*)conv_13_out, + + RELU, conv_13_params.output_scale, + conv_13_params.pool_size, conv_13_params.pool_stride, conv_13_params.pool_padding, + + tiled_matmul_type); + + + + start = read_cycles(); + printf("Layer with I=%i, K=%i and J=%i\n", fc_14_params.I, fc_14_params.K, fc_14_params.J); + + tiled_matmul_nn_auto(fc_14_params.I, fc_14_params.J, fc_14_params.K, + conv_13_out, fc_14_w, fc_14_b, fc_14_out, + RELU, 1, true, + tiled_matmul_type, check, "fc14"); + + end = read_cycles(); + matmul_cycles += end - start; +} \ No newline at end of file diff --git a/imagenet/vgg16.h b/imagenet/vgg16.h new file mode 100644 index 00000000..18b575a2 --- /dev/null +++ b/imagenet/vgg16.h @@ -0,0 +1,104 @@ +#ifndef C95AAE97_875E_4B5B_AEA4_1F2773C54BC2 +#define C95AAE97_875E_4B5B_AEA4_1F2773C54BC2 + +#include +#include + +static const elem_t conv_1_w[64][3][3][3] = {0}; +static elem_t conv_1_in[224][224][3] = {0}; +static elem_t conv_1_out[224][224][64] = {0}; +static elem_t conv_1_b[224][224][64] = {0}; +static const struct ConvParamsSimple conv_1_params = {.batch_size=1, .in_dim=224, .kernel_size=3, .in_channels=3, .out_channels=64, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=224, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=224, .output_scale=1.0}; + +static const elem_t conv_2_w[64][3][3][64] = {0}; +static elem_t conv_2_in[224][224][64] = {0}; +static elem_t conv_2_out[112][112][64] = {0}; +static elem_t conv_2_b[112][112][64] = {0}; +static const struct ConvParamsSimple conv_2_params = {.batch_size=1, .in_dim=224, .kernel_size=3, .in_channels=64, .out_channels=64, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=224, .pool_size=2, .pool_stride=2, .pool_padding=0, .out_dim_pooled=112, .output_scale=1.0}; + +static const elem_t conv_3_w[128][3][3][64] = {0}; +static elem_t conv_3_in[112][112][64] = {0}; +static elem_t conv_3_out[112][112][128] = {0}; +static elem_t conv_3_b[112][112][128] = {0}; +static const struct ConvParamsSimple conv_3_params = {.batch_size=1, .in_dim=112, .kernel_size=3, .in_channels=64, .out_channels=128, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=112, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=112, .output_scale=1.0}; + +static const elem_t conv_4_w[128][3][3][128] = {0}; +static elem_t conv_4_in[112][112][128] = {0}; +static elem_t conv_4_out[56][56][128] = {0}; +static elem_t conv_4_b[56][56][128] = {0}; +static const struct ConvParamsSimple conv_4_params = {.batch_size=1, .in_dim=112, .kernel_size=3, .in_channels=128, .out_channels=128, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=112, .pool_size=2, .pool_stride=2, .pool_padding=0, .out_dim_pooled=56, .output_scale=1.0}; + +static const elem_t conv_5_w[256][3][3][128] = {0}; +static elem_t conv_5_in[56][56][128] = {0}; +static elem_t conv_5_out[56][56][256] = {0}; +static elem_t conv_5_b[56][56][256] = {0}; +static const struct ConvParamsSimple conv_5_params = {.batch_size=1, .in_dim=56, .kernel_size=3, .in_channels=128, .out_channels=256, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=56, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=56, .output_scale=1.0}; + +static const elem_t conv_6_w[256][3][3][256] = {0}; +static elem_t conv_6_in[56][56][256] = {0}; +static elem_t conv_6_out[56][56][256] = {0}; +static elem_t conv_6_b[56][56][256] = {0}; +static const struct ConvParamsSimple conv_6_params = {.batch_size=1, .in_dim=56, .kernel_size=3, .in_channels=256, .out_channels=256, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=56, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=56, .output_scale=1.0}; + +static const elem_t conv_7_w[256][3][3][256] = {0}; +static elem_t conv_7_in[56][56][256] = {0}; +static elem_t conv_7_out[28][28][256] = {0}; +static elem_t conv_7_b[28][28][256] = {0}; +static const struct ConvParamsSimple conv_7_params = {.batch_size=1, .in_dim=56, .kernel_size=3, .in_channels=256, .out_channels=256, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=56, .pool_size=2, .pool_stride=2, .pool_padding=0, .out_dim_pooled=28, .output_scale=1.0}; + +static const elem_t conv_8_w[512][3][3][256] = {0}; +static elem_t conv_8_in[28][28][256] = {0}; +static elem_t conv_8_out[28][28][512] = {0}; +static elem_t conv_8_b[28][28][512] = {0}; +static const struct ConvParamsSimple conv_8_params = {.batch_size=1, .in_dim=28, .kernel_size=3, .in_channels=256, .out_channels=512, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=28, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=28, .output_scale=1.0}; + +static const elem_t conv_9_w[512][3][3][512] = {0}; +static elem_t conv_9_in[28][28][512] = {0}; +static elem_t conv_9_out[28][28][512] = {0}; +static elem_t conv_9_b[28][28][512] = {0}; +static const struct ConvParamsSimple conv_9_params = {.batch_size=1, .in_dim=28, .kernel_size=3, .in_channels=512, .out_channels=512, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=28, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=28, .output_scale=1.0}; + +static const elem_t conv_10_w[512][3][3][512] = {0}; +static elem_t conv_10_in[28][28][512] = {0}; +static elem_t conv_10_out[14][14][512] = {0}; +static elem_t conv_10_b[14][14][512] = {0}; +static const struct ConvParamsSimple conv_10_params = {.batch_size=1, .in_dim=28, .kernel_size=3, .in_channels=512, .out_channels=512, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=28, .pool_size=2, .pool_stride=2, .pool_padding=0, .out_dim_pooled=14, .output_scale=1.0}; + +static const elem_t conv_11_w[512][3][3][512] = {0}; +static elem_t conv_11_in[14][14][512] = {0}; +static elem_t conv_11_out[14][14][512] = {0}; +static elem_t conv_11_b[14][14][512] = {0}; +static const struct ConvParamsSimple conv_11_params = {.batch_size=1, .in_dim=14, .kernel_size=3, .in_channels=512, .out_channels=512, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=14, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=14, .output_scale=1.0}; + +static const elem_t conv_12_w[512][3][3][512] = {0}; +static elem_t conv_12_in[14][14][512] = {0}; +static elem_t conv_12_out[14][14][512] = {0}; +static elem_t conv_12_b[14][14][512] = {0}; +static const struct ConvParamsSimple conv_12_params = {.batch_size=1, .in_dim=14, .kernel_size=3, .in_channels=512, .out_channels=512, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=14, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=14, .output_scale=1.0}; + +static const elem_t conv_13_w[512][3][3][512] = {0}; +static elem_t conv_13_in[14][14][512] = {0}; +static elem_t conv_13_out[7][7][512] = {0}; +static elem_t conv_13_b[7][7][512] = {0}; +static const struct ConvParamsSimple conv_13_params = {.batch_size=1, .in_dim=14, .kernel_size=3, .in_channels=512, .out_channels=512, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=14, .pool_size=2, .pool_stride=2, .pool_padding=0, .out_dim_pooled=7, .output_scale=1.0}; + +static const acc_t fc_14_b[4096][1] = {0}; +static const elem_t fc_14_w[25088][4096] = {0}; +static elem_t fc_14_out[4096][1] row_align(1); +static const struct FcParams fc_14_params = {.batch_size=1, .in_features=25088, .out_features=4096, .bias=1, .output_scale=(1.0 / (1 << 10)), .I=4, .J=4096, .K=25088}; + + + + + + + + + + + + + + + +#endif /* C95AAE97_875E_4B5B_AEA4_1F2773C54BC2 */ diff --git a/imagenet/vgg_ssd.c b/imagenet/vgg_ssd.c new file mode 100644 index 00000000..6c748f6b --- /dev/null +++ b/imagenet/vgg_ssd.c @@ -0,0 +1,729 @@ + +#include +#include +#include +#ifndef BAREMETAL +#include +#endif +#include "include/gemmini.h" +#include "include/gemmini_nn.h" + +#include "vgg_ssd_params.h" +// #include "resnet50_params_1batch.h" +#include "images.h" + + +int main (int argc, char * argv[]) { +#ifndef BAREMETAL + if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) { + perror("mlockall failed"); + exit(1); + } +#endif + + gemmini_flush(0); + + enum tiled_matmul_type_t tiled_matmul_type = WS; + + if (argc < 2) { + tiled_matmul_type = WS; + } else if (strcmp(argv[1], "cpu") == 0) { + tiled_matmul_type = CPU; + } else if (strcmp(argv[1], "os") == 0) { + tiled_matmul_type = OS; + } else if (strcmp(argv[1], "ws") == 0) { + tiled_matmul_type = WS; + } else if (strcmp(argv[1], "-h") == 0) { + printf("usage: %s [-h] matmul_option [check]\n matmul_option may be 'os', 'ws', or cpu'\n", argv[0]); + exit(0); + } else { + printf("Unknown command-line argument\n"); + printf("usage: %s [-h] matmul_option [check]\n matmul_option may be 'os', 'ws', or cpu'\n", argv[0]); + exit(1); + } + + bool conv = true; + + if (argc < 3) { + conv = true; + } else if (strcmp(argv[2], "conv") == 0) { + conv = true; + } else if (strcmp(argv[2], "matmul") == 0) { + conv = false; + } else { + printf("Unknown command-line argument\n"); + printf("usage: %s [-h] matmul_option [check] [conv]\n matmul_option may be 'os', 'ws', or cpu'\n", argv[0]); + exit(1); + } + + bool check = false; + + if (argc < 4) { + check = false; + } else if (strcmp(argv[3], "check") == 0) { + check = true; + } else { + printf("Unknown command-line argument\n"); + printf("usage: %s [-h] matmul_option [check]\n matmul_option may be 'os', 'ws', or cpu'\n", argv[0]); + exit(1); + } + + uint64_t start, end; + uint64_t im2col_cycles = 0, matmul_cycles = 0, conv_cycles = 0, pool_cycles = 0, conv_dw_cycles = 0, res_add_cycles = 0, other_cycles = 0; + + + + // Conv1 + printf("Starting Conv 1...\n"); + tiled_conv_auto( + conv_1_params.batch_size, + conv_1_params.in_dim, conv_1_params.in_dim, + conv_1_params.in_channels, conv_1_params.out_channels, + conv_1_params.out_dim, conv_1_params.out_dim, + conv_1_params.stride, + 1, 1, conv_1_params.padding, + conv_1_params.kernel_size, conv_1_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_1_in, (elem_t*)conv_1_w, (acc_t*)conv_1_b, (elem_t*)conv_1_out, + + RELU, conv_1_params.output_scale, + conv_1_params.pool_size, conv_1_params.pool_stride, conv_1_params.pool_padding, + + tiled_matmul_type); + + + // Conv2 + printf("Starting Conv 2...\n"); + tiled_conv_auto( + conv_2_params.batch_size, + conv_2_params.in_dim, conv_2_params.in_dim, + conv_2_params.in_channels, conv_2_params.out_channels, + conv_2_params.out_dim, conv_2_params.out_dim, + conv_2_params.stride, 1, 1, conv_2_params.padding, + conv_2_params.kernel_size, conv_2_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_1_out, (elem_t*)conv_2_w, (acc_t*)conv_2_b, (elem_t*)conv_1_out, + + RELU, conv_2_params.output_scale, + conv_2_params.pool_size, conv_2_params.pool_stride, conv_2_params.pool_padding, + + tiled_matmul_type); + + // Conv3 + printf("Starting Conv 3...\n"); + tiled_conv_auto( + conv_3_params.batch_size, + conv_3_params.in_dim, conv_3_params.in_dim, + conv_3_params.in_channels, conv_3_params.out_channels, + conv_3_params.out_dim, conv_3_params.out_dim, + conv_3_params.stride, 1, 1, conv_3_params.padding, + conv_3_params.kernel_size, conv_3_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_2_out, (elem_t*)conv_3_w, (acc_t*)conv_3_b, (elem_t*)conv_3_out, + + RELU, conv_3_params.output_scale, + conv_3_params.pool_size, conv_3_params.pool_stride, conv_3_params.pool_padding, + + tiled_matmul_type); + + // Conv4 + printf("Starting Conv 4...\n"); + tiled_conv_auto( + conv_4_params.batch_size, + conv_4_params.in_dim, conv_4_params.in_dim, + conv_4_params.in_channels, conv_4_params.out_channels, + conv_4_params.out_dim, conv_4_params.out_dim, + conv_4_params.stride, 1, 1, conv_4_params.padding, + conv_4_params.kernel_size, conv_4_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_3_out, (elem_t*)conv_4_w, (acc_t*)conv_4_b, (elem_t*)conv_4_out, + + RELU, conv_4_params.output_scale, + conv_4_params.pool_size, conv_4_params.pool_stride, conv_4_params.pool_padding, + + tiled_matmul_type); + + // Conv5 + printf("Starting Conv 5...\n"); + tiled_conv_auto( + conv_5_params.batch_size, + conv_5_params.in_dim, conv_5_params.in_dim, + conv_5_params.in_channels, conv_5_params.out_channels, + conv_5_params.out_dim, conv_5_params.out_dim, + conv_5_params.stride, 1, 1, conv_5_params.padding, + conv_5_params.kernel_size, conv_5_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_4_out, (elem_t*)conv_5_w, (acc_t*)conv_5_b, (elem_t*)conv_5_out, + + RELU, conv_5_params.output_scale, + conv_5_params.pool_size, conv_5_params.pool_stride, conv_5_params.pool_padding, + + tiled_matmul_type); + + // Conv6 + printf("Starting Conv 6...\n"); + tiled_conv_auto( + conv_6_params.batch_size, + conv_6_params.in_dim, conv_6_params.in_dim, + conv_6_params.in_channels, conv_6_params.out_channels, + conv_6_params.out_dim, conv_6_params.out_dim, + conv_6_params.stride, 1, 1, conv_6_params.padding, + conv_6_params.kernel_size, conv_6_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_5_out, (elem_t*)conv_6_w, (acc_t*)conv_6_b, (elem_t*)conv_6_out, + + RELU, conv_6_params.output_scale, + conv_6_params.pool_size, conv_6_params.pool_stride, conv_6_params.pool_padding, + + tiled_matmul_type); + + + // Conv7 + printf("Starting Conv 7...\n"); + tiled_conv_auto( + conv_7_params.batch_size, + conv_7_params.in_dim, conv_7_params.in_dim, + conv_7_params.in_channels, conv_7_params.out_channels, + conv_7_params.out_dim, conv_7_params.out_dim, + conv_7_params.stride, 1, 1, conv_7_params.padding, + conv_7_params.kernel_size, conv_7_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_6_out, (elem_t*)conv_7_w, (acc_t*)conv_7_b, (elem_t*)conv_7_out, + + RELU, conv_8_params.output_scale, + conv_7_params.pool_size, conv_7_params.pool_stride, conv_7_params.pool_padding, + + tiled_matmul_type); + + // Conv8 + printf("Starting Conv 8...\n"); + tiled_conv_auto( + conv_8_params.batch_size, + conv_8_params.in_dim, conv_8_params.in_dim, + conv_8_params.in_channels, conv_8_params.out_channels, + conv_8_params.out_dim, conv_8_params.out_dim, + conv_8_params.stride, 1, 1, conv_8_params.padding, + conv_8_params.kernel_size, conv_8_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_7_out, (elem_t*)conv_8_w, (acc_t*)conv_8_b, (elem_t*)conv_8_out, + + RELU, conv_8_params.output_scale, + conv_8_params.pool_size, conv_8_params.pool_stride, conv_8_params.pool_padding, + + tiled_matmul_type); + + // Conv9 + printf("Starting Conv 9...\n"); + tiled_conv_auto( + conv_9_params.batch_size, + conv_9_params.in_dim, conv_9_params.in_dim, + conv_9_params.in_channels, conv_9_params.out_channels, + conv_9_params.out_dim, conv_9_params.out_dim, + conv_9_params.stride, 1, 1, conv_9_params.padding, + conv_9_params.kernel_size, conv_9_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_8_out, (elem_t*)conv_9_w, (acc_t*)conv_9_b, (elem_t*)conv_9_out, + + RELU, conv_9_params.output_scale, + conv_9_params.pool_size, conv_9_params.pool_stride, conv_9_params.pool_padding, + + tiled_matmul_type); + + // Conv10 + printf("Starting Conv 10...\n"); + tiled_conv_auto( + conv_10_params.batch_size, + conv_10_params.in_dim, conv_10_params.in_dim, + conv_10_params.in_channels, conv_10_params.out_channels, + conv_10_params.out_dim, conv_10_params.out_dim, + conv_10_params.stride, 1, 1, conv_10_params.padding, + conv_10_params.kernel_size, conv_10_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_9_out, (elem_t*)conv_10_w, (acc_t*)conv_10_b, (elem_t*)conv_10_out, + + RELU, conv_10_params.output_scale, + conv_10_params.pool_size, conv_10_params.pool_stride, conv_10_params.pool_padding, + + tiled_matmul_type); + + //Conv11 + printf("Starting Conv 11...\n"); + tiled_conv_auto( + conv_11_params.batch_size, + conv_11_params.in_dim, conv_11_params.in_dim, + conv_11_params.in_channels, conv_11_params.out_channels, + conv_11_params.out_dim, conv_11_params.out_dim, + conv_11_params.stride, 1, 1, conv_11_params.padding, + conv_11_params.kernel_size, conv_11_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_10_out, (elem_t*)conv_11_w, (acc_t*)conv_11_b, (elem_t*)conv_11_out, + + RELU, conv_11_params.output_scale, + conv_11_params.pool_size, conv_11_params.pool_stride, conv_11_params.pool_padding, + + tiled_matmul_type); + + //Conv12 + printf("Starting Conv 12...\n"); + tiled_conv_auto( + conv_12_params.batch_size, + conv_12_params.in_dim, conv_12_params.in_dim, + conv_12_params.in_channels, conv_12_params.out_channels, + conv_12_params.out_dim, conv_12_params.out_dim, + conv_12_params.stride, 1, 1, conv_12_params.padding, + conv_12_params.kernel_size, conv_12_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_11_out, (elem_t*)conv_12_w, (acc_t*)conv_12_b, (elem_t*)conv_12_out, + + RELU, conv_12_params.output_scale, + conv_12_params.pool_size, conv_12_params.pool_stride, conv_12_params.pool_padding, + + tiled_matmul_type); + + //Conv13 + printf("Starting Conv 13...\n"); + tiled_conv_auto( + conv_13_params.batch_size, + conv_13_params.in_dim, conv_13_params.in_dim, + conv_13_params.in_channels, conv_13_params.out_channels, + conv_13_params.out_dim, conv_13_params.out_dim, + conv_13_params.stride, 1, 1, conv_13_params.padding, + conv_13_params.kernel_size, conv_13_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_12_out, (elem_t*)conv_13_w, (acc_t*)conv_13_b, (elem_t*)conv_13_out, + + RELU, conv_13_params.output_scale, + conv_13_params.pool_size, conv_13_params.pool_stride, conv_13_params.pool_padding, + + tiled_matmul_type); + + + + //Auxilliary Convs for SSD + printf("Starting Conv Aux 6_1...\n"); + tiled_conv_auto( + conv_aux_6_1_params.batch_size, + conv_aux_6_1_params.in_dim, conv_aux_6_1_params.in_dim, + conv_aux_6_1_params.in_channels, conv_aux_6_1_params.out_channels, + conv_aux_6_1_params.out_dim, conv_aux_6_1_params.out_dim, + conv_aux_6_1_params.stride, 1, 1, conv_aux_6_1_params.padding, + conv_aux_6_1_params.kernel_size, conv_aux_6_1_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_12_out, (elem_t*)conv_aux_6_1_w, (acc_t*)conv_aux_6_1_b, (elem_t*)conv_aux_6_1_out, + + RELU, conv_aux_6_1_params.output_scale, + conv_aux_6_1_params.pool_size, conv_aux_6_1_params.pool_stride, conv_aux_6_1_params.pool_padding, + + tiled_matmul_type); + + //printf("Starting Conv Aux 6_2...\n"); + //tiled_conv_auto( + // conv_aux_6_2_params.batch_size, conv_aux_6_2_params.in_dim, conv_aux_6_2_params.in_channels, + // conv_aux_6_2_params.out_channels, conv_aux_6_2_params.out_dim, + // conv_aux_6_2_params.stride, 1, 1, conv_aux_6_2_params.padding, conv_aux_6_2_params.kernel_size, + // false, false, false, false, false, + + // (elem_t*)conv_aux_6_1_out, (elem_t*)conv_aux_6_2_w, (acc_t*)conv_aux_6_2_b, (elem_t*)conv_aux_6_2_out, + + // RELU, conv_aux_6_2_params.output_scale, + // conv_aux_6_2_params.pool_size, conv_aux_6_2_params.pool_stride, conv_aux_6_2_params.pool_padding, + + // tiled_matmul_type); + + printf("Starting Conv Aux 7_1...\n"); + tiled_conv_auto( + conv_aux_7_1_params.batch_size, + conv_aux_7_1_params.in_dim, conv_aux_7_1_params.in_dim, + conv_aux_7_1_params.in_channels, conv_aux_7_1_params.out_channels, + conv_aux_7_1_params.out_dim, conv_aux_7_1_params.out_dim, + conv_aux_7_1_params.stride, 1, 1, conv_aux_7_1_params.padding, + conv_aux_7_1_params.kernel_size, conv_aux_7_1_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_aux_6_1_out, (elem_t*)conv_aux_7_1_w, (acc_t*)conv_aux_7_1_b, (elem_t*)conv_aux_7_1_out, + + RELU, conv_aux_7_1_params.output_scale, + conv_aux_7_1_params.pool_size, conv_aux_7_1_params.pool_stride, conv_aux_7_1_params.pool_padding, + + tiled_matmul_type); + + //printf("Starting Conv Aux 7_2...\n"); + //tiled_conv_auto( + // conv_aux_7_2_params.batch_size, conv_aux_7_2_params.in_dim, conv_aux_7_2_params.in_channels, + // conv_aux_7_2_params.out_channels, conv_aux_7_2_params.out_dim, + // conv_aux_7_2_params.stride, 1, 1, conv_aux_7_2_params.padding, conv_aux_7_2_params.kernel_size, + // false, false, false, false, false, + + // (elem_t*)conv_aux_7_1_out, (elem_t*)conv_aux_7_2_w, (acc_t*)conv_aux_7_2_b, (elem_t*)conv_aux_7_2_out, + + // RELU, conv_aux_7_2_params.output_scale, + // conv_aux_7_2_params.pool_size, conv_aux_7_2_params.pool_stride, conv_aux_7_2_params.pool_padding, + + // tiled_matmul_type); + + + printf("Starting Conv Aux 8_1...\n"); + tiled_conv_auto( + conv_aux_8_1_params.batch_size, + conv_aux_8_1_params.in_dim, conv_aux_8_1_params.in_dim, + conv_aux_8_1_params.in_channels, conv_aux_8_1_params.out_channels, + conv_aux_8_1_params.out_dim, conv_aux_8_1_params.out_dim, + conv_aux_8_1_params.stride, 1, 1, conv_aux_8_1_params.padding, + conv_aux_8_1_params.kernel_size, conv_aux_8_1_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_aux_7_1_out, (elem_t*)conv_aux_8_1_w, (acc_t*)conv_aux_8_1_b, (elem_t*)conv_aux_8_1_out, + + RELU, conv_aux_8_1_params.output_scale, + conv_aux_8_1_params.pool_size, conv_aux_8_1_params.pool_stride, conv_aux_8_1_params.pool_padding, + + tiled_matmul_type); + + printf("Starting Conv Aux 8_2...\n"); + tiled_conv_auto( + conv_aux_8_2_params.batch_size, + conv_aux_8_2_params.in_dim, conv_aux_8_2_params.in_dim, + conv_aux_8_2_params.in_channels, conv_aux_8_2_params.out_channels, + conv_aux_8_2_params.out_dim, conv_aux_8_2_params.out_dim, + conv_aux_8_2_params.stride, 1, 1, conv_aux_8_2_params.padding, + conv_aux_8_2_params.kernel_size, conv_aux_8_2_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_aux_8_1_out, (elem_t*)conv_aux_8_2_w, (acc_t*)conv_aux_8_2_b, (elem_t*)conv_aux_8_2_out, + + RELU, conv_aux_8_2_params.output_scale, + conv_aux_8_2_params.pool_size, conv_aux_8_2_params.pool_stride, conv_aux_8_2_params.pool_padding, + + tiled_matmul_type); + + + printf("Starting Conv Aux 9_1...\n"); + tiled_conv_auto( + conv_aux_9_1_params.batch_size, + conv_aux_9_1_params.in_dim, conv_aux_9_1_params.in_dim, + conv_aux_9_1_params.in_channels, conv_aux_9_1_params.out_channels, + conv_aux_9_1_params.out_dim, conv_aux_9_1_params.out_dim, + conv_aux_9_1_params.stride, 1, 1, conv_aux_9_1_params.padding, + conv_aux_9_1_params.kernel_size, conv_aux_9_1_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_aux_8_2_out, (elem_t*)conv_aux_9_1_w, (acc_t*)conv_aux_9_1_b, (elem_t*)conv_aux_9_1_out, + + RELU, conv_aux_9_1_params.output_scale, + conv_aux_9_1_params.pool_size, conv_aux_9_1_params.pool_stride, conv_aux_9_1_params.pool_padding, + + tiled_matmul_type); + + printf("Starting Conv Aux 9_2...\n"); + tiled_conv_auto( + conv_aux_9_2_params.batch_size, + conv_aux_9_2_params.in_dim, conv_aux_9_2_params.in_dim, + conv_aux_9_2_params.in_channels, conv_aux_9_2_params.out_channels, + conv_aux_9_2_params.out_dim, conv_aux_9_2_params.out_dim, + conv_aux_9_2_params.stride, 1, 1, conv_aux_9_2_params.padding, + conv_aux_9_2_params.kernel_size, conv_aux_9_2_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_aux_9_1_out, (elem_t*)conv_aux_9_2_w, (acc_t*)conv_aux_9_2_b, (elem_t*)conv_aux_9_2_out, + + RELU, conv_aux_9_2_params.output_scale, + conv_aux_9_2_params.pool_size, conv_aux_9_2_params.pool_stride, conv_aux_9_2_params.pool_padding, + + tiled_matmul_type); + + + printf("Starting Conv Aux 10_1...\n"); + tiled_conv_auto( + conv_aux_10_1_params.batch_size, + conv_aux_10_1_params.in_dim, conv_aux_10_1_params.in_dim, + conv_aux_10_1_params.in_channels, conv_aux_10_1_params.out_channels, + conv_aux_10_1_params.out_dim, conv_aux_10_1_params.out_dim, + conv_aux_10_1_params.stride, 1, 1, conv_aux_10_1_params.padding, + conv_aux_10_1_params.kernel_size, conv_aux_10_1_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_aux_9_2_out, (elem_t*)conv_aux_10_1_w, (acc_t*)conv_aux_10_1_b, (elem_t*)conv_aux_10_1_out, + + RELU, conv_aux_10_1_params.output_scale, + conv_aux_10_1_params.pool_size, conv_aux_10_1_params.pool_stride, conv_aux_10_1_params.pool_padding, + + tiled_matmul_type); + + printf("Starting Conv Aux 10_2...\n"); + tiled_conv_auto( + conv_aux_10_2_params.batch_size, + conv_aux_10_2_params.in_dim, conv_aux_10_2_params.in_dim, + conv_aux_10_2_params.in_channels, conv_aux_10_2_params.out_channels, + conv_aux_10_2_params.out_dim, conv_aux_10_2_params.out_dim, + conv_aux_10_2_params.stride, 1, 1, conv_aux_10_2_params.padding, + conv_aux_10_2_params.kernel_size, conv_aux_10_2_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_aux_10_1_out, (elem_t*)conv_aux_10_2_w, (acc_t*)conv_aux_10_2_b, (elem_t*)conv_aux_10_2_out, + + RELU, conv_aux_10_2_params.output_scale, + conv_aux_10_2_params.pool_size, conv_aux_10_2_params.pool_stride, conv_aux_10_2_params.pool_padding, + + tiled_matmul_type); + + printf("Starting Conv Aux 11_1...\n"); + tiled_conv_auto( + conv_aux_11_1_params.batch_size, + conv_aux_11_1_params.in_dim, conv_aux_11_1_params.in_dim, + conv_aux_11_1_params.in_channels, conv_aux_11_1_params.out_channels, + conv_aux_11_1_params.out_dim, conv_aux_11_1_params.out_dim, + conv_aux_11_1_params.stride, 1, 1, conv_aux_11_1_params.padding, + conv_aux_11_1_params.kernel_size, conv_aux_11_1_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_aux_10_2_out, (elem_t*)conv_aux_11_1_w, (acc_t*)conv_aux_11_1_b, (elem_t*)conv_aux_11_1_out, + + RELU, conv_aux_11_1_params.output_scale, + conv_aux_11_1_params.pool_size, conv_aux_11_1_params.pool_stride, conv_aux_11_1_params.pool_padding, + + tiled_matmul_type); + + printf("Starting Conv Aux 11_2...\n"); + tiled_conv_auto( + conv_aux_11_2_params.batch_size, + conv_aux_11_2_params.in_dim, conv_aux_11_2_params.in_dim, + conv_aux_11_2_params.in_channels, conv_aux_11_2_params.out_channels, + conv_aux_11_2_params.out_dim, conv_aux_11_2_params.out_dim, + conv_aux_11_2_params.stride, 1, 1, conv_aux_11_2_params.padding, + conv_aux_11_2_params.kernel_size, conv_aux_11_2_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_aux_11_1_out, (elem_t*)conv_aux_11_2_w, (acc_t*)conv_aux_11_2_b, (elem_t*)conv_aux_11_2_out, + + RELU, conv_aux_11_2_params.output_scale, + conv_aux_11_2_params.pool_size, conv_aux_11_2_params.pool_stride, conv_aux_11_2_params.pool_padding, + + tiled_matmul_type); + + /* + * Localization Prediction + */ + printf("Starting Conv Loc Pred 1...\n"); + tiled_conv_auto( + conv_loc_pred_1_params.batch_size, + conv_loc_pred_1_params.in_dim, conv_loc_pred_1_params.in_dim, + conv_loc_pred_1_params.in_channels, conv_loc_pred_1_params.out_channels, + conv_loc_pred_1_params.out_dim, conv_loc_pred_1_params.out_dim, + conv_loc_pred_1_params.stride, 1, 1, conv_loc_pred_1_params.padding, + conv_loc_pred_1_params.kernel_size, conv_loc_pred_1_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_10_out, (elem_t*)conv_loc_pred_1_w, (acc_t*)conv_loc_pred_1_b, (elem_t*)conv_loc_pred_1_out, + + RELU, conv_loc_pred_1_params.output_scale, + conv_loc_pred_1_params.pool_size, conv_loc_pred_1_params.pool_stride, conv_loc_pred_1_params.pool_padding, + + tiled_matmul_type); + + + printf("Starting Conv Loc Pred 2...\n"); + tiled_conv_auto( + conv_loc_pred_2_params.batch_size, + conv_loc_pred_2_params.in_dim, conv_loc_pred_2_params.in_dim, + conv_loc_pred_2_params.in_channels, conv_loc_pred_2_params.out_channels, + conv_loc_pred_2_params.out_dim, conv_loc_pred_2_params.out_dim, + conv_loc_pred_2_params.stride, 1, 1, conv_loc_pred_2_params.padding, + conv_loc_pred_2_params.kernel_size, conv_loc_pred_2_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_aux_7_1_out, (elem_t*)conv_loc_pred_2_w, (acc_t*)conv_loc_pred_2_b, (elem_t*)conv_loc_pred_2_out, + + RELU, conv_loc_pred_2_params.output_scale, + conv_loc_pred_2_params.pool_size, conv_loc_pred_2_params.pool_stride, conv_loc_pred_2_params.pool_padding, + + tiled_matmul_type); + + printf("Starting Conv Loc Pred 3...\n"); + tiled_conv_auto( + conv_loc_pred_3_params.batch_size, + conv_loc_pred_3_params.in_dim, conv_loc_pred_3_params.in_dim, + conv_loc_pred_3_params.in_channels, conv_loc_pred_3_params.out_channels, + conv_loc_pred_3_params.out_dim, conv_loc_pred_3_params.out_dim, + conv_loc_pred_3_params.stride, 1, 1, conv_loc_pred_3_params.padding, + conv_loc_pred_3_params.kernel_size, conv_loc_pred_3_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_aux_8_2_out, (elem_t*)conv_loc_pred_3_w, (acc_t*)conv_loc_pred_3_b, (elem_t*)conv_loc_pred_3_out, + + RELU, conv_loc_pred_3_params.output_scale, + conv_loc_pred_3_params.pool_size, conv_loc_pred_3_params.pool_stride, conv_loc_pred_3_params.pool_padding, + + tiled_matmul_type); + + printf("Starting Conv Loc Pred 4...\n"); + tiled_conv_auto( + conv_loc_pred_4_params.batch_size, + conv_loc_pred_4_params.in_dim, conv_loc_pred_4_params.in_dim, + conv_loc_pred_4_params.in_channels, conv_loc_pred_4_params.out_channels, + conv_loc_pred_4_params.out_dim, conv_loc_pred_4_params.out_dim, + conv_loc_pred_4_params.stride, 1, 1, conv_loc_pred_4_params.padding, + conv_loc_pred_4_params.kernel_size, conv_loc_pred_4_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_aux_9_2_out, (elem_t*)conv_loc_pred_4_w, (acc_t*)conv_loc_pred_4_b, (elem_t*)conv_loc_pred_4_out, + + RELU, conv_loc_pred_4_params.output_scale, + conv_loc_pred_4_params.pool_size, conv_loc_pred_4_params.pool_stride, conv_loc_pred_4_params.pool_padding, + + tiled_matmul_type); + + printf("Starting Conv Loc Pred 5...\n"); + tiled_conv_auto( + conv_loc_pred_5_params.batch_size, + conv_loc_pred_5_params.in_dim, conv_loc_pred_5_params.in_dim, + conv_loc_pred_5_params.in_channels, conv_loc_pred_5_params.out_channels, + conv_loc_pred_5_params.out_dim, conv_loc_pred_5_params.out_dim, + conv_loc_pred_5_params.stride, 1, 1, conv_loc_pred_5_params.padding, + conv_loc_pred_5_params.kernel_size, conv_loc_pred_5_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_aux_10_2_out, (elem_t*)conv_loc_pred_5_w, (acc_t*)conv_loc_pred_5_b, (elem_t*)conv_loc_pred_5_out, + + RELU, conv_loc_pred_5_params.output_scale, + conv_loc_pred_5_params.pool_size, conv_loc_pred_5_params.pool_stride, conv_loc_pred_5_params.pool_padding, + + tiled_matmul_type); + + printf("Starting Conv Loc Pred 6...\n"); + tiled_conv_auto( + conv_loc_pred_6_params.batch_size, + conv_loc_pred_6_params.in_dim, conv_loc_pred_6_params.in_dim, + conv_loc_pred_6_params.in_channels, conv_loc_pred_6_params.out_channels, + conv_loc_pred_6_params.out_dim, conv_loc_pred_6_params.out_dim, + conv_loc_pred_6_params.stride, 1, 1, conv_loc_pred_6_params.padding, + conv_loc_pred_6_params.kernel_size, conv_loc_pred_6_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_aux_11_2_out, (elem_t*)conv_loc_pred_6_w, (acc_t*)conv_loc_pred_6_b, (elem_t*)conv_loc_pred_6_out, + + RELU, conv_loc_pred_6_params.output_scale, + conv_loc_pred_6_params.pool_size, conv_loc_pred_6_params.pool_stride, conv_loc_pred_6_params.pool_padding, + + tiled_matmul_type); + + /* + * Class Prediction + */ + printf("Starting Conv Class Pred 1...\n"); + tiled_conv_auto( + conv_class_pred_1_params.batch_size, + conv_class_pred_1_params.in_dim, conv_class_pred_1_params.in_dim, + conv_class_pred_1_params.in_channels, conv_class_pred_1_params.out_channels, + conv_class_pred_1_params.out_dim, conv_class_pred_1_params.out_dim, + conv_class_pred_1_params.stride, 1, 1, conv_class_pred_1_params.padding, + conv_class_pred_1_params.kernel_size, conv_class_pred_1_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_10_out, (elem_t*)conv_class_pred_1_w, (acc_t*)conv_class_pred_1_b, (elem_t*)conv_class_pred_1_out, + + RELU, conv_class_pred_1_params.output_scale, + conv_class_pred_1_params.pool_size, conv_class_pred_1_params.pool_stride, conv_class_pred_1_params.pool_padding, + + tiled_matmul_type); + + + printf("Starting Conv Class Pred 2...\n"); + tiled_conv_auto( + conv_class_pred_2_params.batch_size, + conv_class_pred_2_params.in_dim, conv_class_pred_2_params.in_dim, + conv_class_pred_2_params.in_channels, conv_class_pred_2_params.out_channels, + conv_class_pred_2_params.out_dim, conv_class_pred_2_params.out_dim, + conv_class_pred_2_params.stride, 1, 1, conv_class_pred_2_params.padding, + conv_class_pred_2_params.kernel_size, conv_class_pred_2_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_aux_7_1_out, (elem_t*)conv_class_pred_2_w, (acc_t*)conv_class_pred_2_b, (elem_t*)conv_class_pred_2_out, + + RELU, conv_class_pred_2_params.output_scale, + conv_class_pred_2_params.pool_size, conv_class_pred_2_params.pool_stride, conv_class_pred_2_params.pool_padding, + + tiled_matmul_type); + + printf("Starting Conv Class Pred 3...\n"); + tiled_conv_auto( + conv_class_pred_3_params.batch_size, + conv_class_pred_3_params.in_dim, conv_class_pred_3_params.in_dim, + conv_class_pred_3_params.in_channels, conv_class_pred_3_params.out_channels, + conv_class_pred_3_params.out_dim, conv_class_pred_3_params.out_dim, + conv_class_pred_3_params.stride, 1, 1, conv_class_pred_3_params.padding, + conv_class_pred_3_params.kernel_size, conv_class_pred_3_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_aux_8_2_out, (elem_t*)conv_class_pred_3_w, (acc_t*)conv_class_pred_3_b, (elem_t*)conv_class_pred_3_out, + + RELU, conv_class_pred_3_params.output_scale, + conv_class_pred_3_params.pool_size, conv_class_pred_3_params.pool_stride, conv_class_pred_3_params.pool_padding, + + tiled_matmul_type); + + printf("Starting Conv Class Pred 4...\n"); + tiled_conv_auto( + conv_class_pred_4_params.batch_size, + conv_class_pred_4_params.in_dim, conv_class_pred_4_params.in_dim, + conv_class_pred_4_params.in_channels, conv_class_pred_4_params.out_channels, + conv_class_pred_4_params.out_dim, conv_class_pred_4_params.out_dim, + conv_class_pred_4_params.stride, 1, 1, conv_class_pred_4_params.padding, + conv_class_pred_4_params.kernel_size, conv_class_pred_4_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_aux_9_2_out, (elem_t*)conv_class_pred_4_w, (acc_t*)conv_class_pred_4_b, (elem_t*)conv_class_pred_4_out, + + RELU, conv_class_pred_4_params.output_scale, + conv_class_pred_4_params.pool_size, conv_class_pred_4_params.pool_stride, conv_class_pred_4_params.pool_padding, + + tiled_matmul_type); + + printf("Starting Conv Class Pred 5...\n"); + tiled_conv_auto( + conv_class_pred_5_params.batch_size, + conv_class_pred_5_params.in_dim, conv_class_pred_5_params.in_dim, + conv_class_pred_5_params.in_channels, conv_class_pred_5_params.out_channels, + conv_class_pred_5_params.out_dim, conv_class_pred_5_params.out_dim, + conv_class_pred_5_params.stride, 1, 1, conv_class_pred_5_params.padding, + conv_class_pred_5_params.kernel_size, conv_class_pred_5_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_aux_10_2_out, (elem_t*)conv_class_pred_5_w, (acc_t*)conv_class_pred_5_b, (elem_t*)conv_class_pred_5_out, + + RELU, conv_class_pred_5_params.output_scale, + conv_class_pred_5_params.pool_size, conv_class_pred_5_params.pool_stride, conv_class_pred_5_params.pool_padding, + + tiled_matmul_type); + + printf("Starting Conv Class Pred 6...\n"); + tiled_conv_auto( + conv_class_pred_6_params.batch_size, + conv_class_pred_6_params.in_dim, conv_class_pred_6_params.in_dim, + conv_class_pred_6_params.in_channels, conv_class_pred_6_params.out_channels, + conv_class_pred_6_params.out_dim, conv_class_pred_6_params.out_dim, + conv_class_pred_6_params.stride, 1, 1, conv_class_pred_6_params.padding, + conv_class_pred_6_params.kernel_size, conv_class_pred_6_params.kernel_size, + false, false, false, false, false, + + (elem_t*)conv_aux_11_2_out, (elem_t*)conv_class_pred_6_w, (acc_t*)conv_class_pred_6_b, (elem_t*)conv_class_pred_6_out, + + RELU, conv_class_pred_6_params.output_scale, + conv_class_pred_6_params.pool_size, conv_class_pred_6_params.pool_stride, conv_class_pred_6_params.pool_padding, + + tiled_matmul_type); + +} \ No newline at end of file diff --git a/imagenet/vgg_ssd_params.h b/imagenet/vgg_ssd_params.h new file mode 100644 index 00000000..7a5f30ac --- /dev/null +++ b/imagenet/vgg_ssd_params.h @@ -0,0 +1,199 @@ +#ifndef F2F15EE8_05C8_43F3_96E4_FAF8C462ABFB +#define F2F15EE8_05C8_43F3_96E4_FAF8C462ABFB + +#include +#include + +static const elem_t conv_1_w[64][3][3][3] = {0}; +static elem_t conv_1_in[300][300][3] = {0}; +static elem_t conv_1_out[300][300][64] = {0}; +static elem_t conv_1_b[300][300][64] = {0};//1_1 +static const struct ConvParamsSimple conv_1_params = {.batch_size=1, .in_dim=300, .kernel_size=3, .in_channels=3, .out_channels=64, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=300, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=224, .output_scale=1.0}; + +static const elem_t conv_2_w[64][3][3][64] = {0}; +static elem_t conv_2_out[150][150][64] = {0}; +static elem_t conv_2_b[150][150][64] = {0};//1_2 +static const struct ConvParamsSimple conv_2_params = {.batch_size=1, .in_dim=300, .kernel_size=3, .in_channels=64, .out_channels=64, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=300, .pool_size=2, .pool_stride=2, .pool_padding=0, .out_dim_pooled=150, .output_scale=1.0}; + +static const elem_t conv_3_w[128][3][3][64] = {0}; +static elem_t conv_3_out[150][150][128] = {0}; +static elem_t conv_3_b[150][150][128] = {0};//2_1 +static const struct ConvParamsSimple conv_3_params = {.batch_size=1, .in_dim=150, .kernel_size=3, .in_channels=64, .out_channels=128, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=150, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=112, .output_scale=1.0}; + +static const elem_t conv_4_w[128][3][3][128] = {0}; +static elem_t conv_4_out[75][75][128] = {0}; +static elem_t conv_4_b[75][75][128] = {0};//2_2 +static const struct ConvParamsSimple conv_4_params = {.batch_size=1, .in_dim=150, .kernel_size=3, .in_channels=128, .out_channels=128, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=150, .pool_size=2, .pool_stride=2, .pool_padding=0, .out_dim_pooled=75, .output_scale=1.0}; + +static const elem_t conv_5_w[256][3][3][128] = {0}; +static elem_t conv_5_out[75][75][256] = {0}; +static elem_t conv_5_b[75][75][256] = {0};//3_1 +static const struct ConvParamsSimple conv_5_params = {.batch_size=1, .in_dim=75, .kernel_size=3, .in_channels=128, .out_channels=256, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=75, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=75, .output_scale=1.0}; + +static const elem_t conv_6_w[256][3][3][256] = {0}; +static elem_t conv_6_out[75][75][256] = {0}; +static elem_t conv_6_b[75][75][256] = {0};//3_2 +static const struct ConvParamsSimple conv_6_params = {.batch_size=1, .in_dim=75, .kernel_size=3, .in_channels=256, .out_channels=256, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=75, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=75, .output_scale=1.0}; + +static const elem_t conv_7_w[256][3][3][256] = {0}; +static elem_t conv_7_out[38][38][256] = {0}; +static elem_t conv_7_b[38][38][256] = {0};//3_3 +static const struct ConvParamsSimple conv_7_params = {.batch_size=1, .in_dim=75, .kernel_size=3, .in_channels=256, .out_channels=256, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=75, .pool_size=2, .pool_stride=2, .pool_padding=1, .out_dim_pooled=38, .output_scale=1.0}; + +static const elem_t conv_8_w[512][3][3][256] = {0}; +static elem_t conv_8_out[38][38][512] = {0}; +static elem_t conv_8_b[38][38][512] = {0};//4_1 +static const struct ConvParamsSimple conv_8_params = {.batch_size=1, .in_dim=38, .kernel_size=3, .in_channels=256, .out_channels=512, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=38, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=28, .output_scale=1.0}; + +static const elem_t conv_9_w[512][3][3][512] = {0}; +static elem_t conv_9_out[38][38][512] = {0}; +static elem_t conv_9_b[38][38][512] = {0};//4_2 +static const struct ConvParamsSimple conv_9_params = {.batch_size=1, .in_dim=38, .kernel_size=3, .in_channels=512, .out_channels=512, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=38, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=28, .output_scale=1.0}; + +static const elem_t conv_10_w[512][3][3][512] = {0}; +static elem_t conv_10_out[19][19][512] = {0}; +static elem_t conv_10_b[19][19][512] = {0};//4_3 +static const struct ConvParamsSimple conv_10_params = {.batch_size=1, .in_dim=38, .kernel_size=3, .in_channels=512, .out_channels=512, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=38, .pool_size=2, .pool_stride=2, .pool_padding=0, .out_dim_pooled=19, .output_scale=1.0}; + +static const elem_t conv_11_w[512][3][3][512] = {0}; +static elem_t conv_11_out[19][19][512] = {0}; +static elem_t conv_11_b[19][19][512] = {0};//5_1 +static const struct ConvParamsSimple conv_11_params = {.batch_size=1, .in_dim=19, .kernel_size=3, .in_channels=512, .out_channels=512, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=19, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=19, .output_scale=1.0}; + +static const elem_t conv_12_w[512][3][3][512] = {0}; +static elem_t conv_12_out[19][19][512] = {0}; +static elem_t conv_12_b[19][19][512] = {0};//5_2 +static const struct ConvParamsSimple conv_12_params = {.batch_size=1, .in_dim=19, .kernel_size=3, .in_channels=512, .out_channels=512, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=19, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=19, .output_scale=1.0}; + +static const elem_t conv_13_w[512][3][3][512] = {0}; +static elem_t conv_13_out[19][19][512] = {0}; +static elem_t conv_13_b[19][19][512] = {0};//5_3 +static const struct ConvParamsSimple conv_13_params = {.batch_size=1, .in_dim=19, .kernel_size=3, .in_channels=512, .out_channels=512, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=19, .pool_size=3, .pool_stride=1, .pool_padding=1, .out_dim_pooled=19, .output_scale=1.0}; + + +static const elem_t conv_aux_6_1_w[1024][1][1][512] = {0}; +static elem_t conv_aux_6_1_out[19][19][1024] = {0}; +static elem_t conv_aux_6_1_b[19][19][1024] = {0};//6_1 +static const struct ConvParamsSimple conv_aux_6_1_params = {.batch_size=1, .in_dim=19, .kernel_size=1, .in_channels=512, .out_channels=1024, .stride=1, .padding=0, .bias=0, .depthwise=0, .out_dim=19, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=19, .output_scale=1.0}; + + +static const elem_t conv_aux_7_1_w[1024][3][3][1024] = {0}; +static elem_t conv_aux_7_1_out[19][19][1024] = {0}; +static elem_t conv_aux_7_1_b[19][19][1024] = {0};//7_1 +static const struct ConvParamsSimple conv_aux_7_1_params = {.batch_size=1, .in_dim=19, .kernel_size=3, .in_channels=1024, .out_channels=1024, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=19, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=19, .output_scale=1.0}; + + + +// EXTRA AUXILLIARY CONV FUNCTIONS + +static const elem_t conv_aux_8_1_w[256][1][1][1024] = {0}; +static elem_t conv_aux_8_1_out[19][19][256] = {0}; +static elem_t conv_aux_8_1_b[19][19][512] = {0};// +static const struct ConvParamsSimple conv_aux_8_1_params = {.batch_size=1, .in_dim=19, .kernel_size=1, .in_channels=1024, .out_channels=256, .stride=1, .padding=0, .bias=0, .depthwise=0, .out_dim=19, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=19, .output_scale=1.0}; + +static const elem_t conv_aux_8_2_w[512][3][3][256] = {0}; +static elem_t conv_aux_8_2_out[10][10][512] = {0}; +static elem_t conv_aux_8_2_b[10][10][512] = {0};// +static const struct ConvParamsSimple conv_aux_8_2_params = {.batch_size=1, .in_dim=19, .kernel_size=3, .in_channels=256, .out_channels=512, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=10, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=10, .output_scale=1.0}; + + +static const elem_t conv_aux_9_1_w[128][1][1][512] = {0}; +static elem_t conv_aux_9_1_out[10][10][128] = {0}; +static elem_t conv_aux_9_1_b[10][10][128] = {0};// +static const struct ConvParamsSimple conv_aux_9_1_params = {.batch_size=1, .in_dim=10, .kernel_size=1, .in_channels=512, .out_channels=128, .stride=1, .padding=0, .bias=0, .depthwise=0, .out_dim=10, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=10, .output_scale=1.0}; + +static const elem_t conv_aux_9_2_w[256][3][3][128] = {0}; +static elem_t conv_aux_9_2_out[5][5][256] = {0}; +static elem_t conv_aux_9_2_b[5][5][256] = {0};// +static const struct ConvParamsSimple conv_aux_9_2_params = {.batch_size=1, .in_dim=10, .kernel_size=3, .in_channels=128, .out_channels=256, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=5, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=5, .output_scale=1.0}; + +static const elem_t conv_aux_10_1_w[128][1][1][256] = {0}; +static elem_t conv_aux_10_1_out[5][5][128] = {0}; +static elem_t conv_aux_10_1_b[5][5][128] = {0};// +static const struct ConvParamsSimple conv_aux_10_1_params = {.batch_size=1, .in_dim=5, .kernel_size=1, .in_channels=256, .out_channels=128, .stride=1, .padding=0, .bias=0, .depthwise=0, .out_dim=5, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=5, .output_scale=1.0}; + +static const elem_t conv_aux_10_2_w[256][3][3][128] = {0}; +static elem_t conv_aux_10_2_out[3][3][256] = {0}; +static elem_t conv_aux_10_2_b[3][3][256] = {0};// +static const struct ConvParamsSimple conv_aux_10_2_params = {.batch_size=1, .in_dim=5, .kernel_size=3, .in_channels=128, .out_channels=256, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=3, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=3, .output_scale=1.0}; + + +static const elem_t conv_aux_11_1_w[128][1][1][256] = {0}; +static elem_t conv_aux_11_1_out[3][3][512] = {0}; +static elem_t conv_aux_11_1_b[3][3][512] = {0}; +static const struct ConvParamsSimple conv_aux_11_1_params = {.batch_size=1, .in_dim=3, .kernel_size=1, .in_channels=256, .out_channels=128, .stride=1, .padding=0, .bias=0, .depthwise=0, .out_dim=3, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=3, .output_scale=1.0}; + +static const elem_t conv_aux_11_2_w[256][3][3][128] = {0}; +static elem_t conv_aux_11_2_out[1][1][512] = {0}; +static elem_t conv_aux_11_2_b[1][1][512] = {0}; +static const struct ConvParamsSimple conv_aux_11_2_params = {.batch_size=1, .in_dim=3, .kernel_size=3, .in_channels=128, .out_channels=256, .stride=1, .padding=1, .bias=0, .depthwise=0, .out_dim=1, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=1, .output_scale=1.0}; + +/* +* LOCALIZATION PREDICTION CONVOLUTIONS +*/ + +static const elem_t conv_loc_pred_1_w[16][3][3][512] = {0}; +static elem_t conv_loc_pred_1_out[38][38][16] = {0}; +static elem_t conv_loc_pred_1_b[38][38][16] = {0}; +static const struct ConvParamsSimple conv_loc_pred_1_params = {.batch_size=1, .in_dim=38, .kernel_size=3, .in_channels=512, .out_channels=16, .stride=1, .padding=0, .bias=0, .depthwise=0, .out_dim=38, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=14, .output_scale=1.0}; + +static const elem_t conv_loc_pred_2_w[24][3][3][1024] = {0}; +static elem_t conv_loc_pred_2_out[19][19][24] = {0}; +static elem_t conv_loc_pred_2_b[19][19][24] = {0}; +static const struct ConvParamsSimple conv_loc_pred_2_params = {.batch_size=1, .in_dim=19, .kernel_size=3, .in_channels=1024, .out_channels=24, .stride=1, .padding=0, .bias=0, .depthwise=0, .out_dim=19, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=14, .output_scale=1.0}; + +static const elem_t conv_loc_pred_3_w[24][3][3][512] = {0}; +static elem_t conv_loc_pred_3_out[10][10][24] = {0}; +static elem_t conv_loc_pred_3_b[10][10][24] = {0}; +static const struct ConvParamsSimple conv_loc_pred_3_params = {.batch_size=1, .in_dim=10, .kernel_size=3, .in_channels=512, .out_channels=24, .stride=1, .padding=0, .bias=0, .depthwise=0, .out_dim=10, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=14, .output_scale=1.0}; + +static const elem_t conv_loc_pred_4_w[24][3][3][256] = {0}; +static elem_t conv_loc_pred_4_out[5][5][24] = {0}; +static elem_t conv_loc_pred_4_b[5][5][24] = {0}; +static const struct ConvParamsSimple conv_loc_pred_4_params = {.batch_size=1, .in_dim=5, .kernel_size=3, .in_channels=256, .out_channels=24, .stride=1, .padding=0, .bias=0, .depthwise=0, .out_dim=5, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=14, .output_scale=1.0}; + +static const elem_t conv_loc_pred_5_w[16][3][3][256] = {0}; +static elem_t conv_loc_pred_5_out[3][3][16] = {0}; +static elem_t conv_loc_pred_5_b[3][3][16] = {0}; +static const struct ConvParamsSimple conv_loc_pred_5_params = {.batch_size=1, .in_dim=3, .kernel_size=3, .in_channels=256, .out_channels=16, .stride=1, .padding=0, .bias=0, .depthwise=0, .out_dim=3, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=14, .output_scale=1.0}; + +static const elem_t conv_loc_pred_6_w[16][3][3][256] = {0}; +static elem_t conv_loc_pred_6_out[1][1][16] = {0}; +static elem_t conv_loc_pred_6_b[1][1][16] = {0}; +static const struct ConvParamsSimple conv_loc_pred_6_params = {.batch_size=1, .in_dim=1, .kernel_size=3, .in_channels=256, .out_channels=16, .stride=1, .padding=0, .bias=0, .depthwise=0, .out_dim=1, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=14, .output_scale=1.0}; + +/* +* CLASS PREDICTION CONVOLUTIONS +*/ +static const elem_t conv_class_pred_1_w[64][3][3][512] = {0}; +static elem_t conv_class_pred_1_out[38][38][16] = {0}; +static elem_t conv_class_pred_1_b[38][38][16] = {0}; +static const struct ConvParamsSimple conv_class_pred_1_params = {.batch_size=1, .in_dim=38, .kernel_size=3, .in_channels=512, .out_channels=64, .stride=1, .padding=0, .bias=0, .depthwise=0, .out_dim=38, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=14, .output_scale=1.0}; + +static const elem_t conv_class_pred_2_w[96][3][3][1024] = {0}; +static elem_t conv_class_pred_2_out[19][19][16] = {0}; +static elem_t conv_class_pred_2_b[19][19][16] = {0}; +static const struct ConvParamsSimple conv_class_pred_2_params = {.batch_size=1, .in_dim=19, .kernel_size=3, .in_channels=1024, .out_channels=96, .stride=1, .padding=0, .bias=0, .depthwise=0, .out_dim=19, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=14, .output_scale=1.0}; + +static const elem_t conv_class_pred_3_w[96][3][3][512] = {0}; +static elem_t conv_class_pred_3_out[10][10][16] = {0}; +static elem_t conv_class_pred_3_b[10][10][16] = {0}; +static const struct ConvParamsSimple conv_class_pred_3_params = {.batch_size=1, .in_dim=10, .kernel_size=3, .in_channels=512, .out_channels=96, .stride=1, .padding=0, .bias=0, .depthwise=0, .out_dim=10, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=14, .output_scale=1.0}; + +static const elem_t conv_class_pred_4_w[96][3][3][256] = {0}; +static elem_t conv_class_pred_4_out[5][5][16] = {0}; +static elem_t conv_class_pred_4_b[5][5][16] = {0}; +static const struct ConvParamsSimple conv_class_pred_4_params = {.batch_size=1, .in_dim=5, .kernel_size=3, .in_channels=256, .out_channels=96, .stride=1, .padding=0, .bias=0, .depthwise=0, .out_dim=5, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=14, .output_scale=1.0}; + +static const elem_t conv_class_pred_5_w[64][3][3][256] = {0}; +static elem_t conv_class_pred_5_out[3][3][16] = {0}; +static elem_t conv_class_pred_5_b[3][3][16] = {0}; +static const struct ConvParamsSimple conv_class_pred_5_params = {.batch_size=1, .in_dim=3, .kernel_size=3, .in_channels=256, .out_channels=64, .stride=1, .padding=0, .bias=0, .depthwise=0, .out_dim=3, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=14, .output_scale=1.0}; + +static const elem_t conv_class_pred_6_w[64][3][3][256] = {0}; +static elem_t conv_class_pred_6_out[1][1][16] = {0}; +static elem_t conv_class_pred_6_b[1][1][16] = {0}; +static const struct ConvParamsSimple conv_class_pred_6_params = {.batch_size=1, .in_dim=1, .kernel_size=3, .in_channels=256, .out_channels=64, .stride=1, .padding=0, .bias=0, .depthwise=0, .out_dim=1, .pool_size=1, .pool_stride=1, .pool_padding=0, .out_dim_pooled=14, .output_scale=1.0}; + + +#endif /* F2F15EE8_05C8_43F3_96E4_FAF8C462ABFB */ diff --git a/include/gemmini.h b/include/gemmini.h index 76b50a48..bdd9343c 100644 --- a/include/gemmini.h +++ b/include/gemmini.h @@ -15,6 +15,7 @@ #include "include/gemmini_params.h" #define GEMMINI_ASSERTIONS +//#define PRINT_TILE // Accelerator interface #include "rocc-software/src/xcustom.h" @@ -54,6 +55,7 @@ #define CONFIG_LD 1 #define CONFIG_ST 2 #define CONFIG_BERT 3 +#define CONFIG_ACT 4 #define GARBAGE_ADDR ((uint32_t)(-1)) #define OUTPUT_STATIONARY 0 @@ -284,6 +286,9 @@ static acc_scale_t_bits acc_scale_t_to_acc_scale_t_bits(acc_scale_t x) { #define gemmini_config_norm(q_const, q_const_type, set_stats_id_only, act_msb, stat_id, igelu_qb, igelu_qc) \ ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, (((uint64_t) ((uint32_t) q_const)) << 32) | ((q_const_type & 1) << 18) | ((set_stats_id_only & 1) << 17) | ((act_msb & 1) << 16) | ((uint64_t)stat_id << 8) | CONFIG_BERT, ((uint64_t)((uint32_t)(igelu_qc)) << 32) | ((uint64_t)((uint32_t)(igelu_qb))), k_CONFIG) +#define gemmini_config_activation(offset, shift, activation) \ + ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)offset << 48) | ((uint64_t)shift << 32) | ((uint64_t) activation << 2) | ((uint64_t) CONFIG_ACT), 0, k_CONFIG) + // flush #define gemmini_flush(skip) \ ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, skip, 0, k_FLUSH) @@ -1328,7 +1333,7 @@ static void sp_tiled_conv( int out_channels, int out_row_dim, int out_col_dim, int pool_out_row_dim, int pool_out_col_dim, - int stride, int padding, int kernel_dim, int kernel_dilation, + int stride, int padding, int kernel_row_dim, int kernel_col_dim, int kernel_dilation, int pool_size, int pool_stride, int pool_padding, @@ -1386,6 +1391,7 @@ static void sp_tiled_conv( input_dilated || kernel_dilation > 1 || ichs > DIM ? 1 : DIM/ichs; if (max_pixels_per_row > kcols) max_pixels_per_row = kcols; + if (kernel_col_dim != kernel_row_dim) max_pixels_per_row = 1; //TODO currently not supported #else const int max_pixels_per_row = 1; #endif @@ -1413,8 +1419,8 @@ static void sp_tiled_conv( C_sp_addr_row = (C_sp_addr_row + ACC_ROWS / 2) % ACC_ROWS; } - if (in_row_dim == in_col_dim && out_row_dim == out_col_dim && pool_out_row_dim == pool_out_col_dim) { - gemmini_loop_conv_ws(batch_size, in_row_dim, in_channels, out_channels, out_row_dim, pool_out_row_dim, stride, padding, kernel_dim, kernel_dilation, pool_size, pool_stride, pool_padding, batches, porows, pocols, pochs, krows, kcols, kchs, lpad, rpad, upad, dpad, plpad, prpad, pupad, pdpad, orows, ocols, weights, output, bias, input, no_bias, no_pool, downsample, wrot180, input_dilated, act, trans_output_1203, trans_weight_1203, trans_weight_0132, trans_input_3120, max_pixels_per_row, dw); + if (in_row_dim == in_col_dim && out_row_dim == out_col_dim && pool_out_row_dim == pool_out_col_dim && kernel_row_dim == kernel_col_dim) { + gemmini_loop_conv_ws(batch_size, in_row_dim, in_channels, out_channels, out_row_dim, pool_out_row_dim, stride, padding, kernel_row_dim, kernel_dilation, pool_size, pool_stride, pool_padding, batches, porows, pocols, pochs, krows, kcols, kchs, lpad, rpad, upad, dpad, plpad, prpad, pupad, pdpad, orows, ocols, weights, output, bias, input, no_bias, no_pool, downsample, wrot180, input_dilated, act, trans_output_1203, trans_weight_1203, trans_weight_0132, trans_input_3120, max_pixels_per_row, dw); return; } @@ -1540,7 +1546,7 @@ static void sp_tiled_conv( if (dw) { dram_stride = sizeof(elem_t); } else if (trans_weight_1203) { - dram_stride = kernel_dim * kernel_dim * out_channels * sizeof(elem_t); + dram_stride = kernel_row_dim * kernel_col_dim * out_channels * sizeof(elem_t); } else if (trans_weight_0132) { dram_stride = in_channels * sizeof(elem_t); } @@ -1569,13 +1575,13 @@ static void sp_tiled_conv( B_sp_addr = B_sp_addr_start + (kch / DIM) * krows * kcols * ochs + krow * kcols * ochs + kcol * ochs + och; } - const elem_t * w = weights + (krow*kernel_dim*in_channels + kcol*in_channels + kch) * out_channels + och; + const elem_t * w = weights + (krow*kernel_col_dim*in_channels + kcol*in_channels + kch) * out_channels + och; if (dw) { - w = weights + krow * kernel_dim + kcol; + w = weights + krow * kernel_col_dim + kcol; } else if (trans_weight_1203) { - w = weights + (kch * kernel_dim * kernel_dim + krow * kernel_dim + kcol) * out_channels + och; + w = weights + (kch * kernel_row_dim * kernel_col_dim + krow * kernel_col_dim + kcol) * out_channels + och; } else if (trans_weight_0132) { - w = weights + (krow * kernel_dim * out_channels + kcol * out_channels + och) * in_channels + kch; + w = weights + (krow * kernel_col_dim * out_channels + kcol * out_channels + och) * in_channels + kch; } gemmini_extended_mvin2(w, B_sp_addr, J, K); @@ -1810,7 +1816,7 @@ static int tiled_conv_total_spad_rows(bool acc, static void conv_cpu_without_pool( int batch_size, int in_row_dim, int in_col_dim, int in_channels, int out_channels, int out_row_dim, int out_col_dim, - int stride, int input_dilation, int kernel_dilation, int padding, int kernel_dim, + int stride, int input_dilation, int kernel_dilation, int padding, int kernel_row_dim, int kernel_col_dim, bool wrot180, bool trans_output_1203, bool trans_input_3120, bool trans_weight_1203, bool trans_weight_0132, @@ -1830,13 +1836,13 @@ static void conv_cpu_without_pool( acc_t opixel = no_bias ? 0 : bias[och]; - for (int krow = 0; krow < kernel_dim; krow++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { if ((orow * stride + krow * kernel_dilation - padding) % input_dilation != 0) continue; const int irow = (orow * stride + krow * kernel_dilation - padding) / input_dilation; - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { if ((ocol * stride + kcol * kernel_dilation - padding) % input_dilation != 0) continue; @@ -1852,16 +1858,16 @@ static void conv_cpu_without_pool( elem_t ipixel = irow < 0 || irow >= in_row_dim || icol < 0 || icol >= in_col_dim ? 0 : *in; - const int krow_ = wrot180 ? kernel_dim - krow - 1 : krow; - const int kcol_ = wrot180 ? kernel_dim - kcol - 1 : kcol; + const int krow_ = wrot180 ? kernel_row_dim - krow - 1 : krow; + const int kcol_ = wrot180 ? kernel_col_dim - kcol - 1 : kcol; - elem_t weight = *(weights + (krow_ * kernel_dim * in_channels + kcol_ * in_channels + kch) * out_channels + och); + elem_t weight = *(weights + (krow_ * kernel_col_dim * in_channels + kcol_ * in_channels + kch) * out_channels + och); //TODO Check if (trans_weight_1203) { // HWIO to WIHO - weight = *(weights + (kch * kernel_dim * kernel_dim + krow_ * kernel_dim + kcol_) * out_channels + och); + weight = *(weights + (kch * kernel_row_dim * kernel_col_dim + krow_ * kernel_col_dim + kcol_) * out_channels + och); //TODO Check } else if (trans_weight_0132) { // HWIO to HWOI - weight = *(weights + (krow_ * kernel_dim * out_channels + kcol_ * out_channels + och) * in_channels + kch); + weight = *(weights + (krow_ * kernel_col_dim * out_channels + kcol_ * out_channels + och) * in_channels + kch); } opixel += weight * ipixel; @@ -1886,7 +1892,7 @@ static void conv_cpu_without_pool( static void conv_dw_cpu_without_pool( int batch_size, int in_row_dim, int in_col_dim, int channels, int out_row_dim, int out_col_dim, - int stride, int padding, int kernel_dim, + int stride, int padding, int kernel_row_dim, int kernel_col_dim, const elem_t * input, const elem_t * weights, @@ -1903,10 +1909,10 @@ static void conv_dw_cpu_without_pool( for (int ch = 0; ch < channels; ch++) { acc_t opixel = no_bias ? 0 : bias[ch]; - for (int krow = 0; krow < kernel_dim; krow++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { const int irow = orow * stride + krow - padding; - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { const int icol = ocol * stride + kcol - padding; const elem_t * in = input + (b * in_row_dim * in_col_dim + irow * in_col_dim + icol) * channels + ch; @@ -1914,7 +1920,7 @@ static void conv_dw_cpu_without_pool( const elem_t ipixel = irow < 0 || irow >= in_row_dim || icol < 0 || icol >= in_col_dim ? 0 : *in; - const elem_t weight = *(weights + (ch * kernel_dim + krow) * kernel_dim + kcol); + const elem_t weight = *(weights + (ch * kernel_col_dim + krow) * kernel_row_dim + kcol); opixel += weight * ipixel; } @@ -1933,7 +1939,7 @@ static void conv_dw_cpu_without_pool( static void conv_cpu( int batch_size, int in_row_dim, int in_col_dim, int in_channels, int out_channels, int out_row_dim, int out_col_dim, - int stride, int input_dilation, int kernel_dilation, int padding, int kernel_dim, + int stride, int input_dilation, int kernel_dilation, int padding, int kernel_row_dim, int kernel_col_dim, bool wrot180, bool trans_output_1203, bool trans_input_3120, bool trans_weight_1203, bool trans_weight_0132, @@ -1950,7 +1956,7 @@ static void conv_cpu( conv_cpu_without_pool( batch_size, in_row_dim, in_col_dim, in_channels, out_channels, out_row_dim, out_col_dim, - stride, input_dilation, kernel_dilation, padding, kernel_dim, + stride, input_dilation, kernel_dilation, padding, kernel_row_dim, kernel_col_dim, wrot180, trans_output_1203, trans_input_3120, trans_weight_1203, trans_weight_0132, input, weights, bias, output, @@ -1985,13 +1991,13 @@ static void conv_cpu( acc_t opixel = no_bias ? 0 : bias[poch]; - for (int krow = 0; krow < kernel_dim; krow++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { if ((orow * stride + krow * kernel_dilation - padding) % input_dilation != 0) continue; const int irow = (orow * stride + krow * kernel_dilation - padding) / input_dilation; - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { if ((ocol * stride + kcol * kernel_dilation - padding) % input_dilation != 0) continue; @@ -2007,16 +2013,16 @@ static void conv_cpu( elem_t ipixel = irow < 0 || irow >= in_row_dim || icol < 0 || icol >= in_col_dim ? 0 : *in; - const int krow_ = wrot180 ? kernel_dim - krow - 1 : krow; - const int kcol_ = wrot180 ? kernel_dim - kcol - 1 : kcol; + const int krow_ = wrot180 ? kernel_row_dim - krow - 1 : krow; + const int kcol_ = wrot180 ? kernel_col_dim - kcol - 1 : kcol; - elem_t weight = *(weights + (krow_ * kernel_dim * in_channels + kcol_ * in_channels + kch) * out_channels + poch); + elem_t weight = *(weights + (krow_ * kernel_col_dim * in_channels + kcol_ * in_channels + kch) * out_channels + poch); if (trans_weight_1203) { // HWIO to WIHO - weight = *(weights + (kch * kernel_dim * kernel_dim + krow_ * kernel_dim + kcol_) * out_channels + poch); + weight = *(weights + (kch * kernel_row_dim * kernel_col_dim + krow_ * kernel_col_dim + kcol_) * out_channels + poch); } else if (trans_weight_0132) { // HWIO to HWOI - weight = *(weights + (krow_ * kernel_dim * out_channels + kcol_ * out_channels + poch) * in_channels + kch); + weight = *(weights + (krow_ * kernel_col_dim * out_channels + kcol_ * out_channels + poch) * in_channels + kch); } opixel += weight * ipixel; @@ -2052,7 +2058,7 @@ static void conv_cpu( static void conv_dw_cpu( int batch_size, int in_row_dim, int in_col_dim, int channels, int out_row_dim, int out_col_dim, - int stride, int padding, int kernel_dim, + int stride, int padding, int kernel_row_dim, int kernel_col_dim, const elem_t * input, const elem_t * weights, @@ -2067,7 +2073,7 @@ static void conv_dw_cpu( conv_dw_cpu_without_pool( batch_size, in_row_dim, in_col_dim, channels, out_row_dim, out_col_dim, - stride, padding, kernel_dim, + stride, padding, kernel_row_dim, kernel_col_dim, input, weights, bias, output, act, scale); return; @@ -2100,10 +2106,10 @@ static void conv_dw_cpu( acc_t opixel = no_bias ? 0 : bias[ch]; - for (int krow = 0; krow < kernel_dim; krow++) { + for (int krow = 0; krow < kernel_row_dim; krow++) { const int irow = orow * stride + krow - padding; - for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int kcol = 0; kcol < kernel_col_dim; kcol++) { const int icol = ocol * stride + kcol - padding; const elem_t * in = input + (b * in_row_dim * in_col_dim + irow * in_col_dim + icol) * channels + ch; @@ -2111,7 +2117,7 @@ static void conv_dw_cpu( elem_t ipixel = irow < 0 || irow >= in_row_dim || icol < 0 || icol >= in_col_dim ? 0 : *in; - const elem_t weight = *(weights + (ch * kernel_dim + krow) * kernel_dim + kcol); + const elem_t weight = *(weights + (ch * kernel_col_dim + krow) * kernel_row_dim + kcol); opixel += weight * ipixel; } @@ -2142,7 +2148,8 @@ static void tiled_conv( int batch_size, int in_row_dim, int in_col_dim, int in_channels, int out_channels, int out_row_dim, int out_col_dim, - int stride, int input_dilation, int kernel_dilation, int padding, int kernel_dim, + int stride, int input_dilation, int kernel_dilation, int padding, + int kernel_row_dim, int kernel_col_dim, bool wrot180, bool trans_output_1203, bool trans_input_3120, bool trans_weight_1203, bool trans_weight_0132, @@ -2177,7 +2184,7 @@ static void tiled_conv( conv_cpu( batch_size, in_row_dim, in_col_dim, in_channels, out_channels, out_row_dim, out_col_dim, - stride, input_dilation, kernel_dilation, padding, kernel_dim, + stride, input_dilation, kernel_dilation, padding, kernel_row_dim, kernel_col_dim, wrot180, trans_output_1203, trans_input_3120, trans_weight_1203, trans_weight_0132, input, weights, bias, output, @@ -2204,7 +2211,7 @@ static void tiled_conv( pool_padding = 0; } - const bool downsample = stride == 2 && kernel_dim == 1 && in_row_dim % 2 == 0 && in_col_dim % 2 == 0 + const bool downsample = stride == 2 && kernel_row_dim == 1 && kernel_col_dim == 1 && in_row_dim % 2 == 0 && in_col_dim % 2 == 0 && padding == 0 && no_pool && input_dilation == 1 && !trans_input_3120; const int input_dilated = input_dilation == 2; @@ -2230,7 +2237,7 @@ static void tiled_conv( printf("not enough accumulator space to store outputs\n"); exit(1); } - if (kernel_dim <= padding) { + if (kernel_row_dim <= padding || kernel_col_dim <= padding) { printf("kernel_dim must be larger than padding\n"); exit(1); } @@ -2273,11 +2280,11 @@ static void tiled_conv( const int ocol = pocol * pool_stride - pool_padding; for (int poch = 0; poch < out_channels; poch += pochs) { - for (int krow = 0; krow < kernel_dim; krow += krows) { + for (int krow = 0; krow < kernel_row_dim; krow += krows) { const int orow_floored = orow < 0 ? 0 : orow; int irow = orow_floored * stride + krow * kernel_dilation - padding; - for (int kcol = 0; kcol < kernel_dim; kcol += kcols) { + for (int kcol = 0; kcol < kernel_col_dim; kcol += kcols) { const int ocol_floored = ocol < 0 ? 0 : ocol; int icol = ocol_floored * stride + kcol * kernel_dilation - padding; @@ -2287,8 +2294,8 @@ static void tiled_conv( out = output + (porow * pool_out_col_dim * batch_size + pocol * batch_size + b) * out_channels + poch; } - if (krow + krows < kernel_dim || - kcol + kcols < kernel_dim || + if (krow + krows < kernel_row_dim || + kcol + kcols < kernel_col_dim || kch + kchs < in_channels) { out = NULL; } @@ -2304,8 +2311,8 @@ static void tiled_conv( const int porows_ = pool_out_row_dim - porow > porows ? porows : pool_out_row_dim - porow; const int pocols_ = pool_out_col_dim - pocol > pocols ? pocols : pool_out_col_dim - pocol; const int pochs_ = out_channels - poch > pochs ? pochs : out_channels - poch; - const int krows_ = kernel_dim - krow > krows ? krows : kernel_dim - krow; - const int kcols_ = kernel_dim - kcol > kcols ? kcols : kernel_dim - kcol; + const int krows_ = kernel_row_dim - krow > krows ? krows : kernel_row_dim - krow; + const int kcols_ = kernel_col_dim - kcol > kcols ? kcols : kernel_col_dim - kcol; const int kchs_ = in_channels - kch > kchs ? kchs : in_channels - kch; const int ocols_ = pocols_ * pool_stride + pool_size - 1; @@ -2337,15 +2344,17 @@ static void tiled_conv( int krow_ = krow; int kcol_ = kcol; if (wrot180) { - krow_ = kernel_dim - krow - krows_; - kcol_ = kernel_dim - kcol - kcols_; + krow_ = kernel_row_dim - krow - krows_; + kcol_ = kernel_col_dim - kcol - kcols_; } - const elem_t * weights_slice = weights + (krow_*kernel_dim*in_channels + kcol_*in_channels + kch) * out_channels + poch; + const elem_t * weights_slice = weights + (krow_*kernel_col_dim*in_channels + kcol_*in_channels + kch) * out_channels + poch; if (trans_weight_1203) { - weights_slice = weights + (kch*kernel_dim*kernel_dim + krow_*kernel_dim+kcol_) * out_channels + poch; + // HWIO to WIHO + weights_slice = weights + (kch*kernel_row_dim*kernel_col_dim + krow_*kernel_col_dim+kcol_) * out_channels + poch; //TODO check } else if (trans_weight_0132) { - weights_slice = weights + (krow_*kernel_dim*out_channels + kcol_*out_channels + poch) * in_channels + kch; + // HWIO to HWOI + weights_slice = weights + (krow_*kernel_col_dim*out_channels + kcol_*out_channels + poch) * in_channels + kch; //TODO check } const elem_t * in = input + (b *in_row_dim * in_col_dim + ((irow+upad)>>input_dilated) * in_col_dim + ((icol+lpad)>>input_dilated)) * in_channels + kch; @@ -2358,7 +2367,7 @@ static void tiled_conv( out_channels, out_row_dim, out_col_dim, pool_out_row_dim, pool_out_col_dim, - stride, padding, kernel_dim, kernel_dilation, + stride, padding, kernel_row_dim, kernel_col_dim, kernel_dilation, pool_size, pool_stride, pool_padding, @@ -2395,7 +2404,7 @@ static void tiled_conv( static void tiled_conv_dw( int batch_size, int in_row_dim, int in_col_dim, int channels, int out_row_dim, int out_col_dim, - int stride, int padding, int kernel_dim, + int stride, int padding, int kernel_row_dim, int kernel_col_dim, int batches, int porows, int pocols, @@ -2419,7 +2428,7 @@ static void tiled_conv_dw( conv_dw_cpu( batch_size, in_row_dim, in_col_dim, channels, out_row_dim, out_col_dim, - stride, padding, kernel_dim, + stride, padding, kernel_row_dim, kernel_col_dim, input, weights, bias, output, act, scale, pool_size, pool_stride, pool_padding); @@ -2465,8 +2474,12 @@ static void tiled_conv_dw( printf("not enough accumulator space to store outputs\n"); exit(1); } - if (kernel_dim <= padding) { - printf("kernel_dim must be larger than padding\n"); + if (kernel_col_dim <= padding ) { + printf("kernel_col_dim must be larger than padding\n"); + exit(1); + } + if (kernel_row_dim <= padding){ + printf("kernel_row_dim must be larger than padding\n"); exit(1); } } @@ -2488,18 +2501,18 @@ static void tiled_conv_dw( const int ocol = pocol * pool_stride - pool_padding; for (int ch = 0; ch < channels; ch++) { - for (int krow = 0; krow < kernel_dim; krow += krows) { + for (int krow = 0; krow < kernel_row_dim; krow += krows) { const int orow_floored = orow < 0 ? 0 : orow; int irow = orow_floored * stride + krow - padding; - for (int kcol = 0; kcol < kernel_dim; kcol += kcols) { + for (int kcol = 0; kcol < kernel_col_dim; kcol += kcols) { const int ocol_floored = ocol < 0 ? 0 : ocol; int icol = ocol_floored * stride + kcol - padding; elem_t * out = output + (b * pool_out_row_dim * pool_out_col_dim + porow * pool_out_col_dim + pocol) * channels + ch; - if (krow + krows < kernel_dim || - kcol + kcols < kernel_dim) { + if (krow + krows < kernel_row_dim || + kcol + kcols < kernel_col_dim) { out = NULL; } @@ -2512,8 +2525,8 @@ static void tiled_conv_dw( const int batches_ = batch_size - b > batches ? batches : batch_size - b; const int porows_ = pool_out_row_dim - porow > porows ? porows : pool_out_row_dim - porow; const int pocols_ = pool_out_col_dim - pocol > pocols ? pocols : pool_out_col_dim - pocol; - const int krows_ = kernel_dim - krow > krows ? krows : kernel_dim - krow; - const int kcols_ = kernel_dim - kcol > kcols ? kcols : kernel_dim - kcol; + const int krows_ = kernel_row_dim - krow > krows ? krows : kernel_row_dim - krow; + const int kcols_ = kernel_col_dim - kcol > kcols ? kcols : kernel_col_dim - kcol; const int ocols_ = pocols_ * pool_stride + pool_size - 1; const int orows_ = porows_ * pool_stride + pool_size - 1; @@ -2531,7 +2544,7 @@ static void tiled_conv_dw( int upad = irow < 0 ? -irow : 0; int dpad = irow + irows_ > in_row_dim ? irow + irows_ - in_row_dim : 0; - const elem_t * weights_slice = weights + (ch*kernel_dim + krow) * kernel_dim + kcol; + const elem_t * weights_slice = weights + (ch*kernel_col_dim + krow) * kernel_row_dim + kcol; const elem_t *in = input + (b * in_row_dim * in_col_dim + (irow+upad) * in_col_dim + (icol+lpad)) * channels + ch; @@ -2540,7 +2553,7 @@ static void tiled_conv_dw( channels, out_row_dim, out_col_dim, pool_out_row_dim, pool_out_col_dim, - stride, padding, kernel_dim, 1, + stride, padding, kernel_row_dim, kernel_col_dim, 1, pool_size, pool_stride, pool_padding, @@ -2576,7 +2589,7 @@ static void tiled_conv_dw( static void tiled_conv_auto( int batch_size, int in_row_dim, int in_col_dim, int in_channels, int out_channels, int out_row_dim, int out_col_dim, - int stride, int input_dilation, int kernel_dilation, int padding, int kernel_dim, + int stride, int input_dilation, int kernel_dilation, int padding, int kernel_row_dim, int kernel_col_dim, bool wrot180, bool trans_output_1203, bool trans_input_3120, bool trans_weight_1203, bool trans_weight_0132, @@ -2600,13 +2613,13 @@ static void tiled_conv_auto( const int pool_out_row_dim = (out_row_dim + 2 * pool_padding - pool_size) / pool_stride + 1; const int pool_out_col_dim = (out_col_dim + 2 * pool_padding - pool_size) / pool_stride + 1; - const bool downsample = stride == 2 && kernel_dim == 1 && padding == 0 && no_pool && in_row_dim % 2 == 0 && in_col_dim % 2 == 0; + const bool downsample = stride == 2 && kernel_row_dim == 1 && kernel_col_dim == 1 && padding == 0 && no_pool && in_row_dim % 2 == 0 && in_col_dim % 2 == 0; // Tile convolution params // int args[] = {batch_size, porows, pocols, pochs, krows, kcols, kchs}; - int args[] = {batch_size, pool_out_row_dim, pool_out_col_dim, out_channels, kernel_dim, kernel_dim, in_channels}; - const int max_args[] = {batch_size, pool_out_row_dim, pool_out_col_dim, out_channels, kernel_dim, kernel_dim, in_channels}; + int args[] = {batch_size, pool_out_row_dim, pool_out_col_dim, out_channels, kernel_row_dim, kernel_col_dim, in_channels}; //kernel_row_dim, kernel_col_dim + const int max_args[] = {batch_size, pool_out_row_dim, pool_out_col_dim, out_channels, kernel_row_dim, kernel_col_dim, in_channels}; //kernel_row_dim, kernel_col_dim const int orows_idx = 1; const int ocols_idx = 2; @@ -2747,7 +2760,7 @@ static void tiled_conv_auto( tiled_conv( batch_size, in_row_dim, in_col_dim, in_channels, out_channels, out_row_dim, out_col_dim, - stride, input_dilation, kernel_dilation, padding, kernel_dim, + stride, input_dilation, kernel_dilation, padding, kernel_row_dim, kernel_col_dim, wrot180, trans_output_1203, trans_input_3120, trans_weight_1203, trans_weight_0132, @@ -2823,7 +2836,7 @@ static void tiled_conv_downsample( static void tiled_conv_dw_auto( int batch_size, int in_row_dim, int in_col_dim, int channels, int out_row_dim, int out_col_dim, - int stride, int padding, int kernel_dim, + int stride, int padding, int kernel_row_dim, int kernel_col_dim, elem_t * input, elem_t * weights, @@ -2848,8 +2861,8 @@ static void tiled_conv_dw_auto( // Tile convolution params // int args[] = {batch_size, porows, pocols, pochs, krows, kcols, kchs}; - int args[] = {batch_size, pool_out_row_dim, pool_out_col_dim, 1, kernel_dim, kernel_dim, 1}; - const int max_args[] = {batch_size, pool_out_row_dim, pool_out_col_dim, 1, kernel_dim, kernel_dim, 1}; + int args[] = {batch_size, pool_out_row_dim, pool_out_col_dim, 1, kernel_row_dim, kernel_col_dim, 1}; + const int max_args[] = {batch_size, pool_out_row_dim, pool_out_col_dim, 1, kernel_row_dim, kernel_col_dim, 1}; const int orows_idx = 1; const int ocols_idx = 2; @@ -2985,7 +2998,7 @@ static void tiled_conv_dw_auto( tiled_conv_dw( batch_size, in_row_dim, in_col_dim, channels, out_row_dim, out_col_dim, - stride, padding, kernel_dim, + stride, padding, kernel_row_dim, kernel_col_dim, batches, orows, ocols, @@ -3288,5 +3301,74 @@ static void tiled_global_average_auto(const elem_t * input, elem_t * output, #undef abs +// Given input and output shape automatically calculate parameters to transform transposed conv +// into the equivalent convolution +// For doubling input size: stride=2, kernel_dim=2 +static void tiled_transposed_conv_auto( + int batch_size, int in_dim, int channels, int out_dim, + int stride, int padding, int kernel_dim, + + const elem_t * input, + const elem_t * weights, + elem_t * output, + + acc_scale_t scale + ) { + + //TODO + // Fixed: batch_size, in_dim, channels, out_dim, kernel_dim + // Free: dilation, padding -> to calculate + + // For every convolution there exists a corresponding transposed convolution with: + // k_tick = k + // s_tick = 1 + // p_tick = k - p - 1 + // input_dilation = s - 1 (for gemmini: s-1+1) + // a = (i + 2p - k ) % s (currently must be zero) + // o_tick = s(i_tick - 1) + a + k - 2p + // + // Special case where a = 0: + // o_tick = s(i_tick - 1) + k - 2p + // + // Gemmini supports maximum of input_dilation = 2 (corresponds to one zero between input elements) + + int input_dilation = stride; + + #ifdef GEMMINI_ASSERTIONS + if(input_dilation > 2){ + printf("stride can't be greater than two.\n"); + exit(1); + } + #endif + + int a = (in_dim + 2*padding - kernel_dim) % stride; //add a num of zeros to bottom and right of input + // TODO find a way to pad these zeros + if (a != 0){ + printf("Configurations where in_dim+2p-k=%d is not a multiple of s=%d are not yet supported.\n", in_dim+2*padding-kernel_dim, stride); + exit(1); + } + + int transposed_padding = kernel_dim - padding - 1; + if(transposed_padding < 0){ + printf("Selected padding results in shrinking image: padding must be <= %d", kernel_dim-1); + } + + tiled_conv_auto( + batch_size, in_dim, in_dim, channels, + channels, out_dim, out_dim, + 1, input_dilation, 1, transposed_padding, kernel_dim, kernel_dim, + false, false, false, false, false, + + (elem_t*)input, + (elem_t*)weights, + NULL, + (elem_t*)output, + + NO_ACTIVATION, ACC_SCALE_IDENTITY, 0, 0, 0, + + WS); + +} + #endif // SRC_MAIN_C_GEMMINI_H diff --git a/include/gemmini_nn.h b/include/gemmini_nn.h index 2a5315fa..4dad63b4 100644 --- a/include/gemmini_nn.h +++ b/include/gemmini_nn.h @@ -17,6 +17,8 @@ struct ConvParams { int out_row_dim; int out_col_dim; int kernel_size; + int kernel_row_dim; + int kernel_col_dim; int in_channels; int out_channels; int stride; @@ -32,6 +34,20 @@ struct ConvParams { int I, J, K; }; +struct ConvParamsSimple{ + int batch_size; + int in_dim, out_dim; + int kernel_size; + int in_channels; + int out_channels; + int stride; + int padding; + bool bias; + bool depthwise; + acc_scale_t output_scale; + int pool_size, pool_stride, pool_padding, out_dim_pooled; +}; + struct FcParams { int batch_size; int in_features; diff --git a/include/gemmini_params.h b/include/gemmini_params.h index 768ebf21..02d9e7ae 100644 --- a/include/gemmini_params.h +++ b/include/gemmini_params.h @@ -78,6 +78,7 @@ typedef uint32_t acc_scale_t_bits; #define ACC_SCALE_SIG_BITS 24 #define ACC_READ_SMALL_WIDTH +#define ACC_READ_FULL_WIDTH #define HAS_FIRST_LAYER_OPTIMIZATIONS diff --git a/riscv-tests b/riscv-tests index c84daca8..ec6537fc 160000 --- a/riscv-tests +++ b/riscv-tests @@ -1 +1 @@ -Subproject commit c84daca8824635b7d896003c78f9c6245997cf7a +Subproject commit ec6537fc4a527ca88be2f045e01c460e640ab9c5