diff --git a/bareMetalC/Makefile b/bareMetalC/Makefile index 8b537489..b9f82d31 100644 --- a/bareMetalC/Makefile +++ b/bareMetalC/Makefile @@ -49,10 +49,8 @@ tests = \ matrix_add \ resadd \ global_average \ + gemmini_counter \ template \ - # gemmini_counter \ - # reset_counters \ - # get_counters tests_baremetal = $(tests:=-baremetal) @@ -72,8 +70,6 @@ else tests_pk = $(tests:=-pk) endif -tests_pk = - BENCH_COMMON = $(abs_top_srcdir)/riscv-tests/benchmarks/common GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h @@ -94,6 +90,7 @@ CFLAGS := $(CFLAGS) \ -I$(abs_top_srcdir) \ -I$(BENCH_COMMON) \ -DID_STRING=$(ID_STRING) \ + -DPRINT_TILE=0 \ CFLAGS_PK := \ $(CFLAGS) \ diff --git a/bareMetalC/tiled_matmul_ws.c b/bareMetalC/tiled_matmul_ws.c index 550d9b1d..5bd3bb3f 100644 --- a/bareMetalC/tiled_matmul_ws.c +++ b/bareMetalC/tiled_matmul_ws.c @@ -22,9 +22,9 @@ typedef elem_t ACC_T; #endif #ifndef BAREMETAL -#define MAT_DIM_I 12544 // 512 -#define MAT_DIM_K 256 // 512 -#define MAT_DIM_J 64 // 512 +#define MAT_DIM_I 512 +#define MAT_DIM_K 512 +#define MAT_DIM_J 512 #else #define MAT_DIM_I 64 #define MAT_DIM_K 64 diff --git a/bareMetalC/tiled_matmul_ws_perf.c b/bareMetalC/tiled_matmul_ws_perf.c index 206bc17e..e25528ff 100644 --- a/bareMetalC/tiled_matmul_ws_perf.c +++ b/bareMetalC/tiled_matmul_ws_perf.c @@ -10,10 +10,7 @@ #endif #include "include/gemmini_testutils.h" -#define HEADS 1 - #define ACTIVATION NO_ACTIVATION -// #define ACTIVATION SOFTMAX #define NO_BIAS 0 #define REPEATING_BIAS 1 @@ -23,23 +20,15 @@ #ifndef BAREMETAL -#define MAT_DIM_I 128 // 128 // 256 -#define MAT_DIM_K 512 // 64 // 256 -#define MAT_DIM_J 512 // 128 // 256 +#define MAT_DIM_I 128 +#define MAT_DIM_K 512 +#define MAT_DIM_J 256 #else -// #define MAT_DIM_I 128 -// #define MAT_DIM_K 128 -// #define MAT_DIM_J 128 - -#define MAT_DIM_I 512 // 256 -#define MAT_DIM_K 512 // 256 -#define MAT_DIM_J 32 // 256 - -// #define MAT_DIM_I 256 -// #define MAT_DIM_K 512 -// #define MAT_DIM_J 512 +#define MAT_DIM_I 128 +#define MAT_DIM_K 256 +#define MAT_DIM_J 256 #endif @@ -63,31 +52,25 @@ int main() { } #endif - printf("HEADS: %d\n", HEADS); - printf("MAT_DIM_I: %d\n", MAT_DIM_I); - printf("MAT_DIM_J: %d\n", MAT_DIM_J); - printf("MAT_DIM_K: %d\n", MAT_DIM_K); - printf("ACTIVATION: %d\n", ACTIVATION); - gemmini_flush(0); #if A_TRANSPOSE==0 - static elem_t full_A[HEADS][MAT_DIM_I][MAT_DIM_K] row_align(1); + static elem_t full_A[MAT_DIM_I][MAT_DIM_K] row_align(1); #else - static elem_t full_A[HEADS][MAT_DIM_K][MAT_DIM_I] row_align(1); + static elem_t full_A[MAT_DIM_K][MAT_DIM_I] row_align(1); #endif #if B_TRANSPOSE==0 - static elem_t full_B[HEADS][MAT_DIM_K][MAT_DIM_J] row_align(1); + static elem_t full_B[MAT_DIM_K][MAT_DIM_J] row_align(1); #else - static elem_t full_B[HEADS][MAT_DIM_J][MAT_DIM_K] row_align(1); + static elem_t full_B[MAT_DIM_J][MAT_DIM_K] row_align(1); #endif - static elem_t full_C[HEADS][MAT_DIM_I][MAT_DIM_J] row_align(1); - static acc_t full_D[HEADS][MAT_DIM_I][MAT_DIM_J] row_align_acc(1); + static elem_t full_C[MAT_DIM_I][MAT_DIM_J] row_align(1); + static acc_t full_D[MAT_DIM_I][MAT_DIM_J] row_align_acc(1); - static full_t gold_full[HEADS][MAT_DIM_I][MAT_DIM_J]; - static elem_t gold[HEADS][MAT_DIM_I][MAT_DIM_J]; + static full_t gold_full[MAT_DIM_I][MAT_DIM_J]; + static elem_t gold[MAT_DIM_I][MAT_DIM_J]; counter_configure(0, RDMA_BYTES_REC); counter_configure(1, WDMA_BYTES_SENT); @@ -99,11 +82,8 @@ int main() { printf("A_TRANSPOSE: %d, B_TRANSPOSE: %d\n", A_TRANSPOSE, B_TRANSPOSE); uint64_t start = read_cycles(); - for (int head = 0; head < HEADS; head++) tiled_matmul_auto(MAT_DIM_I, MAT_DIM_J, MAT_DIM_K, - // (elem_t*)full_A, (elem_t*)full_B, NO_BIAS ? NULL : &full_D[0][0], (elem_t*)full_C, - (elem_t*)full_A[head], (elem_t*)full_B[head], NO_BIAS ? NULL : &full_D[head][0][0], (elem_t*)full_C[head], - // (elem_t*)full_A[0], (elem_t*)full_B[0], NO_BIAS ? NULL : &full_D[0][0][0], (elem_t*)full_C[0], + (elem_t*)full_A, (elem_t*)full_B, NO_BIAS ? NULL : &full_D[0][0], (elem_t*)full_C, A_STRIDE, B_STRIDE, MAT_DIM_J, MAT_DIM_J, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, ACTIVATION, ACC_SCALE_IDENTITY, 0, REPEATING_BIAS, @@ -117,7 +97,7 @@ int main() { uint64_t end = read_cycles(); printf("Cycles taken: %llu\n", end-start); - const uint64_t total_macs = HEADS * MAT_DIM_I * MAT_DIM_J * MAT_DIM_K; + const uint64_t total_macs = MAT_DIM_I * MAT_DIM_J * MAT_DIM_K; const uint64_t ideal_cycles = total_macs / (DIM * DIM); const uint64_t utilization = 100 * ideal_cycles / (end-start); printf("Total macs: %llu\n", total_macs); diff --git a/gemmini-data-collection/.gitignore b/gemmini-data-collection/.gitignore new file mode 100644 index 00000000..b27c3107 --- /dev/null +++ b/gemmini-data-collection/.gitignore @@ -0,0 +1 @@ +clean.sh diff --git a/gemmini-data-collection/README.md b/gemmini-data-collection/README.md new file mode 100644 index 00000000..bc0b4a8b --- /dev/null +++ b/gemmini-data-collection/README.md @@ -0,0 +1,123 @@ +# Gemmini Data Collection + +This directory contains scripts that enable large-scale, seamless data collection for various test benches applied on various configurations of Gemmini. +These scripts generally work by automatically generating _new_ C source files in the `bareMetalC/` directory, and running them in parallel for rapid data collection. + +Most of the following documentation assumes the current working directory is `gemmini/software/gemmini-rocc-tests/gemmini-data-collection`. + +## How to Use + +### Step 1: Create Test Bench Template + +* Make a generalized version of the test bench you want to simulate on Gemmini in the `templates` folder. + +* For all values of the test bench you want to make variable, replace each usage of each value with its respective `%%`. + +* If you just want to run matrix multiplications or convolutions, then you can skip this step as we already provide templates for these operations. + +### Step 2: Specify Tests to Run + +* Open `tests.py` in your preferred text editor, and make `GemminiTest` objects for each test you want to simulate (e.g. one test for a 512-by-512-by-512 matmul, or another test for a small convolution). You may refer to the existing examples in `tests.py`. + +* The `GemminiTest` objects should have the following arguments: + * **Parameter 1**: an array of keywords used in the template + * **Parameter 2**: an array of values to replace the keywords specified in Parameter 1 (in respective order) + * **Parameter 3**: name of template you are using (must be a `C` file located in the `templates` folder) + * **Parameter 4**: name of output file (`C` file) you want generated in `bareMetalC` folder + +### Step 3: Construct Gemmini Configurations + +* This step is optional and is only necessary if you wish to collect simulation data for hardware configurations different than the Gemmini defaults (e.g. with different spatial array dimensions, or scratchpad sizes). + +* Go to `gemmini/src/main/scala/gemmini/CustomConfigs.scala` and create your desired Gemmini configs + +* Keep note of their names – you will need them for Step 4 + +### Step 4: Run Tests in Parallel + +* Refer to descriptions of `gen_data.sh` and `config_gen_data.sh` in the section below. + +* Run the `config_gen_data.sh` script from the `gemmini-data-collection` directory, as our paths assume that this is your current working directory. + - Note that all the tests you specificed in Step 2 will run **in parallel** on separate processes. Make sure you are running on a server with sufficient RAM for all those tests! + +## Detailed Script Descriptions +### `gemmini_data_collection.py` + +This script generates three other critical scripts that facilitate the data collection process, described below. + +Each call to the `main` function of this script generates an instance of a template test bench (ex. GEMM, 2D-Conv) for Gemmini simulation; it also mutates the below scripts to account for the instance: + +* `gemmini/data-collection-vcs.sh` + + * Uses the `gemmini/scripts/run-vcs.sh` script to run a faster cycle-accurate Gemmini simulation for each instance generated by the `gemmini_data_collection.py` script + + * Output containing the cycle count directed to `gemmini/data-collection-output`. + +* `gemmini/data-collection-midas.sh midas_dram_model` + + * Uses the `gemmini/scripts/run-midas.sh` script to run a cycle-accurate Gemmini simulation with a more precise DRAM model for each instance generated by the `gemmini_data_collection.py` script + + * Output containing the cycle count is directed to `gemmini/data-collection-output`. + +* `gemmini/data-collection-spike.sh` + + * Uses the `gemmini/scripts/run-spike.sh` script to run through the "functionality" of each instance generated by the `gemmini_data_collection.py` script + + * When Spike is called, a switch will enable the tiling factors for the job to be outputted instead of the cycle count (does not happen when running `gemmini/data-collection-vcs.sh` or `gemmini/data-collection-midas.sh`) + + * Output containing tiling factors directed to `gemmini/data-collection-output` + +* `clean.sh` + + * Cleans the output of `gemmini/software/gemmini-rocc-tests/build.sh` script + + * Deletes `gemmini/data-collection-output-configs` folder + + * Deletes `gemmini/data-collection-output` folder + + * Deletes `gemmini/data-collection-vcs.sh` script + + * Deletes `gemmini/data-collection-midas.sh` script + + * Deletes `gemmini/data-collection-spike.sh` script + + * Deletes all test bench instance `C` files generated from the `gemmini_data_collection.py` script + + * Resets `Makefile` to remove instruction to build test bench instances + +### `gen_data.sh tile|cycle vcs|verilator|midas [midas_dram_model]` + +* If called with the `tile` parameter, script generates output with tiling factors; if called with the `cycle` parameter, script generates output with simulation cycles + +* Cleans all prior output via `clean.sh` + +* Runs `gemmini_data_collection.py` + +* If called with the `tile` parameter, turns switch to print tiling factors on; if called with the `cycle` parameter, turns switch to print tiling factors off (affects `gemmini/software/gemmini-rocc-tests/include/gemmini.h`), and instead prints the cycle count. + +* Builds the `bareMetalC` tests + +* If called with the `tile` parameter, calls `gemmini/data-collection-spike.sh` + +* If called with the `cycle` parameter, then a second input parameter is expected (`vcs`, `verilator`, or `midas` to specify which simulation) + + * If called with the `vcs` parameter, calls `gemmini/data-collection-vcs.sh` + + * If called with the `midas` parameter, calls `gemmini/data-collection-midas.sh` + +* **Note:** The `midas_dram_model` parameter is only required if the script is called with the `midas` argument + +### `config_gen_data.sh config_name vcs|verilator|midas [midas_dram_model]` + +* Sets the active Gemmini configuration in `gemmini/src/main/scala/gemmini/CustomConfigs.scala` to the specified `config_name` Scala variable name provided as a parameter + + * If you want to use the default configuration, you should pass in `baselineInferenceConfig` as the first parameter + +* Builds Spike and _either_ VCS, _or_ Verilator, _or_ Midas based on the chosen configuration using `gemmini/scripts/build-spike.sh` and _either_ `gemmini/scripts/build-vcs.sh`, _or_ `gemmini/scripts/build-verilator.sh`, _or_ `gemmini/scripts/build-midas.sh` respectively. + +* Runs `gen_data.sh tile` and `gen_data.sh cycle [vcs|verilator|midas]` to collect both tiling factor data and simulation cycle count data. + +* Places output in subfolders within the `data-collection-output-configs` folder; the subfolders follow the naming format `data-collection-output--` where the `<>` indicates variability of naming + +* **Note:** The `midas_dram_model` parameter is only required if the script is called with the `midas` argument + diff --git a/gemmini-data-collection/config_gen_data.sh b/gemmini-data-collection/config_gen_data.sh new file mode 100755 index 00000000..580c92a5 --- /dev/null +++ b/gemmini-data-collection/config_gen_data.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +cd ../../.. +sed -i "/val customConfig =/c\ val customConfig = $1" src/main/scala/gemmini/CustomConfigs.scala +sed -i "/val customConfig =/c\ val customConfig = $1" configs/GemminiCustomConfigs.scala +if [ "$2" = "vcs" ]; then + ./scripts/build-vcs.sh +elif [ "$2" = "verilator" ]; then + ./scripts/build-verilator.sh +elif [ "$2" = "midas" ]; then + ./scripts/build-midas.sh $3 +else + echo "Invalid second paramter passed into gen-data.sh: should be 'vcs', 'verilator' or 'midas'" + exit 1 +fi + +./scripts/build-spike.sh +cd software/gemmini-rocc-tests/gemmini-data-collection + +result_dir=../../../data-collection-output +tiling_dir=../../../data-collection-output-configs/data-collection-output-tiling-factors-$1 +cycle_dir=../../../data-collection-output-configs/data-collection-output-cycles-$2-$1 + +mkdir -p $tiling_dir +mkdir -p $cycle_dir + +bash gen_data.sh tile +mv $result_dir/* $tiling_dir/ && rmdir $result_dir +bash gen_data.sh cycle $2 $3 +mv $result_dir/* $cycle_dir/ && rmdir $result_dir + diff --git a/gemmini-data-collection/gemmini_data_collection.py b/gemmini-data-collection/gemmini_data_collection.py new file mode 100644 index 00000000..c9ab1d9a --- /dev/null +++ b/gemmini-data-collection/gemmini_data_collection.py @@ -0,0 +1,92 @@ +import sys +import tests + +def main(keywords, replacement, template_file, new_file): + """ + Search TEMPLATE_FILE for KEYWORDS and replace respective keywords with REPLACEMENT. Write changes to NEW_FILE. + Update Makefile with new filename for target. + """ + assert (len(keywords) == len(replacement)), "Number of keywords needs to be the same as number of replacement words" + + with open('templates/'+template_file+'.c', 'r') as file: + filedata = file.read() + + for i in range(len(keywords)): + filedata = filedata.replace('%'+keywords[i]+'%', replacement[i]) + + with open('../bareMetalC/'+new_file+'.c', 'w') as file: + file.write(filedata) + + print("Created " + new_file + " from " + template_file) + + with open('../bareMetalC/Makefile', 'r') as file: + filedata = file.read() + + filedata = filedata.replace("tests = \\", "tests = \\\n\t"+new_file+"\\") + + with open('../bareMetalC/Makefile', 'w') as file: + filedata = file.write(filedata) + + print("Updated Makefile") + + with open('../../../data-collection-vcs.sh', 'r') as file: + filedata = file.read() + + filedata = filedata + "./scripts/run-vcs.sh " + new_file + " > data-collection-output/" + new_file + "-vcs.txt &\n" + + with open('../../../data-collection-vcs.sh', 'w') as file: + filedata = file.write(filedata) + + print("Updated data-collection-vcs.sh script") + + with open('../../../data-collection-verilator.sh', 'r') as file: + filedata = file.read() + + filedata = filedata + "./scripts/run-verilator.sh " + new_file + " > data-collection-output/" + new_file + "-verilator.txt &\n" + + with open('../../../data-collection-verilator.sh', 'w') as file: + filedata = file.write(filedata) + + print("Updated data-collection-verilator.sh script") + + with open('../../../data-collection-midas.sh', 'r') as file: + filedata = file.read() + + filedata = filedata + "./scripts/run-midas.sh $1 " + new_file + " > data-collection-output/" + new_file + "-midas.txt &\n" + + with open('../../../data-collection-midas.sh', 'w') as file: + filedata = file.write(filedata) + + print("Updated data-collection-midas.sh script") + + with open('../../../data-collection-spike.sh', 'r') as file: + filedata = file.read() + + filedata = filedata + "./scripts/run-spike.sh " + new_file + " > data-collection-output/" + new_file + "-spike.txt &\n" + + with open('../../../data-collection-spike.sh', 'w') as file: + filedata = file.write(filedata) + + print("Updated data-collection-spike.sh script") + + with open('clean.sh', 'a') as file: + file.write('rm ../bareMetalC/' + new_file + '.c\n') + + print("Updated clean.sh script") + + +if __name__ == "__main__": + for fname in 'vcs', 'verilator', 'midas', 'spike': + with open('../../../data-collection-' + fname + '.sh', 'w') as file: + file.write("#!/bin/bash\n\nmkdir -p data-collection-output\n") + + with open('clean.sh', 'w') as file: + file.write('#!/bin/bash\n\nrm -rf ../../../data-collection-output\nrm ../../../data-collection-vcs.sh\nrm ../../../data-collection-verilator.sh\nrm ../../../data-collection-midas.sh\nrm ../../../data-collection-spike.sh\ncp og_baremetal_Makefile ../bareMetalC/Makefile\ncd ..\n./build.sh clean\ncd gemmini-data-collection\n') + + for test in tests.tests: + main(*test) + + for fname in 'vcs', 'verilator', 'midas', 'spike': + with open('../../../data-collection-' + fname + '.sh', 'a') as file: + file.write("wait\n") + diff --git a/gemmini-data-collection/gen_data.sh b/gemmini-data-collection/gen_data.sh new file mode 100755 index 00000000..4f0d3198 --- /dev/null +++ b/gemmini-data-collection/gen_data.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +bash clean.sh +python gemmini_data_collection.py +cd .. + + +if [ "$1" = "tile" ]; then + sed -i '/-DPRINT_TILE=/c\\t-DPRINT_TILE=1 \\' bareMetalC/Makefile + echo "Set DPRINT_TILE=1" +elif [ "$1" = "cycle" ]; then + sed -i '/-DPRINT_TILE=/c\\t-DPRINT_TILE=0 \\' bareMetalC/Makefile + echo "Set DPRINT_TILE=0" +else + echo "Invalid first parameter passed into gen-data.sh: should be 'tile' or 'cycle'" + exit 1 +fi + + +./build.sh bareMetalC +cd ../.. + +if [ "$1" = "tile" ]; then + echo "Running Spike" + bash data-collection-spike.sh +elif [ "$1" = "cycle" ]; then + if [ "$2" = "vcs" ]; then + echo "Running VCS" + bash data-collection-vcs.sh + elif [ "$2" = "verilator" ]; then + echo "Running Verilator" + bash data-collection-verilator.sh + elif [ "$2" = "midas" ]; then + echo "Running Midas" + bash data-collection-midas.sh $3 + else + echo "Invalid second parameter passed into gen-data.sh: should be 'vcs', 'verilator' or 'midas'" + exit 1 + fi +fi + diff --git a/gemmini-data-collection/layers/extract_data.py b/gemmini-data-collection/layers/extract_data.py new file mode 100644 index 00000000..389f9850 --- /dev/null +++ b/gemmini-data-collection/layers/extract_data.py @@ -0,0 +1,67 @@ +import pickle, yaml, os, re, argparse + +def extract_prob(layer_dict, prob): + assert prob["Wstride"] == prob["Hstride"] + assert prob["Wdilation"] == prob["Hdilation"] + assert prob["P"] == prob["Q"] + assert prob["R"] == prob["S"] + + layer_dict["BATCH_SIZE"] = prob["N"] + layer_dict["IN_CHANNELS"] = prob["C"] + layer_dict["OUT_CHANNELS"] = prob["K"] + layer_dict["KERNEL_DIM"] = prob["R"] + layer_dict["STRIDE"] = prob["Wstride"] + layer_dict["KERNEL_DILATION"] = prob["Wdilation"] + layer_dict["IN_DIM"] = (prob["P"] - 1) * layer_dict["STRIDE"] + 2 * layer_dict["KERNEL_DILATION"] + layer_dict["KERNEL_DIM"] + + +def extract_mapping(layer_dict, prob, loc): + map_dir = "{R}_{S}_{P}_{Q}_{C}_{K}_{N}_{Wstride}_{Hstride}_{Wdilation}_{Hdilation}".format(R=prob["R"], S=prob["S"], P=prob["P"], Q=prob["Q"], C=prob["C"], K=prob["K"], N=prob["N"], Wstride=prob["Wstride"], Hstride=prob["Hstride"], Wdilation=prob["Wdilation"], Hdilation=prob["Hdilation"]) + map_filename = os.path.join(loc, map_dir, "timeloop-mapper.map.yaml") + layer_dict["prob_name"] = map_dir + + with open(map_filename) as f: + f_text = f.read() + + def tiling_factor(letter): + p = re.compile("\s{l}(\d+)\s".format(l=letter)) + factors = p.findall(f_text) + product = 1 + for num in factors: + product *= int(num) + product /= int(factors[-1]) #remove DRAM factor + product /= int(factors[-2]) #remove spad temporal + product /= int(factors[-3]) #remove spad spatial + return product + + layer_dict["TILE_BATCHES"] = tiling_factor("N") + layer_dict["TILE_OCOLS"] = tiling_factor("Q") + layer_dict["TILE_OROWS"] = tiling_factor("P") + layer_dict["TILE_OCHS"] = tiling_factor("K") + layer_dict["TILE_KCOLS"] = tiling_factor("S") + layer_dict["TILE_KROWS"] = tiling_factor("R") + layer_dict["TILE_KCHS"] = tiling_factor("C") + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--layer_file", type=str, required=True) + parser.add_argument("--map_loc", type=str, required=True) + parser.add_argument("--prob_loc", type=str, required=True) + args = parser.parse_args() + + layer_dicts = [] + with open(args.layer_file) as f: + layers = yaml.safe_load(f) + for layer in layers: + layer_filename = os.path.join(args.prob_loc, layer + ".yaml") + with open(layer_filename) as f: + prob = yaml.safe_load(f)["problem"] + layer_dict = {} + extract_prob(layer_dict, prob) + extract_mapping(layer_dict, prob, args.map_loc) + layer_dicts.append(layer_dict) + + with open('layers.pickle', 'wb') as p: + pickle.dump(layer_dicts, p, protocol=pickle.HIGHEST_PROTOCOL) + + #sample: python extract_data.py --layer_file resnet50/unique_layers.yaml --prob_loc resnet50/ --map_loc resnet50_map_v2/logs/gemmini_16_256.0_64.0/ diff --git a/gemmini-data-collection/og_baremetal_Makefile b/gemmini-data-collection/og_baremetal_Makefile new file mode 100644 index 00000000..b9f82d31 --- /dev/null +++ b/gemmini-data-collection/og_baremetal_Makefile @@ -0,0 +1,128 @@ +include $(abs_top_srcdir)/Makefrag + +tests = \ + mvin_mvout \ + mvin_mvout_zeros \ + mvin_mvout_stride \ + mvin_mvout_block_stride \ + mvin_mvout_acc \ + mvin_mvout_acc_zero_stride \ + mvin_mvout_acc_stride \ + mvin_mvout_acc_full \ + mvin_mvout_acc_full_stride \ + matmul_os \ + matmul_ws \ + matmul \ + raw_hazard \ + aligned \ + padded \ + mvin_scale \ + conv \ + conv_with_pool \ + conv_with_rot180 \ + conv_with_kernel_dilation \ + conv_with_input_dilation \ + conv_with_input_dilation_and_rot180 \ + conv_with_input_dilation_and_neg_padding \ + conv_trans_output_1203 \ + conv_trans_weight_1203 \ + conv_trans_weight_0132 \ + conv_trans_input_3120 \ + conv_trans_input_3120_with_kernel_dilation \ + conv_first_layer \ + conv_dw \ + conv_perf \ + conv_dw_perf \ + tiled_matmul_os \ + tiled_matmul_ws \ + tiled_matmul_ws_At \ + tiled_matmul_ws_Bt \ + tiled_matmul_ws_full_C \ + tiled_matmul_ws_low_D \ + tiled_matmul_ws_igelu \ + tiled_matmul_ws_layernorm \ + tiled_matmul_ws_softmax \ + tiled_matmul_ws_perf \ + tiled_matmul_cpu \ + tiled_matmul_option \ + transpose \ + matrix_add \ + resadd \ + global_average \ + gemmini_counter \ + template \ + +tests_baremetal = $(tests:=-baremetal) + +ifeq ($(findstring spike,$(RUNNER)),spike) +# Currently don't support conv or conv-with-pool on spike +runs_baremetal = $(addsuffix .run,$(filter-out conv-baremetal conv_with_pool-baremetal,$(tests_baremetal))) +else +# Don't run very long benchmarks for RTL sim +runs_baremetal = $(addsuffix .run,$(filter-out tiled_matmul_cpu-baremetal tiled_matmul_option-baremetal,$(tests_baremetal))) +endif + +ifdef BAREMETAL_ONLY + tests_linux = + tests_pk = +else + tests_linux = $(tests:=-linux) + tests_pk = $(tests:=-pk) +endif + +BENCH_COMMON = $(abs_top_srcdir)/riscv-tests/benchmarks/common +GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h + +CFLAGS := $(CFLAGS) \ + -DPREALLOCATE=1 \ + -DMULTITHREAD=1 \ + -mcmodel=medany \ + -std=gnu99 \ + -O2 \ + -ffast-math \ + -fno-common \ + -fno-builtin-printf \ + -march=rv64gc -Wa,-march=rv64gcxhwacha \ + -lm \ + -lgcc \ + -I$(abs_top_srcdir)/riscv-tests \ + -I$(abs_top_srcdir)/riscv-tests/env \ + -I$(abs_top_srcdir) \ + -I$(BENCH_COMMON) \ + -DID_STRING=$(ID_STRING) \ + -DPRINT_TILE=0 \ + +CFLAGS_PK := \ + $(CFLAGS) \ + -static \ + -DBAREMETAL=1 \ + +CFLAGS_BAREMETAL := \ + $(CFLAGS) \ + -nostdlib \ + -nostartfiles \ + -static \ + -T $(BENCH_COMMON)/test.ld \ + -DBAREMETAL=1 \ + +all: $(tests_baremetal) $(tests_linux) $(tests_pk) + +vpath %.c $(src_dir) + +%-baremetal: %.c $(GEMMINI_HEADERS) + $(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \ + $(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS) + +%-linux: %.c $(GEMMINI_HEADERS) + $(CC_LINUX) $(CFLAGS) $< $(LFLAGS) -o $@ + +%-pk: %.c $(GEMMINI_HEADERS) + $(CC_LINUX) $(CFLAGS_PK) $< $(LFLAGS) -o $@ + +run-baremetal: $(runs_baremetal) + +%-baremetal.run: %-baremetal + $(RUNNER)$(abs_top_srcdir)/build/bareMetalC/$^ + +junk += $(tests_baremetal) $(tests_linux) $(tests_pk) + diff --git a/gemmini-data-collection/templates/conv_template.c b/gemmini-data-collection/templates/conv_template.c new file mode 100644 index 00000000..49df0aa1 --- /dev/null +++ b/gemmini-data-collection/templates/conv_template.c @@ -0,0 +1,324 @@ +#include +#include +#include +#include +#include +#ifndef BAREMETAL +#include +#endif +#include "include/gemmini_testutils.h" + +#define BATCH_SIZE 2 +#define IN_DIM %IN_DIM% +#define IN_CHANNELS %IN_CHANNELS% +#define OUT_CHANNELS %OUT_CHANNELS% +#define KERNEL_DIM %KERNEL_DIM% +#define PADDING %PADDING% +#define STRIDE %STRIDE% + +#define NO_BIAS false + +#define OUT_DIM ((IN_DIM + 2*PADDING - KERNEL_DIM) / STRIDE + 1) +#define PATCH_SIZE (KERNEL_DIM * KERNEL_DIM * IN_CHANNELS) +#define N_PATCHES (BATCH_SIZE * OUT_DIM * OUT_DIM) + +void conv(int batch_size, int in_channels, int in_dim, + int out_channels, int kernel_dim, + int out_dim, + int stride, int padding, + elem_t input[batch_size][in_dim][in_dim][in_channels], + elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + acc_t bias[out_channels], + elem_t output[batch_size][out_dim][out_dim][out_channels]) { + +#ifdef GEMMINI_ASSERTIONS + if (out_dim != (in_dim + 2*padding - kernel_dim) / stride + 1) { + printf("conv out_dim is not correct\n"); + exit(1); + } +#endif + + for (int b = 0; b < batch_size; b++) { + for (int orow = 0; orow < out_dim; orow++) { + for (int ocol = 0; ocol < out_dim; ocol++) { + for (int och = 0; och < out_channels; och++) { + acc_t result = bias[och]; + + for (int krow = 0; krow < kernel_dim; krow++) { + for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int kch = 0; kch < in_channels; kch++) { + int irow = orow * stride + krow - padding; + int icol = ocol * stride + kcol - padding; + + elem_t pixel = irow < 0 || irow >= in_dim || + icol < 0 || icol >= in_dim ? + 0 : input[b][irow][icol][kch]; + + result += + weights[och][krow][kcol][kch] * + pixel; + } + } + } + + // Clip result + result = result > elem_t_max ? elem_t_max : (result < elem_t_min ? elem_t_min : result); + + output[b][orow][ocol][och] = result; + } + } + } + } +} + +void flatten_weights(int out_channels, int kernel_dim, int in_channels, + int patch_size, + elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + elem_t weights_mat[patch_size][out_channels]) { + + assert(patch_size == kernel_dim * kernel_dim * in_channels); + + for (int outc = 0; outc < out_channels; outc++) { + for (int krow = 0; krow < kernel_dim; krow++) { + for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int inc = 0; inc < in_channels; inc++) { + int wmatrow = krow * kernel_dim * in_channels + + kcol * in_channels + + inc; + + weights_mat[wmatrow][outc] = + weights[outc][krow][kcol][inc]; + } + } + } + } +} + +bool vec_is_equal(elem_t * a, elem_t * b, int len) { + for (int i = 0; i < len; i++) + if (a[i] != b[i]) + return false; + return true; +} + +void init_random(elem_t * buf, int len) { + elem_t i = 0; + for (elem_t * ptr = buf; ptr < buf + len; ptr++) { + // *ptr = (rand() % 32) - 16; +#ifdef FAST + *ptr = 1; +#else + *ptr = (rand() % 5) - 2; +#endif + } +} + +void init_random_acc(acc_t * buf, int len) { + elem_t i = 0; + for (acc_t * ptr = buf; ptr < buf + len; ptr++) { + // *ptr = (rand() % 32) - 16; +#ifdef FAST + *ptr = 1; +#else + *ptr = (rand() % 5) - 2; +#endif + } +} + +void init_zeros_acc(acc_t * buf, int len) { + for (acc_t * ptr = buf; ptr < buf + len; ptr++) { + *ptr = 0; + } +} + +int main() { +#ifndef BAREMETAL + if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) { + perror("mlockall failed"); + exit(1); + } +#endif + printf("Input dimension: %u\n", IN_DIM); + printf("Input channels: %u\n", IN_CHANNELS); + printf("Output channels: %u\n", OUT_CHANNELS); + printf("Kernel dimensions: %u\n", KERNEL_DIM); + printf("Stride: %u\n", STRIDE); + printf("Padding: %u\n", PADDING); + printf("Batch size: %u\n\n", BATCH_SIZE); + + gemmini_flush(0); + + // assert((in_dim + 2*padding - kernel_dim) % stride == 0); + + printf("Output dimension: %u\n\n", OUT_DIM); + + static elem_t input[BATCH_SIZE][IN_DIM][IN_DIM][IN_CHANNELS]; + static elem_t weights[OUT_CHANNELS][KERNEL_DIM][KERNEL_DIM][IN_CHANNELS]; + static acc_t bias[OUT_CHANNELS]; + static elem_t output[BATCH_SIZE][OUT_DIM][OUT_DIM][OUT_CHANNELS]; + + printf("Randomize inputs...\n"); + //init_random(&input[0][0][0][0], sizeof(input) / sizeof(elem_t)); + + printf("Randomize weights...\n"); + //init_random(&weights[0][0][0][0], sizeof(weights) / sizeof(elem_t)); + + /* + printf("Randomize bias...\n"); + if (NO_BIAS) + init_zeros_acc(&bias[0], sizeof(bias) / sizeof(acc_t)); + else + init_random_acc(&bias[0], sizeof(bias) / sizeof(acc_t)); + */ + + printf("CPU conv...\n"); + uint64_t start_cpu = read_cycles(); +#ifndef FAST + /*conv(BATCH_SIZE, IN_CHANNELS, IN_DIM, + OUT_CHANNELS, KERNEL_DIM, + OUT_DIM, + STRIDE, PADDING, + input, + weights, + bias, + output); + */ +#endif + uint64_t end_cpu = read_cycles(); + printf("CPU conv took %llu cycles\n", end_cpu - start_cpu); + + static elem_t weights_mat[PATCH_SIZE][OUT_CHANNELS]; + static elem_t output_mat[N_PATCHES][OUT_CHANNELS]; + + printf("Flatten weights...\n"); + /*flatten_weights(OUT_CHANNELS, KERNEL_DIM, IN_CHANNELS, + PATCH_SIZE, + weights, + weights_mat); + */ + printf("Gemmini conv...\n"); + uint64_t start_gemmini = read_cycles(); + tiled_conv_auto( + BATCH_SIZE, IN_DIM, IN_CHANNELS, + OUT_CHANNELS, OUT_DIM, + STRIDE, 1, 1, PADDING, KERNEL_DIM, + false, false, false, false, false, + + (elem_t*)input, + (elem_t*)weights_mat, + NO_BIAS ? NULL : (acc_t*)bias, + (elem_t*)output_mat, + + NO_ACTIVATION, ACC_SCALE_IDENTITY, 0, 0, 0, + + WS); + + gemmini_fence(); + uint64_t end_gemmini = read_cycles(); + printf("Gemmini conv took %llu cycles\n", end_gemmini - start_gemmini); + + assert(sizeof(output_mat) == sizeof(output)); + +#ifdef FAST + bool success = true; + for (int orow = 0; orow < BATCH_SIZE * OUT_DIM * OUT_DIM; orow++) { + for (int ocol = 0; ocol < OUT_CHANNELS; ocol++) { + elem_t v = output_mat[orow][ocol]; + if (v != 21 && v != 31 && v != 46) { + success = false; + break; + } + } + } +#else + bool success = vec_is_equal(&output[0][0][0][0], &output_mat[0][0], sizeof(output) / sizeof(elem_t)); +#endif + + if (!success) { + // return 1; + + printf("bias:\n"); + for (int och = 0; och < OUT_CHANNELS; och++) { + printf("%d,", bias[och]); + } + printf("\b\n\n"); + + printf("weights:\n"); + for (int och = 0; och < OUT_CHANNELS; och++) { + printf("["); + for (int wrow = 0; wrow < KERNEL_DIM; wrow++) { + printf("["); + for (int wcol = 0; wcol < KERNEL_DIM; wcol++) { + printf("["); + for (int ich = 0; ich < IN_CHANNELS; ich++) { + printf("%d,", weights[och][wrow][wcol][ich]); + } + printf("\b],"); + } + printf("\b],\n"); + } + printf("\b],"); + } + printf("\b\n\n"); + + printf("weights_mat:\n"); + for (int wrow = 0; wrow < KERNEL_DIM * KERNEL_DIM * IN_CHANNELS; wrow++) { + printf("["); + for (int wcol = 0; wcol < OUT_CHANNELS; wcol++) { + printf("%d,", weights_mat[wrow][wcol]); + } + printf("\b],\n"); + } + printf("\b\n\n"); + + printf("input:\n"); + for (int batch = 0; batch < BATCH_SIZE; batch++) { + printf("["); + for (int irow = 0; irow < IN_DIM; irow++) { + printf("["); + for (int icol = 0; icol < IN_DIM; icol++) { + printf("["); + for (int ich = 0; ich < IN_CHANNELS; ich++) { + printf("%d,", input[batch][irow][icol][ich]); + } + printf("\b],"); + } + printf("\b],\n"); + } + printf("\b],"); + } + printf("\b\n\n"); + + printf("output:\n"); + for (int batch = 0; batch < BATCH_SIZE; batch++) { + printf("["); + for (int orow = 0; orow < OUT_DIM; orow++) { + printf("["); + for (int ocol = 0; ocol < OUT_DIM; ocol++) { + printf("["); + for (int och = 0; och < OUT_CHANNELS; och++) { + printf("%d,", output[batch][orow][ocol][och]); + } + printf("\b],"); + } + printf("\b],\n"); + } + printf("\b],"); + } + printf("\b\n\n"); + + printf("output_mat:\n"); + for (int orow = 0; orow < BATCH_SIZE * OUT_DIM * OUT_DIM; orow++) { + printf("["); + for (int ocol = 0; ocol < OUT_CHANNELS; ocol++) { + printf("%d,", output_mat[orow][ocol]); + } + printf("\b],\n"); + } + printf("\b\n\n"); + + return 1; + } + + return 0; +} diff --git a/gemmini-data-collection/templates/conv_template_map.c b/gemmini-data-collection/templates/conv_template_map.c new file mode 100644 index 00000000..40d0b6fa --- /dev/null +++ b/gemmini-data-collection/templates/conv_template_map.c @@ -0,0 +1,338 @@ +#include +#include +#include +#include +#include +#ifndef BAREMETAL +#include +#endif +#include "include/gemmini_testutils.h" + +#define BATCH_SIZE %BATCH_SIZE% +#define IN_DIM %IN_DIM% +#define IN_CHANNELS %IN_CHANNELS% +#define OUT_CHANNELS %OUT_CHANNELS% +#define KERNEL_DIM %KERNEL_DIM% +#define PADDING 0 +#define KERNEL_DILATION %KERNEL_DILATION% +#define STRIDE %STRIDE% + +#define BATCHES %TILE_BATCHES% +#define OCOLS %TILE_OCOLS% +#define OROWS %TILE_OROWS% +#define OCHS %TILE_OCHS% +#define KCOLS %TILE_KCOLS% +#define KROWS %TILE_KROWS% +#define KCHS %TILE_KCHS% + +#define NO_BIAS false + +#define OUT_DIM ((IN_DIM + 2*PADDING - KERNEL_DIM) / STRIDE + 1) +#define PATCH_SIZE (KERNEL_DIM * KERNEL_DIM * IN_CHANNELS) +#define N_PATCHES (BATCH_SIZE * OUT_DIM * OUT_DIM) + +void conv(int batch_size, int in_channels, int in_dim, + int out_channels, int kernel_dim, + int out_dim, + int stride, int padding, + elem_t input[batch_size][in_dim][in_dim][in_channels], + elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + acc_t bias[out_channels], + elem_t output[batch_size][out_dim][out_dim][out_channels]) { + +#ifdef GEMMINI_ASSERTIONS + if (out_dim != (in_dim + 2*padding - kernel_dim) / stride + 1) { + printf("conv out_dim is not correct\n"); + exit(1); + } +#endif + + for (int b = 0; b < batch_size; b++) { + for (int orow = 0; orow < out_dim; orow++) { + for (int ocol = 0; ocol < out_dim; ocol++) { + for (int och = 0; och < out_channels; och++) { + acc_t result = bias[och]; + + for (int krow = 0; krow < kernel_dim; krow++) { + for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int kch = 0; kch < in_channels; kch++) { + int irow = orow * stride + krow - padding; + int icol = ocol * stride + kcol - padding; + + elem_t pixel = irow < 0 || irow >= in_dim || + icol < 0 || icol >= in_dim ? + 0 : input[b][irow][icol][kch]; + + result += + weights[och][krow][kcol][kch] * + pixel; + } + } + } + + // Clip result + result = result > elem_t_max ? elem_t_max : (result < elem_t_min ? elem_t_min : result); + + output[b][orow][ocol][och] = result; + } + } + } + } +} + +void flatten_weights(int out_channels, int kernel_dim, int in_channels, + int patch_size, + elem_t weights[out_channels][kernel_dim][kernel_dim][in_channels], + elem_t weights_mat[patch_size][out_channels]) { + + assert(patch_size == kernel_dim * kernel_dim * in_channels); + + for (int outc = 0; outc < out_channels; outc++) { + for (int krow = 0; krow < kernel_dim; krow++) { + for (int kcol = 0; kcol < kernel_dim; kcol++) { + for (int inc = 0; inc < in_channels; inc++) { + int wmatrow = krow * kernel_dim * in_channels + + kcol * in_channels + + inc; + + weights_mat[wmatrow][outc] = + weights[outc][krow][kcol][inc]; + } + } + } + } +} + +bool vec_is_equal(elem_t * a, elem_t * b, int len) { + for (int i = 0; i < len; i++) + if (a[i] != b[i]) + return false; + return true; +} + +void init_random(elem_t * buf, int len) { + elem_t i = 0; + for (elem_t * ptr = buf; ptr < buf + len; ptr++) { + // *ptr = (rand() % 32) - 16; +#ifdef FAST + *ptr = 1; +#else + *ptr = (rand() % 5) - 2; +#endif + } +} + +void init_random_acc(acc_t * buf, int len) { + elem_t i = 0; + for (acc_t * ptr = buf; ptr < buf + len; ptr++) { + // *ptr = (rand() % 32) - 16; +#ifdef FAST + *ptr = 1; +#else + *ptr = (rand() % 5) - 2; +#endif + } +} + +void init_zeros_acc(acc_t * buf, int len) { + for (acc_t * ptr = buf; ptr < buf + len; ptr++) { + *ptr = 0; + } +} + +int main() { +#ifndef BAREMETAL + if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) { + perror("mlockall failed"); + exit(1); + } +#endif + printf("Input dimension: %u\n", IN_DIM); + printf("Input channels: %u\n", IN_CHANNELS); + printf("Output channels: %u\n", OUT_CHANNELS); + printf("Kernel dimensions: %u\n", KERNEL_DIM); + printf("Kernel dilation: %u\n", KERNEL_DILATION); + printf("Stride: %u\n", STRIDE); + printf("Padding: %u\n", PADDING); + printf("Batch size: %u\n\n", BATCH_SIZE); + + gemmini_flush(0); + + // assert((in_dim + 2*padding - kernel_dim) % stride == 0); + + printf("Output dimension: %u\n\n", OUT_DIM); + + static elem_t input[BATCH_SIZE][IN_DIM][IN_DIM][IN_CHANNELS]; + static elem_t weights[OUT_CHANNELS][KERNEL_DIM][KERNEL_DIM][IN_CHANNELS]; + static acc_t bias[OUT_CHANNELS]; + static elem_t output[BATCH_SIZE][OUT_DIM][OUT_DIM][OUT_CHANNELS]; + + printf("Randomize inputs...\n"); + //init_random(&input[0][0][0][0], sizeof(input) / sizeof(elem_t)); + + printf("Randomize weights...\n"); + //init_random(&weights[0][0][0][0], sizeof(weights) / sizeof(elem_t)); + + /* + printf("Randomize bias...\n"); + if (NO_BIAS) + init_zeros_acc(&bias[0], sizeof(bias) / sizeof(acc_t)); + else + init_random_acc(&bias[0], sizeof(bias) / sizeof(acc_t)); + */ + + printf("CPU conv...\n"); + uint64_t start_cpu = read_cycles(); +#ifndef FAST + /*conv(BATCH_SIZE, IN_CHANNELS, IN_DIM, + OUT_CHANNELS, KERNEL_DIM, + OUT_DIM, + STRIDE, PADDING, + input, + weights, + bias, + output); + */ +#endif + uint64_t end_cpu = read_cycles(); + printf("CPU conv took %llu cycles\n", end_cpu - start_cpu); + + static elem_t weights_mat[PATCH_SIZE][OUT_CHANNELS]; + static elem_t output_mat[N_PATCHES][OUT_CHANNELS]; + + printf("Flatten weights...\n"); + /*flatten_weights(OUT_CHANNELS, KERNEL_DIM, IN_CHANNELS, + PATCH_SIZE, + weights, + weights_mat); + */ + printf("Gemmini conv...\n"); + uint64_t start_gemmini = read_cycles(); + tiled_conv( + BATCH_SIZE, IN_DIM, IN_CHANNELS, + OUT_CHANNELS, OUT_DIM, + STRIDE, 1, KERNEL_DILATION, PADDING, KERNEL_DIM, + false, false, false, false, false, + + BATCHES, + OROWS, OCOLS, OCHS, + KROWS, KCOLS, KCHS, + + (elem_t*)input, + (elem_t*)weights_mat, + NO_BIAS ? NULL : (acc_t*)bias, + (elem_t*)output_mat, + + NO_ACTIVATION, ACC_SCALE_IDENTITY, 0, 0, 0, + + WS); + + gemmini_fence(); + uint64_t end_gemmini = read_cycles(); + printf("Gemmini conv took %llu cycles\n", end_gemmini - start_gemmini); + + assert(sizeof(output_mat) == sizeof(output)); + +#ifdef FAST + bool success = true; + for (int orow = 0; orow < BATCH_SIZE * OUT_DIM * OUT_DIM; orow++) { + for (int ocol = 0; ocol < OUT_CHANNELS; ocol++) { + elem_t v = output_mat[orow][ocol]; + if (v != 21 && v != 31 && v != 46) { + success = false; + break; + } + } + } +#else + bool success = vec_is_equal(&output[0][0][0][0], &output_mat[0][0], sizeof(output) / sizeof(elem_t)); +#endif + + if (!success) { + // return 1; + + printf("bias:\n"); + for (int och = 0; och < OUT_CHANNELS; och++) { + printf("%d,", bias[och]); + } + printf("\b\n\n"); + + printf("weights:\n"); + for (int och = 0; och < OUT_CHANNELS; och++) { + printf("["); + for (int wrow = 0; wrow < KERNEL_DIM; wrow++) { + printf("["); + for (int wcol = 0; wcol < KERNEL_DIM; wcol++) { + printf("["); + for (int ich = 0; ich < IN_CHANNELS; ich++) { + printf("%d,", weights[och][wrow][wcol][ich]); + } + printf("\b],"); + } + printf("\b],\n"); + } + printf("\b],"); + } + printf("\b\n\n"); + + printf("weights_mat:\n"); + for (int wrow = 0; wrow < KERNEL_DIM * KERNEL_DIM * IN_CHANNELS; wrow++) { + printf("["); + for (int wcol = 0; wcol < OUT_CHANNELS; wcol++) { + printf("%d,", weights_mat[wrow][wcol]); + } + printf("\b],\n"); + } + printf("\b\n\n"); + + printf("input:\n"); + for (int batch = 0; batch < BATCH_SIZE; batch++) { + printf("["); + for (int irow = 0; irow < IN_DIM; irow++) { + printf("["); + for (int icol = 0; icol < IN_DIM; icol++) { + printf("["); + for (int ich = 0; ich < IN_CHANNELS; ich++) { + printf("%d,", input[batch][irow][icol][ich]); + } + printf("\b],"); + } + printf("\b],\n"); + } + printf("\b],"); + } + printf("\b\n\n"); + + printf("output:\n"); + for (int batch = 0; batch < BATCH_SIZE; batch++) { + printf("["); + for (int orow = 0; orow < OUT_DIM; orow++) { + printf("["); + for (int ocol = 0; ocol < OUT_DIM; ocol++) { + printf("["); + for (int och = 0; och < OUT_CHANNELS; och++) { + printf("%d,", output[batch][orow][ocol][och]); + } + printf("\b],"); + } + printf("\b],\n"); + } + printf("\b],"); + } + printf("\b\n\n"); + + printf("output_mat:\n"); + for (int orow = 0; orow < BATCH_SIZE * OUT_DIM * OUT_DIM; orow++) { + printf("["); + for (int ocol = 0; ocol < OUT_CHANNELS; ocol++) { + printf("%d,", output_mat[orow][ocol]); + } + printf("\b],\n"); + } + printf("\b\n\n"); + + return 1; + } + + return 0; +} diff --git a/gemmini-data-collection/templates/matmul_template.c b/gemmini-data-collection/templates/matmul_template.c new file mode 100644 index 00000000..60b4c8b3 --- /dev/null +++ b/gemmini-data-collection/templates/matmul_template.c @@ -0,0 +1,91 @@ +// See LICENSE for license details. + +#include +#include +#include +#include +#include +#ifndef BAREMETAL +#include +#endif +#include "include/gemmini_testutils.h" + +#define NO_BIAS 0 +#define REPEATING_BIAS 1 + +#define A_TRANSPOSE 0 +#define B_TRANSPOSE 0 + +#define MAT_DIM_I %DIM_I% +#define MAT_DIM_K %DIM_J% +#define MAT_DIM_J %DIM_K% + +#if A_TRANSPOSE==0 +#define A_STRIDE MAT_DIM_K +#else +#define A_STRIDE MAT_DIM_I +#endif + +#if B_TRANSPOSE==0 +#define B_STRIDE MAT_DIM_J +#else +#define B_STRIDE MAT_DIM_K +#endif + +int main() { +#ifndef BAREMETAL + if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) { + perror("mlockall failed"); + exit(1); + } +#endif + + gemmini_flush(0); + +#if A_TRANSPOSE==0 + static elem_t full_A[MAT_DIM_I][MAT_DIM_K] row_align(1); +#else + static elem_t full_A[MAT_DIM_K][MAT_DIM_I] row_align(1); +#endif + +#if B_TRANSPOSE==0 + static elem_t full_B[MAT_DIM_K][MAT_DIM_J] row_align(1); +#else + static elem_t full_B[MAT_DIM_J][MAT_DIM_K] row_align(1); +#endif + + static elem_t full_C[MAT_DIM_I][MAT_DIM_J] row_align(1); + static acc_t full_D[MAT_DIM_I][MAT_DIM_J] row_align_acc(1); + + static full_t gold_full[MAT_DIM_I][MAT_DIM_J]; + static elem_t gold[MAT_DIM_I][MAT_DIM_J]; + + printf("Starting gemmini matmul\n"); + printf("I: %d, J: %d, K: %d\n", MAT_DIM_I, MAT_DIM_J, MAT_DIM_K); + printf("NO_BIAS: %d, REPEATING_BIAS: %d\n", NO_BIAS, REPEATING_BIAS); + printf("A_TRANSPOSE: %d, B_TRANSPOSE: %d\n", A_TRANSPOSE, B_TRANSPOSE); + unsigned long start = read_cycles(); + + tiled_matmul_auto(MAT_DIM_I, MAT_DIM_J, MAT_DIM_K, + (elem_t*)full_A, (elem_t*)full_B, NO_BIAS ? NULL : &full_D[0][0], (elem_t*)full_C, + A_STRIDE, B_STRIDE, MAT_DIM_J, MAT_DIM_J, + MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, + NO_ACTIVATION, ACC_SCALE_IDENTITY, 0, REPEATING_BIAS, + A_TRANSPOSE, B_TRANSPOSE, + false, false, + 0, + WS); + + gemmini_fence(); + + unsigned long end = read_cycles(); + printf("Cycles taken: %u\n", end-start); + + const int total_macs = MAT_DIM_I * MAT_DIM_J * MAT_DIM_K; + const int ideal_cycles = total_macs / (DIM * DIM); + const int utilization = 100 * ideal_cycles / (end-start); + printf("Utilization: %d%%\n", utilization); + + exit(0); +} + diff --git a/gemmini-data-collection/templates/matmul_template_map.c b/gemmini-data-collection/templates/matmul_template_map.c new file mode 100644 index 00000000..7e805f0c --- /dev/null +++ b/gemmini-data-collection/templates/matmul_template_map.c @@ -0,0 +1,97 @@ +// See LICENSE for license details. + +#include +#include +#include +#include +#include +#ifndef BAREMETAL +#include +#endif +#include "include/gemmini_testutils.h" + +#define NO_BIAS 0 +#define REPEATING_BIAS 1 + +#define A_TRANSPOSE 0 +#define B_TRANSPOSE 0 + +//in Gemmini, K is shared dimension +//user should consider J shared dimension +#define MAT_DIM_I %DIM_I% +#define MAT_DIM_K %DIM_J% +#define MAT_DIM_J %DIM_K% +#define TILE_I %TILE_DIM_I% +#define TILE_K %TILE_DIM_J% +#define TILE_J %TILE_DIM_K% + +#if A_TRANSPOSE==0 +#define A_STRIDE MAT_DIM_K +#else +#define A_STRIDE MAT_DIM_I +#endif + +#if B_TRANSPOSE==0 +#define B_STRIDE MAT_DIM_J +#else +#define B_STRIDE MAT_DIM_K +#endif + +int main() { +#ifndef BAREMETAL + if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) { + perror("mlockall failed"); + exit(1); + } +#endif + + gemmini_flush(0); + +#if A_TRANSPOSE==0 + static elem_t full_A[MAT_DIM_I][MAT_DIM_K] row_align(1); +#else + static elem_t full_A[MAT_DIM_K][MAT_DIM_I] row_align(1); +#endif + +#if B_TRANSPOSE==0 + static elem_t full_B[MAT_DIM_K][MAT_DIM_J] row_align(1); +#else + static elem_t full_B[MAT_DIM_J][MAT_DIM_K] row_align(1); +#endif + + static elem_t full_C[MAT_DIM_I][MAT_DIM_J] row_align(1); + static acc_t full_D[MAT_DIM_I][MAT_DIM_J] row_align_acc(1); + + static full_t gold_full[MAT_DIM_I][MAT_DIM_J]; + static elem_t gold[MAT_DIM_I][MAT_DIM_J]; + + printf("Starting gemmini matmul\n"); + printf("I: %d, J: %d, K: %d\n", MAT_DIM_I, MAT_DIM_J, MAT_DIM_K); + printf("NO_BIAS: %d, REPEATING_BIAS: %d\n", NO_BIAS, REPEATING_BIAS); + printf("A_TRANSPOSE: %d, B_TRANSPOSE: %d\n", A_TRANSPOSE, B_TRANSPOSE); + unsigned long start = read_cycles(); + + tiled_matmul(MAT_DIM_I, MAT_DIM_J, MAT_DIM_K, + (elem_t*)full_A, (elem_t*)full_B, NO_BIAS ? NULL : &full_D[0][0], (elem_t*)full_C, + A_STRIDE, B_STRIDE, MAT_DIM_J, MAT_DIM_J, + MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, + NO_ACTIVATION, ACC_SCALE_IDENTITY, 0, REPEATING_BIAS, + TILE_I, TILE_J, TILE_K, + A_TRANSPOSE, B_TRANSPOSE, + false, false, + 0, + WS); + + gemmini_fence(); + + unsigned long end = read_cycles(); + printf("Cycles taken: %u\n", end-start); + + const int total_macs = MAT_DIM_I * MAT_DIM_J * MAT_DIM_K; + const int ideal_cycles = total_macs / (DIM * DIM); + const int utilization = 100 * ideal_cycles / (end-start); + printf("Utilization: %d%%\n", utilization); + + exit(0); +} + diff --git a/gemmini-data-collection/tests.py b/gemmini-data-collection/tests.py new file mode 100644 index 00000000..57c8d141 --- /dev/null +++ b/gemmini-data-collection/tests.py @@ -0,0 +1,30 @@ +from collections import namedtuple +import pickle + +GemminiTest = namedtuple("GemminiTest", "keywords replacement template_file new_file") + +# MODIFY LIST BELOW TO SPECIFY TESTS TO RUN +# arguments: array of keywords, array of values for keywords, name of C file in the templates folder, name of output C file to be placed in bareMetalC + +with open("layers/layers.pickle", "rb") as p: + layers = pickle.load(p) + +tests = [] + +for layer in layers: + layer_name = layer["prob_name"] + del layer["prob_name"] + vals = [] + for val in layer.values(): + vals.append(str(int(val))) + tests.append(GemminiTest(list(layer.keys()), vals, "conv_template_map", layer_name)) + +""" +tests = [ + #GemminiTest(["DIM_I", "DIM_J", "DIM_K"], ["6272", "64", "64"], "matmul_template", "tiled_matmul_ws_perf-128_256_64"), + #GemminiTest(["DIM_I", "DIM_J", "DIM_K"], ["64", "32", "128"], "matmul_template", "tiled_matmul_ws_perf-64_32_128"), + #GemminiTest(["DIM_I", "DIM_J", "DIM_K", "TILE_DIM_I", "TILE_DIM_J", "TILE_DIM_K"], ["64", "32", "128", "1", "1", "2"], "matmul_template_map", "tiled_matmul_ws_perf-64_32_128"), + GemminiTest(["IN_DIM", "IN_CHANNELS", "OUT_CHANNELS", "KERNEL_DIM", "STRIDE", "PADDING", "TILE_BATCHES", "TILE_OCOLS", "TILE_OROWS", "TILE_OCHS", "TILE_KCOLS", "TILE_KROWS", "TILE_KCHS"], ["224", "3", "64", "7", "2", "3", "2", "4", "4", "4", "4", "4", "3"], "conv_template_map", "conv-perf_224-3-64-7-2-3"), + #GemminiTest(["IN_DIM", "IN_CHANNELS", "OUT_CHANNELS", "KERNEL_DIM", "STRIDE", "PADDING"], ["224", "3", "64", "7", "2", "3"], "conv_template", "conv-perf_224-3-64-7-2-3"), +] +""" diff --git a/include/gemmini.h b/include/gemmini.h index 1035c02b..3f93835f 100644 --- a/include/gemmini.h +++ b/include/gemmini.h @@ -1285,7 +1285,8 @@ static void tiled_matmul_auto(size_t dim_I, size_t dim_J, size_t dim_K, break; } - /* +#ifdef PRINT_TILE +#if PRINT_TILE const int spad_rows = tiled_matmul_total_spad_rows(tile_I, tile_J, tile_K); const int acc_rows = tiled_matmul_total_acc_rows(tile_I, tile_J); @@ -1298,7 +1299,10 @@ static void tiled_matmul_auto(size_t dim_I, size_t dim_J, size_t dim_K, printf("spad_row utilization: %d%%\n", (spad_rows * 100) / max_spad_rows); printf("acc_row utilization: %d%%\n\n", (acc_rows * 100) / max_acc_rows); - */ + + exit(EXIT_SUCCESS); +#endif +#endif tiled_matmul(dim_I, dim_J, dim_K, A, B, D, C, @@ -2688,7 +2692,9 @@ static void tiled_conv_auto( acc_rows = tiled_conv_total_spad_rows(true, stride, input_dilation, kernel_dilation, downsample, trans_weight_0132, trans_input_3120, args[0], args[1], args[2], args[3], args[4], args[5], args[6], pool_size, pool_stride); + */ +#if PRINT_TILE printf("batches = %d\n", batches); printf("orows = %d\n", orows); printf("ocols = %d\n", ocols); @@ -2704,7 +2710,7 @@ static void tiled_conv_auto( printf("accumulator row utilization: %d%%\n\n", (acc_rows*100) / max_acc_rows); printf("inner matmul size: i=%d, j=%d, k=%d\n\n", ocols, ochs, kchs); - */ +#endif tiled_conv( batch_size, in_dim, in_channels,