diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..44a6b67 --- /dev/null +++ b/.clang-format @@ -0,0 +1,9 @@ +BasedOnStyle: LLVM # 基于LLVM的代码风格 +IndentWidth: 4 # 缩进宽度为4个空格 +AccessModifierOffset: -4 # 访问修饰符的偏移量为-4 +ColumnLimit: 100 # 每行代码的最大列数 +AlwaysBreakAfterDefinitionReturnType: None +PenaltyReturnTypeOnItsOwnLine: 1000000 +AlignAfterOpenBracket: BlockIndent +AllowShortIfStatementsOnASingleLine: true + diff --git a/.gitignore b/.gitignore index 43a659c..2cd9651 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,6 @@ test html/ cmake-build* build -obj_dir* \ No newline at end of file +obj_dir* +compile_commands.json +.cache \ No newline at end of file diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..ecf6dff --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "dependencies/membox"] + path = dependencies/membox + url = https://github.com/THU-DSP-LAB/membox.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 5c633f5..572c637 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,51 +1,24 @@ cmake_minimum_required(VERSION 3.22) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + set(PROJECT ventus_driver) project(${PROJECT}) set(CMAKE_CXX_STANDARD 17) -# set(CMAKE_CXX_FLAGS -Wl,--whole-archive -Wl,--no-whole-archive) - -include_directories(common) -include_directories(include) -include_directories(devices) -include_directories(devices/verilating_device/page_table) -include_directories(tests) -include_directories(driver) - - -set(CMAKE_BUILD_TYPE "Debug") - -if(CMAKE_BUILD_TYPE AND (CMAKE_BUILD_TYPE STREQUAL "Debug")) - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall -O0 -g -ggdb") - message("Debug mode:${CMAKE_CXX_FLAGS_DEBUG}") -# add_executable(test_debug ${src_dirs}) - - -elseif(CMAKE_BUILD_TYPE AND (CMAKE_BUILD_TYPE STREQUAL "Release")) - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Wall -O3") - message("Release mode:${CMAKE_CXX_FLAGS_RELEASE}") -# add_executable(test_release ${src_dirs}) -else() - message("else:${CMAKE_BUILD_TYPE}") - message("else:${CMAKE_CXX_FLAGS_RELEASE}") -# add_executable(test_release ${src_dirs}) -endif() - -if(EXISTS "${CMAKE_SOURCE_DIR}/test.cpp") - set(CODING_TEST test) - add_executable(${CODING_TEST} test.cpp) -endif() - -option(ENABLE_INSTALL "if install driver library to install dir" OFF) - -set(DRIVER_LIB_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib/) -message(STATUS "DRIVER_LIB_INSTALL_DIR:" ${DRIVER_LIB_INSTALL_DIR} "\n") -option(ENABLE_VERILATOR "if add verilated rtl device" OFF) +set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall") +set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Wall") -add_subdirectory(devices) +add_subdirectory(dependencies/membox) +add_subdirectory(common) add_subdirectory(driver) -if(ENABLE_VERILATOR) -add_subdirectory(tests) -endif() -add_subdirectory(codetests) \ No newline at end of file +add_subdirectory(codetests) + +set(DRIVER_PUBLIC_HEADERS + ${CMAKE_CURRENT_SOURCE_DIR}/include/ventus.h + ${CMAKE_CURRENT_SOURCE_DIR}/common/vt_utils.h +) +install(FILES ${DRIVER_PUBLIC_HEADERS} + DESTINATION include COMPONENT "include" +) diff --git a/README.md b/README.md index ceee148..edfd2e0 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,97 @@ # ventus-driver -driver code for [ventus-gpgpu](https://github.com/THU-DSP-LAB/ventus-gpgpu) -To build this repository, see [llvm-ventus](https://github.com/THU-DSP-LAB/llvm-project). +## English +[中文版 Chinese](#中文) + +Driver code for Ventus GPGPU project [ventus-gpgpu](https://github.com/THU-DSP-LAB/ventus-gpgpu). It is intended to be used together with other Ventus toolchain projects. See [ventus-env](https://github.com/THU-DSP-LAB/ventus-env). + +### Install +- Recommended: use [ventus-env](https://github.com/THU-DSP-LAB/ventus-env) to deploy the Ventus environment and build via the script build-ventus.sh. + +- Manual CMake build: +```bash +cmake -G Ninja -B build/ -S . \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_INSTALL_PREFIX=../install \ + -DVENTUS_INSTALL_PREFIX=../install \ + -DSPIKE_SRC_DIR=../spike \ + -DDRIVER_ENABLE_AUTOSELECT=ON \ + -DDRIVER_ENABLE_RTLSIM=ON \ + -DDRIVER_ENABLE_CYCLESIM=ON +cmake --build build/ +cmake --install build/ +``` + +### Usage +This repository provides the driver implementation for Ventus GPGPU, mainly handling memory management and device connectivity. + +- Upstream (host): connects to an OpenCL implementation ([POCL](https://github.com/THU-DSP-LAB/pocl)). +- Downstream (devices): supports multiple (simulation) backends: + - Instruction-level simulator [spike](https://github.com/THU-DSP-LAB/ventus-gpgpu-isa-simulator), path: driver/spike_device + - SystemC-based cycle-accurate simulator [cyclesim](https://github.com/THU-DSP-LAB/ventus-gpgpu-cpp-simulator), path: driver/cyclesim_device + - Verilator-based [Chisel RTL](https://github.com/THU-DSP-LAB/ventus-gpgpu) simulation framework [sim-verilator](https://github.com/THU-DSP-LAB/ventus-gpgpu/tree/master/sim-verilator), path: driver/rtlsim_device + - The helper backend driver/auto_select allows choosing a backend via the environment variable VENTUS_BACKEND (see Usage). +- Both upstream and downstream connections are via shared libraries. + +This repository builds multiple shared libraries installed into the specified prefix. After configuring environment variables as in [ventus-env](https://github.com/THU-DSP-LAB/ventus-env), you can run OpenCL programs and the driver will be invoked automatically. + +### Environment variables +- `VENTUS_BACKEND` selects the device backend. Allowed values: spike/isa, rtl/rtlsim/gpgpu, cycle/cyclesim/systemc/simulator. +- `VENTUS_WAVEFORM=1` enables waveform dump: fst for rtlsim, vcd for cyclesim. +- `VENTUS_WAVEFORM_BEGIN` and `VENTUS_WAVEFORM_END` define a time window to limit waveform dump for rtlsim (speeds up simulation). Not supported by cyclesim. +- `VENTUS_DUMP_RESULT=filename.json` saves all device-to-host copied data and their device addresses into a JSON file for debugging. +- `VENTUS_TIMING_DDR=0` disables DDR timing modeling in cyclesim (enabled by default). RTL does not support DDR timing yet. + +### Example +```bash +VENTUS_BACKEND=rtl VENTUS_DUMP_RESULT=app.rtl.json VENTUS_WAVEFORM=1 ./OpenCLapp.out 2>&1 | tee rtl.log +``` + +--- + +## 中文 +[English Version](#english) + +这是[ventus-gpgpu](https://github.com/THU-DSP-LAB/ventus-gpgpu)的驱动程序,目前仅支持仿真环境,需与其他Ventus工具链项目配合使用,参见 [ventus-env](https://github.com/THU-DSP-LAB/ventus-env)。 + +### Install +推荐使用 [ventus-env](https://github.com/THU-DSP-LAB/ventus-env) 部署 Ventus 环境,使用其中的 build-ventus.sh 脚本来编译安装。 + +手动 cmake 编译命令: +```bash +cmake -G Ninja -B build/ -S . \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_INSTALL_PREFIX=../install \ + -DVENTUS_INSTALL_PREFIX=../install \ + -DSPIKE_SRC_DIR=../spike \ + -DDRIVER_ENABLE_AUTOSELECT=ON \ + -DDRIVER_ENABLE_RTLSIM=ON \ + -DDRIVER_ENABLE_CYCLESIM=ON +cmake --build build/ +cmake --install build/ +``` + +### Usage +本仓库作为乘影 Ventus GPGPU 的驱动实现,主要完成内存管理与设备连接的功能。 +- 向上层连接 OpenCL 实现([POCL](https://github.com/THU-DSP-LAB/pocl))。 +- 向底层连接多种(仿真)设备,目前支持: + - 指令级仿真器 [spike](https://github.com/THU-DSP-LAB/ventus-gpgpu-isa-simulator),对应 driver/spike_device + - 基于 SystemC 的周期级仿真器 [cyclesim](https://github.com/THU-DSP-LAB/ventus-gpgpu-cpp-simulator),对应 driver/cyclesim_device + - 基于 Verilator 搭建的 [Chisel RTL](https://github.com/THU-DSP-LAB/ventus-gpgpu) 仿真框架 [sim-verilator](https://github.com/THU-DSP-LAB/ventus-gpgpu/tree/master/sim-verilator),对应 driver/rtlsim_device + - 另有 driver/auto_select,用户可通过环境变量 VENTUS_BACKEND 指定具体底层设备(见 Usage)。 +- 向上层与向底层的连接均以动态库链接的形式完成。 + +本仓库会编译出多个动态库安装到指定目录下。用户参照 [ventus-env](https://github.com/THU-DSP-LAB/ventus-env) 配置好环境变量后运行 OpenCL 程序即可自动调用。 + +### 环境变量 +- VENTUS_BACKEND 选择底层设备,可选值:spike/isa,rtl/rtlsim/gpgpu,cycle/cyclesim/systemc/simulator。 +- VENTUS_WAVEFORM=1 时,rtlsim 导出 fst 波形,cyclesim 导出 vcd 波形。 +- 设定 VENTUS_WAVEFORM_BEGIN 与 VENTUS_WAVEFORM_END(数字)可使 rtlsim 仅导出该时间段内波形,加速仿真;cyclesim 不支持。 +- VENTUS_DUMP_RESULT=filename.json 将所有从 device 端拷回 host 端的数据及其设备端地址保存到指定 JSON 文件,辅助调试。 +- VENTUS_TIMING_DDR=0 关闭 cyclesim 中的 DDR 时序仿真(默认开启)。RTL 暂不支持 DDR 时序仿真。 + +### 示例 +```bash +VENTUS_BACKEND=rtl VENTUS_DUMP_RESULT=app.rtl.json VENTUS_WAVEFORM=1 ./OpenCLapp.out 2>&1 | tee rtl.log +``` + diff --git a/TODO b/TODO deleted file mode 100644 index e69de29..0000000 diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt new file mode 100644 index 0000000..7fe0bf4 --- /dev/null +++ b/common/CMakeLists.txt @@ -0,0 +1,20 @@ +set(PROJECT driver_common_utils) +project(${PROJECT}) + +file(GLOB_RECURSE SRCS ./loadelf.cpp) + +add_library(${PROJECT} STATIC ${SRCS}) + +target_link_libraries(${PROJECT} PRIVATE elf) +target_link_libraries(${PROJECT} PRIVATE spdlog) +target_link_libraries(${PROJECT} PRIVATE fmt) +target_include_directories(${PROJECT} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) + +set_target_properties(${PROJECT} PROPERTIES OUTPUT_NAME "${PROJECT}") +set_target_properties(${PROJECT} PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(${PROJECT} PROPERTIES POSITION_INDEPENDENT_CODE ON) + +# if(ENABLE_INSTALL) +# message(STATUS "DRIVER_LIB_INSTALL_DIR:" ${DRIVER_LIB_INSTALL_DIR}) +# install(TARGETS ${PROJECT} LIBRARY DESTINATION ${DRIVER_LIB_INSTALL_DIR} COMPONENT "lib") +# endif() diff --git a/common/loadelf.cpp b/common/loadelf.cpp new file mode 100644 index 0000000..43b39a2 --- /dev/null +++ b/common/loadelf.cpp @@ -0,0 +1,94 @@ +#include "loadelf.hpp" +#include +#include +#include +#include +#include +#include +#include +#include + +// 分析 ELF 文件,返回所有加载到内存中的段信息 +std::vector get_data_from_elf(const char *filename, std::shared_ptr logger) { + std::vector blocks; + + // 初始化 libelf 库 + if (elf_version(EV_CURRENT) == EV_NONE) { + logger->error("ELF: cannot initialize libelf"); + return std::vector(); + } + + // 打开 ELF 文件 + int fd = open(filename, O_RDONLY); + if (fd < 0) { + const char *errstr = strerrordesc_np(errno); + logger->error("ELF: cannot open file '{}': {}", filename, errstr); + return std::vector(); + } + + // 开始 ELF 处理 + Elf *e = elf_begin(fd, ELF_C_READ, nullptr); + if (!e) { + logger->error("ELF: elf_begin failed: {}", elf_errmsg(-1)); + close(fd); + return std::vector(); + } + + // 读取 ELF 头部 + GElf_Ehdr ehdr; + if (gelf_getehdr(e, &ehdr) == nullptr) { + logger->error("ELF: gelf_getehdr failed: {}", elf_errmsg(-1)); + elf_end(e); + close(fd); + return std::vector(); + } + + // 获取程序头表中的段数量 + size_t phnum = 0; + if (elf_getphdrnum(e, &phnum) != 0) { + logger->error("ELF: elf_getphdrnum failed: {}", elf_errmsg(-1)); + elf_end(e); + close(fd); + return std::vector(); + } + + // 遍历所有程序头 + for (size_t i = 0; i < phnum; ++i) { + GElf_Phdr phdr; + if (gelf_getphdr(e, i, &phdr) != &phdr) { + logger->error("ELF: gelf_getphdr header {} failed: {}", i, elf_errmsg(-1)); + return std::vector(); + } + + // 只考虑加载段 + if (phdr.p_type == PT_LOAD) { + MemBlock block; + block.vaddr = phdr.p_vaddr; + block.memsz = phdr.p_memsz; + + // 如果段在文件中有初始化数据,则读取数据 + if (phdr.p_filesz > 0) { + block.data.resize(phdr.p_filesz); + if (lseek(fd, phdr.p_offset, SEEK_SET) == -1) { + const char *errstr = strerrordesc_np(errno); + logger->error("ELF: failed seeking to offset {}: {}", phdr.p_offset, errstr); + return std::vector(); + } else { + ssize_t bytesRead = read(fd, block.data.data(), phdr.p_filesz); + if (bytesRead != (ssize_t)phdr.p_filesz) { + const char *errstr = strerrordesc_np(errno); + logger->error( + "ELF: failed reading {} bytes data: {}", phdr.p_filesz, errstr + ); + return std::vector(); + } + } + } + blocks.push_back(block); + } + } + + elf_end(e); + close(fd); + return blocks; +} diff --git a/common/loadelf.hpp b/common/loadelf.hpp new file mode 100644 index 0000000..c7560af --- /dev/null +++ b/common/loadelf.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include +#include +#include +#include + +// 解析ELF后返回的需要分配的内存块信息 +typedef struct MemBlock { + uint64_t vaddr; // 内存块的起始地址 + size_t memsz; // 内存块所需分配大小 + std::vector data; // 此内存块的初始化数据 + // 若(filesz=)data.size < memsz则需要补0到memsz大小 +} MemBlock; + +std::vector get_data_from_elf(const char *filename, std::shared_ptr logger); diff --git a/common/utils.hpp b/common/utils.hpp new file mode 100644 index 0000000..4a895c3 --- /dev/null +++ b/common/utils.hpp @@ -0,0 +1,45 @@ +#include +#include +#include +#include +#include +#include +#include + +inline std::optional parse_bool(std::string str) { + // transform to lowercase safely (unsigned char cast to avoid UB) + std::transform(str.begin(), str.end(), str.begin(), [](char c) { + return static_cast(std::tolower(c)); + }); + if (str == "true" || str == "1" || str == "yes" || str == "on") return true; + if (str == "false" || str == "0" || str == "no" || str == "off") return false; + return std::nullopt; +} +inline std::optional parse_bool(const char *str) { + if (str == nullptr) return std::nullopt; + return parse_bool(std::string(str)); +} + +inline std::optional parse_u64(const char *s) { + if (!s || *s == '\0') return std::nullopt; + + // 手动处理二进制 0b/0B + if (s[0] == '0' && (s[1] == 'b' || s[1] == 'B')) { + const char *p = s + 2; + if (*p == '\0') return std::nullopt; + uint64_t v = 0; + while (*p == '0' || *p == '1') { + v = (v << 1) | (*p - '0'); + ++p; + } + if (*p != '\0') return std::nullopt; + return v; + } + + errno = 0; + char *endp = nullptr; + // base = 0 自动识别: 0x/0X -> 16, 前导0 -> 8, 否则 -> 10 + unsigned long long v = std::strtoull(s, &endp, 0); + if (errno != 0 || endp == s || *endp != '\0') return std::nullopt; + return static_cast(v); +}; diff --git a/dependencies/membox b/dependencies/membox new file mode 160000 index 0000000..fdccb9a --- /dev/null +++ b/dependencies/membox @@ -0,0 +1 @@ +Subproject commit fdccb9a64675465e682136979d9dc394afc89b88 diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt index 27c86fe..f2e6452 100644 --- a/driver/CMakeLists.txt +++ b/driver/CMakeLists.txt @@ -1,5 +1,31 @@ -if(ENABLE_VERILATOR) - add_subdirectory(verilating_device) +option(DRIVER_ENABLE_CYCLESIM "Enable cyclesim device driver" OFF) +option(DRIVER_ENABLE_RTLSIM "Enable rtlsim device driver" OFF) +option(DRIVER_ENABLE_GVM "Enable gvm device driver" OFF) +option(DRIVER_ENABLE_AUTOSELECT "Enable auto select device driver" OFF) + +add_subdirectory(spike_device) +set(DRIVER_DEFAULT "spike_driver") + +# If you enable these, it's assumed that backend libraries are already installed to ${VENTUS_INSTALL_PREFIX}/lib +if(DRIVER_ENABLE_CYCLESIM) + add_subdirectory(cyclesim_device) +endif() +if(DRIVER_ENABLE_RTLSIM) + add_subdirectory(rtlsim_device) endif() +if(DRIVER_ENABLE_GVM) + add_subdirectory(gvm_device) +endif() +if(DRIVER_ENABLE_AUTOSELECT) + add_subdirectory(auto_select) + set(DRIVER_DEFAULT "auto_select_driver") +endif() + -add_subdirectory(spike_device) \ No newline at end of file +install(CODE " + file(CREATE_LINK + lib${DRIVER_DEFAULT}.so + ${CMAKE_INSTALL_PREFIX}/lib/libventus_driver.so + SYMBOLIC + ) +") diff --git a/driver/auto_select/CMakeLists.txt b/driver/auto_select/CMakeLists.txt new file mode 100644 index 0000000..996c21b --- /dev/null +++ b/driver/auto_select/CMakeLists.txt @@ -0,0 +1,21 @@ +set(PROJECT auto_select_driver) +project(${PROJECT}) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../include) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../common) + +file(GLOB_RECURSE SRCS ./ventus.cpp) +find_package(nlohmann_json CONFIG REQUIRED) + +add_library(${PROJECT} SHARED ${SRCS}) +target_link_libraries(${PROJECT} PRIVATE spdlog) +target_link_libraries(${PROJECT} PRIVATE fmt) +target_link_libraries(${PROJECT} PRIVATE dl) +target_link_libraries(${PROJECT} PRIVATE nlohmann_json::nlohmann_json) + +set_target_properties(${PROJECT} PROPERTIES OUTPUT_NAME "${PROJECT}") +set_target_properties(${PROJECT} PROPERTIES CLEAN_DIRECT_OUTPUT 1) + +install(TARGETS ${PROJECT} + LIBRARY DESTINATION lib COMPONENT "lib" +) diff --git a/driver/auto_select/ventus.cpp b/driver/auto_select/ventus.cpp new file mode 100644 index 0000000..c6b07a7 --- /dev/null +++ b/driver/auto_select/ventus.cpp @@ -0,0 +1,331 @@ +/* + * Ventus driver for the Auto Select feature. + * This driver is designed to work with the Ventus spike/rtlsim/cyclesim devices. + * It uses dynamic loading to access the Ventus library functions. + */ + +#include "ventus.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// +// 用于导出memcpy_device_to_host的所有数据及其地址(环境变量VENTUS_DUMP_RESULT=filename.json) +// + +// 辅助函数 +static int append_json_object(const std::string &filename, const nlohmann::json &new_obj); +static std::string to_hex_string(uint32_t value) { return fmt::format("0x{:08X}", value); } +// 下边两个函数也可被外界调用,方便再OpenCL APP中获取设备端指针具体地址 +// 获取上次memcpy_device_to_host的设备端地址 +extern "C" int __vt_get_last_copy_to_dev_addr(uint64_t *addr); +// 设置json dump文件名,传入nullptr表示关闭dump功能,等价于VENTUS_DUMP_RESULT环境变量 +extern "C" void __vt_enable_dump_json_copy_to_dev(const char *filename); +// 全局变量 +static uint64_t g_last_copy_to_dev_addr = 0; // 上次memcpy_device_to_host的设备端地址 +static std::optional g_dump_result_filename = std::nullopt; + +// +// 定义函数指针结构体,包含所有ventus.h API的函数指针 +// +struct vt_api_t { + int (*vt_dev_open)(vt_device_h *hdevice); + int (*vt_dev_close)(vt_device_h hdevice); + int (*vt_dev_caps)(vt_device_h *hdevice, uint64_t caps_id, uint64_t *value); + int (*vt_root_mem_alloc)(vt_device_h hdevice, int taskID); + int (*vt_root_mem_free)(vt_device_h hdevice, int taskID); + int (*vt_buf_alloc)( + vt_device_h hdevice, uint64_t size, uint64_t *vaddr, int BUF_TYPE, uint64_t taskID, + uint64_t kernelID + ); + int (*vt_buf_free)( + vt_device_h hdevice, uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID + ); + int (*vt_one_buf_free)( + vt_device_h hdevice, uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID + ); + int (*vt_copy_to_dev)( + vt_device_h hdevice, uint64_t dev_vaddr, const void *src_addr, uint64_t size, + uint64_t taskID, uint64_t kernelID + ); + int (*vt_copy_from_dev)( + vt_device_h hdevice, uint64_t dev_vaddr, void *dst_addr, uint64_t size, uint64_t taskID, + uint64_t kernelID + ); + int (*vt_start)(vt_device_h hdevice, void *metaData, uint64_t taskID); + int (*vt_ready_wait)(vt_device_h hdevice, uint64_t timeout); + int (*vt_finish_all_kernel)(vt_device_h hdevice, std::queue *finished_kernel_list); + int (*vt_upload_kernel_bytes)( + vt_device_h device, const void *content, uint64_t size, int taskID + ); + int (*vt_upload_kernel_file)(vt_device_h device, const char *filename, int kernelID); + int (*vt_dump_perf)(vt_device_h device, FILE *stream); +} vt_api = {0}; + +// 加载后端库并设置函数指针 +vt_api_t load_backend() { + vt_api_t api = {0}; // 初始化函数指针结构体 + + // 读取环境变量 VENTUS_BACKEND,确定动态库名 + const char *backend_ = std::getenv("VENTUS_BACKEND"); + std::string backend = backend_ ? backend_ : "spike"; + std::transform(backend.begin(), backend.end(), backend.begin(), [](unsigned char c) { + return std::tolower(c); + }); + std::map backend_map; + backend_map["isa"] = "libspike_driver.so"; + backend_map["spike"] = "libspike_driver.so"; + backend_map["rtlsim"] = "librtlsim_driver.so"; + backend_map["rtl"] = "librtlsim_driver.so"; + backend_map["gpgpu"] = "librtlsim_driver.so"; + backend_map["cycle"] = "libcyclesim_driver.so"; + backend_map["cyclesim"] = "libcyclesim_driver.so"; + backend_map["gvm"] = "libgvm_driver.so"; + backend_map["simulator"] = "libcyclesim_driver.so"; + backend_map["systemc"] = "libcyclesim_driver.so"; + + std::string backend_soname; + if (backend_map.find(backend) != backend_map.end()) { + backend_soname = backend_map[backend]; + } else { + SPDLOG_ERROR("Unsupported VENTUS_BACKEND: {}", backend); + std::exit(EXIT_FAILURE); + } + + // 获取自身路径,基于此查找其它动态库 + Dl_info info; + if (dladdr((void *)load_backend, &info) == 0) { + SPDLOG_ERROR("dlopen failed to get current library path."); + std::exit(EXIT_FAILURE); + } + std::filesystem::path self_path(info.dli_fname); + self_path = self_path.parent_path(); // 获取当前库所在目录 + + // 构建后端库路径,例如 "install/lib/liba.so" + std::string lib_path = self_path / backend_soname; + void *handle = dlopen(lib_path.c_str(), RTLD_LAZY); + if (!handle) { + SPDLOG_ERROR("dlopen failed to load backend library: {}", dlerror()); + std::exit(EXIT_FAILURE); + } + + // 获取所有 API 的函数指针 + // clang-format off + api.vt_dev_open = (int (*)(vt_device_h*))dlsym(handle, "vt_dev_open"); + api.vt_dev_close = (int (*)(vt_device_h))dlsym(handle, "vt_dev_close"); + api.vt_dev_caps = (int (*)(vt_device_h*, uint64_t, uint64_t*))dlsym(handle, "vt_dev_caps"); + api.vt_root_mem_alloc = (int (*)(vt_device_h, int))dlsym(handle, "vt_root_mem_alloc"); + api.vt_root_mem_free = (int (*)(vt_device_h, int))dlsym(handle, "vt_root_mem_free"); + api.vt_buf_alloc = (int (*)(vt_device_h, uint64_t, uint64_t*, int, uint64_t, uint64_t))dlsym(handle, "vt_buf_alloc"); + api.vt_buf_free = (int (*)(vt_device_h, uint64_t, uint64_t*, uint64_t, uint64_t))dlsym(handle, "vt_buf_free"); + api.vt_one_buf_free = (int (*)(vt_device_h, uint64_t, uint64_t*, uint64_t, uint64_t))dlsym(handle, "vt_one_buf_free"); + api.vt_copy_to_dev = (int (*)(vt_device_h, uint64_t, const void*, uint64_t, uint64_t, uint64_t))dlsym(handle, "vt_copy_to_dev"); + api.vt_copy_from_dev = (int (*)(vt_device_h, uint64_t, void*, uint64_t, uint64_t, uint64_t))dlsym(handle, "vt_copy_from_dev"); + api.vt_start = (int (*)(vt_device_h, void*, uint64_t))dlsym(handle, "vt_start"); + api.vt_ready_wait = (int (*)(vt_device_h, uint64_t))dlsym(handle, "vt_ready_wait"); + api.vt_finish_all_kernel = (int (*)(vt_device_h, std::queue*))dlsym(handle, "vt_finish_all_kernel"); + api.vt_upload_kernel_bytes = (int (*)(vt_device_h, const void*, uint64_t, int))dlsym(handle, "vt_upload_kernel_bytes"); + api.vt_upload_kernel_file = (int (*)(vt_device_h, const char*, int))dlsym(handle, "vt_upload_kernel_file"); + api.vt_dump_perf = (int (*)(vt_device_h, FILE*))dlsym(handle, "vt_dump_perf"); + // clang-format on + + // 检查是否所有函数指针都成功获取 + if (!api.vt_dev_open || !api.vt_dev_close || !api.vt_dev_caps || !api.vt_root_mem_alloc || + !api.vt_root_mem_free || !api.vt_buf_alloc || !api.vt_buf_free || !api.vt_one_buf_free || + !api.vt_copy_to_dev || !api.vt_copy_from_dev || !api.vt_start || !api.vt_ready_wait || + !api.vt_finish_all_kernel || !api.vt_upload_kernel_bytes || !api.vt_upload_kernel_file || + !api.vt_dump_perf) { + // 如果有任何函数指针获取失败,返回空的 api 结构体 + // 不显式调用 dlclose,依赖操作系统清理 + SPDLOG_ERROR("Failed to load all required functions from backend library"); + std::exit(EXIT_FAILURE); + } + + // 成功加载,返回设置好的 api 结构体 + // handle 不保存,依赖操作系统在进程结束时自动卸载 + return api; +} + +// 使用静态变量实现线程安全的懒加载 +struct BackendLoader { + vt_api_t api = {0}; + bool loaded = false; + BackendLoader() { api = load_backend(); } +}; +static BackendLoader loader; + +// 实现所有 API 函数,使用 extern "C" 确保符号正确导出 +extern "C" int vt_dev_open(vt_device_h *hdevice) { + if (!loader.loaded) { + loader.api = load_backend(); // 动态加载后端库 + loader.loaded = true; // 标记为已加载 + } + if (!loader.api.vt_dev_open) return -1; + const char *env_dump_result = std::getenv("VENTUS_DUMP_RESULT"); + if (env_dump_result == nullptr) { + env_dump_result = std::getenv("VENTUS_DRIVER_DUMP_RESULT"); // capability name + } + if (!g_dump_result_filename && env_dump_result) { + g_dump_result_filename = std::string{env_dump_result}; + std::ofstream ofs(*g_dump_result_filename, std::ios::trunc | std::ios::out); + ofs.close(); // 清空文件 + } + return loader.api.vt_dev_open(hdevice); +} + +extern "C" int vt_dev_close(vt_device_h hdevice) { + if (!loader.api.vt_dev_close) return -1; + return loader.api.vt_dev_close(hdevice); +} + +extern "C" int vt_dev_caps(vt_device_h *hdevice, uint64_t caps_id, uint64_t *value) { + if (!loader.api.vt_dev_caps) return -1; + return loader.api.vt_dev_caps(hdevice, caps_id, value); +} + +extern "C" int vt_root_mem_alloc(vt_device_h hdevice, int taskID) { + if (!loader.api.vt_root_mem_alloc) return -1; + return loader.api.vt_root_mem_alloc(hdevice, taskID); +} + +extern "C" int vt_root_mem_free(vt_device_h hdevice, int taskID) { + if (!loader.api.vt_root_mem_free) return -1; + return loader.api.vt_root_mem_free(hdevice, taskID); +} + +extern "C" int vt_buf_alloc( + vt_device_h hdevice, uint64_t size, uint64_t *vaddr, int BUF_TYPE, uint64_t taskID, + uint64_t kernelID +) { + if (!loader.api.vt_buf_alloc) return -1; + return loader.api.vt_buf_alloc(hdevice, size, vaddr, BUF_TYPE, taskID, kernelID); +} + +extern "C" int vt_buf_free( + vt_device_h hdevice, uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID +) { + if (!loader.api.vt_buf_free) return -1; + return loader.api.vt_buf_free(hdevice, size, vaddr, taskID, kernelID); +} + +extern "C" int vt_one_buf_free( + vt_device_h hdevice, uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID +) { + if (!loader.api.vt_one_buf_free) return -1; + return loader.api.vt_one_buf_free(hdevice, size, vaddr, taskID, kernelID); +} + +extern "C" int vt_copy_to_dev( + vt_device_h hdevice, uint64_t dev_vaddr, const void *src_addr, uint64_t size, uint64_t taskID, + uint64_t kernelID +) { + if (!loader.api.vt_copy_to_dev) return -1; + return loader.api.vt_copy_to_dev(hdevice, dev_vaddr, src_addr, size, taskID, kernelID); +} + +extern "C" int vt_copy_from_dev( + vt_device_h hdevice, uint64_t dev_vaddr, void *dst_addr, uint64_t size, uint64_t taskID, + uint64_t kernelID +) { + if (!loader.api.vt_copy_from_dev) return -1; + int result = loader.api.vt_copy_from_dev(hdevice, dev_vaddr, dst_addr, size, taskID, kernelID); + if (result == 0) { + g_last_copy_to_dev_addr = dev_vaddr; + } + if (g_dump_result_filename) { + nlohmann::json j; + j["address"] = to_hex_string(dev_vaddr); + j["size"] = to_hex_string(size); + nlohmann::json addr_data; + for (size_t i = 0; i < (size + 3) / 4; i++) { + addr_data[to_hex_string(dev_vaddr + i * 4)] = to_hex_string(((uint32_t *)dst_addr)[i]); + } + j["data"] = addr_data; + append_json_object(*g_dump_result_filename, j); + } + return result; +} + +extern "C" int vt_start(vt_device_h hdevice, void *metaData, uint64_t taskID) { + if (!loader.api.vt_start) return -1; + return loader.api.vt_start(hdevice, metaData, taskID); +} + +extern "C" int vt_ready_wait(vt_device_h hdevice, uint64_t timeout) { + if (!loader.api.vt_ready_wait) return -1; + return loader.api.vt_ready_wait(hdevice, timeout); +} + +extern "C" int vt_finish_all_kernel(vt_device_h hdevice, std::queue *finished_kernel_list) { + if (!loader.api.vt_finish_all_kernel) return -1; + return loader.api.vt_finish_all_kernel(hdevice, finished_kernel_list); +} + +extern "C" int vt_upload_kernel_bytes( + vt_device_h device, const void *content, uint64_t size, int taskID +) { + if (!loader.api.vt_upload_kernel_bytes) return -1; + return loader.api.vt_upload_kernel_bytes(device, content, size, taskID); +} + +extern "C" int vt_upload_kernel_file(vt_device_h device, const char *filename, int kernelID) { + if (!loader.api.vt_upload_kernel_file) return -1; + return loader.api.vt_upload_kernel_file(device, filename, kernelID); +} + +extern "C" int vt_dump_perf(vt_device_h device, FILE *stream) { + if (!loader.api.vt_dump_perf) return -1; + return loader.api.vt_dump_perf(device, stream); +} + +extern "C" int __vt_get_last_copy_to_dev_addr(uint64_t *addr) { + *addr = g_last_copy_to_dev_addr; + return 0; +} + +extern "C" void __vt_enable_dump_json_copy_to_dev(const char *filename) { + if (!filename) { + g_dump_result_filename = std::nullopt; + return; + } + if (!g_dump_result_filename || *g_dump_result_filename != filename) { + g_dump_result_filename = std::string{filename}; + std::ofstream ofs(*g_dump_result_filename, std::ios::trunc | std::ios::out); + ofs.close(); // 清空文件 + } +} + +static int append_json_object(const std::string &filename, const nlohmann::json &new_obj) { + nlohmann::json root; + std::ifstream ifs(filename); + if (ifs.is_open() && ifs.peek() != std::ifstream::traits_type::eof()) { + ifs >> root; + ifs.close(); + if (!root.is_array()) { + SPDLOG_ERROR("Error: File is not a JSON array: {}", filename); + return 1; + } + } else { + root = nlohmann::json::array(); + } + + root.push_back(new_obj); + + std::ofstream ofs(filename); + if (!ofs.is_open()) { + SPDLOG_ERROR("Unable to open json dump file: {}", filename); + return 1; + } + ofs << root.dump(4); + ofs.close(); + return 0; +} diff --git a/driver/cyclesim_device/CMakeLists.txt b/driver/cyclesim_device/CMakeLists.txt new file mode 100644 index 0000000..a214b2f --- /dev/null +++ b/driver/cyclesim_device/CMakeLists.txt @@ -0,0 +1,29 @@ +set(PROJECT cyclesim_driver) +project(${PROJECT}) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../include) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../common) +include_directories(${VENTUS_INSTALL_PREFIX}/include) + +file(GLOB_RECURSE SRCS ventus.cpp) + +add_library(${PROJECT} SHARED ${SRCS}) +target_link_directories(${PROJECT} PRIVATE ${VENTUS_INSTALL_PREFIX}/lib) +target_link_libraries(${PROJECT} PRIVATE VentusCycleSim) +target_link_libraries(${PROJECT} PRIVATE driver_common_utils) +target_link_libraries(${PROJECT} PRIVATE spdlog) +target_link_libraries(${PROJECT} PRIVATE fmt) + +target_compile_definitions(${PROJECT} PRIVATE + SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_TRACE +) +set_target_properties(${PROJECT} PROPERTIES OUTPUT_NAME "${PROJECT}") +set_target_properties(${PROJECT} PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(${PROJECT} PROPERTIES + BUILD_RPATH "${VENTUS_INSTALL_PREFIX}/lib" + INSTALL_RPATH "$ORIGIN" +) + +install(TARGETS ${PROJECT} + LIBRARY DESTINATION lib COMPONENT "lib" +) diff --git a/driver/cyclesim_device/ventus.cpp b/driver/cyclesim_device/ventus.cpp new file mode 100644 index 0000000..fb978e8 --- /dev/null +++ b/driver/cyclesim_device/ventus.cpp @@ -0,0 +1,289 @@ +/** + * @file ventus.cpp + * @brief 设备和OpenCL程序的交互功能的实现 + * + * 1. `/include/ventus.h`中声明的函数 + */ + +#include "ventus.h" +#include "loadelf.hpp" +#include "utils.hpp" +#include "ventus_cyclesim.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct driver_metadata_t { + uint64_t kernel_id; + uint64_t kernel_size[3]; ///> 每个kernel的workgroup三维数目 + uint64_t wf_size; ///> 每个warp的thread数目 + uint64_t wg_size; ///> 每个workgroup的warp数目 + uint64_t metaDataBaseAddr; ///> CSR_KNL的值, + uint64_t ldsSize; ///> 每个workgroup使用的local memory的大小 + uint64_t pdsSize; ///> 每个thread用到的private memory大小 + uint64_t sgprUsage; ///> 每个workgroup使用的标量寄存器数目 + uint64_t vgprUsage; ///> 每个thread使用的向量寄存器数目 + uint64_t pdsBaseAddr; ///> private memory的基址,要转成每个workgroup的基地址, + /// wf_size*wg_size*pdsSize + const char *kernel_name; +} driver_metadata_t; + +static std::map g_ptroots; // pagetable root physical address +static std::shared_ptr logger; +static uint64_t g_alloc_vaddr = 0x90000000; +static std::vector> g_elf_alloc; + +/// open the device and connect to it +extern int vt_dev_open(vt_device_h *hdevice) { + if (hdevice == nullptr) return -1; + ventus_cyclesim_config_t config; + ventus_cyclesim_get_default_config(&config); + config.sim_time_max = ~0ull; + config.ramulator.enable = parse_bool(std::getenv("VENTUS_TIMING_DDR")).value_or(true); + config.waveform.enable = parse_bool(std::getenv("VENTUS_WAVEFORM")).value_or(false); + config.waveform.enable |= parse_u64(std::getenv("VENTUS_WAVEFORM_BEGIN")).has_value(); + config.waveform.enable |= parse_u64(std::getenv("VENTUS_WAVEFORM_END")).has_value(); + config.waveform.filename = "waveform.cycle"; + auto device = ventus_cyclesim_init(&config); + *hdevice = device; + logger = spdlog::stdout_color_mt("ventus"); + logger->set_level(spdlog::level::debug); + SPDLOG_LOGGER_DEBUG(logger, "vt_dev_open : hello world from ventus.cpp (cyclesim device)"); + + // TODO: temp + // POCL should call vt_root_mem_alloc() to create virtual memory space before any buf_alloc + // but currently it seems not. So we create a default root page table here. + uint64_t ptroot = ventus_cyclesim_vmem_create(device); + if (ptroot == 0) return -1; + g_ptroots[0] = ptroot; + return 0; +} + +/// Close the device when all the operations are done +extern int vt_dev_close(vt_device_h hdevice) { + if (hdevice == nullptr) return -1; + auto device = static_cast(hdevice); + ventus_cyclesim_finish(device, false); + SPDLOG_LOGGER_DEBUG(logger, "vt_dev_close: goodbye from ventus.cpp (cyclesim device)"); + return 0; +} +int vt_dev_caps(vt_device_h *hdevice, uint64_t caps_id, uint64_t *value) { + // TODO: Not implemented yet + return -1; +} + +extern int vt_buf_alloc( + vt_device_h hdevice, uint64_t size, uint64_t *vaddr, int BUF_TYPE, uint64_t taskID, + uint64_t kernelID +) { + if (size <= 0 || hdevice == nullptr) return -1; + auto device = static_cast(hdevice); + uint64_t vaddr_allocated = + ventus_cyclesim_vmem_alloc(device, g_ptroots[taskID], g_alloc_vaddr, size); + SPDLOG_LOGGER_DEBUG( + logger, "vt_buf_alloc: vaddr_recommand={:x}, vaddr_allocated={:x}, size=0x{:x}, taskID={}", + *vaddr, vaddr_allocated, size, taskID + ); + g_alloc_vaddr += (size > 0x1000) ? size : 0x1000; + *vaddr = vaddr_allocated; + if (*vaddr == 0) return -1; + return 0; +} + +extern int vt_buf_free( + vt_device_h hdevice, uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID +) { + if (hdevice == nullptr) return -1; + auto device = static_cast(hdevice); + ventus_cyclesim_vmem_free(device, g_ptroots[taskID], *vaddr, size); + SPDLOG_LOGGER_DEBUG( + logger, "vt_buf_free: vaddr=0x{:x}, size=0x{:x}, taskID={}", *vaddr, size, taskID + ); + return 0; +} + +extern int vt_one_buf_free( + vt_device_h hdevice, uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID +) { + if (hdevice == nullptr) return -1; + auto device = static_cast(hdevice); + ventus_cyclesim_vmem_free(device, g_ptroots[taskID], *vaddr, size); + SPDLOG_LOGGER_DEBUG( + logger, "vt_buf_free: vaddr=0x{:x}, size=0x{:x}, taskID={}", *vaddr, size, taskID + ); + return 0; +} + +/** + * @brief 为设备分配内存,返回根页表的地址 + * @param hdevice + * @param size + * @return int + */ +extern int vt_root_mem_alloc(vt_device_h hdevice, int taskID) { + if (hdevice == nullptr) return -1; + auto device = static_cast(hdevice); + auto ptroot = ventus_cyclesim_vmem_create(device); + if (ptroot == 0) return -1; + SPDLOG_LOGGER_DEBUG(logger, "vt_root_mem_alloc: taskID={}, ptroot={:x}", taskID, ptroot); + g_ptroots[taskID] = ptroot; + return 0; +} + +/** + * 释放taskID(对应context)的根页表 + * @param hdevice + * @param taskID + * @return + */ +extern int vt_root_mem_free(vt_device_h hdevice, int taskID) { + if (hdevice == nullptr) return -1; + auto device = static_cast(hdevice); + ventus_cyclesim_vmem_destroy(device, g_ptroots[taskID]); + g_ptroots.erase(taskID); + SPDLOG_LOGGER_DEBUG( + logger, "vt_root_mem_free: taskID={}, ptroot={:x}", taskID, g_ptroots[taskID] + ); + return 0; +} + +extern int vt_copy_to_dev( + vt_device_h hdevice, uint64_t dev_vaddr, const void *src_addr, uint64_t size, uint64_t taskID, + uint64_t kernelID +) { + if (hdevice == nullptr) return -1; + if (dev_vaddr >= 0x70000000 && dev_vaddr < 0x80000000) { + SPDLOG_LOGGER_ERROR( + logger, "vt_copy_to_dev: dev_vaddr={:x} in LDS space, not supportted", dev_vaddr + ); + return 0; + } + auto device = static_cast(hdevice); + SPDLOG_LOGGER_DEBUG( + logger, "vt_copy_to_dev: dev_vaddr={:x}, size=0x{:x}, taskID={}, kernelID={}", dev_vaddr, + size, taskID, kernelID + ); + ventus_cyclesim_vmemcpy_h2d(device, g_ptroots[taskID], dev_vaddr, src_addr, size); + return 0; +} + +extern int vt_copy_from_dev( + vt_device_h hdevice, uint64_t dev_vaddr, void *dst_addr, uint64_t size, uint64_t taskID, + uint64_t kernelID +) { + if (hdevice == nullptr) return -1; + auto device = static_cast(hdevice); + SPDLOG_LOGGER_DEBUG( + logger, "vt_copy_from_dev: dev_vaddr={:x}, size=0x{:x}, taskID={}, kernelID={}", dev_vaddr, + size, taskID, kernelID + ); + ventus_cyclesim_vmemcpy_d2h(device, g_ptroots[taskID], dst_addr, dev_vaddr, size); + return 0; +} + +extern int vt_start(vt_device_h hdevice, void *mtd_raw, uint64_t taskID) { + if (hdevice == nullptr) return -1; + auto device = static_cast(hdevice); + auto mtd_driver = static_cast(mtd_raw); + static uint32_t kernel_cnt = 0; + ventus_kernel_metadata_t mtd_sim{ + .name = mtd_driver->kernel_name, + // .kernel_id = mtd_driver->kernel_id, + .kernel_id = kernel_cnt++, + .data = nullptr, + .startaddr = 0x80000000, + .kernel_size = + {mtd_driver->kernel_size[0], mtd_driver->kernel_size[1], mtd_driver->kernel_size[2]}, + .wf_size = mtd_driver->wf_size, + .wg_size = mtd_driver->wg_size, + .metaDataBaseAddr = mtd_driver->metaDataBaseAddr, + .ldsSize = mtd_driver->ldsSize, + .pdsSize = mtd_driver->pdsSize, + .sgprUsage = mtd_driver->sgprUsage, + .vgprUsage = mtd_driver->vgprUsage, + .pdsBaseAddr = mtd_driver->pdsBaseAddr, + .num_buffer = 0, + .buffer_base = nullptr, + .buffer_size = nullptr, + .buffer_allocsize = nullptr, + .pagetable = g_ptroots[taskID], + }; + ventus_cyclesim_add_kernel(device, &mtd_sim, nullptr); + SPDLOG_LOGGER_DEBUG( + logger, + "vt_start: taskID={}, kernelID={}, kernel_size=({},{},{}), " + "wgsize={}, wfsize={}, pds_size=0x{:x}, lds_size=0x{:x}, addr_meta=0x{:x}, addr_pds=0x{:x}", + taskID, mtd_sim.kernel_id, mtd_sim.kernel_size[0], mtd_sim.kernel_size[1], + mtd_sim.kernel_size[2], mtd_sim.wg_size, mtd_sim.wf_size, mtd_sim.pdsSize, mtd_sim.ldsSize, + mtd_sim.metaDataBaseAddr, mtd_sim.pdsBaseAddr + ); + return 0; +} + +extern int vt_ready_wait(vt_device_h hdevice, uint64_t timeout) { + if (hdevice == nullptr) return -1; + auto device = static_cast(hdevice); + uint64_t timeout_ns = timeout * 1000000; + while (!ventus_cyclesim_is_idle(device) && ventus_cyclesim_get_time(device) < timeout_ns) { + ventus_cyclesim_step(device); + } + return 0; +} + +extern int vt_finish_all_kernel(vt_device_h hdevice, std::queue *finished_kernel_list) { + // TODO: what is this function for? what is finished_kernel_list? + return -1; +} + +extern int vt_upload_kernel_file(vt_device_h hdevice, const char *filename, int taskID) { + if (hdevice == nullptr) return -1; + auto device = (ventus_cyclesim_t *)hdevice; + uint64_t ptroot = g_ptroots[taskID]; + + // parse ELF file, find .text and other data sections + const auto blocks = get_data_from_elf(filename, logger); + if (blocks.empty()) { + return -1; // at least .text section is needed + } + + for (const auto &to_free : g_elf_alloc) { + // free previous ELF allocations + ventus_cyclesim_vmem_free(device, ptroot, to_free.first, to_free.second); + } + g_elf_alloc.clear(); + + // alloc and load/zero-fill each block + for (auto block = blocks.begin(); block != blocks.end(); block++) { + uint64_t vaddr = block->vaddr; + uint64_t size = block->memsz; + uint64_t vaddr_allocated = ventus_cyclesim_vmem_alloc(device, ptroot, vaddr, size); + if (vaddr_allocated != vaddr) { + ventus_cyclesim_vmem_free(device, ptroot, vaddr_allocated, size); + for (auto need_free = blocks.begin(); need_free != block; need_free++) { + ventus_cyclesim_vmem_free(device, ptroot, need_free->vaddr, need_free->memsz); + } + return -1; + } + g_elf_alloc.push_back(std::make_pair(vaddr, size)); + SPDLOG_LOGGER_DEBUG( + logger, "vt_upload_kernel_file {}: vaddr={:x}, size=0x{:x}", filename, vaddr, size + ); + ventus_cyclesim_vmemcpy_h2d(device, ptroot, vaddr, block->data.data(), block->data.size()); + std::vector zeros(size - block->data.size(), 0); + ventus_cyclesim_vmemcpy_h2d( + device, ptroot, vaddr + block->data.size(), zeros.data(), zeros.size() + ); + } + + return 0; +} +int vt_upload_kernel_bytes(vt_device_h device, const void *content, uint64_t size, int taskID) { + return 0; +} +int vt_dump_perf(vt_device_h device, FILE *stream) { return 0; } diff --git a/driver/gvm_device/.clang-format b/driver/gvm_device/.clang-format new file mode 100644 index 0000000..44a6b67 --- /dev/null +++ b/driver/gvm_device/.clang-format @@ -0,0 +1,9 @@ +BasedOnStyle: LLVM # 基于LLVM的代码风格 +IndentWidth: 4 # 缩进宽度为4个空格 +AccessModifierOffset: -4 # 访问修饰符的偏移量为-4 +ColumnLimit: 100 # 每行代码的最大列数 +AlwaysBreakAfterDefinitionReturnType: None +PenaltyReturnTypeOnItsOwnLine: 1000000 +AlignAfterOpenBracket: BlockIndent +AllowShortIfStatementsOnASingleLine: true + diff --git a/driver/gvm_device/CMakeLists.txt b/driver/gvm_device/CMakeLists.txt new file mode 100644 index 0000000..1854f43 --- /dev/null +++ b/driver/gvm_device/CMakeLists.txt @@ -0,0 +1,26 @@ +set(PROJECT gvm_driver) +project(${PROJECT}) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../include) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../common) +include_directories(${VENTUS_INSTALL_PREFIX}/include) + +file(GLOB_RECURSE SRCS ./ventus.cpp) + +add_library(${PROJECT} SHARED ${SRCS}) +target_link_directories(${PROJECT} PRIVATE ${VENTUS_INSTALL_PREFIX}/lib) +target_link_libraries(${PROJECT} PRIVATE VentusGVM) +target_link_libraries(${PROJECT} PRIVATE driver_common_utils) +target_link_libraries(${PROJECT} PRIVATE spdlog) +target_link_libraries(${PROJECT} PRIVATE fmt) + +set_target_properties(${PROJECT} PROPERTIES OUTPUT_NAME "${PROJECT}") +set_target_properties(${PROJECT} PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(${PROJECT} PROPERTIES + BUILD_RPATH "${VENTUS_INSTALL_PREFIX}/lib" + INSTALL_RPATH "$ORIGIN" +) + +install(TARGETS ${PROJECT} + LIBRARY DESTINATION lib COMPONENT "lib" +) diff --git a/driver/gvm_device/ventus.cpp b/driver/gvm_device/ventus.cpp new file mode 100644 index 0000000..4811e9d --- /dev/null +++ b/driver/gvm_device/ventus.cpp @@ -0,0 +1,313 @@ +/** + * @file ventus.cpp + * @brief 设备和OpenCL程序的交互功能的实现 + * + * 1. `/include/ventus.h`中声明的函数 + */ + +#include +#include "ventus.h" +#include "loadelf.hpp" +#include "ventus_gvm.h" +#include +#include +#include +#include +#include +#include +#include + +typedef struct driver_metadata_t { + uint64_t kernel_id; + uint64_t kernel_size[3]; ///> 每个kernel的workgroup三维数目 + uint64_t wf_size; ///> 每个warp的thread数目 + uint64_t wg_size; ///> 每个workgroup的warp数目 + uint64_t metaDataBaseAddr; ///> CSR_KNL的值, + uint64_t ldsSize; ///> 每个workgroup使用的local memory的大小 + uint64_t pdsSize; ///> 每个thread用到的private memory大小 + uint64_t sgprUsage; ///> 每个workgroup使用的标量寄存器数目 + uint64_t vgprUsage; ///> 每个thread使用的向量寄存器数目 + uint64_t pdsBaseAddr; ///> private memory的基址,要转成每个workgroup的基地址, + /// wf_size*wg_size*pdsSize +} driver_metadata_t; + +// static std::map ptroots; // pagetable root physical address +static std::shared_ptr logger; +static uint64_t alloc_vaddr = 0x90000000; + +/// open the device and connect to it +extern int vt_dev_open(vt_device_h *hdevice) { + if (hdevice == nullptr) return -1; + fw_vt_dev_open(); + + auto env_waveform = std::getenv("VENTUS_WAVEFORM"); + auto env_waveform_begin = std::getenv("VENTUS_WAVEFORM_BEGIN"); + auto env_waveform_end = std::getenv("VENTUS_WAVEFORM_END"); + bool waveform_enable = false; + uint64_t waveform_begin = UINT64_MAX; // default: not enable + uint64_t waveform_end = 0; + if (parse_bool(env_waveform).value_or(false)) { + waveform_begin = 0; // default: dump waveform all time + waveform_end = UINT64_MAX; + } + waveform_begin = parse_u64(env_waveform_begin).value_or(waveform_begin); + waveform_end = parse_u64(env_waveform_end).value_or(waveform_end); + waveform_enable = waveform_end > waveform_begin; + + ventus_rtlsim_config_t config; + ventus_rtlsim_get_default_config(&config); + config.sim_time_max = ~0ull; + config.pmem.auto_alloc = true; + config.waveform.enable = waveform_enable; + config.waveform.time_begin = waveform_begin; + config.waveform.time_end = waveform_end; + config.waveform.filename = "waveform.gvm.fst"; + config.snapshot.enable = false; + config.log.console.enable = true; + config.log.console.level = "trace"; + config.log.file.enable = false; + auto device = ventus_rtlsim_init(&config); + *hdevice = device; + logger = spdlog::stdout_color_mt("ventus"); + logger->set_level(spdlog::level::trace); + logger->debug("vt_dev_open : hello world from ventus.cpp (gvm device)"); + return 0; +} + +/// Close the device when all the operations are done +extern int vt_dev_close(vt_device_h hdevice) { + if (hdevice == nullptr) return -1; + fw_vt_dev_close(); + auto device = static_cast(hdevice); + ventus_rtlsim_finish(device, false); + logger->debug("vt_dev_close : goodbye from ventus.cpp (gvm device)"); + return 0; +} +int vt_dev_caps(vt_device_h *hdevice, uint64_t caps_id, uint64_t *value) { + if (value == nullptr) return -1; +#define GET_PARAM(key) \ + do { \ + uint32_t val; \ + if (ventus_rtlsim_get_parameter(key, &val) == 0) { \ + *value = val; \ + return 0; \ + } else { \ + SPDLOG_LOGGER_ERROR(logger, "vt_dev_caps: get parameter {} failed", key); \ + return -1; \ + } \ + } while (0) + switch (caps_id) { + case VT_CAPS_MAX_CORES: + GET_PARAM("num_sm"); + case VT_CAPS_MAX_WARPS: + GET_PARAM("num_warp"); + case VT_CAPS_MAX_THREADS: + GET_PARAM("num_thread"); + case VT_CAPS_LOCAL_MEM_SIZE: + GET_PARAM("sharemem_size"); + default: + SPDLOG_LOGGER_ERROR( + logger, "vt_dev_caps: unknown caps_id {} (or not implemented)", caps_id + ); + return -1; + } + return -1; +} + +extern int vt_buf_alloc( + vt_device_h hdevice, const uint64_t size, uint64_t *vaddr, int BUF_TYPE, uint64_t taskID, + uint64_t kernelID +) { + // TODO: RTLSIM does not support Virtual Memory yet + if (size <= 0 || hdevice == nullptr) return -1; + + uint64_t* fw_vaddr = new uint64_t; + fw_vt_buf_alloc(size, fw_vaddr, BUF_TYPE, taskID, kernelID); + + *vaddr = *fw_vaddr; + if (*vaddr == 0) return -1; + delete fw_vaddr; + return 0; +} + +extern int vt_buf_free( + vt_device_h hdevice, uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID +) { + // if (hdevice == nullptr) return -1; + // auto device = static_cast(hdevice); + // ventus_rtlsim_vmem_free(device, ptroots[taskID], *vaddr, size); + + uint64_t* fw_vaddr = new uint64_t; + *fw_vaddr = *vaddr; + fw_vt_buf_free(size, fw_vaddr, taskID, kernelID); + delete fw_vaddr; + + return 0; +} + +extern int vt_one_buf_free( + vt_device_h hdevice, uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID +) { + // if (hdevice == nullptr) return -1; + // auto device = static_cast(hdevice); + // ventus_rtlsim_vmem_free(device, ptroots[taskID], *vaddr, size); + + uint64_t* fw_vaddr = new uint64_t; + *fw_vaddr = *vaddr; + fw_vt_one_buf_free(size, fw_vaddr, taskID, kernelID); + delete fw_vaddr; + + return 0; +} + +/** + * @brief 为设备分配内存,返回根页表的地址 + * @param hdevice + * @param size + * @return int + */ +extern int vt_root_mem_alloc(vt_device_h hdevice, int taskID) { + // if (hdevice == nullptr) return -1; + // auto device = static_cast(hdevice); + // auto ptroot = ventus_rtlsim_vmem_create(device); + // if (ptroot == 0) return -1; + // logger->debug("vt_root_mem_alloc: taskID={}, ptroot={:x}", taskID, ptroot); + // ptroots[taskID] = ptroot; + if (taskID == 0) { + logger->error("RTLSIM_device does not support VMEM yet, taskID must be 0"); + } + return 0; +} + +/** + * 释放taskID(对应context)的根页表 + * @param hdevice + * @param taskID + * @return + */ +extern int vt_root_mem_free(vt_device_h hdevice, int taskID) { + // if (hdevice == nullptr) return -1; + // auto device = static_cast(hdevice); + // ventus_rtlsim_vmem_destroy(device, ptroots[taskID]); + // ptroots.erase(taskID); + return 0; +} + +extern int vt_copy_to_dev( + vt_device_h hdevice, uint64_t dev_vaddr, const void *src_addr, uint64_t size, uint64_t taskID, + uint64_t kernelID +) { + if (hdevice == nullptr) return -1; + fw_vt_copy_to_dev(dev_vaddr, src_addr, size, taskID, kernelID); + auto device = static_cast(hdevice); + logger->debug( + "vt_copy_to_dev: dev_addr={:x}, size={}, taskID={}, kernelID={}", dev_vaddr, size, taskID, + kernelID + ); + ventus_rtlsim_pmemcpy_h2d(device, dev_vaddr, src_addr, size); + return 0; +} + +extern int vt_copy_from_dev( + vt_device_h hdevice, uint64_t dev_vaddr, void *dst_addr, uint64_t size, uint64_t taskID, + uint64_t kernelID +) { + if (hdevice == nullptr) return -1; + auto device = static_cast(hdevice); + SPDLOG_LOGGER_DEBUG(logger, + "vt_copy_from_dev: dev_addr={:x}, size={}, taskID={}, kernelID={}", dev_vaddr, size, taskID, + kernelID + ); + ventus_rtlsim_pmemcpy_d2h(device, dst_addr, dev_vaddr, size); + return 0; +} + +extern int vt_start(vt_device_h hdevice, void *mtd_raw, uint64_t taskID) { + if (hdevice == nullptr) return -1; + auto device = static_cast(hdevice); + auto mtd_driver = static_cast(mtd_raw); + ventus_kernel_metadata_t mtd_sim{ + .name = "UnknownKernelName", + .data = nullptr, + .startaddr = 0x80000000, + .kernel_id = mtd_driver->kernel_id, + .kernel_size = + {mtd_driver->kernel_size[0], mtd_driver->kernel_size[1], mtd_driver->kernel_size[2]}, + .wf_size = mtd_driver->wf_size, + .wg_size = mtd_driver->wg_size, + .metaDataBaseAddr = mtd_driver->metaDataBaseAddr, + .ldsSize = mtd_driver->ldsSize, + .pdsSize = mtd_driver->pdsSize, + .sgprUsage = mtd_driver->sgprUsage, + .vgprUsage = mtd_driver->vgprUsage, + .pdsBaseAddr = mtd_driver->pdsBaseAddr, + .num_buffer = 0, + .buffer_base = nullptr, + .buffer_size = nullptr, + .buffer_allocsize = nullptr, + }; + logger->debug( + "kernel metadata: kernel_id={}, kernel_size=[{}, {}, {}], wf_size={}, wg_size={}, " + "metaDataBaseAddr={:x}, ldsSize={}, pdsSize={}, sgprUsage={}, vgprUsage={}, " + "pdsBaseAddr={:x}", + mtd_driver->kernel_id, mtd_driver->kernel_size[0], mtd_driver->kernel_size[1], + mtd_driver->kernel_size[2], mtd_driver->wf_size, mtd_driver->wg_size, + mtd_driver->metaDataBaseAddr, mtd_driver->ldsSize, mtd_driver->pdsSize, + mtd_driver->sgprUsage, mtd_driver->vgprUsage, mtd_driver->pdsBaseAddr + ); + fw_vt_start(mtd_raw, taskID); // 先初始化 spike,再运行 sim-verilator + ventus_rtlsim_add_kernel(device, &mtd_sim, nullptr); + return 0; +} + +extern int vt_ready_wait(vt_device_h hdevice, uint64_t timeout) { + if (hdevice == nullptr) return -1; + auto device = static_cast(hdevice); + uint64_t timeout_ns = timeout * 1000000; + while (!ventus_rtlsim_is_idle(device) && ventus_rtlsim_get_time(device) < timeout_ns) { + ventus_rtlsim_step(device); + } + for (int i = 0; i < 5000; i++) { + // TODO: RTL does not provide a way to check if L2 cache flush is done + ventus_rtlsim_step(device); + } + // TODO: temp + // it seems that vt_dev_close() is not called by POCL + // we call it here to make waveform output sucessful + // vt_dev_close(hdevice); + return 0; +} + +extern int vt_finish_all_kernel(vt_device_h hdevice, std::queue *finished_kernel_list) { + // TODO: what is this function for? what is finished_kernel_list? + return -1; +} + +extern int vt_upload_kernel_file(vt_device_h hdevice, const char *filename, int taskID) { + if (hdevice == nullptr) return -1; + fw_vt_upload_kernel_file(filename, taskID); + auto device = (ventus_rtlsim_t *)hdevice; + // uint64_t ptroot = ptroots[taskID]; + + // parse ELF file, find .text and other data sections + const auto blocks = get_data_from_elf(filename, logger); + if (blocks.empty()) { + return -1; // at least .text section is needed + } + + // alloc and load/zero-fill each block + for (auto block = blocks.begin(); block != blocks.end(); block++) { + uint64_t vaddr = block->vaddr; + uint64_t size = block->memsz; + logger->debug("vt_upload_kernel_file: addr={:x}, size={}", vaddr, size); + ventus_rtlsim_pmemcpy_h2d(device, vaddr, block->data.data(), block->data.size()); + std::vector zeros(size - block->data.size(), 0); + ventus_rtlsim_pmemcpy_h2d(device, vaddr + block->data.size(), zeros.data(), zeros.size()); + } + ventus_rtlsim_icache_invalidate(device); + return 0; +} +int vt_upload_kernel_bytes(vt_device_h device, const void *content, uint64_t size, int taskID) { + return 0; +} +int vt_dump_perf(vt_device_h device, FILE *stream) { return 0; } diff --git a/driver/gvm_device/ventus_gvm.h b/driver/gvm_device/ventus_gvm.h new file mode 100644 index 0000000..395fc87 --- /dev/null +++ b/driver/gvm_device/ventus_gvm.h @@ -0,0 +1,184 @@ +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +#if __GNUC__ >= 4 +#define DLL_PUBLIC __attribute__((visibility("default"))) +#define DLL_LOCAL __attribute__((visibility("hidden"))) +#else +#define DLL_PUBLIC +#define DLL_LOCAL +#endif + +#include +#include + +typedef struct ventus_rtlsim_t ventus_rtlsim_t; +typedef uint64_t paddr_t; + +typedef struct ventus_kernel_metadata_t { // 这个metadata是供驱动使用的,而不是给硬件的 + // Additional data + const char* name; // kernel name + void* data; // use this as you like, such as callback function argument + + // Raw metadata + uint64_t startaddr; + uint64_t kernel_id; // Is this useful??? Maybe this should be moved to additional data + uint64_t kernel_size[3]; // 每个kernel的workgroup三维数目 + uint64_t wf_size; // 每个warp的thread数目 + uint64_t wg_size; // 每个workgroup的warp数目 + uint64_t metaDataBaseAddr; // CSR_KNL的值, + uint64_t ldsSize; // 每个workgroup使用的local memory的大小 + uint64_t pdsSize; // 每个thread用到的private memory大小 + uint64_t sgprUsage; // 每个wavefront(warp)使用的标量寄存器数目 + uint64_t vgprUsage; // 每个wavefront(warp)(also thread)使用的向量寄存器数目 + uint64_t pdsBaseAddr; // private memory的基址,要转成每个workgroup的基地址, wf_size*wg_size*pdsSize + uint64_t num_buffer; // buffer的数目,包括pc + uint64_t* buffer_base; // 各buffer的基址。第一块buffer是给硬件用的metadata + uint64_t* buffer_size; // 各buffer的size,以Bytes为单位。实际使用的大小,用于初始化.data + uint64_t* buffer_allocsize; // 各buffer的size,以Bytes为单位。分配的大小 +} ventus_kernel_metadata_t; + +typedef struct { + uint64_t sim_time_max; // 最大仿真时间限制 + struct { // These log sinks can be enabled simultaneously + struct { // Write log to a file (append to its tail) + bool enable; + const char* level; // "trace", "debug", "info", "warn", "error", "critical" + const char* filename; + } file; + struct { // console log + bool enable; + const char* level; + } console; + const char* level; + } log; + struct { + uint64_t pagesize; // 物理内存页大小 + uint64_t auto_alloc; // 若访存到未分配的物理页,自动分配(如此则与实际硬件内存行为相同) + // 注意,自动分配的物理内存是不会释放的,除非整个仿真结束 + } pmem; + struct { // 波形输出功能,这里只设置正常仿真流程,对仿真快照回溯后的波形输出无影响 + bool enable; // 是否启用?仿真快照回溯后将自动启用 + uint64_t time_begin; // 输出波形的起始时刻 + uint64_t time_end; // 输出波形的结束时刻,end > begin才有波形输出 + int levels; // 波形输出的层级 + const char* filename; + } waveform; + struct { // 仿真快照,当仿真出错时可回溯仿真进度到最旧快照,开启波形记录重新仿真 + bool enable; + uint64_t time_interval; // 快照时间间隔 + int num_max; // 最大快照数量,超限时新快照将顶替最旧快照 + const char* filename; // 快照输出的FST波形文件名 + } snapshot; + struct { // verilator运行时命令行参数,以argc,argv形式传入 + int argc; // 注意argc可以为0 + const char** argv; // 共有argc个char*字符串,[0]成员不是程序名,而是首个verilator参数 + } verilator; +} ventus_rtlsim_config_t; + +typedef struct { + bool error; // Simulation got fatal error, or RTL $finish() + bool time_exceed; // Simulation time exceeds limit + bool idle; // All given kernels has finished +} ventus_rtlsim_step_result_t; + +// = +// API functions: +// = + +// +// Helper functions +// + +// Give you a recommended default config. +DLL_PUBLIC void ventus_rtlsim_get_default_config(ventus_rtlsim_config_t* config); +// Get current simulation time. +DLL_PUBLIC uint64_t ventus_rtlsim_get_time(const ventus_rtlsim_t* sim); +// Check if the simulated GPU is idle (no kernel is running). +DLL_PUBLIC bool ventus_rtlsim_is_idle(const ventus_rtlsim_t* sim); +// Get RTL parameters (output from *out_value, return 0 on success) +DLL_PUBLIC int ventus_rtlsim_get_parameter(const char* name, uint32_t* out_value); + +// +// Init, calculate, and finish +// + +// Init the simulation. +DLL_PUBLIC ventus_rtlsim_t* ventus_rtlsim_init(const ventus_rtlsim_config_t* config); + +// Finish the simulation. +// If error occurred in the simulation, and snapshot feature enabled, +// it will rollback to the oldest snapshot to find out what happened. +// You can force the rollback by passing `snapshot_rollback_forcing = true` +DLL_PUBLIC void ventus_rtlsim_finish(ventus_rtlsim_t* sim, bool snapshot_rollback_forcing); + +// Calculate 1 unit-time of simulation. +// Return the result of this step: ok, error, time_exceed, or idle. +// If error occurred, calling this function has no effect, you should consider finish the simulation. +DLL_PUBLIC const ventus_rtlsim_step_result_t* ventus_rtlsim_step(ventus_rtlsim_t* sim); + +// Host request GPGPU device to invalidate its Icache +// (for example, after loading new kernel code to device memory) +// This will take effect in the next simulation step() +DLL_PUBLIC void ventus_rtlsim_icache_invalidate(ventus_rtlsim_t* sim); + +// +// Push new kernels to gpu for execution. +// + +// After a kernel finishing its execution, the finish_callback will be called, with metadata passed, +// aka. `finish_callback(metadata)` will be called. + +// It's allowed to delay data-loading until the kernel is actually activated on GPU, +// by using data_load_callback +// **Temporary api**, May be removed in the future +DLL_PUBLIC void ventus_rtlsim_add_kernel__delay_data_loading( + ventus_rtlsim_t* sim, const ventus_kernel_metadata_t* metadata, + void (*load_data_callback)(const ventus_kernel_metadata_t*), + void (*finish_callback)(const ventus_kernel_metadata_t*) +); + +// It's recommended to use this ↓. Remember to load data to GPU before calling this. +DLL_PUBLIC void ventus_rtlsim_add_kernel( + ventus_rtlsim_t* sim, const ventus_kernel_metadata_t* metadata, + void (*finish_callback)(const ventus_kernel_metadata_t*) +); + +// +// Physical memory interface +// + +// Physical page alloc & free +// These functions are not needed by actual hardware memory, only for reducing simulation memory usage. +// If config.pmem.auto_alloc is set, you don't need to call these functions. +DLL_PUBLIC bool ventus_rtlsim_pmem_page_alloc(ventus_rtlsim_t* sim, paddr_t base); +DLL_PUBLIC bool ventus_rtlsim_pmem_page_free(ventus_rtlsim_t* sim, paddr_t base); + +// Physical memory read & write +// copy data from host to device +DLL_PUBLIC bool ventus_rtlsim_pmemcpy_h2d(ventus_rtlsim_t* sim, paddr_t dst, const void* src, uint64_t size); +// copy data from device to host +DLL_PUBLIC bool ventus_rtlsim_pmemcpy_d2h(ventus_rtlsim_t* sim, void* dst, paddr_t src, uint64_t size); + +// GVM API begin + +DLL_PUBLIC int fw_vt_dev_open(); +DLL_PUBLIC int fw_vt_dev_close(); +DLL_PUBLIC int fw_vt_buf_alloc(uint64_t size, uint64_t *vaddr, int BUF_TYPE, uint64_t taskID, uint64_t kernelID); +DLL_PUBLIC int fw_vt_buf_free(uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID); +DLL_PUBLIC int fw_vt_one_buf_free(uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID); +DLL_PUBLIC int fw_vt_copy_to_dev(uint64_t dev_vaddr,const void *src_addr, uint64_t size, uint64_t taskID, uint64_t kernelID); +DLL_PUBLIC int fw_vt_start(void* metaData, uint64_t taskID); +DLL_PUBLIC int fw_vt_upload_kernel_file(const char* filename, int taskID); + +// GVM API end + +#undef DLL_PUBLIC +#undef DLL_LOCAL + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/driver/rtlsim_device/CMakeLists.txt b/driver/rtlsim_device/CMakeLists.txt new file mode 100644 index 0000000..0e2eaf4 --- /dev/null +++ b/driver/rtlsim_device/CMakeLists.txt @@ -0,0 +1,28 @@ +set(PROJECT rtlsim_driver) +project(${PROJECT}) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../include) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../common) +include_directories(${VENTUS_INSTALL_PREFIX}/include) + +file(GLOB_RECURSE SRCS ./ventus.cpp) + +add_library(${PROJECT} SHARED ${SRCS}) +target_link_directories(${PROJECT} PRIVATE ${VENTUS_INSTALL_PREFIX}/lib) +target_link_libraries(${PROJECT} PRIVATE VentusRTL) +target_link_libraries(${PROJECT} PRIVATE driver_common_utils) +target_link_libraries(${PROJECT} PRIVATE spdlog) +target_link_libraries(${PROJECT} PRIVATE fmt) +target_link_libraries(${PROJECT} PRIVATE SV) +target_compile_definitions(${PROJECT} PRIVATE -DSPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_TRACE) + +set_target_properties(${PROJECT} PROPERTIES OUTPUT_NAME "${PROJECT}") +set_target_properties(${PROJECT} PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(${PROJECT} PROPERTIES + BUILD_RPATH "${VENTUS_INSTALL_PREFIX}/lib" + INSTALL_RPATH "$ORIGIN" +) + +install(TARGETS ${PROJECT} + LIBRARY DESTINATION lib COMPONENT "lib" +) diff --git a/driver/rtlsim_device/ventus.cpp b/driver/rtlsim_device/ventus.cpp new file mode 100644 index 0000000..ce3c1df --- /dev/null +++ b/driver/rtlsim_device/ventus.cpp @@ -0,0 +1,309 @@ +/** + * @file ventus.cpp + * @brief 设备和OpenCL程序的交互功能的实现 + * + * 1. `/include/ventus.h`中声明的函数 + */ + +#include "ventus.h" +#include "buddy.hpp" +#include "loadelf.hpp" +#include "utils.hpp" +#include "ventus_rtlsim.h" +#include +#include +#include +#include +#include +#include +#include + +typedef struct driver_metadata_t { + uint64_t kernel_id; + uint64_t kernel_size[3]; ///> 每个kernel的workgroup三维数目 + uint64_t wf_size; ///> 每个warp的thread数目 + uint64_t wg_size; ///> 每个workgroup的warp数目 + uint64_t metaDataBaseAddr; ///> CSR_KNL的值, + uint64_t ldsSize; ///> 每个workgroup使用的local memory的大小 + uint64_t pdsSize; ///> 每个thread用到的private memory大小 + uint64_t sgprUsage; ///> 每个workgroup使用的标量寄存器数目 + uint64_t vgprUsage; ///> 每个thread使用的向量寄存器数目 + uint64_t pdsBaseAddr; ///> private memory的基址,要转成每个workgroup的基地址, + /// wf_size*wg_size*pdsSize + const char *kernel_name; +} driver_metadata_t; + +// static std::map ptroots; // pagetable root physical address +static std::shared_ptr logger; +BuddyAllocator<4096> buddy_allocator((0xFFFFFFFF - 0x90000000 + 1) / 4096, 16); +constexpr paddr_t BUDDY_BASE = 0x90000000 - 4096; + +static constexpr unsigned log2Ceil(unsigned n) { + if (n <= 1) return 0; + return 32 - __builtin_clz(n - 1); +} + +/// open the device and connect to it +extern int vt_dev_open(vt_device_h *hdevice) { + if (hdevice == nullptr) return -1; + + auto env_waveform = std::getenv("VENTUS_WAVEFORM"); + auto env_waveform_begin = std::getenv("VENTUS_WAVEFORM_BEGIN"); + auto env_waveform_end = std::getenv("VENTUS_WAVEFORM_END"); + bool waveform_enable = false; + uint64_t waveform_begin = UINT64_MAX; // default: not enable + uint64_t waveform_end = 0; + if (parse_bool(env_waveform).value_or(false)) { + waveform_begin = 0; // default: dump waveform all time + waveform_end = UINT64_MAX; + } + waveform_begin = parse_u64(env_waveform_begin).value_or(waveform_begin); + waveform_end = parse_u64(env_waveform_end).value_or(waveform_end); + waveform_enable = waveform_end > waveform_begin; + + ventus_rtlsim_config_t config; + ventus_rtlsim_get_default_config(&config); + config.sim_time_max = ~0ull; + config.pmem.auto_alloc = true; + config.waveform.enable = waveform_enable; + config.waveform.time_begin = waveform_begin; + config.waveform.time_end = waveform_end; + config.waveform.filename = "waveform.rtl.fst"; + config.snapshot.enable = false; + config.log.console.enable = true; + config.log.console.level = "trace"; + config.log.file.enable = false; + auto device = ventus_rtlsim_init(&config); + *hdevice = device; + logger = spdlog::stdout_color_mt("ventus"); + logger->set_level(spdlog::level::trace); + logger->set_pattern("[%l] %v [%s:%#]"); + SPDLOG_LOGGER_DEBUG(logger, "vt_dev_open : hello world from ventus.cpp (rtlsim device)"); + return 0; +} + +/// Close the device when all the operations are done +extern int vt_dev_close(vt_device_h hdevice) { + if (hdevice == nullptr) return -1; + auto device = static_cast(hdevice); + ventus_rtlsim_finish(device, false); + SPDLOG_LOGGER_DEBUG(logger, "vt_dev_close : goodbye from ventus.cpp (rtlsim device)"); + return 0; +} +int vt_dev_caps(vt_device_h *hdevice, uint64_t caps_id, uint64_t *value) { + if (value == nullptr) return -1; +#define GET_PARAM(key) \ + do { \ + uint32_t val; \ + if (ventus_rtlsim_get_parameter(key, &val) == 0) { \ + *value = val; \ + return 0; \ + } else { \ + SPDLOG_LOGGER_ERROR(logger, "vt_dev_caps: get parameter {} failed", key); \ + return -1; \ + } \ + } while (0) + switch (caps_id) { + case VT_CAPS_MAX_CORES: + GET_PARAM("num_sm"); + case VT_CAPS_MAX_WARPS: + GET_PARAM("num_warp"); + case VT_CAPS_MAX_THREADS: + GET_PARAM("num_thread"); + case VT_CAPS_LOCAL_MEM_SIZE: + GET_PARAM("sharemem_size"); + default: + SPDLOG_LOGGER_ERROR( + logger, "vt_dev_caps: unknown caps_id {} (or not implemented)", caps_id + ); + return -1; + } + return -1; +} + +extern int vt_buf_alloc( + vt_device_h hdevice, const uint64_t size, uint64_t *vaddr, int BUF_TYPE, uint64_t taskID, + uint64_t kernelID +) { + // TODO: RTLSIM does not support Virtual Memory yet + if (size <= 0 || hdevice == nullptr) return -1; + // auto device = static_cast(hdevice); + size_t pgcnt = (size + 4095) / 4096; + paddr_t addr_allocated = buddy_allocator.allocate(log2Ceil(pgcnt)) + BUDDY_BASE; + if (addr_allocated == BUDDY_BASE) { + SPDLOG_LOGGER_ERROR(logger, "vt_buf_alloc: buddy allocator failed, size=0x{:x}", size); + return -1; + } + SPDLOG_LOGGER_DEBUG( + logger, + "vt_buf_alloc: vaddr_recommand=0x{:x}, vaddr_allocated=0x{:x}, size=0x{:x}, taskID={}", + *vaddr, addr_allocated, size, taskID + ); + *vaddr = addr_allocated; // This is paddr actually + if (*vaddr == 0) return -1; + return 0; +} + +extern int vt_buf_free( + vt_device_h hdevice, uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID +) { + if (hdevice == nullptr) return -1; + // auto device = static_cast(hdevice); + assert(*vaddr % 4096 == 0); + size_t pgcnt = (size + 4095) / 4096; + // Not really freed in hardware, just in buddy allocator + buddy_allocator.free(*vaddr - BUDDY_BASE, log2Ceil(pgcnt)); + SPDLOG_LOGGER_INFO(logger, "vt_buf_free: vaddr=0x{:x}, size=0x{:x}", *vaddr, size); + return 0; +} + +extern int vt_one_buf_free( + vt_device_h hdevice, uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID +) { + return vt_buf_free(hdevice, size, vaddr, taskID, kernelID); +} + +/** + * @brief 为设备分配内存,返回根页表的地址 + * @param hdevice + * @param size + * @return int + */ +extern int vt_root_mem_alloc(vt_device_h hdevice, int taskID) { + // if (hdevice == nullptr) return -1; + // auto device = static_cast(hdevice); + // auto ptroot = ventus_rtlsim_vmem_create(device); + // if (ptroot == 0) return -1; + // logger->debug("vt_root_mem_alloc: taskID={}, ptroot={:x}", taskID, ptroot); + // ptroots[taskID] = ptroot; + if (taskID == 0) { + SPDLOG_LOGGER_ERROR(logger, "RTLSIM_device does not support VMEM yet, taskID must be 0"); + } + return 0; +} + +/** + * 释放taskID(对应context)的根页表 + * @param hdevice + * @param taskID + * @return + */ +extern int vt_root_mem_free(vt_device_h hdevice, int taskID) { + // if (hdevice == nullptr) return -1; + // auto device = static_cast(hdevice); + // ventus_rtlsim_vmem_destroy(device, ptroots[taskID]); + // ptroots.erase(taskID); + return 0; +} + +extern int vt_copy_to_dev( + vt_device_h hdevice, uint64_t dev_vaddr, const void *src_addr, uint64_t size, uint64_t taskID, + uint64_t kernelID +) { + if (hdevice == nullptr) return -1; + auto device = static_cast(hdevice); + SPDLOG_LOGGER_DEBUG( + logger, "vt_copy_to_dev: dev_addr=0x{:x}, size=0x{:x}, taskID={}, kernelID={}", dev_vaddr, + size, taskID, kernelID + ); + ventus_rtlsim_pmemcpy_h2d(device, dev_vaddr, src_addr, size); + return 0; +} + +extern int vt_copy_from_dev( + vt_device_h hdevice, uint64_t dev_vaddr, void *dst_addr, uint64_t size, uint64_t taskID, + uint64_t kernelID +) { + if (hdevice == nullptr) return -1; + auto device = static_cast(hdevice); + SPDLOG_LOGGER_DEBUG( + logger, "vt_copy_from_dev: dev_addr=0x{:x}, size=0x{:x}, taskID={}, kernelID={}", dev_vaddr, + size, taskID, kernelID + ); + ventus_rtlsim_pmemcpy_d2h(device, dst_addr, dev_vaddr, size); + return 0; +} + +extern int vt_start(vt_device_h hdevice, void *mtd_raw, uint64_t taskID) { + if (hdevice == nullptr) return -1; + auto device = static_cast(hdevice); + auto mtd_driver = static_cast(mtd_raw); + ventus_kernel_metadata_t mtd_sim{ + .name = mtd_driver->kernel_name, + .data = nullptr, + .startaddr = 0x80000000, + .kernel_id = mtd_driver->kernel_id, + .kernel_size = + {mtd_driver->kernel_size[0], mtd_driver->kernel_size[1], mtd_driver->kernel_size[2]}, + .wf_size = mtd_driver->wf_size, + .wg_size = mtd_driver->wg_size, + .metaDataBaseAddr = mtd_driver->metaDataBaseAddr, + .ldsSize = mtd_driver->ldsSize, + .pdsSize = mtd_driver->pdsSize, + .sgprUsage = mtd_driver->sgprUsage, + .vgprUsage = mtd_driver->vgprUsage, + .pdsBaseAddr = mtd_driver->pdsBaseAddr, + .num_buffer = 0, + }; + SPDLOG_LOGGER_DEBUG( + logger, + "kernel metadata: kernel_id={}, kernel_size=[{}, {}, {}], wf_size={}, wg_size={}, " + "metaDataBaseAddr=0x{:x}, ldsSize=0x{:x}, pdsSize=0x{:x}, sgprUsage={}, " + "vgprUsage={}, pdsBaseAddr=0x{:x}", + mtd_driver->kernel_id, mtd_driver->kernel_size[0], mtd_driver->kernel_size[1], + mtd_driver->kernel_size[2], mtd_driver->wf_size, mtd_driver->wg_size, + mtd_driver->metaDataBaseAddr, mtd_driver->ldsSize, mtd_driver->pdsSize, + mtd_driver->sgprUsage, mtd_driver->vgprUsage, mtd_driver->pdsBaseAddr + ); + ventus_rtlsim_add_kernel(device, &mtd_sim, nullptr); + return 0; +} + +extern int vt_ready_wait(vt_device_h hdevice, uint64_t timeout) { + if (hdevice == nullptr) return -1; + auto device = static_cast(hdevice); + uint64_t timeout_ns = timeout * 1000000; + while (!ventus_rtlsim_is_idle(device) && ventus_rtlsim_get_time(device) < timeout_ns) { + ventus_rtlsim_step(device); + } + for (int i = 0; i < 5000; i++) { + // TODO: RTL does not provide a way to check if L2 cache flush is done + ventus_rtlsim_step(device); + } + return 0; +} + +extern int vt_finish_all_kernel(vt_device_h hdevice, std::queue *finished_kernel_list) { + // TODO: what is this function for? what is finished_kernel_list? + return -1; +} + +extern int vt_upload_kernel_file(vt_device_h hdevice, const char *filename, int taskID) { + if (hdevice == nullptr) return -1; + auto device = (ventus_rtlsim_t *)hdevice; + // uint64_t ptroot = ptroots[taskID]; + + // parse ELF file, find .text and other data sections + const auto blocks = get_data_from_elf(filename, logger); + if (blocks.empty()) { + return -1; // at least .text section is needed + } + + // alloc and load/zero-fill each block + for (auto block = blocks.begin(); block != blocks.end(); block++) { + uint64_t vaddr = block->vaddr; + uint64_t size = block->memsz; + SPDLOG_LOGGER_DEBUG( + logger, "vt_upload_kernel_file {}: vaddr=0x{:x}, size=0x{:x}", filename, vaddr, size + ); + ventus_rtlsim_pmemcpy_h2d(device, vaddr, block->data.data(), block->data.size()); + std::vector zeros(size - block->data.size(), 0); + ventus_rtlsim_pmemcpy_h2d(device, vaddr + block->data.size(), zeros.data(), zeros.size()); + } + ventus_rtlsim_icache_invalidate(device); + return 0; +} +int vt_upload_kernel_bytes(vt_device_h device, const void *content, uint64_t size, int taskID) { + return 0; +} +int vt_dump_perf(vt_device_h device, FILE *stream) { return 0; } diff --git a/driver/spike_device/CMakeLists.txt b/driver/spike_device/CMakeLists.txt index 63fe0f3..e3a2e44 100644 --- a/driver/spike_device/CMakeLists.txt +++ b/driver/spike_device/CMakeLists.txt @@ -1,34 +1,30 @@ set(PROJECT spike_driver) project(${PROJECT}) - -message("val of env is:$ENV{SPIKE_SRC_DIR}/spike_main") set(CMAKE_CXX_FLAGS -lstdc++) -include_directories(${CMAKE_SOURCE_DIR}/../../include) -include_directories(${CMAKE_SOURCE_DIR}/../../common) -include_directories($ENV{SPIKE_SRC_DIR}) -include_directories($ENV{SPIKE_SRC_DIR}/spike_main) -include_directories($ENV{SPIKE_SRC_DIR}/riscv) -include_directories($ENV{SPIKE_SRC_DIR}/build) -include_directories($ENV{SPIKE_SRC_DIR}/softfloat) -include_directories($ENV{SPIKE_SRC_DIR}/fesvr) - -#set(CMAKE_POSITION_INDEPENDENT_CODE True) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../include) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../common) +include_directories(${SPIKE_SRC_DIR}) +include_directories(${SPIKE_SRC_DIR}/spike_main) +include_directories(${SPIKE_SRC_DIR}/riscv) +include_directories(${SPIKE_SRC_DIR}/build) +include_directories(${SPIKE_SRC_DIR}/softfloat) +include_directories(${SPIKE_SRC_DIR}/fesvr) file(GLOB_RECURSE SRCS ./ventus.cpp) - -#message($ENV{SPIKE_TARGET_DIR}) - add_library(${PROJECT} SHARED ${SRCS}) -target_link_directories(${PROJECT} PUBLIC $ENV{SPIKE_TARGET_DIR}/lib) +target_link_directories(${PROJECT} PUBLIC ${VENTUS_INSTALL_PREFIX}/lib) target_link_libraries(${PROJECT} PUBLIC spike_main) set_target_properties(${PROJECT} PROPERTIES OUTPUT_NAME "${PROJECT}") set_target_properties(${PROJECT} PROPERTIES CLEAN_DIRECT_OUTPUT 1) - -if(ENABLE_INSTALL) - message(STATUS "DRIVER_LIB_INSTALL_DIR:" ${DRIVER_LIB_INSTALL_DIR}) - install(TARGETS ${PROJECT} LIBRARY DESTINATION ${DRIVER_LIB_INSTALL_DIR} COMPONENT "lib") - endif() \ No newline at end of file +set_target_properties(${PROJECT} PROPERTIES + BUILD_RPATH "${VENTUS_INSTALL_PREFIX}/lib" + INSTALL_RPATH "$ORIGIN" +) + +install(TARGETS ${PROJECT} + LIBRARY DESTINATION lib COMPONENT "lib" +) diff --git a/driver/spike_device/ventus.cpp b/driver/spike_device/ventus.cpp index 4f046ce..a0af49d 100644 --- a/driver/spike_device/ventus.cpp +++ b/driver/spike_device/ventus.cpp @@ -9,11 +9,6 @@ #include #include #include -#include -#include -#include -#include -// driver/page_table #include "ventus.h" #include "spike_main.h" @@ -41,6 +36,12 @@ extern int vt_dev_caps(vt_device_h* hdevice, host_port_t* input_sig){ // //set spike_device id to 1 return 0; } + +int vt_dev_caps(vt_device_h* hdevice, uint64_t caps_id, uint64_t *value) { + // TODO: Not implemented yet + return -1; +} + extern int vt_buf_alloc(vt_device_h hdevice, uint64_t size, uint64_t *vaddr, int BUF_TYPE, uint64_t taskID, uint64_t kernelID) { if(size <= 0 || hdevice == nullptr) return -1; @@ -211,3 +212,6 @@ extern int vt_upload_kernel_file(vt_device_h device, const char* filename, int t return err; } */ + +int vt_upload_kernel_bytes(vt_device_h device, const void* content, uint64_t size, int taskID) { return 0; } +int vt_dump_perf(vt_device_h device, FILE* stream) { return 0; } diff --git a/driver/verilating_device/CMakeLists.txt b/driver/verilating_device/CMakeLists.txt deleted file mode 100644 index 357e34e..0000000 --- a/driver/verilating_device/CMakeLists.txt +++ /dev/null @@ -1,29 +0,0 @@ -set(PROJECT rtldriver) -project(${PROJECT}) - -set(CMAKE_CXX_STANDARD 11) - -include_directories(${CMAKE_SOURCE_DIR}/include) -include_directories(${CMAKE_SOURCE_DIR}/common) -include_directories(${CMAKE_SOURCE_DIR}/devices/verilating_device) -include_directories(${CMAKE_SOURCE_DIR}/devices/verilating_device/page_table) - -#set(CMAKE_POSITION_INDEPENDENT_CODE True) - -file(GLOB_RECURSE SRCS "./*.cpp" "./*.h") - - -link_directories(${CMAKE_BINARY_DIR}/devices/verilating_device) - -add_library(${PROJECT} SHARED ${SRCS}) -add_dependencies(${PROJECT} rtlsim) - -target_link_libraries(${PROJECT} PUBLIC rtlsim) - - -set_target_properties(${PROJECT} PROPERTIES OUTPUT_NAME "${PROJECT}") -set_target_properties(${PROJECT} PROPERTIES CLEAN_DIRECT_OUTPUT 1) - -if(ENABLE_INSTALL) - install(TARGETS ${PROJECT} LIBRARY DESTINATION ${DRIVER_LIB_INSTALL_DIR} COMPONENT "lib") -endif() \ No newline at end of file diff --git a/driver/verilating_device/README.md b/driver/verilating_device/README.md deleted file mode 100644 index e5673e9..0000000 --- a/driver/verilating_device/README.md +++ /dev/null @@ -1,7 +0,0 @@ -这个目录为GPGPU的源代码转为verilog后,执行OpenCL程序时,相应的driver需要实现的功能,包括: - -1. `/include/ventus.h`中声明的函数 - -2. `vt_device`类,表示GPGPU设备,成员变量包括设备类(不包含ram),设备ram类 -3. `vt_buffer`类,主机和设备之间交换数据的缓冲区,成员变量包括`vt_device`,数据,缓冲区大小 -4. 其他 TBD \ No newline at end of file diff --git a/driver/verilating_device/ventus.cpp b/driver/verilating_device/ventus.cpp deleted file mode 100644 index 50d6df8..0000000 --- a/driver/verilating_device/ventus.cpp +++ /dev/null @@ -1,269 +0,0 @@ -/** - * @file ventus.cpp - * @brief 设备和OpenCL程序的交互功能的实现 - * - * 1. `/include/ventu.h`中声明的函数 - * 2. `vt_device`类,表示GPGPU设备,成员变量包括设备类(不包含ram),设备ram类 - * 3. `vt_buffer`类,主机和设备之间交换数据的缓冲,成员变量包括`vt_device`,数据,缓冲区大小 - - * @author yangzexia (yang-zx17\@qq.com) - * @version 1.0 - * @date 2022-11-16 - * - * @copyright Copyright (c) 2022 yangzexia@THU-DSPLAB - * - * @par 修改日志: - * - *
Date Version Author Description - *
2022-11-16 1.0 yangzexia 创建 - *
- */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -// driver/page_table -#include "ventus.h" - -#include "vt_device.h" - -///@todo MMU,内存分配 -// #include -#include "vt_utils.h" -#include "vt_config.h" - -// devices/page_table -#include "verilating_device/page_table/vt_memory.h" -#include "verilating_device/page_table/MemConfig.h" -// #include -#include "processor.h" - -#define RAM_PAGE_SIZE 4096 - -using namespace ventus; - - -/// open the device and connect to it -extern int vt_dev_open(vt_device_h* hdevice){ - if(hdevice == nullptr) - return -1; - PCOUT_INFO << "vt_dev_open : hello world from ventus.cpp" << endl; - *hdevice = new vt_device(); - vt_root_mem_alloc(*hdevice, 0); - return 0; -} -/// Close the device when all the operations are done -extern int vt_dev_close(vt_device_h hdevice){ - if(hdevice == nullptr) - return -1; - auto* device = (vt_device*) hdevice; - delete device; - return 0; -} -extern int vt_dev_caps(vt_device_h* hdevice, uint64_t caps_id, uint64_t *value){ - if(hdevice == nullptr) - return -1; - switch (caps_id) { - case VT_CAPS_VERSION: - *value = IMPLEMENTATION_ID; - break; - case VT_CAPS_MAX_CORES: - *value = NUM_CTA; - break; - case VT_CAPS_MAX_WARPS: - *value = NUM_WARP; - break; - case VT_CAPS_MAX_THREADS: - *value = NUM_THREAD; - break; - default: - std::cout << "invalid caps id: " << caps_id << std::endl; - std::abort(); - return -1; - } - - return 0; -} -extern int vt_buf_alloc(vt_device_h hdevice, uint64_t size, uint64_t *vaddr, int BUF_TYPE, uint64_t taskID, uint64_t kernelID) { - if(size <= 0 || hdevice == nullptr) - return -1; - auto device = ((vt_device*) hdevice); - return device->alloc_local_mem( size, vaddr, BUF_TYPE, taskID, kernelID); - -} -extern int vt_buf_free(vt_device_h hdevice, uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID) { - if(size <= 0 || hdevice == nullptr) - return -1; - auto device = ((vt_device*) hdevice); - - return device->free_local_mem( size, vaddr, taskID, kernelID); - -} - -/** - * @brief 为设备分配内存,返回根页表的地址 - * @param hdevice - * @param size - * @param dev_vaddr 申请物理地址时的虚拟地址 - * @return int - */ -extern int vt_root_mem_alloc(vt_device_h hdevice, int taskID) { - if( hdevice == nullptr) - return -1; - vt_device* device = (vt_device*) hdevice; - return device->create_device_mem(taskID); -} - -/** - * 释放taskID(对应context)的根页表 - * @param hdevice - * @param taskID - * @return - */ -extern int vt_root_mem_free(vt_device_h hdevice, int taskID) { - if(hdevice == nullptr) - return -1; - auto device = (vt_device*) hdevice; - return device->delete_device_mem(taskID); -} - -//extern int vt_create_kernel(vt_device_h hdevice, int taskID, int kernelID) { -// if(hdevice == nullptr) -// return -1; -// auto device = (vt_device*) hdevice; -// return device->push_kernel(taskID, kernelID); -//} - -extern int vt_copy_to_dev(vt_device_h hdevice, uint64_t dev_vaddr,const void *src_addr, uint64_t size, uint64_t taskID, uint64_t kernelID) { - if(size <= 0) - return -1; - auto device = (vt_device*) hdevice; - return device->upload(dev_vaddr, src_addr, size, taskID, kernelID); -} - -extern int vt_copy_from_dev(vt_device_h hdevice, uint64_t dev_vaddr, void *dst_addr, uint64_t size, uint64_t taskID, uint64_t kernelID) { - if(size <= 0) - return -1; - auto device = (vt_device*) hdevice; - return device->download(dev_vaddr, dst_addr, size, taskID, kernelID); -} - -extern int vt_start(vt_device_h hdevice, void* metaData, uint64_t taskID) { - if(hdevice == nullptr) - return -1; - auto device = (vt_device *) hdevice; - device->start(taskID, metaData); - return 0; -} -extern int vt_ready_wait(vt_device_h hdevice, uint64_t timeout) { - if(hdevice == nullptr) - return -1; - auto* device = (vt_device*) hdevice; -#ifdef DEBUG_VIRTUAL_ADDR - device->execute_all_kernel(); - return 0; -#endif - return device->wait(timeout); - -} - -extern int vt_finish_all_kernel(vt_device_h hdevice, queue *finished_kernel_list) { - if(hdevice == nullptr) - return -1; - auto device = (vt_device*) hdevice; - *finished_kernel_list = device->execute_all_kernel(); - return 0; -} - -extern int vt_upload_kernel_bytes(vt_device_h device, const void* content, uint64_t size, int taskID) { - int err = 0; - - if (NULL == content || 0 == size) - return -1; - - uint32_t buffer_transfer_size = 65536; ///< 64 KB - uint64_t kernel_base_addr = BUF_PARA_BASE; - - // allocate device buffer - uint64_t dev_mem_addr; - - uint64_t offset = 0; - // 确定字符串可以被4整除 - - int numValues = size / 8; // 每个uint32_t值占据8个字符 - - // 创建uint32_t数组 - uint32_t values[numValues]; - - // 将字符串转换为uint32_t数组 - for (int i = 0; i < numValues; i++) { - std::string substring = (*(string*)content).substr(i * 8, 8); // 每次提取8个字符 - unsigned int value = std::stoul(substring, nullptr, 16); // 转换为无符号整数 - std::memcpy(values + i, &value, sizeof(uint32_t)); // 复制到数组中 - } - void * const buffer = malloc(buffer_transfer_size); - while (offset < size) { - auto chunk_size = std::min(buffer_transfer_size, size - offset); - std::memcpy(buffer, values + offset, chunk_size); - - err = vt_buf_alloc(device, buffer_transfer_size, &dev_mem_addr, KERNEL_MEM, taskID, 0); - if (err != 0) - return -1; - - printf("*** Upload Kernel to 0x%0x: data=", kernel_base_addr + offset); - for (int i = 0; i < chunk_size; ++i) { - printf("%08x", ((values + offset))[i]); - } - printf("\n"); - - - err = vt_copy_to_dev(device, dev_mem_addr, buffer, chunk_size, taskID, 0); - if (err != 0) { -// vt_buf_free(device, buffer_transfer_size, &dev_mem_addr, taskID, 0); - return err; - } - offset += chunk_size; - } - free(buffer); - return 0; -} - -extern int vt_upload_kernel_file(vt_device_h device, const char* filename, int taskID) { - -// return 0; - - const char *pos = std::strchr(filename, '.'); - char newname[100]; - if (pos != nullptr) { - std::strncpy(newname, filename, pos - filename); - std::strcat(newname,".vmem"); - std::size_t len = std::strlen(newname); - newname[len] = '\0'; - } - std::ifstream ifs(newname, std::ios::binary); - if (!ifs) { - std::cout << "error: " << newname << " not found" << std::endl; - return -1; - } - - // read file content - ifs.seekg(0, ifs.end); - auto size = ifs.tellg(); - std::string content; - content.resize(size); - ifs.seekg(0, ifs.beg); - ifs.read(&content[0], size); - content.erase(std::remove(content.begin(), content.end(), '\n'), content.end()); - // upload - int err = vt_upload_kernel_bytes(device, &content, content.length(), taskID); - - - return err; -} diff --git a/driver/verilating_device/vt_device.cpp b/driver/verilating_device/vt_device.cpp deleted file mode 100644 index 7020bbb..0000000 --- a/driver/verilating_device/vt_device.cpp +++ /dev/null @@ -1,601 +0,0 @@ -#include "vt_device.h" -#include -#include -#include -#include "vt_utils.h" -#include "MemConfig.h" -//#include "processor.h" - -int vt_device::create_device_mem(uint64_t taskID) { - if(contextList_.find(taskID) != contextList_.end()) { - PCOUT_ERROR << "the taskID of " << taskID <<"has been created, check your input!" <second.ram.createRootPageTable(); - it->second.root = ret1; - return ret0 || !ret1; -} - -int vt_device::delete_device_mem(int taskID){ - if(contextList_.find(taskID) != contextList_.end()) { - PCOUT_ERROR << "the taskID of " << taskID <<"has not been created, check your input!" <second.ram.allocateMemory(it->second.root, *vaddr, size); - PCOUT_INFO << "allocating memory at vaddr of 0x" <second.ram.allocateMemory(it->second.root, *addr, size); - /// 将ram分配的物理地址和addrManager分配的物理地址关联起来 - addrManager_.attachPaddr(taskID, kernelID, addr, vaddr); - PCOUT_INFO << "allocating memory at vaddr of 0x" <second.ram.releaseMemory(it->second.root, *paddr); - delete paddr; -#ifdef DEBUG_VIRTUAL_ADDR - int ret0 = addrManager_.releaseMemory(taskID, kernelID, vaddr, size); - return ret0 || ret1; -#else - return ret1; -#endif -} - -int vt_device::upload(uint64_t dev_vaddr,const void *src_addr, uint64_t size, uint64_t taskID, uint64_t kernelID){ - if(size <= 0 || src_addr == nullptr || contextList_.find(taskID) == contextList_.end()) - return -1; - auto it = contextList_.find(taskID); -#ifdef DEBUG_VIRTUAL_ADDR - return it->second.ram.writeDataVirtual(it->second.root, dev_vaddr, size, src_addr); -#else - return it->second.ram.writeDataPhysical(dev_vaddr, size, src_addr); -#endif - - -} - -int vt_device::download(uint64_t dev_vaddr, void *dst_addr, uint64_t size, uint64_t taskID, uint64_t kernelID){ - if(size <= 0 || dst_addr == nullptr || contextList_.find(taskID) == contextList_.end()) - return -1; - auto it = contextList_.find(taskID); -#ifdef DEBUG_VIRTUAL_ADDR - return it->second.ram.readDataVirtual(it->second.root, dev_vaddr, size, dst_addr); -#else - return it->second.ram.readDataPhysical(dev_vaddr, size, dst_addr); -#endif -} -/** - * @brief 发送任务,每个任务由多个block组成,每次调用start发送一个任务 - * 传入到硬件的wg_id由processor.run()决定,函数执行完成后会返回实际的wg_id, - * 任务队列的数据结构为一个元素为unordered_map的list,list的每个元素代表一个任务, - * 每次发送一个任务会在list中增加一个元素 - * unordered_map的每个key代表block ID,value表示该block是否执行完成。 - * @param input_sig 输入到GPGPU的信号,与硬件接口对应 - * @param num_block 这个任务由多少个block组成 - * @return int 0 - * - * @todo start中调用parse_metaData, 然后push_kernel, 然后为硬件接口赋值,启动GPU - */ -int vt_device::start(int taskID, void* metaData){ - //parse metaData - - host_port_t *devicePort = new host_port_t; - auto inputData = (meta_data *)metaData; -#ifdef DEBUG_VIRTUAL_ADDR - uint64_t wgNum = inputData->kernel_size[0] * inputData->kernel_size[1]*inputData->kernel_size[2]; - uint64_t pdsParam = inputData->pdsSize * inputData->wf_size * inputData->wg_size; - devicePort->host_req_num_wf = inputData->wg_size; - devicePort->host_req_wf_size = inputData->wf_size; - devicePort->host_req_kernel_size_3d_0 = inputData->kernel_size[0]; - devicePort->host_req_kernel_size_3d_1 = inputData->kernel_size[1]; - devicePort->host_req_kernel_size_3d_2 = inputData->kernel_size[2]; - devicePort->host_req_vgpr_size_total = inputData->wg_size * inputData->vgprUsage; - devicePort->host_req_sgpr_size_total = inputData->wg_size * inputData->sgprUsage; - devicePort->host_req_gds_size_total = 0; - devicePort->host_req_vgpr_size_per_wf = inputData->vgprUsage; - devicePort->host_req_sgpr_size_per_wf = inputData->sgprUsage; - devicePort->host_req_start_pc = 0x80000000; - devicePort->host_req_pds_baseaddr = inputData->pdsBaseAddr; - devicePort->host_req_csr_knl = inputData->metaDataBaseAddr; - devicePort->host_req_lds_size_total = inputData->ldsSize; - devicePort->host_req_gds_baseaddr = 0; -#endif - - if(contextList_.find(taskID) == contextList_.end()) { - PCOUT_ERROR << "the context of ID "<< taskID << " not exists, check your input!" << endl; - return -1; - } - processor_.attach_ram(&contextList_.find(taskID)->second.ram); - //each function call send one block of a kernel - for (int i = 0; i < wgNum; ++i) { - #ifdef DEBUG_VIRTUAL_ADDR - uint64_t kernelID = inputData->kernel_id; - devicePort->host_req_pds_baseaddr = inputData->pdsBaseAddr + i * pdsParam; - #else - uint64_t kernelID = 0; - #endif - devicePort->host_req_wg_id = (inst_len)((( - kernelID<<(int)ceil(log2(MAX_CONTEXT)) | taskID) - <<((int)ceil(log2(MAX_KERNEL)) | kernelID)) - <<((int)ceil(log2(NUM_SM*MAX_BLOCK_PER_SM)) | i)) - <<((int)ceil(log2(NUM_SM))); - #ifdef DEBUG_VERIFY_HW - devicePort->host_req_wg_id = 0; - devicePort->host_req_num_wf = 2; - devicePort->host_req_wf_size = 0x8; - devicePort->host_req_kernel_size_3d_0 = 0; - devicePort->host_req_kernel_size_3d_1 = 0; - devicePort->host_req_kernel_size_3d_2 = 0; - devicePort->host_req_vgpr_size_total = 0x040; - devicePort->host_req_sgpr_size_total = 0x040; - devicePort->host_req_gds_size_total = 0; - devicePort->host_req_vgpr_size_per_wf = 0x020; - devicePort->host_req_sgpr_size_per_wf = 0x020; - devicePort->host_req_start_pc = 0x80000000; - devicePort->host_req_pds_baseaddr = 0x80001000; - devicePort->host_req_csr_knl = 0x80023000; - devicePort->host_req_lds_size_total = 0x80; - devicePort->host_req_gds_baseaddr = 0x00000000; - #endif - - processor_.run(contextList_.find(taskID)->second.root, devicePort); - //更新contextList_ - mapfiredBlk; - firedBlk.emplace((int)(devicePort->host_req_wg_id), UNFINISH); - contextList_.find(taskID)->second.kernelList.emplace(kernelID, kernel_info(firedBlk, UNFINISH)); - - } - return 0; -} -/** - * @brief 等待一定时间,更新kernel执行完成的信息 - * @param time - * @return int - */ -int vt_device::wait(uint64_t time){ - // 如果所有已经启动的run()任务都完成 - if(!last_task_.valid()); - else { - // 如果没有则等待到所有run()都完成 - uint64_t timeout = time / 1000; - std::chrono::seconds wait_time(1); - for(;;){ - auto status = last_task_.wait_for(wait_time); - if (status == std::future_status::ready || timeout-- == 0) - break; - } - } // 如果正在遍历的任务的所有block都完成,则将该任务记录下来并删除, - // 根据GPGPU返回的block完成情况更新任务队列,将已完成的block ID与保存的list中的block ID比较 - - std::queue finished_block = processor_.wait(time); - - while(!finished_block.empty()) { - bool block_legal = true; - //根据硬件返回的已完成blkID,解码出所属的context, kernel和原本的block - uint64_t blkID = (finished_block.front() >> (int)ceil(log2(NUM_SM))) & (1 << (int)ceil(log2(NUM_SM*MAX_BLOCK_PER_SM))); - uint64_t kernelID = (finished_block.front() >> (int)ceil(log2(NUM_SM*MAX_BLOCK_PER_SM*NUM_SM))) & (1 << (int)ceil(log2(MAX_KERNEL))); - uint64_t contextID = (finished_block.front() >> (int)ceil(log2(NUM_SM*MAX_BLOCK_PER_SM*NUM_SM*MAX_KERNEL))) & (1 << (int)ceil(log2(MAX_CONTEXT))); - auto contextItem = contextList_.find(contextID); - //判断contextID是否存在 - if(contextItem == contextList_.end()) - block_legal = false; - else{ - auto it = contextItem->second.kernelList.find(kernelID); - //判断kernelID是否存在 - if(it == contextItem->second.kernelList.end()) { - block_legal = false; - } - else { - //判断blkID是否存在 - if(it->second.blk_list.find(blkID) == it->second.blk_list.end()) - block_legal = false; - else { - //将相应kernel的block设置为已完成 - it->second.blk_list[blkID] = FINISH; - finished_block.pop(); - - //当某一个kernel的block完成之后,判断是否该block的所有kernel都完成, - // 判断该kernel所属的context的所有kernel是否都完成 - bool kernel_all_block_finished = true; - for(auto& it_map : it->second.blk_list) { - if(!it_map.second) { - kernel_all_block_finished = false; - break; - } - } - if(kernel_all_block_finished) { - finished_kernel_l.push(contextID << (int)ceil(log2(MAX_CONTEXT)) | kernelID); - it->second.state = FINISH; - } - } - } - } - if(!block_legal) { - cout << "return Wrong finished block ID, something error" << endl; - return -1; - } - } - return 0; -} -/** - * @brief 返回已经完成的kernel - * @return queue - */ -queue vt_device::get_finished_kernel() { - queue tmp; - while(!finished_kernel_l.empty()) { - tmp.push(finished_kernel_l.front()); - finished_kernel_l.pop(); - } - return tmp; -} -/** - * 执行所有context下的所有kernel,并返回已完成kernel ID的队列 - * @return - */ -queue vt_device::execute_all_kernel() { - queue tmp; - int cnt = 0; - while(!all_context_finished()) { - while(!finished_kernel_l.empty()) { - tmp.push(finished_kernel_l.front()); - finished_kernel_l.pop(); - } - wait(RUN_DELAY); - cnt++; - if(cnt > 30) break; - } - return tmp; -} - -/** - * 返回已经完成的contextID,如果没有执行完成,硬件时钟并不会前进 - * @return > contextID的队列 - */ -queue vt_device::get_finished_context() { - queue tmp; - auto it = contextList_.begin(); - while(it != contextList_.end()) { - if(it->second.context_finished()){ ///< 这个context里的所有kernel都执行完成了 - tmp.push(it->second.contextID); - it = contextList_.erase(it); - } - else ++it; - } - return tmp; -} - -bool vt_device::all_context_finished() { - auto it = contextList_.begin(); - while(it != contextList_.end()) { - if(!it->second.context_finished()) - return false; - } - return true; -} - - -addr_manager::~addr_manager() { - for(auto it : contextMemory_) { - addrItem *curItem = it.second; - while(curItem != nullptr) { - auto tmp = curItem; - curItem = curItem->succContextItem; - delete tmp; - } - } -} - - -int addr_manager::allocMemory(uint64_t contextID, uint64_t kernelID, uint64_t *vaddr, uint64_t size, int BUF_TYPE) { - if(size == 0 || vaddr == nullptr) { - PCOUT_ERROR << "vaddr needs to allocate memory is nullptr! error!" << endl; - return -1; - } - - size = aligned_size(size, BLOCK_SIZE); - addrItem* currentItem = nullptr; -// auto curContextIt = contextList_.begin(); -// while(curContextIt != contextList_.end()) { -// if(*curContextIt == contextID) { - if(contextMemory_.find(contextID) == contextMemory_.end()) {/// 检查这个context是否存在 - PCOUT_ERROR << "Context of ID" << contextID <<" has not created, can't allocate memory!" << endl; - return -1; - } - - switch (BUF_TYPE) {/// - case READ_ONLY: - if(size < RWDATA_BASE - RODATA_BASE) { - *vaddr = RODATA_BASE; - break; - } else { - PCOUT_ERROR << "buffer size too large, error!" << endl; - return -1; - } - case READ_WRITE: if(size < RWDATA_BASE - RODATA_BASE) { - *vaddr = RWDATA_BASE; - break; - } else { - PCOUT_ERROR << "buffer size too large, error!" << endl; - return -1; - } - case KERNEL_MEM: if(size < GLOBALMEM_SIZE/2) { - *vaddr = BUF_PARA_BASE; - break; - } else { - PCOUT_ERROR << "buffer size too large, error!" << endl; - return -1; - } - default: break; - } - if(contextMemory_.at(contextID) == nullptr) { - currentItem = new addrItem(kernelID, contextID, *vaddr, size); - contextMemory_.at(contextID) = currentItem; - } - else { - currentItem = contextMemory_.at(contextID); - if(!allocVaddr(¤tItem, vaddr, size, BUF_TYPE)) - insertNewItem(currentItem, contextID, kernelID, vaddr, size); - else { - PCOUT_ERROR << "allocating virtual addr failed !" << endl; - } - } - return 0; -} - -int addr_manager::createNewContext(uint64_t contextID) { - - for(auto it : contextMemory_) { - if(it.first == contextID) { - PCOUT_ERROR << "A context of ID" << contextID <<" exists, error!" << endl; - return -1; - } - } -// contextList_.emplace_back(contextID); - auto t = contextMemory_.emplace(contextID, nullptr); - auto p = t.first; - return 0; -} -/// 插入一个地址元素,如果contextMemory_中已经存在读写类型的地址,并且需要插入只读类型的地址,则要插入的地址为开头, -/// 同时修改该context的地址链表的开头元素为要插入的元素 -/// \param currentItem 在该元素的后面插入 -/// \param contextID -/// \param kernelID -/// \param vaddr -/// \param size -void addr_manager::insertNewItem(addrItem *currentItem, uint64_t contextID, uint64_t kernelID, uint64_t *vaddr, - uint64_t size) { - auto tmp = new addrItem(kernelID, contextID, *vaddr, size); - if(tmp->vaddr == RODATA_BASE && currentItem->vaddr == RWDATA_BASE) { - tmp->succContextItem = currentItem; - currentItem->prevContextItem = tmp; - contextMemory_.at(contextID) = tmp; - return; - } - tmp->succContextItem = (currentItem)->succContextItem; - tmp->prevContextItem = currentItem; - if(currentItem->succContextItem != nullptr) - currentItem->succContextItem->prevContextItem = tmp; - (currentItem)->succContextItem = tmp; - -} -/// https://raw.githubusercontent.com/yangzexia/md-image/image/202305171429512.svg -int addr_manager::allocVaddr(addrItem **rootItem, uint64_t *vaddr, uint64_t size, int BUF_TYPE) { - - uint64_t curAddr; - switch (BUF_TYPE) {/// 寻找下一个还没有分配的地址 - case READ_ONLY: - if((*rootItem)->vaddr==RODATA_BASE) { - *vaddr = aligned_size((*rootItem)->vaddr + (*rootItem)->size, PAGESIZE); - while ((*rootItem)->vaddr < RWDATA_BASE && (*rootItem)->succContextItem != nullptr) { -// *vaddr = (*rootItem)->vaddr + (*rootItem)->size; - if (*vaddr + size <= (*rootItem)->succContextItem->vaddr) { - break;/// 该地址符合条件,跳出循环 - } - *rootItem = (*rootItem)->succContextItem; - *vaddr = aligned_size((*rootItem)->vaddr + (*rootItem)->size, PAGESIZE); - } - - if ((*rootItem)->succContextItem == nullptr || (*rootItem)->succContextItem->vaddr >= RWDATA_BASE) { - if (*vaddr + size <= RWDATA_BASE) - break; - else { - PCOUT_ERROR << "memory needs to allocate of size of 0x" << hex << size << dec - << "failed! No enough space!" << endl; - return -1; - } - } - } else { - *vaddr = RODATA_BASE; - } - break; - - - case READ_WRITE: - if((*rootItem)->vaddr == RODATA_BASE) {/// 如果第一个元素是只读类型的地址,则遍历到RW_BASE,如果没有RW的地址,则要分配的地址为RW_BASE - while((*rootItem)->vaddr < RWDATA_BASE ) { - if ((*rootItem)->succContextItem == nullptr) { - *vaddr = RWDATA_BASE; - return 0; - } - *rootItem = (*rootItem)->succContextItem; - } - } - *vaddr = aligned_size((*rootItem)->vaddr + (*rootItem)->size, PAGESIZE); - while ((*rootItem)->vaddr < BUF_PARA_BASE && (*rootItem)->succContextItem != nullptr) { -// *vaddr = (*rootItem)->vaddr + (*rootItem)->size; - if (*vaddr + size <= (*rootItem)->succContextItem->vaddr) { - break;/// 该地址符合条件,跳出循环 - } - *rootItem = (*rootItem)->succContextItem; -// if((*rootItem)->succContextItem == nullptr) - *vaddr = aligned_size((*rootItem)->vaddr + (*rootItem)->size, PAGESIZE); - } - if ((*rootItem)->succContextItem == nullptr && (*vaddr + size > BUF_PARA_BASE)) { - PCOUT_ERROR << "memory needs to allocate of size of 0x" << hex << size << dec - << "failed! No enough space!" << endl; - return -1; - } - break; - case KERNEL_MEM: - if((*rootItem)->vaddr == RODATA_BASE || (*rootItem)->vaddr == RWDATA_BASE) {/// 如果第一个元素是只读类型的地址,则遍历到RW_BASE,如果没有RW的地址,则要分配的地址为RW_BASE - while((*rootItem)->vaddr < BUF_PARA_BASE ) { - if ((*rootItem)->succContextItem == nullptr) { - *vaddr = BUF_PARA_BASE; - return 0; - } - *rootItem = (*rootItem)->succContextItem; - } - } - *vaddr = aligned_size((*rootItem)->vaddr + (*rootItem)->size, PAGESIZE); - while ((*rootItem)->vaddr < BUF_PARA_BASE+GLOBALMEM_SIZE/2 && (*rootItem)->succContextItem != nullptr) { -// *vaddr = (*rootItem)->vaddr + (*rootItem)->size; - if (*vaddr + size <= (*rootItem)->succContextItem->vaddr) { - break;/// 该地址符合条件,跳出循环 - } - *rootItem = (*rootItem)->succContextItem; -// if((*rootItem)->succContextItem == nullptr) - *vaddr = aligned_size((*rootItem)->vaddr + (*rootItem)->size, PAGESIZE); - } - if ((*rootItem)->succContextItem == nullptr && (*vaddr + size > BUF_PARA_BASE+GLOBALMEM_SIZE/2)) { - PCOUT_ERROR << "memory needs to allocate of size of 0x" << hex << size << dec - << "failed! No enough space!" << endl; - return -1; - } - break; - } - - return 0; -} - - -int addr_manager::attachPaddr(uint64_t kernelID, uint64_t contextID, uint64_t *vaddr, uint64_t *paddr) { - bool b_contextExist = false; - bool b_vaddrExist = false; -// for(auto it : contextList_) { -// if(it == contextID) { - b_contextExist = true; - auto tmp = contextMemory_.at(contextID); - while(tmp != nullptr) { - if(tmp->vaddr == *vaddr) { - tmp->paddr = *paddr; - break; - } - tmp = tmp->succContextItem; -// } -// break; -// } - } - if(!tmp) { - PCOUT_ERROR << "Attaching paddr created by ram and vaddr created by addrManager_ failed, vaddr not exists!" << endl; - return -1; - } - return 0; -} - -int addr_manager::findVaByPa(uint64_t kernelID, uint64_t contextID, uint64_t *vaddr, uint64_t *paddr) { - if(contextMemory_.find(contextID) == contextMemory_.end()) { - PCOUT_ERROR << "Context of ID" << contextID <<" has not created,check parameters!" << endl; - return -1; - } - auto tmp = contextMemory_.at(contextID); - while(tmp != nullptr) { - if(tmp->paddr == *paddr) { - *vaddr = tmp->vaddr; - break; - } - tmp = tmp->succContextItem; - } - return 0; -} - -int addr_manager::releaseMemory(uint64_t contextID, uint64_t kernelID, uint64_t *vaddr, uint64_t size) { -// auto tmp = new addrItem(kernelID, contextID, *vaddr, size); - bool b_contextExist = false; - bool b_vaddrExist = false; -// for(auto it : contextList_) { -// if(it == contextID) { - b_contextExist = true; - auto tmp = contextMemory_.at(contextID); - while(tmp != nullptr) { - if(tmp->vaddr == *vaddr) { - if(tmp->prevContextItem == nullptr && tmp->succContextItem == nullptr); - else if(tmp->prevContextItem == nullptr) - tmp->succContextItem->prevContextItem = nullptr; - else if(tmp->succContextItem == nullptr) - tmp->prevContextItem->succContextItem = nullptr; - else { - tmp->prevContextItem->succContextItem = tmp->succContextItem; - tmp->succContextItem->prevContextItem = tmp->prevContextItem; - } - delete tmp; - b_vaddrExist = true; - break; - } - tmp = tmp->succContextItem; -// } -// break; -// } - } - if(!b_contextExist) { - PCOUT_ERROR << "context ID of " << contextID << " check your input! " << endl; - return -1; - } - if(!b_vaddrExist) { - PCOUT_ERROR << "invalid vaddr of " << *vaddr << " check your input! " << endl; - return -1; - } - return 0; -} - -bool addr_manager::findContextID(uint64_t contextID) { - for(auto it : contextMemory_) { - if(it.first == contextID) - return true; - } - return false; -} - -bool addr_manager::findKernelID(uint64_t kernelID) { - for(auto it : kernelList_) { - if(it == kernelID) - return true; - } - return false; -} - diff --git a/driver/verilating_device/vt_device.h b/driver/verilating_device/vt_device.h deleted file mode 100644 index 45597f6..0000000 --- a/driver/verilating_device/vt_device.h +++ /dev/null @@ -1,221 +0,0 @@ -/** - * @file vt_device.h - * @brief 与驱动提供的API对接的类的声明 - * @author YangZexia (yang-zx17\@qq.com) - * @version 1.2 - * @date 2022-11-24 - * - * @copyright Copyright (c) {2022} DSPLAB@Tsinghua University - * - * @par 修改日志: - * - *
Date Version Author Description - *
2022-11-24 1.0 YangZexia 首次创建 - *
2022-12-14 1.1 - *
2022-11-24 1.2 YangZexia 增加了多任务(多个根页表)机制 - *
- */ -#include "processor.h" -#include "vt_utils.h" -#include "vt_config.h" -#include -#include -#include -#include -#include -#include - -using namespace ventus; -using namespace std; -//These macro is defined as test - -enum _state{UNFINISH, FINISH}; - - -struct meta_data{ - uint64_t kernel_id; - uint64_t kernel_size[3];///< 每个kernel的workgroup三维数目 - uint64_t wf_size; ///< 每个warp的thread数目 - uint64_t wg_size; ///< 每个workgroup的warp数目 - uint64_t metaDataBaseAddr;///< CSR_KNL的值, - uint64_t ldsSize;///< 每个workgroup使用的local memory的大小 - uint64_t pdsSize;///< 每个thread用到的private memory大小 - uint64_t sgprUsage;///< 每个workgroup使用的标量寄存器数目 - uint64_t vgprUsage;///< 每个thread使用的向量寄存器数目 - uint64_t pdsBaseAddr;///< private memory的基址,要转成每个workgroup的基地址, wf_size*wg_size*pdsSize -}; - - -struct kernel_info{ ///< 一个kernel由多个NDrange组成,一个NDrange由多个workgroup组成,每个workgroup在硬件上执行时映射到一个block. - map blk_list; ///< 该kernel总共包含的block,以及每个block的执行状态 - _state state; ///< 该kernel的执行状态 - kernel_info(map input_blk_list, _state stateIn): - blk_list(std::move(input_blk_list)), - state(stateIn){} -}; - -struct context_info{ - uint64_t contextID; - map kernelList; ///< 该context已经发送给硬件执行的kernel(只有发送给硬件的kernel从会被记录)及其状态:执行完成,未完成 - uint64_t root; - Memory ram = Memory(RAM_RANGE); - context_info(uint64_t taskID) : ram(RAM_RANGE){ - contextID = taskID; - cout << RAM_RANGE < contextList_; - map contextMemory_; - list kernelList_; - - -}; - -class vt_device { -public: - vt_device() { -// addrManager_.attatch_ram(&ram_); - test_proc(); - // list> task_by_block_l; - // vector roots; - } - ~vt_device(){ - if(last_task_.valid()) - last_task_.wait(); - } - - int create_device_mem(uint64_t taskID); - - /** - * @brief 释放分配的空间,释放根页表所指向的空间 - * @param taskID 要释放的内存空间对应的任务ID - * @return int - */ - int delete_device_mem(int taskID); - - - /** - * @brief 为GPU分配按照虚拟地址分配内存空间,返回指向根页表的指针 - * @param size 要分配的空间大小 - * @param dev_maddr 要分配的空间起始虚拟地址 - * @param root 指向根页表的指针 - * @return int - */ - int alloc_local_mem(uint64_t size, uint64_t *vaddr, int BUF_TYPE, uint64_t taskID, uint64_t kernelID); - - int free_local_mem(uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID); - - /** - * @brief 将buffer写入到分配给GPU的memory中,只读区间 - * @param taskID 任务ID - * @param dest_addr GPU的memory,虚拟地址 - * @param size 大小 - * @return int - */ - int upload(uint64_t dev_vaddr,const void *src_addr, uint64_t size, uint64_t taskID, uint64_t kernelID); - /** - * @brief - * @param root 根页表 - * @param dest_data_addr 要读取的数据地址,虚拟地址 - * @param src_addr 读出后要放置的位置 - * @param size 大小 - * @return int - */ - int download(uint64_t dev_vaddr, void *dst_addr, uint64_t size, uint64_t taskID, uint64_t kernelID); - int start(int taskID, void* metaData); - int wait(uint64_t time); - queue get_finished_kernel(); - queue get_finished_context(); - queue execute_all_kernel(); - bool all_context_finished(); - - - -private: - - int push_kernel(uint64_t taskID, uint64_t kernelID, mapinput_blk_list); -// uint64_t parse_metaData(uint64_t taskID, void *metaData, host_port_t* devicePort); - - Processor processor_; - future last_task_; - queue finished_kernel_l; ///< 已经执行完成的任务ID - addr_manager addrManager_; - map contextList_; -}; - - - - diff --git a/include/ventus.h b/include/ventus.h index 47752ab..6917c7d 100644 --- a/include/ventus.h +++ b/include/ventus.h @@ -55,7 +55,7 @@ int vt_dev_open(vt_device_h* hdevice); int vt_dev_close(vt_device_h hdevice); /// return device configurations - int vt_dev_caps(vt_device_h* hdevice, uint64_t caps_id, uint64_t *value); +int vt_dev_caps(vt_device_h* hdevice, uint64_t caps_id, uint64_t *value); /// @brief 【已实现】以任务为单位,在GPGPU设备上分配虚拟内存空间(创建根页表) /// @param hdevice 指向设备的指针