diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..44a6b67
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,9 @@
+BasedOnStyle: LLVM          # 基于LLVM的代码风格
+IndentWidth: 4              # 缩进宽度为4个空格
+AccessModifierOffset: -4    # 访问修饰符的偏移量为-4
+ColumnLimit: 100            # 每行代码的最大列数
+AlwaysBreakAfterDefinitionReturnType: None
+PenaltyReturnTypeOnItsOwnLine: 1000000
+AlignAfterOpenBracket: BlockIndent 
+AllowShortIfStatementsOnASingleLine: true
+
diff --git a/.gitignore b/.gitignore
index 43a659c..2cd9651 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,4 +9,6 @@ test
 html/
 cmake-build*
 build
-obj_dir*
\ No newline at end of file
+obj_dir*
+compile_commands.json
+.cache
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..ecf6dff
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "dependencies/membox"]
+	path = dependencies/membox
+	url = https://github.com/THU-DSP-LAB/membox.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5c633f5..572c637 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,51 +1,24 @@
 cmake_minimum_required(VERSION 3.22)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
 set(PROJECT ventus_driver)
 project(${PROJECT})
 
 set(CMAKE_CXX_STANDARD 17)
-# set(CMAKE_CXX_FLAGS -Wl,--whole-archive -Wl,--no-whole-archive)
-
-include_directories(common)
-include_directories(include)
-include_directories(devices)
-include_directories(devices/verilating_device/page_table)
-include_directories(tests)
-include_directories(driver)
-
-
-set(CMAKE_BUILD_TYPE "Debug")
-
-if(CMAKE_BUILD_TYPE AND (CMAKE_BUILD_TYPE STREQUAL "Debug"))
-    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall -O0 -g -ggdb")
-    message("Debug mode:${CMAKE_CXX_FLAGS_DEBUG}")
-#    add_executable(test_debug ${src_dirs})
-
-
-elseif(CMAKE_BUILD_TYPE AND (CMAKE_BUILD_TYPE STREQUAL "Release"))
-    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Wall -O3")
-    message("Release mode:${CMAKE_CXX_FLAGS_RELEASE}")
-#    add_executable(test_release ${src_dirs})
-else()
-    message("else:${CMAKE_BUILD_TYPE}")
-    message("else:${CMAKE_CXX_FLAGS_RELEASE}")
-#    add_executable(test_release ${src_dirs})
-endif()
-
-if(EXISTS "${CMAKE_SOURCE_DIR}/test.cpp")
-    set(CODING_TEST test)
-    add_executable(${CODING_TEST} test.cpp)
-endif()
-
-option(ENABLE_INSTALL "if install driver library to install dir" OFF)
-
-set(DRIVER_LIB_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib/)
-message(STATUS "DRIVER_LIB_INSTALL_DIR:" ${DRIVER_LIB_INSTALL_DIR} "\n")
 
-option(ENABLE_VERILATOR "if add verilated rtl device" OFF)
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall")
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Wall")
 
-add_subdirectory(devices)
+add_subdirectory(dependencies/membox)
+add_subdirectory(common)
 add_subdirectory(driver)
-if(ENABLE_VERILATOR)
-add_subdirectory(tests)
-endif()
-add_subdirectory(codetests)
\ No newline at end of file
+add_subdirectory(codetests)
+
+set(DRIVER_PUBLIC_HEADERS
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/ventus.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/common/vt_utils.h
+)
+install(FILES ${DRIVER_PUBLIC_HEADERS}
+    DESTINATION include COMPONENT "include"
+)
diff --git a/README.md b/README.md
index ceee148..edfd2e0 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,97 @@
 # ventus-driver
-driver code for [ventus-gpgpu](https://github.com/THU-DSP-LAB/ventus-gpgpu)
 
-To build this repository, see [llvm-ventus](https://github.com/THU-DSP-LAB/llvm-project).
+## English
+[中文版 Chinese](#中文)
+
+Driver code for Ventus GPGPU project [ventus-gpgpu](https://github.com/THU-DSP-LAB/ventus-gpgpu). It is intended to be used together with other Ventus toolchain projects. See [ventus-env](https://github.com/THU-DSP-LAB/ventus-env).
+
+### Install
+- Recommended: use [ventus-env](https://github.com/THU-DSP-LAB/ventus-env) to deploy the Ventus environment and build via the script build-ventus.sh.
+
+- Manual CMake build:
+```bash
+cmake -G Ninja -B build/ -S . \
+  -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+  -DCMAKE_INSTALL_PREFIX=../install \
+  -DVENTUS_INSTALL_PREFIX=../install \
+  -DSPIKE_SRC_DIR=../spike \
+  -DDRIVER_ENABLE_AUTOSELECT=ON \
+  -DDRIVER_ENABLE_RTLSIM=ON \
+  -DDRIVER_ENABLE_CYCLESIM=ON
+cmake --build build/
+cmake --install build/
+```
+
+### Usage
+This repository provides the driver implementation for Ventus GPGPU, mainly handling memory management and device connectivity.
+
+- Upstream (host): connects to an OpenCL implementation ([POCL](https://github.com/THU-DSP-LAB/pocl)).
+- Downstream (devices): supports multiple (simulation) backends:
+  - Instruction-level simulator [spike](https://github.com/THU-DSP-LAB/ventus-gpgpu-isa-simulator), path: driver/spike_device
+  - SystemC-based cycle-accurate simulator [cyclesim](https://github.com/THU-DSP-LAB/ventus-gpgpu-cpp-simulator), path: driver/cyclesim_device
+  - Verilator-based [Chisel RTL](https://github.com/THU-DSP-LAB/ventus-gpgpu) simulation framework [sim-verilator](https://github.com/THU-DSP-LAB/ventus-gpgpu/tree/master/sim-verilator), path: driver/rtlsim_device
+  - The helper backend driver/auto_select allows choosing a backend via the environment variable VENTUS_BACKEND (see Usage).
+- Both upstream and downstream connections are via shared libraries.
+
+This repository builds multiple shared libraries installed into the specified prefix. After configuring environment variables as in [ventus-env](https://github.com/THU-DSP-LAB/ventus-env), you can run OpenCL programs and the driver will be invoked automatically.
+
+### Environment variables
+- `VENTUS_BACKEND` selects the device backend. Allowed values: spike/isa, rtl/rtlsim/gpgpu, cycle/cyclesim/systemc/simulator.
+- `VENTUS_WAVEFORM=1` enables waveform dump: fst for rtlsim, vcd for cyclesim.
+- `VENTUS_WAVEFORM_BEGIN` and `VENTUS_WAVEFORM_END` define a time window to limit waveform dump for rtlsim (speeds up simulation). Not supported by cyclesim.
+- `VENTUS_DUMP_RESULT=filename.json` saves all device-to-host copied data and their device addresses into a JSON file for debugging.
+- `VENTUS_TIMING_DDR=0` disables DDR timing modeling in cyclesim (enabled by default). RTL does not support DDR timing yet.
+
+### Example
+```bash
+VENTUS_BACKEND=rtl VENTUS_DUMP_RESULT=app.rtl.json VENTUS_WAVEFORM=1 ./OpenCLapp.out 2>&1 | tee rtl.log
+```
+
+---
+
+## 中文
+[English Version](#english)
+
+这是[ventus-gpgpu](https://github.com/THU-DSP-LAB/ventus-gpgpu)的驱动程序，目前仅支持仿真环境，需与其他Ventus工具链项目配合使用，参见 [ventus-env](https://github.com/THU-DSP-LAB/ventus-env)。
+
+### Install
+推荐使用 [ventus-env](https://github.com/THU-DSP-LAB/ventus-env) 部署 Ventus 环境，使用其中的 build-ventus.sh 脚本来编译安装。
+
+手动 cmake 编译命令：
+```bash
+cmake -G Ninja -B build/ -S . \
+  -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+  -DCMAKE_INSTALL_PREFIX=../install \
+  -DVENTUS_INSTALL_PREFIX=../install \
+  -DSPIKE_SRC_DIR=../spike \
+  -DDRIVER_ENABLE_AUTOSELECT=ON \
+  -DDRIVER_ENABLE_RTLSIM=ON \
+  -DDRIVER_ENABLE_CYCLESIM=ON
+cmake --build build/
+cmake --install build/
+```
+
+### Usage
+本仓库作为乘影 Ventus GPGPU 的驱动实现，主要完成内存管理与设备连接的功能。
+- 向上层连接 OpenCL 实现（[POCL](https://github.com/THU-DSP-LAB/pocl)）。
+- 向底层连接多种（仿真）设备，目前支持：
+  - 指令级仿真器 [spike](https://github.com/THU-DSP-LAB/ventus-gpgpu-isa-simulator)，对应 driver/spike_device
+  - 基于 SystemC 的周期级仿真器 [cyclesim](https://github.com/THU-DSP-LAB/ventus-gpgpu-cpp-simulator)，对应 driver/cyclesim_device
+  - 基于 Verilator 搭建的 [Chisel RTL](https://github.com/THU-DSP-LAB/ventus-gpgpu) 仿真框架 [sim-verilator](https://github.com/THU-DSP-LAB/ventus-gpgpu/tree/master/sim-verilator)，对应 driver/rtlsim_device
+  - 另有 driver/auto_select，用户可通过环境变量 VENTUS_BACKEND 指定具体底层设备（见 Usage）。
+- 向上层与向底层的连接均以动态库链接的形式完成。
+
+本仓库会编译出多个动态库安装到指定目录下。用户参照 [ventus-env](https://github.com/THU-DSP-LAB/ventus-env) 配置好环境变量后运行 OpenCL 程序即可自动调用。
+
+### 环境变量
+- VENTUS_BACKEND 选择底层设备，可选值：spike/isa，rtl/rtlsim/gpgpu，cycle/cyclesim/systemc/simulator。
+- VENTUS_WAVEFORM=1 时，rtlsim 导出 fst 波形，cyclesim 导出 vcd 波形。
+- 设定 VENTUS_WAVEFORM_BEGIN 与 VENTUS_WAVEFORM_END（数字）可使 rtlsim 仅导出该时间段内波形，加速仿真；cyclesim 不支持。
+- VENTUS_DUMP_RESULT=filename.json 将所有从 device 端拷回 host 端的数据及其设备端地址保存到指定 JSON 文件，辅助调试。
+- VENTUS_TIMING_DDR=0 关闭 cyclesim 中的 DDR 时序仿真（默认开启）。RTL 暂不支持 DDR 时序仿真。
+
+### 示例
+```bash
+VENTUS_BACKEND=rtl VENTUS_DUMP_RESULT=app.rtl.json VENTUS_WAVEFORM=1 ./OpenCLapp.out 2>&1 | tee rtl.log
+```
+
diff --git a/TODO b/TODO
deleted file mode 100644
index e69de29..0000000
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
new file mode 100644
index 0000000..7fe0bf4
--- /dev/null
+++ b/common/CMakeLists.txt
@@ -0,0 +1,20 @@
+set(PROJECT driver_common_utils)
+project(${PROJECT})
+
+file(GLOB_RECURSE SRCS ./loadelf.cpp)
+
+add_library(${PROJECT} STATIC ${SRCS})
+
+target_link_libraries(${PROJECT} PRIVATE elf)
+target_link_libraries(${PROJECT} PRIVATE spdlog)
+target_link_libraries(${PROJECT} PRIVATE fmt)
+target_include_directories(${PROJECT} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+
+set_target_properties(${PROJECT} PROPERTIES OUTPUT_NAME "${PROJECT}")
+set_target_properties(${PROJECT} PROPERTIES CLEAN_DIRECT_OUTPUT 1)
+set_target_properties(${PROJECT} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+# if(ENABLE_INSTALL)
+#     message(STATUS "DRIVER_LIB_INSTALL_DIR:" ${DRIVER_LIB_INSTALL_DIR})
+#     install(TARGETS ${PROJECT} LIBRARY DESTINATION ${DRIVER_LIB_INSTALL_DIR} COMPONENT "lib")
+# endif()
diff --git a/common/loadelf.cpp b/common/loadelf.cpp
new file mode 100644
index 0000000..43b39a2
--- /dev/null
+++ b/common/loadelf.cpp
@@ -0,0 +1,94 @@
+#include "loadelf.hpp"
+#include <cstdio>
+#include <cstdlib>
+#include <fcntl.h>
+#include <gelf.h>
+#include <libelf.h>
+#include <memory>
+#include <spdlog/spdlog.h>
+#include <unistd.h>
+
+// 分析 ELF 文件，返回所有加载到内存中的段信息
+std::vector<MemBlock> get_data_from_elf(const char *filename, std::shared_ptr<spdlog::logger> logger) {
+    std::vector<MemBlock> blocks;
+
+    // 初始化 libelf 库
+    if (elf_version(EV_CURRENT) == EV_NONE) {
+        logger->error("ELF: cannot initialize libelf");
+        return std::vector<MemBlock>();
+    }
+
+    // 打开 ELF 文件
+    int fd = open(filename, O_RDONLY);
+    if (fd < 0) {
+        const char *errstr = strerrordesc_np(errno);
+        logger->error("ELF: cannot open file '{}': {}", filename, errstr);
+        return std::vector<MemBlock>();
+    }
+
+    // 开始 ELF 处理
+    Elf *e = elf_begin(fd, ELF_C_READ, nullptr);
+    if (!e) {
+        logger->error("ELF: elf_begin failed: {}", elf_errmsg(-1));
+        close(fd);
+        return std::vector<MemBlock>();
+    }
+
+    // 读取 ELF 头部
+    GElf_Ehdr ehdr;
+    if (gelf_getehdr(e, &ehdr) == nullptr) {
+        logger->error("ELF: gelf_getehdr failed: {}", elf_errmsg(-1));
+        elf_end(e);
+        close(fd);
+        return std::vector<MemBlock>();
+    }
+
+    // 获取程序头表中的段数量
+    size_t phnum = 0;
+    if (elf_getphdrnum(e, &phnum) != 0) {
+        logger->error("ELF: elf_getphdrnum failed: {}", elf_errmsg(-1));
+        elf_end(e);
+        close(fd);
+        return std::vector<MemBlock>();
+    }
+
+    // 遍历所有程序头
+    for (size_t i = 0; i < phnum; ++i) {
+        GElf_Phdr phdr;
+        if (gelf_getphdr(e, i, &phdr) != &phdr) {
+            logger->error("ELF: gelf_getphdr header {} failed: {}", i, elf_errmsg(-1));
+            return std::vector<MemBlock>();
+        }
+
+        // 只考虑加载段
+        if (phdr.p_type == PT_LOAD) {
+            MemBlock block;
+            block.vaddr = phdr.p_vaddr;
+            block.memsz = phdr.p_memsz;
+
+            // 如果段在文件中有初始化数据，则读取数据
+            if (phdr.p_filesz > 0) {
+                block.data.resize(phdr.p_filesz);
+                if (lseek(fd, phdr.p_offset, SEEK_SET) == -1) {
+                    const char *errstr = strerrordesc_np(errno);
+                    logger->error("ELF: failed seeking to offset {}: {}", phdr.p_offset, errstr);
+                    return std::vector<MemBlock>();
+                } else {
+                    ssize_t bytesRead = read(fd, block.data.data(), phdr.p_filesz);
+                    if (bytesRead != (ssize_t)phdr.p_filesz) {
+                        const char *errstr = strerrordesc_np(errno);
+                        logger->error(
+                            "ELF: failed reading {} bytes data: {}", phdr.p_filesz, errstr
+                        );
+                        return std::vector<MemBlock>();
+                    }
+                }
+            }
+            blocks.push_back(block);
+        }
+    }
+
+    elf_end(e);
+    close(fd);
+    return blocks;
+}
diff --git a/common/loadelf.hpp b/common/loadelf.hpp
new file mode 100644
index 0000000..c7560af
--- /dev/null
+++ b/common/loadelf.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <spdlog/spdlog.h>
+#include <vector>
+
+// 解析ELF后返回的需要分配的内存块信息
+typedef struct MemBlock {
+    uint64_t vaddr;            // 内存块的起始地址
+    size_t memsz;              // 内存块所需分配大小
+    std::vector<uint8_t> data; // 此内存块的初始化数据
+    // 若(filesz=)data.size < memsz则需要补0到memsz大小
+} MemBlock;
+
+std::vector<MemBlock> get_data_from_elf(const char *filename, std::shared_ptr<spdlog::logger> logger);
diff --git a/common/utils.hpp b/common/utils.hpp
new file mode 100644
index 0000000..4a895c3
--- /dev/null
+++ b/common/utils.hpp
@@ -0,0 +1,45 @@
+#include <algorithm>
+#include <cctype>
+#include <cerrno>
+#include <cstdint>
+#include <cstdlib>
+#include <optional>
+#include <string>
+
+inline std::optional<bool> parse_bool(std::string str) {
+    // transform to lowercase safely (unsigned char cast to avoid UB)
+    std::transform(str.begin(), str.end(), str.begin(), [](char c) {
+        return static_cast<char>(std::tolower(c));
+    });
+    if (str == "true" || str == "1" || str == "yes" || str == "on") return true;
+    if (str == "false" || str == "0" || str == "no" || str == "off") return false;
+    return std::nullopt;
+}
+inline std::optional<bool> parse_bool(const char *str) {
+    if (str == nullptr) return std::nullopt;
+    return parse_bool(std::string(str));
+}
+
+inline std::optional<uint64_t> parse_u64(const char *s) {
+    if (!s || *s == '\0') return std::nullopt;
+
+    // 手动处理二进制 0b/0B
+    if (s[0] == '0' && (s[1] == 'b' || s[1] == 'B')) {
+        const char *p = s + 2;
+        if (*p == '\0') return std::nullopt;
+        uint64_t v = 0;
+        while (*p == '0' || *p == '1') {
+            v = (v << 1) | (*p - '0');
+            ++p;
+        }
+        if (*p != '\0') return std::nullopt;
+        return v;
+    }
+
+    errno = 0;
+    char *endp = nullptr;
+    // base = 0 自动识别: 0x/0X -> 16, 前导0 -> 8, 否则 -> 10
+    unsigned long long v = std::strtoull(s, &endp, 0);
+    if (errno != 0 || endp == s || *endp != '\0') return std::nullopt;
+    return static_cast<uint64_t>(v);
+};
diff --git a/dependencies/membox b/dependencies/membox
new file mode 160000
index 0000000..fdccb9a
--- /dev/null
+++ b/dependencies/membox
@@ -0,0 +1 @@
+Subproject commit fdccb9a64675465e682136979d9dc394afc89b88
diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt
index 27c86fe..f2e6452 100644
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -1,5 +1,31 @@
-if(ENABLE_VERILATOR)
-    add_subdirectory(verilating_device)
+option(DRIVER_ENABLE_CYCLESIM "Enable cyclesim device driver" OFF)
+option(DRIVER_ENABLE_RTLSIM "Enable rtlsim device driver" OFF)
+option(DRIVER_ENABLE_GVM "Enable gvm device driver" OFF)
+option(DRIVER_ENABLE_AUTOSELECT "Enable auto select device driver" OFF)
+
+add_subdirectory(spike_device)
+set(DRIVER_DEFAULT "spike_driver")
+
+# If you enable these, it's assumed that backend libraries are already installed to ${VENTUS_INSTALL_PREFIX}/lib
+if(DRIVER_ENABLE_CYCLESIM)
+    add_subdirectory(cyclesim_device)
+endif()
+if(DRIVER_ENABLE_RTLSIM)
+    add_subdirectory(rtlsim_device)
 endif()
+if(DRIVER_ENABLE_GVM)
+    add_subdirectory(gvm_device)
+endif()
+if(DRIVER_ENABLE_AUTOSELECT)
+    add_subdirectory(auto_select)
+    set(DRIVER_DEFAULT "auto_select_driver")
+endif()
+
 
-add_subdirectory(spike_device)
\ No newline at end of file
+install(CODE "
+    file(CREATE_LINK
+        lib${DRIVER_DEFAULT}.so
+        ${CMAKE_INSTALL_PREFIX}/lib/libventus_driver.so
+        SYMBOLIC
+    )
+")
diff --git a/driver/auto_select/CMakeLists.txt b/driver/auto_select/CMakeLists.txt
new file mode 100644
index 0000000..996c21b
--- /dev/null
+++ b/driver/auto_select/CMakeLists.txt
@@ -0,0 +1,21 @@
+set(PROJECT auto_select_driver)
+project(${PROJECT})
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../include)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../common)
+
+file(GLOB_RECURSE SRCS ./ventus.cpp)
+find_package(nlohmann_json CONFIG REQUIRED)
+
+add_library(${PROJECT} SHARED ${SRCS})
+target_link_libraries(${PROJECT} PRIVATE spdlog)
+target_link_libraries(${PROJECT} PRIVATE fmt)
+target_link_libraries(${PROJECT} PRIVATE dl)
+target_link_libraries(${PROJECT} PRIVATE nlohmann_json::nlohmann_json)
+
+set_target_properties(${PROJECT} PROPERTIES OUTPUT_NAME "${PROJECT}")
+set_target_properties(${PROJECT} PROPERTIES CLEAN_DIRECT_OUTPUT 1)
+
+install(TARGETS ${PROJECT}
+    LIBRARY DESTINATION lib COMPONENT "lib"
+)
diff --git a/driver/auto_select/ventus.cpp b/driver/auto_select/ventus.cpp
new file mode 100644
index 0000000..c6b07a7
--- /dev/null
+++ b/driver/auto_select/ventus.cpp
@@ -0,0 +1,331 @@
+/*
+ * Ventus driver for the Auto Select feature.
+ * This driver is designed to work with the Ventus spike/rtlsim/cyclesim devices.
+ * It uses dynamic loading to access the Ventus library functions.
+ */
+
+#include "ventus.h"
+#include <algorithm>
+#include <cctype>
+#include <cstdlib>
+#include <dlfcn.h>
+#include <filesystem>
+#include <fmt/core.h>
+#include <fstream>
+#include <map>
+#include <nlohmann/json.hpp>
+#include <optional>
+#include <spdlog/spdlog.h>
+#include <string>
+
+//
+// 用于导出memcpy_device_to_host的所有数据及其地址（环境变量VENTUS_DUMP_RESULT=filename.json）
+//
+
+// 辅助函数
+static int append_json_object(const std::string &filename, const nlohmann::json &new_obj);
+static std::string to_hex_string(uint32_t value) { return fmt::format("0x{:08X}", value); }
+// 下边两个函数也可被外界调用，方便再OpenCL APP中获取设备端指针具体地址
+// 获取上次memcpy_device_to_host的设备端地址
+extern "C" int __vt_get_last_copy_to_dev_addr(uint64_t *addr);
+// 设置json dump文件名，传入nullptr表示关闭dump功能，等价于VENTUS_DUMP_RESULT环境变量
+extern "C" void __vt_enable_dump_json_copy_to_dev(const char *filename);
+// 全局变量
+static uint64_t g_last_copy_to_dev_addr = 0; // 上次memcpy_device_to_host的设备端地址
+static std::optional<std::string> g_dump_result_filename = std::nullopt;
+
+//
+// 定义函数指针结构体，包含所有ventus.h API的函数指针
+//
+struct vt_api_t {
+    int (*vt_dev_open)(vt_device_h *hdevice);
+    int (*vt_dev_close)(vt_device_h hdevice);
+    int (*vt_dev_caps)(vt_device_h *hdevice, uint64_t caps_id, uint64_t *value);
+    int (*vt_root_mem_alloc)(vt_device_h hdevice, int taskID);
+    int (*vt_root_mem_free)(vt_device_h hdevice, int taskID);
+    int (*vt_buf_alloc)(
+        vt_device_h hdevice, uint64_t size, uint64_t *vaddr, int BUF_TYPE, uint64_t taskID,
+        uint64_t kernelID
+    );
+    int (*vt_buf_free)(
+        vt_device_h hdevice, uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID
+    );
+    int (*vt_one_buf_free)(
+        vt_device_h hdevice, uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID
+    );
+    int (*vt_copy_to_dev)(
+        vt_device_h hdevice, uint64_t dev_vaddr, const void *src_addr, uint64_t size,
+        uint64_t taskID, uint64_t kernelID
+    );
+    int (*vt_copy_from_dev)(
+        vt_device_h hdevice, uint64_t dev_vaddr, void *dst_addr, uint64_t size, uint64_t taskID,
+        uint64_t kernelID
+    );
+    int (*vt_start)(vt_device_h hdevice, void *metaData, uint64_t taskID);
+    int (*vt_ready_wait)(vt_device_h hdevice, uint64_t timeout);
+    int (*vt_finish_all_kernel)(vt_device_h hdevice, std::queue<int> *finished_kernel_list);
+    int (*vt_upload_kernel_bytes)(
+        vt_device_h device, const void *content, uint64_t size, int taskID
+    );
+    int (*vt_upload_kernel_file)(vt_device_h device, const char *filename, int kernelID);
+    int (*vt_dump_perf)(vt_device_h device, FILE *stream);
+} vt_api = {0};
+
+// 加载后端库并设置函数指针
+vt_api_t load_backend() {
+    vt_api_t api = {0}; // 初始化函数指针结构体
+
+    // 读取环境变量 VENTUS_BACKEND，确定动态库名
+    const char *backend_ = std::getenv("VENTUS_BACKEND");
+    std::string backend = backend_ ? backend_ : "spike";
+    std::transform(backend.begin(), backend.end(), backend.begin(), [](unsigned char c) {
+        return std::tolower(c);
+    });
+    std::map<std::string, std::string> backend_map;
+    backend_map["isa"] = "libspike_driver.so";
+    backend_map["spike"] = "libspike_driver.so";
+    backend_map["rtlsim"] = "librtlsim_driver.so";
+    backend_map["rtl"] = "librtlsim_driver.so";
+    backend_map["gpgpu"] = "librtlsim_driver.so";
+    backend_map["cycle"] = "libcyclesim_driver.so";
+    backend_map["cyclesim"] = "libcyclesim_driver.so";
+    backend_map["gvm"] = "libgvm_driver.so";    
+    backend_map["simulator"] = "libcyclesim_driver.so";
+    backend_map["systemc"] = "libcyclesim_driver.so";
+
+    std::string backend_soname;
+    if (backend_map.find(backend) != backend_map.end()) {
+        backend_soname = backend_map[backend];
+    } else {
+        SPDLOG_ERROR("Unsupported VENTUS_BACKEND: {}", backend);
+        std::exit(EXIT_FAILURE);
+    }
+
+    // 获取自身路径，基于此查找其它动态库
+    Dl_info info;
+    if (dladdr((void *)load_backend, &info) == 0) {
+        SPDLOG_ERROR("dlopen failed to get current library path.");
+        std::exit(EXIT_FAILURE);
+    }
+    std::filesystem::path self_path(info.dli_fname);
+    self_path = self_path.parent_path(); // 获取当前库所在目录
+
+    // 构建后端库路径，例如 "install/lib/liba.so"
+    std::string lib_path = self_path / backend_soname;
+    void *handle = dlopen(lib_path.c_str(), RTLD_LAZY);
+    if (!handle) {
+        SPDLOG_ERROR("dlopen failed to load backend library: {}", dlerror());
+        std::exit(EXIT_FAILURE);
+    }
+
+    // 获取所有 API 的函数指针
+    // clang-format off
+    api.vt_dev_open = (int (*)(vt_device_h*))dlsym(handle, "vt_dev_open");
+    api.vt_dev_close = (int (*)(vt_device_h))dlsym(handle, "vt_dev_close");
+    api.vt_dev_caps = (int (*)(vt_device_h*, uint64_t, uint64_t*))dlsym(handle, "vt_dev_caps");
+    api.vt_root_mem_alloc = (int (*)(vt_device_h, int))dlsym(handle, "vt_root_mem_alloc");
+    api.vt_root_mem_free = (int (*)(vt_device_h, int))dlsym(handle, "vt_root_mem_free");
+    api.vt_buf_alloc = (int (*)(vt_device_h, uint64_t, uint64_t*, int, uint64_t, uint64_t))dlsym(handle, "vt_buf_alloc");
+    api.vt_buf_free = (int (*)(vt_device_h, uint64_t, uint64_t*, uint64_t, uint64_t))dlsym(handle, "vt_buf_free");
+    api.vt_one_buf_free = (int (*)(vt_device_h, uint64_t, uint64_t*, uint64_t, uint64_t))dlsym(handle, "vt_one_buf_free");
+    api.vt_copy_to_dev = (int (*)(vt_device_h, uint64_t, const void*, uint64_t, uint64_t, uint64_t))dlsym(handle, "vt_copy_to_dev");
+    api.vt_copy_from_dev = (int (*)(vt_device_h, uint64_t, void*, uint64_t, uint64_t, uint64_t))dlsym(handle, "vt_copy_from_dev");
+    api.vt_start = (int (*)(vt_device_h, void*, uint64_t))dlsym(handle, "vt_start");
+    api.vt_ready_wait = (int (*)(vt_device_h, uint64_t))dlsym(handle, "vt_ready_wait");
+    api.vt_finish_all_kernel = (int (*)(vt_device_h, std::queue<int>*))dlsym(handle, "vt_finish_all_kernel");
+    api.vt_upload_kernel_bytes = (int (*)(vt_device_h, const void*, uint64_t, int))dlsym(handle, "vt_upload_kernel_bytes");
+    api.vt_upload_kernel_file = (int (*)(vt_device_h, const char*, int))dlsym(handle, "vt_upload_kernel_file");
+    api.vt_dump_perf = (int (*)(vt_device_h, FILE*))dlsym(handle, "vt_dump_perf");
+    // clang-format on
+
+    // 检查是否所有函数指针都成功获取
+    if (!api.vt_dev_open || !api.vt_dev_close || !api.vt_dev_caps || !api.vt_root_mem_alloc ||
+        !api.vt_root_mem_free || !api.vt_buf_alloc || !api.vt_buf_free || !api.vt_one_buf_free ||
+        !api.vt_copy_to_dev || !api.vt_copy_from_dev || !api.vt_start || !api.vt_ready_wait ||
+        !api.vt_finish_all_kernel || !api.vt_upload_kernel_bytes || !api.vt_upload_kernel_file ||
+        !api.vt_dump_perf) {
+        // 如果有任何函数指针获取失败，返回空的 api 结构体
+        // 不显式调用 dlclose，依赖操作系统清理
+        SPDLOG_ERROR("Failed to load all required functions from backend library");
+        std::exit(EXIT_FAILURE);
+    }
+
+    // 成功加载，返回设置好的 api 结构体
+    // handle 不保存，依赖操作系统在进程结束时自动卸载
+    return api;
+}
+
+// 使用静态变量实现线程安全的懒加载
+struct BackendLoader {
+    vt_api_t api = {0};
+    bool loaded = false;
+    BackendLoader() { api = load_backend(); }
+};
+static BackendLoader loader;
+
+// 实现所有 API 函数，使用 extern "C" 确保符号正确导出
+extern "C" int vt_dev_open(vt_device_h *hdevice) {
+    if (!loader.loaded) {
+        loader.api = load_backend(); // 动态加载后端库
+        loader.loaded = true;        // 标记为已加载
+    }
+    if (!loader.api.vt_dev_open) return -1;
+    const char *env_dump_result = std::getenv("VENTUS_DUMP_RESULT");
+    if (env_dump_result == nullptr) {
+        env_dump_result = std::getenv("VENTUS_DRIVER_DUMP_RESULT"); // capability name
+    }
+    if (!g_dump_result_filename && env_dump_result) {
+        g_dump_result_filename = std::string{env_dump_result};
+        std::ofstream ofs(*g_dump_result_filename, std::ios::trunc | std::ios::out);
+        ofs.close(); // 清空文件
+    }
+    return loader.api.vt_dev_open(hdevice);
+}
+
+extern "C" int vt_dev_close(vt_device_h hdevice) {
+    if (!loader.api.vt_dev_close) return -1;
+    return loader.api.vt_dev_close(hdevice);
+}
+
+extern "C" int vt_dev_caps(vt_device_h *hdevice, uint64_t caps_id, uint64_t *value) {
+    if (!loader.api.vt_dev_caps) return -1;
+    return loader.api.vt_dev_caps(hdevice, caps_id, value);
+}
+
+extern "C" int vt_root_mem_alloc(vt_device_h hdevice, int taskID) {
+    if (!loader.api.vt_root_mem_alloc) return -1;
+    return loader.api.vt_root_mem_alloc(hdevice, taskID);
+}
+
+extern "C" int vt_root_mem_free(vt_device_h hdevice, int taskID) {
+    if (!loader.api.vt_root_mem_free) return -1;
+    return loader.api.vt_root_mem_free(hdevice, taskID);
+}
+
+extern "C" int vt_buf_alloc(
+    vt_device_h hdevice, uint64_t size, uint64_t *vaddr, int BUF_TYPE, uint64_t taskID,
+    uint64_t kernelID
+) {
+    if (!loader.api.vt_buf_alloc) return -1;
+    return loader.api.vt_buf_alloc(hdevice, size, vaddr, BUF_TYPE, taskID, kernelID);
+}
+
+extern "C" int vt_buf_free(
+    vt_device_h hdevice, uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID
+) {
+    if (!loader.api.vt_buf_free) return -1;
+    return loader.api.vt_buf_free(hdevice, size, vaddr, taskID, kernelID);
+}
+
+extern "C" int vt_one_buf_free(
+    vt_device_h hdevice, uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID
+) {
+    if (!loader.api.vt_one_buf_free) return -1;
+    return loader.api.vt_one_buf_free(hdevice, size, vaddr, taskID, kernelID);
+}
+
+extern "C" int vt_copy_to_dev(
+    vt_device_h hdevice, uint64_t dev_vaddr, const void *src_addr, uint64_t size, uint64_t taskID,
+    uint64_t kernelID
+) {
+    if (!loader.api.vt_copy_to_dev) return -1;
+    return loader.api.vt_copy_to_dev(hdevice, dev_vaddr, src_addr, size, taskID, kernelID);
+}
+
+extern "C" int vt_copy_from_dev(
+    vt_device_h hdevice, uint64_t dev_vaddr, void *dst_addr, uint64_t size, uint64_t taskID,
+    uint64_t kernelID
+) {
+    if (!loader.api.vt_copy_from_dev) return -1;
+    int result = loader.api.vt_copy_from_dev(hdevice, dev_vaddr, dst_addr, size, taskID, kernelID);
+    if (result == 0) {
+        g_last_copy_to_dev_addr = dev_vaddr;
+    }
+    if (g_dump_result_filename) {
+        nlohmann::json j;
+        j["address"] = to_hex_string(dev_vaddr);
+        j["size"] = to_hex_string(size);
+        nlohmann::json addr_data;
+        for (size_t i = 0; i < (size + 3) / 4; i++) {
+            addr_data[to_hex_string(dev_vaddr + i * 4)] = to_hex_string(((uint32_t *)dst_addr)[i]);
+        }
+        j["data"] = addr_data;
+        append_json_object(*g_dump_result_filename, j);
+    }
+    return result;
+}
+
+extern "C" int vt_start(vt_device_h hdevice, void *metaData, uint64_t taskID) {
+    if (!loader.api.vt_start) return -1;
+    return loader.api.vt_start(hdevice, metaData, taskID);
+}
+
+extern "C" int vt_ready_wait(vt_device_h hdevice, uint64_t timeout) {
+    if (!loader.api.vt_ready_wait) return -1;
+    return loader.api.vt_ready_wait(hdevice, timeout);
+}
+
+extern "C" int vt_finish_all_kernel(vt_device_h hdevice, std::queue<int> *finished_kernel_list) {
+    if (!loader.api.vt_finish_all_kernel) return -1;
+    return loader.api.vt_finish_all_kernel(hdevice, finished_kernel_list);
+}
+
+extern "C" int vt_upload_kernel_bytes(
+    vt_device_h device, const void *content, uint64_t size, int taskID
+) {
+    if (!loader.api.vt_upload_kernel_bytes) return -1;
+    return loader.api.vt_upload_kernel_bytes(device, content, size, taskID);
+}
+
+extern "C" int vt_upload_kernel_file(vt_device_h device, const char *filename, int kernelID) {
+    if (!loader.api.vt_upload_kernel_file) return -1;
+    return loader.api.vt_upload_kernel_file(device, filename, kernelID);
+}
+
+extern "C" int vt_dump_perf(vt_device_h device, FILE *stream) {
+    if (!loader.api.vt_dump_perf) return -1;
+    return loader.api.vt_dump_perf(device, stream);
+}
+
+extern "C" int __vt_get_last_copy_to_dev_addr(uint64_t *addr) {
+    *addr = g_last_copy_to_dev_addr;
+    return 0;
+}
+
+extern "C" void __vt_enable_dump_json_copy_to_dev(const char *filename) {
+    if (!filename) {
+        g_dump_result_filename = std::nullopt;
+        return;
+    }
+    if (!g_dump_result_filename || *g_dump_result_filename != filename) {
+        g_dump_result_filename = std::string{filename};
+        std::ofstream ofs(*g_dump_result_filename, std::ios::trunc | std::ios::out);
+        ofs.close(); // 清空文件
+    }
+}
+
+static int append_json_object(const std::string &filename, const nlohmann::json &new_obj) {
+    nlohmann::json root;
+    std::ifstream ifs(filename);
+    if (ifs.is_open() && ifs.peek() != std::ifstream::traits_type::eof()) {
+        ifs >> root;
+        ifs.close();
+        if (!root.is_array()) {
+            SPDLOG_ERROR("Error: File is not a JSON array: {}", filename);
+            return 1;
+        }
+    } else {
+        root = nlohmann::json::array();
+    }
+
+    root.push_back(new_obj);
+
+    std::ofstream ofs(filename);
+    if (!ofs.is_open()) {
+        SPDLOG_ERROR("Unable to open json dump file: {}", filename);
+        return 1;
+    }
+    ofs << root.dump(4);
+    ofs.close();
+    return 0;
+}
diff --git a/driver/cyclesim_device/CMakeLists.txt b/driver/cyclesim_device/CMakeLists.txt
new file mode 100644
index 0000000..a214b2f
--- /dev/null
+++ b/driver/cyclesim_device/CMakeLists.txt
@@ -0,0 +1,29 @@
+set(PROJECT cyclesim_driver)
+project(${PROJECT})
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../include)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../common)
+include_directories(${VENTUS_INSTALL_PREFIX}/include)
+
+file(GLOB_RECURSE SRCS ventus.cpp)
+
+add_library(${PROJECT} SHARED ${SRCS})
+target_link_directories(${PROJECT} PRIVATE ${VENTUS_INSTALL_PREFIX}/lib)
+target_link_libraries(${PROJECT} PRIVATE VentusCycleSim)
+target_link_libraries(${PROJECT} PRIVATE driver_common_utils)
+target_link_libraries(${PROJECT} PRIVATE spdlog)
+target_link_libraries(${PROJECT} PRIVATE fmt)
+
+target_compile_definitions(${PROJECT} PRIVATE
+    SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_TRACE
+)
+set_target_properties(${PROJECT} PROPERTIES OUTPUT_NAME "${PROJECT}")
+set_target_properties(${PROJECT} PROPERTIES CLEAN_DIRECT_OUTPUT 1)
+set_target_properties(${PROJECT} PROPERTIES
+    BUILD_RPATH "${VENTUS_INSTALL_PREFIX}/lib"
+    INSTALL_RPATH "$ORIGIN"
+)
+
+install(TARGETS ${PROJECT}
+    LIBRARY DESTINATION lib COMPONENT "lib"
+)
diff --git a/driver/cyclesim_device/ventus.cpp b/driver/cyclesim_device/ventus.cpp
new file mode 100644
index 0000000..fb978e8
--- /dev/null
+++ b/driver/cyclesim_device/ventus.cpp
@@ -0,0 +1,289 @@
+/**
+ * @file ventus.cpp
+ * @brief 设备和OpenCL程序的交互功能的实现
+ *
+ * 1. `/include/ventus.h`中声明的函数
+ */
+
+#include "ventus.h"
+#include "loadelf.hpp"
+#include "utils.hpp"
+#include "ventus_cyclesim.h"
+#include <cstdint>
+#include <cstdlib>
+#include <map>
+#include <memory>
+#include <spdlog/sinks/stdout_color_sinks.h>
+#include <spdlog/spdlog.h>
+#include <stdlib.h>
+#include <string>
+#include <vector>
+
+typedef struct driver_metadata_t {
+    uint64_t kernel_id;
+    uint64_t kernel_size[3];   ///> 每个kernel的workgroup三维数目
+    uint64_t wf_size;          ///> 每个warp的thread数目
+    uint64_t wg_size;          ///> 每个workgroup的warp数目
+    uint64_t metaDataBaseAddr; ///> CSR_KNL的值，
+    uint64_t ldsSize;          ///> 每个workgroup使用的local memory的大小
+    uint64_t pdsSize;          ///> 每个thread用到的private memory大小
+    uint64_t sgprUsage;        ///> 每个workgroup使用的标量寄存器数目
+    uint64_t vgprUsage;        ///> 每个thread使用的向量寄存器数目
+    uint64_t pdsBaseAddr; ///> private memory的基址，要转成每个workgroup的基地址，
+                          /// wf_size*wg_size*pdsSize
+    const char *kernel_name;
+} driver_metadata_t;
+
+static std::map<int, uint64_t> g_ptroots; // pagetable root physical address
+static std::shared_ptr<spdlog::logger> logger;
+static uint64_t g_alloc_vaddr = 0x90000000;
+static std::vector<std::pair<vaddr_t, size_t>> g_elf_alloc;
+
+/// open the device and connect to it
+extern int vt_dev_open(vt_device_h *hdevice) {
+    if (hdevice == nullptr) return -1;
+    ventus_cyclesim_config_t config;
+    ventus_cyclesim_get_default_config(&config);
+    config.sim_time_max = ~0ull;
+    config.ramulator.enable = parse_bool(std::getenv("VENTUS_TIMING_DDR")).value_or(true);
+    config.waveform.enable = parse_bool(std::getenv("VENTUS_WAVEFORM")).value_or(false);
+    config.waveform.enable |= parse_u64(std::getenv("VENTUS_WAVEFORM_BEGIN")).has_value();
+    config.waveform.enable |= parse_u64(std::getenv("VENTUS_WAVEFORM_END")).has_value();
+    config.waveform.filename = "waveform.cycle";
+    auto device = ventus_cyclesim_init(&config);
+    *hdevice = device;
+    logger = spdlog::stdout_color_mt("ventus");
+    logger->set_level(spdlog::level::debug);
+    SPDLOG_LOGGER_DEBUG(logger, "vt_dev_open : hello world from ventus.cpp (cyclesim device)");
+
+    // TODO: temp
+    // POCL should call vt_root_mem_alloc() to create virtual memory space before any buf_alloc
+    // but currently it seems not. So we create a default root page table here.
+    uint64_t ptroot = ventus_cyclesim_vmem_create(device);
+    if (ptroot == 0) return -1;
+    g_ptroots[0] = ptroot;
+    return 0;
+}
+
+/// Close the device when all the operations are done
+extern int vt_dev_close(vt_device_h hdevice) {
+    if (hdevice == nullptr) return -1;
+    auto device = static_cast<ventus_cyclesim_t *>(hdevice);
+    ventus_cyclesim_finish(device, false);
+    SPDLOG_LOGGER_DEBUG(logger, "vt_dev_close: goodbye from ventus.cpp (cyclesim device)");
+    return 0;
+}
+int vt_dev_caps(vt_device_h *hdevice, uint64_t caps_id, uint64_t *value) {
+    // TODO: Not implemented yet
+    return -1;
+}
+
+extern int vt_buf_alloc(
+    vt_device_h hdevice, uint64_t size, uint64_t *vaddr, int BUF_TYPE, uint64_t taskID,
+    uint64_t kernelID
+) {
+    if (size <= 0 || hdevice == nullptr) return -1;
+    auto device = static_cast<ventus_cyclesim_t *>(hdevice);
+    uint64_t vaddr_allocated =
+        ventus_cyclesim_vmem_alloc(device, g_ptroots[taskID], g_alloc_vaddr, size);
+    SPDLOG_LOGGER_DEBUG(
+        logger, "vt_buf_alloc: vaddr_recommand={:x}, vaddr_allocated={:x}, size=0x{:x}, taskID={}",
+        *vaddr, vaddr_allocated, size, taskID
+    );
+    g_alloc_vaddr += (size > 0x1000) ? size : 0x1000;
+    *vaddr = vaddr_allocated;
+    if (*vaddr == 0) return -1;
+    return 0;
+}
+
+extern int vt_buf_free(
+    vt_device_h hdevice, uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID
+) {
+    if (hdevice == nullptr) return -1;
+    auto device = static_cast<ventus_cyclesim_t *>(hdevice);
+    ventus_cyclesim_vmem_free(device, g_ptroots[taskID], *vaddr, size);
+    SPDLOG_LOGGER_DEBUG(
+        logger, "vt_buf_free: vaddr=0x{:x}, size=0x{:x}, taskID={}", *vaddr, size, taskID
+    );
+    return 0;
+}
+
+extern int vt_one_buf_free(
+    vt_device_h hdevice, uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID
+) {
+    if (hdevice == nullptr) return -1;
+    auto device = static_cast<ventus_cyclesim_t *>(hdevice);
+    ventus_cyclesim_vmem_free(device, g_ptroots[taskID], *vaddr, size);
+    SPDLOG_LOGGER_DEBUG(
+        logger, "vt_buf_free: vaddr=0x{:x}, size=0x{:x}, taskID={}", *vaddr, size, taskID
+    );
+    return 0;
+}
+
+/**
+ * @brief  为设备分配内存，返回根页表的地址
+ * @param  hdevice
+ * @param  size
+ * @return int
+ */
+extern int vt_root_mem_alloc(vt_device_h hdevice, int taskID) {
+    if (hdevice == nullptr) return -1;
+    auto device = static_cast<ventus_cyclesim_t *>(hdevice);
+    auto ptroot = ventus_cyclesim_vmem_create(device);
+    if (ptroot == 0) return -1;
+    SPDLOG_LOGGER_DEBUG(logger, "vt_root_mem_alloc: taskID={}, ptroot={:x}", taskID, ptroot);
+    g_ptroots[taskID] = ptroot;
+    return 0;
+}
+
+/**
+ * 释放taskID（对应context）的根页表
+ * @param hdevice
+ * @param taskID
+ * @return
+ */
+extern int vt_root_mem_free(vt_device_h hdevice, int taskID) {
+    if (hdevice == nullptr) return -1;
+    auto device = static_cast<ventus_cyclesim_t *>(hdevice);
+    ventus_cyclesim_vmem_destroy(device, g_ptroots[taskID]);
+    g_ptroots.erase(taskID);
+    SPDLOG_LOGGER_DEBUG(
+        logger, "vt_root_mem_free: taskID={}, ptroot={:x}", taskID, g_ptroots[taskID]
+    );
+    return 0;
+}
+
+extern int vt_copy_to_dev(
+    vt_device_h hdevice, uint64_t dev_vaddr, const void *src_addr, uint64_t size, uint64_t taskID,
+    uint64_t kernelID
+) {
+    if (hdevice == nullptr) return -1;
+    if (dev_vaddr >= 0x70000000 && dev_vaddr < 0x80000000) {
+        SPDLOG_LOGGER_ERROR(
+            logger, "vt_copy_to_dev: dev_vaddr={:x} in LDS space, not supportted", dev_vaddr
+        );
+        return 0;
+    }
+    auto device = static_cast<ventus_cyclesim_t *>(hdevice);
+    SPDLOG_LOGGER_DEBUG(
+        logger, "vt_copy_to_dev: dev_vaddr={:x}, size=0x{:x}, taskID={}, kernelID={}", dev_vaddr,
+        size, taskID, kernelID
+    );
+    ventus_cyclesim_vmemcpy_h2d(device, g_ptroots[taskID], dev_vaddr, src_addr, size);
+    return 0;
+}
+
+extern int vt_copy_from_dev(
+    vt_device_h hdevice, uint64_t dev_vaddr, void *dst_addr, uint64_t size, uint64_t taskID,
+    uint64_t kernelID
+) {
+    if (hdevice == nullptr) return -1;
+    auto device = static_cast<ventus_cyclesim_t *>(hdevice);
+    SPDLOG_LOGGER_DEBUG(
+        logger, "vt_copy_from_dev: dev_vaddr={:x}, size=0x{:x}, taskID={}, kernelID={}", dev_vaddr,
+        size, taskID, kernelID
+    );
+    ventus_cyclesim_vmemcpy_d2h(device, g_ptroots[taskID], dst_addr, dev_vaddr, size);
+    return 0;
+}
+
+extern int vt_start(vt_device_h hdevice, void *mtd_raw, uint64_t taskID) {
+    if (hdevice == nullptr) return -1;
+    auto device = static_cast<ventus_cyclesim_t *>(hdevice);
+    auto mtd_driver = static_cast<driver_metadata_t *>(mtd_raw);
+    static uint32_t kernel_cnt = 0;
+    ventus_kernel_metadata_t mtd_sim{
+        .name = mtd_driver->kernel_name,
+        // .kernel_id = mtd_driver->kernel_id,
+        .kernel_id = kernel_cnt++,
+        .data = nullptr,
+        .startaddr = 0x80000000,
+        .kernel_size =
+            {mtd_driver->kernel_size[0], mtd_driver->kernel_size[1], mtd_driver->kernel_size[2]},
+        .wf_size = mtd_driver->wf_size,
+        .wg_size = mtd_driver->wg_size,
+        .metaDataBaseAddr = mtd_driver->metaDataBaseAddr,
+        .ldsSize = mtd_driver->ldsSize,
+        .pdsSize = mtd_driver->pdsSize,
+        .sgprUsage = mtd_driver->sgprUsage,
+        .vgprUsage = mtd_driver->vgprUsage,
+        .pdsBaseAddr = mtd_driver->pdsBaseAddr,
+        .num_buffer = 0,
+        .buffer_base = nullptr,
+        .buffer_size = nullptr,
+        .buffer_allocsize = nullptr,
+        .pagetable = g_ptroots[taskID],
+    };
+    ventus_cyclesim_add_kernel(device, &mtd_sim, nullptr);
+    SPDLOG_LOGGER_DEBUG(
+        logger,
+        "vt_start: taskID={}, kernelID={}, kernel_size=({},{},{}), "
+        "wgsize={}, wfsize={}, pds_size=0x{:x}, lds_size=0x{:x}, addr_meta=0x{:x}, addr_pds=0x{:x}",
+        taskID, mtd_sim.kernel_id, mtd_sim.kernel_size[0], mtd_sim.kernel_size[1],
+        mtd_sim.kernel_size[2], mtd_sim.wg_size, mtd_sim.wf_size, mtd_sim.pdsSize, mtd_sim.ldsSize,
+        mtd_sim.metaDataBaseAddr, mtd_sim.pdsBaseAddr
+    );
+    return 0;
+}
+
+extern int vt_ready_wait(vt_device_h hdevice, uint64_t timeout) {
+    if (hdevice == nullptr) return -1;
+    auto device = static_cast<ventus_cyclesim_t *>(hdevice);
+    uint64_t timeout_ns = timeout * 1000000;
+    while (!ventus_cyclesim_is_idle(device) && ventus_cyclesim_get_time(device) < timeout_ns) {
+        ventus_cyclesim_step(device);
+    }
+    return 0;
+}
+
+extern int vt_finish_all_kernel(vt_device_h hdevice, std::queue<int> *finished_kernel_list) {
+    // TODO: what is this function for? what is finished_kernel_list?
+    return -1;
+}
+
+extern int vt_upload_kernel_file(vt_device_h hdevice, const char *filename, int taskID) {
+    if (hdevice == nullptr) return -1;
+    auto device = (ventus_cyclesim_t *)hdevice;
+    uint64_t ptroot = g_ptroots[taskID];
+
+    // parse ELF file, find .text and other data sections
+    const auto blocks = get_data_from_elf(filename, logger);
+    if (blocks.empty()) {
+        return -1; // at least .text section is needed
+    }
+
+    for (const auto &to_free : g_elf_alloc) {
+        // free previous ELF allocations
+        ventus_cyclesim_vmem_free(device, ptroot, to_free.first, to_free.second);
+    }
+    g_elf_alloc.clear();
+
+    // alloc and load/zero-fill each block
+    for (auto block = blocks.begin(); block != blocks.end(); block++) {
+        uint64_t vaddr = block->vaddr;
+        uint64_t size = block->memsz;
+        uint64_t vaddr_allocated = ventus_cyclesim_vmem_alloc(device, ptroot, vaddr, size);
+        if (vaddr_allocated != vaddr) {
+            ventus_cyclesim_vmem_free(device, ptroot, vaddr_allocated, size);
+            for (auto need_free = blocks.begin(); need_free != block; need_free++) {
+                ventus_cyclesim_vmem_free(device, ptroot, need_free->vaddr, need_free->memsz);
+            }
+            return -1;
+        }
+        g_elf_alloc.push_back(std::make_pair(vaddr, size));
+        SPDLOG_LOGGER_DEBUG(
+            logger, "vt_upload_kernel_file {}: vaddr={:x}, size=0x{:x}", filename, vaddr, size
+        );
+        ventus_cyclesim_vmemcpy_h2d(device, ptroot, vaddr, block->data.data(), block->data.size());
+        std::vector<uint8_t> zeros(size - block->data.size(), 0);
+        ventus_cyclesim_vmemcpy_h2d(
+            device, ptroot, vaddr + block->data.size(), zeros.data(), zeros.size()
+        );
+    }
+
+    return 0;
+}
+int vt_upload_kernel_bytes(vt_device_h device, const void *content, uint64_t size, int taskID) {
+    return 0;
+}
+int vt_dump_perf(vt_device_h device, FILE *stream) { return 0; }
diff --git a/driver/gvm_device/.clang-format b/driver/gvm_device/.clang-format
new file mode 100644
index 0000000..44a6b67
--- /dev/null
+++ b/driver/gvm_device/.clang-format
@@ -0,0 +1,9 @@
+BasedOnStyle: LLVM          # 基于LLVM的代码风格
+IndentWidth: 4              # 缩进宽度为4个空格
+AccessModifierOffset: -4    # 访问修饰符的偏移量为-4
+ColumnLimit: 100            # 每行代码的最大列数
+AlwaysBreakAfterDefinitionReturnType: None
+PenaltyReturnTypeOnItsOwnLine: 1000000
+AlignAfterOpenBracket: BlockIndent 
+AllowShortIfStatementsOnASingleLine: true
+
diff --git a/driver/gvm_device/CMakeLists.txt b/driver/gvm_device/CMakeLists.txt
new file mode 100644
index 0000000..1854f43
--- /dev/null
+++ b/driver/gvm_device/CMakeLists.txt
@@ -0,0 +1,26 @@
+set(PROJECT gvm_driver)
+project(${PROJECT})
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../include)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../common)
+include_directories(${VENTUS_INSTALL_PREFIX}/include)
+
+file(GLOB_RECURSE SRCS ./ventus.cpp)
+
+add_library(${PROJECT} SHARED ${SRCS})
+target_link_directories(${PROJECT} PRIVATE ${VENTUS_INSTALL_PREFIX}/lib)
+target_link_libraries(${PROJECT} PRIVATE VentusGVM)
+target_link_libraries(${PROJECT} PRIVATE driver_common_utils)
+target_link_libraries(${PROJECT} PRIVATE spdlog)
+target_link_libraries(${PROJECT} PRIVATE fmt)
+
+set_target_properties(${PROJECT} PROPERTIES OUTPUT_NAME "${PROJECT}")
+set_target_properties(${PROJECT} PROPERTIES CLEAN_DIRECT_OUTPUT 1)
+set_target_properties(${PROJECT} PROPERTIES
+    BUILD_RPATH "${VENTUS_INSTALL_PREFIX}/lib"
+    INSTALL_RPATH "$ORIGIN"
+)
+
+install(TARGETS ${PROJECT}
+    LIBRARY DESTINATION lib COMPONENT "lib"
+)
diff --git a/driver/gvm_device/ventus.cpp b/driver/gvm_device/ventus.cpp
new file mode 100644
index 0000000..4811e9d
--- /dev/null
+++ b/driver/gvm_device/ventus.cpp
@@ -0,0 +1,313 @@
+/**
+ * @file ventus.cpp
+ * @brief 设备和OpenCL程序的交互功能的实现
+ *
+ * 1. `/include/ventus.h`中声明的函数
+ */
+
+#include <utils.hpp>
+#include "ventus.h"
+#include "loadelf.hpp"
+#include "ventus_gvm.h"
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+#include <spdlog/sinks/stdout_color_sinks.h>
+#include <spdlog/spdlog.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+typedef struct driver_metadata_t {
+    uint64_t kernel_id;
+    uint64_t kernel_size[3];   ///> 每个kernel的workgroup三维数目
+    uint64_t wf_size;          ///> 每个warp的thread数目
+    uint64_t wg_size;          ///> 每个workgroup的warp数目
+    uint64_t metaDataBaseAddr; ///> CSR_KNL的值，
+    uint64_t ldsSize;          ///> 每个workgroup使用的local memory的大小
+    uint64_t pdsSize;          ///> 每个thread用到的private memory大小
+    uint64_t sgprUsage;        ///> 每个workgroup使用的标量寄存器数目
+    uint64_t vgprUsage;        ///> 每个thread使用的向量寄存器数目
+    uint64_t pdsBaseAddr; ///> private memory的基址，要转成每个workgroup的基地址，
+                          /// wf_size*wg_size*pdsSize
+} driver_metadata_t;
+
+// static std::map<int, uint64_t> ptroots; // pagetable root physical address
+static std::shared_ptr<spdlog::logger> logger;
+static uint64_t alloc_vaddr = 0x90000000;
+
+/// open the device and connect to it
+extern int vt_dev_open(vt_device_h *hdevice) {
+    if (hdevice == nullptr) return -1;
+    fw_vt_dev_open();
+
+    auto env_waveform = std::getenv("VENTUS_WAVEFORM");
+    auto env_waveform_begin = std::getenv("VENTUS_WAVEFORM_BEGIN");
+    auto env_waveform_end = std::getenv("VENTUS_WAVEFORM_END");
+    bool waveform_enable = false;
+    uint64_t waveform_begin = UINT64_MAX; // default: not enable
+    uint64_t waveform_end = 0;
+    if (parse_bool(env_waveform).value_or(false)) {
+        waveform_begin = 0; // default: dump waveform all time
+        waveform_end = UINT64_MAX;
+    }
+    waveform_begin = parse_u64(env_waveform_begin).value_or(waveform_begin);
+    waveform_end = parse_u64(env_waveform_end).value_or(waveform_end);
+        waveform_enable = waveform_end > waveform_begin;
+
+    ventus_rtlsim_config_t config;
+    ventus_rtlsim_get_default_config(&config);
+    config.sim_time_max = ~0ull;
+    config.pmem.auto_alloc = true;
+    config.waveform.enable = waveform_enable;
+    config.waveform.time_begin = waveform_begin;
+    config.waveform.time_end = waveform_end;
+    config.waveform.filename = "waveform.gvm.fst";
+    config.snapshot.enable = false;
+    config.log.console.enable = true;
+    config.log.console.level = "trace";
+    config.log.file.enable = false;
+    auto device = ventus_rtlsim_init(&config);
+    *hdevice = device;
+    logger = spdlog::stdout_color_mt("ventus");
+    logger->set_level(spdlog::level::trace);
+    logger->debug("vt_dev_open : hello world from ventus.cpp (gvm device)");
+    return 0;
+}
+
+/// Close the device when all the operations are done
+extern int vt_dev_close(vt_device_h hdevice) {
+    if (hdevice == nullptr) return -1;
+    fw_vt_dev_close();
+    auto device = static_cast<ventus_rtlsim_t *>(hdevice);
+    ventus_rtlsim_finish(device, false);
+    logger->debug("vt_dev_close : goodbye from ventus.cpp (gvm device)");
+    return 0;
+}
+int vt_dev_caps(vt_device_h *hdevice, uint64_t caps_id, uint64_t *value) {
+    if (value == nullptr) return -1;
+#define GET_PARAM(key)                                                                             \
+    do {                                                                                           \
+        uint32_t val;                                                                              \
+        if (ventus_rtlsim_get_parameter(key, &val) == 0) {                                         \
+            *value = val;                                                                          \
+            return 0;                                                                              \
+        } else {                                                                                   \
+            SPDLOG_LOGGER_ERROR(logger, "vt_dev_caps: get parameter {} failed", key);              \
+            return -1;                                                                             \
+        }                                                                                          \
+    } while (0)
+    switch (caps_id) {
+    case VT_CAPS_MAX_CORES:
+        GET_PARAM("num_sm");
+    case VT_CAPS_MAX_WARPS:
+        GET_PARAM("num_warp");
+    case VT_CAPS_MAX_THREADS:
+        GET_PARAM("num_thread");
+    case VT_CAPS_LOCAL_MEM_SIZE:
+        GET_PARAM("sharemem_size");
+    default:
+        SPDLOG_LOGGER_ERROR(
+            logger, "vt_dev_caps: unknown caps_id {} (or not implemented)", caps_id
+        );
+        return -1;
+    }
+    return -1;
+}
+
+extern int vt_buf_alloc(
+    vt_device_h hdevice, const uint64_t size, uint64_t *vaddr, int BUF_TYPE, uint64_t taskID,
+    uint64_t kernelID
+) {
+    // TODO: RTLSIM does not support Virtual Memory yet
+    if (size <= 0 || hdevice == nullptr) return -1;
+
+    uint64_t* fw_vaddr = new uint64_t;
+    fw_vt_buf_alloc(size, fw_vaddr, BUF_TYPE, taskID, kernelID);
+
+    *vaddr = *fw_vaddr;
+    if (*vaddr == 0) return -1;
+    delete fw_vaddr;
+    return 0;
+}
+
+extern int vt_buf_free(
+    vt_device_h hdevice, uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID
+) {
+    // if (hdevice == nullptr) return -1;
+    // auto device = static_cast<ventus_rtlsim_t *>(hdevice);
+    // ventus_rtlsim_vmem_free(device, ptroots[taskID], *vaddr, size);
+    
+    uint64_t* fw_vaddr = new uint64_t;
+    *fw_vaddr = *vaddr;
+    fw_vt_buf_free(size, fw_vaddr, taskID, kernelID);
+    delete fw_vaddr;
+    
+    return 0;
+}
+
+extern int vt_one_buf_free(
+    vt_device_h hdevice, uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID
+) {
+    // if (hdevice == nullptr) return -1;
+    // auto device = static_cast<ventus_rtlsim_t *>(hdevice);
+    // ventus_rtlsim_vmem_free(device, ptroots[taskID], *vaddr, size);
+    
+    uint64_t* fw_vaddr = new uint64_t;
+    *fw_vaddr = *vaddr;
+    fw_vt_one_buf_free(size, fw_vaddr, taskID, kernelID);
+    delete fw_vaddr;
+    
+    return 0;
+}
+
+/**
+ * @brief  为设备分配内存，返回根页表的地址
+ * @param  hdevice
+ * @param  size
+ * @return int
+ */
+extern int vt_root_mem_alloc(vt_device_h hdevice, int taskID) {
+    // if (hdevice == nullptr) return -1;
+    // auto device = static_cast<ventus_rtlsim_t *>(hdevice);
+    // auto ptroot = ventus_rtlsim_vmem_create(device);
+    // if (ptroot == 0) return -1;
+    // logger->debug("vt_root_mem_alloc: taskID={}, ptroot={:x}", taskID, ptroot);
+    // ptroots[taskID] = ptroot;
+    if (taskID == 0) {
+        logger->error("RTLSIM_device does not support VMEM yet, taskID must be 0");
+    }
+    return 0;
+}
+
+/**
+ * 释放taskID（对应context）的根页表
+ * @param hdevice
+ * @param taskID
+ * @return
+ */
+extern int vt_root_mem_free(vt_device_h hdevice, int taskID) {
+    // if (hdevice == nullptr) return -1;
+    // auto device = static_cast<ventus_rtlsim_t *>(hdevice);
+    // ventus_rtlsim_vmem_destroy(device, ptroots[taskID]);
+    // ptroots.erase(taskID);
+    return 0;
+}
+
+extern int vt_copy_to_dev(
+    vt_device_h hdevice, uint64_t dev_vaddr, const void *src_addr, uint64_t size, uint64_t taskID,
+    uint64_t kernelID
+) {
+    if (hdevice == nullptr) return -1;
+    fw_vt_copy_to_dev(dev_vaddr, src_addr, size, taskID, kernelID);
+    auto device = static_cast<ventus_rtlsim_t *>(hdevice);
+    logger->debug(
+        "vt_copy_to_dev: dev_addr={:x}, size={}, taskID={}, kernelID={}", dev_vaddr, size, taskID,
+        kernelID
+    );
+    ventus_rtlsim_pmemcpy_h2d(device, dev_vaddr, src_addr, size);
+    return 0;
+}
+
+extern int vt_copy_from_dev(
+    vt_device_h hdevice, uint64_t dev_vaddr, void *dst_addr, uint64_t size, uint64_t taskID,
+    uint64_t kernelID
+) {
+    if (hdevice == nullptr) return -1;
+    auto device = static_cast<ventus_rtlsim_t *>(hdevice);
+    SPDLOG_LOGGER_DEBUG(logger,
+        "vt_copy_from_dev: dev_addr={:x}, size={}, taskID={}, kernelID={}", dev_vaddr, size, taskID,
+        kernelID
+    );
+    ventus_rtlsim_pmemcpy_d2h(device, dst_addr, dev_vaddr, size);
+    return 0;
+}
+
+extern int vt_start(vt_device_h hdevice, void *mtd_raw, uint64_t taskID) {
+    if (hdevice == nullptr) return -1;
+    auto device = static_cast<ventus_rtlsim_t *>(hdevice);
+    auto mtd_driver = static_cast<driver_metadata_t *>(mtd_raw);
+    ventus_kernel_metadata_t mtd_sim{
+        .name = "UnknownKernelName",
+        .data = nullptr,
+        .startaddr = 0x80000000,
+        .kernel_id = mtd_driver->kernel_id,
+        .kernel_size =
+            {mtd_driver->kernel_size[0], mtd_driver->kernel_size[1], mtd_driver->kernel_size[2]},
+        .wf_size = mtd_driver->wf_size,
+        .wg_size = mtd_driver->wg_size,
+        .metaDataBaseAddr = mtd_driver->metaDataBaseAddr,
+        .ldsSize = mtd_driver->ldsSize,
+        .pdsSize = mtd_driver->pdsSize,
+        .sgprUsage = mtd_driver->sgprUsage,
+        .vgprUsage = mtd_driver->vgprUsage,
+        .pdsBaseAddr = mtd_driver->pdsBaseAddr,
+        .num_buffer = 0,
+        .buffer_base = nullptr,
+        .buffer_size = nullptr,
+        .buffer_allocsize = nullptr,
+    };
+    logger->debug(
+        "kernel metadata: kernel_id={}, kernel_size=[{}, {}, {}], wf_size={}, wg_size={}, "
+        "metaDataBaseAddr={:x}, ldsSize={}, pdsSize={}, sgprUsage={}, vgprUsage={}, "
+        "pdsBaseAddr={:x}",
+        mtd_driver->kernel_id, mtd_driver->kernel_size[0], mtd_driver->kernel_size[1],
+        mtd_driver->kernel_size[2], mtd_driver->wf_size, mtd_driver->wg_size,
+        mtd_driver->metaDataBaseAddr, mtd_driver->ldsSize, mtd_driver->pdsSize,
+        mtd_driver->sgprUsage, mtd_driver->vgprUsage, mtd_driver->pdsBaseAddr
+    );
+    fw_vt_start(mtd_raw, taskID); // 先初始化 spike，再运行 sim-verilator
+    ventus_rtlsim_add_kernel(device, &mtd_sim, nullptr);
+    return 0;
+}
+
+extern int vt_ready_wait(vt_device_h hdevice, uint64_t timeout) {
+    if (hdevice == nullptr) return -1;
+    auto device = static_cast<ventus_rtlsim_t *>(hdevice);
+    uint64_t timeout_ns = timeout * 1000000;
+    while (!ventus_rtlsim_is_idle(device) && ventus_rtlsim_get_time(device) < timeout_ns) {
+        ventus_rtlsim_step(device);
+    }
+    for (int i = 0; i < 5000; i++) {
+        // TODO: RTL does not provide a way to check if L2 cache flush is done
+        ventus_rtlsim_step(device);
+    }
+    // TODO: temp
+    // it seems that vt_dev_close() is not called by POCL
+    // we call it here to make waveform output sucessful
+    // vt_dev_close(hdevice);
+    return 0;
+}
+
+extern int vt_finish_all_kernel(vt_device_h hdevice, std::queue<int> *finished_kernel_list) {
+    // TODO: what is this function for? what is finished_kernel_list?
+    return -1;
+}
+
+extern int vt_upload_kernel_file(vt_device_h hdevice, const char *filename, int taskID) {
+    if (hdevice == nullptr) return -1;
+    fw_vt_upload_kernel_file(filename, taskID);
+    auto device = (ventus_rtlsim_t *)hdevice;
+    // uint64_t ptroot = ptroots[taskID];
+
+    // parse ELF file, find .text and other data sections
+    const auto blocks = get_data_from_elf(filename, logger);
+    if (blocks.empty()) {
+        return -1; // at least .text section is needed
+    }
+
+    // alloc and load/zero-fill each block
+    for (auto block = blocks.begin(); block != blocks.end(); block++) {
+        uint64_t vaddr = block->vaddr;
+        uint64_t size = block->memsz;
+        logger->debug("vt_upload_kernel_file: addr={:x}, size={}", vaddr, size);
+        ventus_rtlsim_pmemcpy_h2d(device, vaddr, block->data.data(), block->data.size());
+        std::vector<uint8_t> zeros(size - block->data.size(), 0);
+        ventus_rtlsim_pmemcpy_h2d(device, vaddr + block->data.size(), zeros.data(), zeros.size());
+    }
+    ventus_rtlsim_icache_invalidate(device);
+    return 0;
+}
+int vt_upload_kernel_bytes(vt_device_h device, const void *content, uint64_t size, int taskID) {
+    return 0;
+}
+int vt_dump_perf(vt_device_h device, FILE *stream) { return 0; }
diff --git a/driver/gvm_device/ventus_gvm.h b/driver/gvm_device/ventus_gvm.h
new file mode 100644
index 0000000..395fc87
--- /dev/null
+++ b/driver/gvm_device/ventus_gvm.h
@@ -0,0 +1,184 @@
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if __GNUC__ >= 4
+#define DLL_PUBLIC __attribute__((visibility("default")))
+#define DLL_LOCAL __attribute__((visibility("hidden")))
+#else
+#define DLL_PUBLIC
+#define DLL_LOCAL
+#endif
+
+#include <stdbool.h>
+#include <stdint.h>
+
+typedef struct ventus_rtlsim_t ventus_rtlsim_t;
+typedef uint64_t paddr_t;
+
+typedef struct ventus_kernel_metadata_t { // 这个metadata是供驱动使用的，而不是给硬件的
+    // Additional data
+    const char* name; // kernel name
+    void* data;       // use this as you like, such as callback function argument
+
+    // Raw metadata
+    uint64_t startaddr;
+    uint64_t kernel_id;        // Is this useful??? Maybe this should be moved to additional data
+    uint64_t kernel_size[3];   // 每个kernel的workgroup三维数目
+    uint64_t wf_size;          // 每个warp的thread数目
+    uint64_t wg_size;          // 每个workgroup的warp数目
+    uint64_t metaDataBaseAddr; // CSR_KNL的值，
+    uint64_t ldsSize;          // 每个workgroup使用的local memory的大小
+    uint64_t pdsSize;          // 每个thread用到的private memory大小
+    uint64_t sgprUsage;        // 每个wavefront(warp)使用的标量寄存器数目
+    uint64_t vgprUsage;        // 每个wavefront(warp)(also thread)使用的向量寄存器数目
+    uint64_t pdsBaseAddr;  // private memory的基址，要转成每个workgroup的基地址， wf_size*wg_size*pdsSize
+    uint64_t num_buffer;   // buffer的数目，包括pc
+    uint64_t* buffer_base; // 各buffer的基址。第一块buffer是给硬件用的metadata
+    uint64_t* buffer_size; // 各buffer的size，以Bytes为单位。实际使用的大小，用于初始化.data
+    uint64_t* buffer_allocsize; // 各buffer的size，以Bytes为单位。分配的大小
+} ventus_kernel_metadata_t;
+
+typedef struct {
+    uint64_t sim_time_max; // 最大仿真时间限制
+    struct {               // These log sinks can be enabled simultaneously
+        struct {           // Write log to a file (append to its tail)
+            bool enable;
+            const char* level; // "trace", "debug", "info", "warn", "error", "critical"
+            const char* filename;
+        } file;
+        struct { // console log
+            bool enable;
+            const char* level;
+        } console;
+        const char* level;
+    } log;
+    struct {
+        uint64_t pagesize; // 物理内存页大小
+        uint64_t auto_alloc; // 若访存到未分配的物理页，自动分配（如此则与实际硬件内存行为相同）
+        // 注意，自动分配的物理内存是不会释放的，除非整个仿真结束
+    } pmem;
+    struct { // 波形输出功能，这里只设置正常仿真流程，对仿真快照回溯后的波形输出无影响
+        bool enable;         // 是否启用？仿真快照回溯后将自动启用
+        uint64_t time_begin; // 输出波形的起始时刻
+        uint64_t time_end;   // 输出波形的结束时刻，end > begin才有波形输出
+        int levels;          // 波形输出的层级
+        const char* filename;
+    } waveform;
+    struct { // 仿真快照，当仿真出错时可回溯仿真进度到最旧快照，开启波形记录重新仿真
+        bool enable;
+        uint64_t time_interval; // 快照时间间隔
+        int num_max;            // 最大快照数量，超限时新快照将顶替最旧快照
+        const char* filename;   // 快照输出的FST波形文件名
+    } snapshot;
+    struct {               // verilator运行时命令行参数，以argc,argv形式传入
+        int argc;          // 注意argc可以为0
+        const char** argv; // 共有argc个char*字符串，[0]成员不是程序名，而是首个verilator参数
+    } verilator;
+} ventus_rtlsim_config_t;
+
+typedef struct {
+    bool error;       // Simulation got fatal error, or RTL $finish()
+    bool time_exceed; // Simulation time exceeds limit
+    bool idle;        // All given kernels has finished
+} ventus_rtlsim_step_result_t;
+
+// =
+// API functions:
+// =
+
+//
+// Helper functions
+//
+
+// Give you a recommended default config.
+DLL_PUBLIC void ventus_rtlsim_get_default_config(ventus_rtlsim_config_t* config);
+// Get current simulation time.
+DLL_PUBLIC uint64_t ventus_rtlsim_get_time(const ventus_rtlsim_t* sim);
+// Check if the simulated GPU is idle (no kernel is running).
+DLL_PUBLIC bool ventus_rtlsim_is_idle(const ventus_rtlsim_t* sim);
+// Get RTL parameters (output from *out_value, return 0 on success)
+DLL_PUBLIC int ventus_rtlsim_get_parameter(const char* name, uint32_t* out_value);
+
+//
+// Init, calculate, and finish
+//
+
+// Init the simulation.
+DLL_PUBLIC ventus_rtlsim_t* ventus_rtlsim_init(const ventus_rtlsim_config_t* config);
+
+// Finish the simulation.
+// If error occurred in the simulation, and snapshot feature enabled,
+//   it will rollback to the oldest snapshot to find out what happened.
+// You can force the rollback by passing `snapshot_rollback_forcing = true`
+DLL_PUBLIC void ventus_rtlsim_finish(ventus_rtlsim_t* sim, bool snapshot_rollback_forcing);
+
+// Calculate 1 unit-time of simulation.
+// Return the result of this step: ok, error, time_exceed, or idle.
+// If error occurred, calling this function has no effect, you should consider finish the simulation.
+DLL_PUBLIC const ventus_rtlsim_step_result_t* ventus_rtlsim_step(ventus_rtlsim_t* sim);
+
+// Host request GPGPU device to invalidate its Icache
+// (for example, after loading new kernel code to device memory)
+// This will take effect in the next simulation step()
+DLL_PUBLIC void ventus_rtlsim_icache_invalidate(ventus_rtlsim_t* sim);
+
+//
+// Push new kernels to gpu for execution.
+//
+
+// After a kernel finishing its execution, the finish_callback will be called, with metadata passed,
+//   aka. `finish_callback(metadata)` will be called.
+
+// It's allowed to delay data-loading until the kernel is actually activated on GPU,
+// by using data_load_callback
+// **Temporary api**, May be removed in the future
+DLL_PUBLIC void ventus_rtlsim_add_kernel__delay_data_loading(
+    ventus_rtlsim_t* sim, const ventus_kernel_metadata_t* metadata,
+    void (*load_data_callback)(const ventus_kernel_metadata_t*),
+    void (*finish_callback)(const ventus_kernel_metadata_t*)
+);
+
+// It's recommended to use this ↓. Remember to load data to GPU before calling this.
+DLL_PUBLIC void ventus_rtlsim_add_kernel(
+    ventus_rtlsim_t* sim, const ventus_kernel_metadata_t* metadata,
+    void (*finish_callback)(const ventus_kernel_metadata_t*)
+);
+
+//
+// Physical memory interface
+//
+
+// Physical page alloc & free
+// These functions are not needed by actual hardware memory, only for reducing simulation memory usage.
+// If config.pmem.auto_alloc is set, you don't need to call these functions.
+DLL_PUBLIC bool ventus_rtlsim_pmem_page_alloc(ventus_rtlsim_t* sim, paddr_t base);
+DLL_PUBLIC bool ventus_rtlsim_pmem_page_free(ventus_rtlsim_t* sim, paddr_t base);
+
+// Physical memory read & write
+// copy data from host to device
+DLL_PUBLIC bool ventus_rtlsim_pmemcpy_h2d(ventus_rtlsim_t* sim, paddr_t dst, const void* src, uint64_t size);
+// copy data from device to host
+DLL_PUBLIC bool ventus_rtlsim_pmemcpy_d2h(ventus_rtlsim_t* sim, void* dst, paddr_t src, uint64_t size);
+
+// GVM API begin
+
+DLL_PUBLIC int fw_vt_dev_open();
+DLL_PUBLIC int fw_vt_dev_close();
+DLL_PUBLIC int fw_vt_buf_alloc(uint64_t size, uint64_t *vaddr, int BUF_TYPE, uint64_t taskID, uint64_t kernelID);
+DLL_PUBLIC int fw_vt_buf_free(uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID);
+DLL_PUBLIC int fw_vt_one_buf_free(uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID);
+DLL_PUBLIC int fw_vt_copy_to_dev(uint64_t dev_vaddr,const void *src_addr, uint64_t size, uint64_t taskID, uint64_t kernelID);
+DLL_PUBLIC int fw_vt_start(void* metaData, uint64_t taskID);
+DLL_PUBLIC int fw_vt_upload_kernel_file(const char* filename, int taskID);
+
+// GVM API end
+
+#undef DLL_PUBLIC
+#undef DLL_LOCAL
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/driver/rtlsim_device/CMakeLists.txt b/driver/rtlsim_device/CMakeLists.txt
new file mode 100644
index 0000000..0e2eaf4
--- /dev/null
+++ b/driver/rtlsim_device/CMakeLists.txt
@@ -0,0 +1,28 @@
+set(PROJECT rtlsim_driver)
+project(${PROJECT})
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../include)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../common)
+include_directories(${VENTUS_INSTALL_PREFIX}/include)
+
+file(GLOB_RECURSE SRCS ./ventus.cpp)
+
+add_library(${PROJECT} SHARED ${SRCS})
+target_link_directories(${PROJECT} PRIVATE ${VENTUS_INSTALL_PREFIX}/lib)
+target_link_libraries(${PROJECT} PRIVATE VentusRTL)
+target_link_libraries(${PROJECT} PRIVATE driver_common_utils)
+target_link_libraries(${PROJECT} PRIVATE spdlog)
+target_link_libraries(${PROJECT} PRIVATE fmt)
+target_link_libraries(${PROJECT} PRIVATE SV)
+target_compile_definitions(${PROJECT} PRIVATE -DSPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_TRACE)
+
+set_target_properties(${PROJECT} PROPERTIES OUTPUT_NAME "${PROJECT}")
+set_target_properties(${PROJECT} PROPERTIES CLEAN_DIRECT_OUTPUT 1)
+set_target_properties(${PROJECT} PROPERTIES
+    BUILD_RPATH "${VENTUS_INSTALL_PREFIX}/lib"
+    INSTALL_RPATH "$ORIGIN"
+)
+
+install(TARGETS ${PROJECT}
+    LIBRARY DESTINATION lib COMPONENT "lib"
+)
diff --git a/driver/rtlsim_device/ventus.cpp b/driver/rtlsim_device/ventus.cpp
new file mode 100644
index 0000000..ce3c1df
--- /dev/null
+++ b/driver/rtlsim_device/ventus.cpp
@@ -0,0 +1,309 @@
+/**
+ * @file ventus.cpp
+ * @brief 设备和OpenCL程序的交互功能的实现
+ *
+ * 1. `/include/ventus.h`中声明的函数
+ */
+
+#include "ventus.h"
+#include "buddy.hpp"
+#include "loadelf.hpp"
+#include "utils.hpp"
+#include "ventus_rtlsim.h"
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+#include <spdlog/sinks/stdout_color_sinks.h>
+#include <spdlog/spdlog.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+typedef struct driver_metadata_t {
+    uint64_t kernel_id;
+    uint64_t kernel_size[3];   ///> 每个kernel的workgroup三维数目
+    uint64_t wf_size;          ///> 每个warp的thread数目
+    uint64_t wg_size;          ///> 每个workgroup的warp数目
+    uint64_t metaDataBaseAddr; ///> CSR_KNL的值，
+    uint64_t ldsSize;          ///> 每个workgroup使用的local memory的大小
+    uint64_t pdsSize;          ///> 每个thread用到的private memory大小
+    uint64_t sgprUsage;        ///> 每个workgroup使用的标量寄存器数目
+    uint64_t vgprUsage;        ///> 每个thread使用的向量寄存器数目
+    uint64_t pdsBaseAddr; ///> private memory的基址，要转成每个workgroup的基地址，
+                          /// wf_size*wg_size*pdsSize
+    const char *kernel_name;
+} driver_metadata_t;
+
+// static std::map<int, uint64_t> ptroots; // pagetable root physical address
+static std::shared_ptr<spdlog::logger> logger;
+BuddyAllocator<4096> buddy_allocator((0xFFFFFFFF - 0x90000000 + 1) / 4096, 16);
+constexpr paddr_t BUDDY_BASE = 0x90000000 - 4096;
+
+static constexpr unsigned log2Ceil(unsigned n) {
+    if (n <= 1) return 0;
+    return 32 - __builtin_clz(n - 1);
+}
+
+/// open the device and connect to it
+extern int vt_dev_open(vt_device_h *hdevice) {
+    if (hdevice == nullptr) return -1;
+
+    auto env_waveform = std::getenv("VENTUS_WAVEFORM");
+    auto env_waveform_begin = std::getenv("VENTUS_WAVEFORM_BEGIN");
+    auto env_waveform_end = std::getenv("VENTUS_WAVEFORM_END");
+    bool waveform_enable = false;
+    uint64_t waveform_begin = UINT64_MAX; // default: not enable
+    uint64_t waveform_end = 0;
+    if (parse_bool(env_waveform).value_or(false)) {
+        waveform_begin = 0; // default: dump waveform all time
+        waveform_end = UINT64_MAX;
+    }
+    waveform_begin = parse_u64(env_waveform_begin).value_or(waveform_begin);
+    waveform_end = parse_u64(env_waveform_end).value_or(waveform_end);
+    waveform_enable = waveform_end > waveform_begin;
+
+    ventus_rtlsim_config_t config;
+    ventus_rtlsim_get_default_config(&config);
+    config.sim_time_max = ~0ull;
+    config.pmem.auto_alloc = true;
+    config.waveform.enable = waveform_enable;
+    config.waveform.time_begin = waveform_begin;
+    config.waveform.time_end = waveform_end;
+    config.waveform.filename = "waveform.rtl.fst";
+    config.snapshot.enable = false;
+    config.log.console.enable = true;
+    config.log.console.level = "trace";
+    config.log.file.enable = false;
+    auto device = ventus_rtlsim_init(&config);
+    *hdevice = device;
+    logger = spdlog::stdout_color_mt("ventus");
+    logger->set_level(spdlog::level::trace);
+    logger->set_pattern("[%l] %v [%s:%#]");
+    SPDLOG_LOGGER_DEBUG(logger, "vt_dev_open : hello world from ventus.cpp (rtlsim device)");
+    return 0;
+}
+
+/// Close the device when all the operations are done
+extern int vt_dev_close(vt_device_h hdevice) {
+    if (hdevice == nullptr) return -1;
+    auto device = static_cast<ventus_rtlsim_t *>(hdevice);
+    ventus_rtlsim_finish(device, false);
+    SPDLOG_LOGGER_DEBUG(logger, "vt_dev_close : goodbye from ventus.cpp (rtlsim device)");
+    return 0;
+}
+int vt_dev_caps(vt_device_h *hdevice, uint64_t caps_id, uint64_t *value) {
+    if (value == nullptr) return -1;
+#define GET_PARAM(key)                                                                             \
+    do {                                                                                           \
+        uint32_t val;                                                                              \
+        if (ventus_rtlsim_get_parameter(key, &val) == 0) {                                         \
+            *value = val;                                                                          \
+            return 0;                                                                              \
+        } else {                                                                                   \
+            SPDLOG_LOGGER_ERROR(logger, "vt_dev_caps: get parameter {} failed", key);              \
+            return -1;                                                                             \
+        }                                                                                          \
+    } while (0)
+    switch (caps_id) {
+    case VT_CAPS_MAX_CORES:
+        GET_PARAM("num_sm");
+    case VT_CAPS_MAX_WARPS:
+        GET_PARAM("num_warp");
+    case VT_CAPS_MAX_THREADS:
+        GET_PARAM("num_thread");
+    case VT_CAPS_LOCAL_MEM_SIZE:
+        GET_PARAM("sharemem_size");
+    default:
+        SPDLOG_LOGGER_ERROR(
+            logger, "vt_dev_caps: unknown caps_id {} (or not implemented)", caps_id
+        );
+        return -1;
+    }
+    return -1;
+}
+
+extern int vt_buf_alloc(
+    vt_device_h hdevice, const uint64_t size, uint64_t *vaddr, int BUF_TYPE, uint64_t taskID,
+    uint64_t kernelID
+) {
+    // TODO: RTLSIM does not support Virtual Memory yet
+    if (size <= 0 || hdevice == nullptr) return -1;
+    // auto device = static_cast<ventus_rtlsim_t *>(hdevice);
+    size_t pgcnt = (size + 4095) / 4096;
+    paddr_t addr_allocated = buddy_allocator.allocate(log2Ceil(pgcnt)) + BUDDY_BASE;
+    if (addr_allocated == BUDDY_BASE) {
+        SPDLOG_LOGGER_ERROR(logger, "vt_buf_alloc: buddy allocator failed, size=0x{:x}", size);
+        return -1;
+    }
+    SPDLOG_LOGGER_DEBUG(
+        logger,
+        "vt_buf_alloc: vaddr_recommand=0x{:x}, vaddr_allocated=0x{:x}, size=0x{:x}, taskID={}",
+        *vaddr, addr_allocated, size, taskID
+    );
+    *vaddr = addr_allocated; // This is paddr actually
+    if (*vaddr == 0) return -1;
+    return 0;
+}
+
+extern int vt_buf_free(
+    vt_device_h hdevice, uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID
+) {
+    if (hdevice == nullptr) return -1;
+    // auto device = static_cast<ventus_rtlsim_t *>(hdevice);
+    assert(*vaddr % 4096 == 0);
+    size_t pgcnt = (size + 4095) / 4096;
+    // Not really freed in hardware, just in buddy allocator
+    buddy_allocator.free(*vaddr - BUDDY_BASE, log2Ceil(pgcnt));
+    SPDLOG_LOGGER_INFO(logger, "vt_buf_free: vaddr=0x{:x}, size=0x{:x}", *vaddr, size);
+    return 0;
+}
+
+extern int vt_one_buf_free(
+    vt_device_h hdevice, uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID
+) {
+    return vt_buf_free(hdevice, size, vaddr, taskID, kernelID);
+}
+
+/**
+ * @brief  为设备分配内存，返回根页表的地址
+ * @param  hdevice
+ * @param  size
+ * @return int
+ */
+extern int vt_root_mem_alloc(vt_device_h hdevice, int taskID) {
+    // if (hdevice == nullptr) return -1;
+    // auto device = static_cast<ventus_rtlsim_t *>(hdevice);
+    // auto ptroot = ventus_rtlsim_vmem_create(device);
+    // if (ptroot == 0) return -1;
+    // logger->debug("vt_root_mem_alloc: taskID={}, ptroot={:x}", taskID, ptroot);
+    // ptroots[taskID] = ptroot;
+    if (taskID == 0) {
+        SPDLOG_LOGGER_ERROR(logger, "RTLSIM_device does not support VMEM yet, taskID must be 0");
+    }
+    return 0;
+}
+
+/**
+ * 释放taskID（对应context）的根页表
+ * @param hdevice
+ * @param taskID
+ * @return
+ */
+extern int vt_root_mem_free(vt_device_h hdevice, int taskID) {
+    // if (hdevice == nullptr) return -1;
+    // auto device = static_cast<ventus_rtlsim_t *>(hdevice);
+    // ventus_rtlsim_vmem_destroy(device, ptroots[taskID]);
+    // ptroots.erase(taskID);
+    return 0;
+}
+
+extern int vt_copy_to_dev(
+    vt_device_h hdevice, uint64_t dev_vaddr, const void *src_addr, uint64_t size, uint64_t taskID,
+    uint64_t kernelID
+) {
+    if (hdevice == nullptr) return -1;
+    auto device = static_cast<ventus_rtlsim_t *>(hdevice);
+    SPDLOG_LOGGER_DEBUG(
+        logger, "vt_copy_to_dev: dev_addr=0x{:x}, size=0x{:x}, taskID={}, kernelID={}", dev_vaddr,
+        size, taskID, kernelID
+    );
+    ventus_rtlsim_pmemcpy_h2d(device, dev_vaddr, src_addr, size);
+    return 0;
+}
+
+extern int vt_copy_from_dev(
+    vt_device_h hdevice, uint64_t dev_vaddr, void *dst_addr, uint64_t size, uint64_t taskID,
+    uint64_t kernelID
+) {
+    if (hdevice == nullptr) return -1;
+    auto device = static_cast<ventus_rtlsim_t *>(hdevice);
+    SPDLOG_LOGGER_DEBUG(
+        logger, "vt_copy_from_dev: dev_addr=0x{:x}, size=0x{:x}, taskID={}, kernelID={}", dev_vaddr,
+        size, taskID, kernelID
+    );
+    ventus_rtlsim_pmemcpy_d2h(device, dst_addr, dev_vaddr, size);
+    return 0;
+}
+
+extern int vt_start(vt_device_h hdevice, void *mtd_raw, uint64_t taskID) {
+    if (hdevice == nullptr) return -1;
+    auto device = static_cast<ventus_rtlsim_t *>(hdevice);
+    auto mtd_driver = static_cast<driver_metadata_t *>(mtd_raw);
+    ventus_kernel_metadata_t mtd_sim{
+        .name = mtd_driver->kernel_name,
+        .data = nullptr,
+        .startaddr = 0x80000000,
+        .kernel_id = mtd_driver->kernel_id,
+        .kernel_size =
+            {mtd_driver->kernel_size[0], mtd_driver->kernel_size[1], mtd_driver->kernel_size[2]},
+        .wf_size = mtd_driver->wf_size,
+        .wg_size = mtd_driver->wg_size,
+        .metaDataBaseAddr = mtd_driver->metaDataBaseAddr,
+        .ldsSize = mtd_driver->ldsSize,
+        .pdsSize = mtd_driver->pdsSize,
+        .sgprUsage = mtd_driver->sgprUsage,
+        .vgprUsage = mtd_driver->vgprUsage,
+        .pdsBaseAddr = mtd_driver->pdsBaseAddr,
+        .num_buffer = 0,
+    };
+    SPDLOG_LOGGER_DEBUG(
+        logger,
+        "kernel metadata: kernel_id={}, kernel_size=[{}, {}, {}], wf_size={}, wg_size={}, "
+        "metaDataBaseAddr=0x{:x}, ldsSize=0x{:x}, pdsSize=0x{:x}, sgprUsage={}, "
+        "vgprUsage={}, pdsBaseAddr=0x{:x}",
+        mtd_driver->kernel_id, mtd_driver->kernel_size[0], mtd_driver->kernel_size[1],
+        mtd_driver->kernel_size[2], mtd_driver->wf_size, mtd_driver->wg_size,
+        mtd_driver->metaDataBaseAddr, mtd_driver->ldsSize, mtd_driver->pdsSize,
+        mtd_driver->sgprUsage, mtd_driver->vgprUsage, mtd_driver->pdsBaseAddr
+    );
+    ventus_rtlsim_add_kernel(device, &mtd_sim, nullptr);
+    return 0;
+}
+
+extern int vt_ready_wait(vt_device_h hdevice, uint64_t timeout) {
+    if (hdevice == nullptr) return -1;
+    auto device = static_cast<ventus_rtlsim_t *>(hdevice);
+    uint64_t timeout_ns = timeout * 1000000;
+    while (!ventus_rtlsim_is_idle(device) && ventus_rtlsim_get_time(device) < timeout_ns) {
+        ventus_rtlsim_step(device);
+    }
+    for (int i = 0; i < 5000; i++) {
+        // TODO: RTL does not provide a way to check if L2 cache flush is done
+        ventus_rtlsim_step(device);
+    }
+    return 0;
+}
+
+extern int vt_finish_all_kernel(vt_device_h hdevice, std::queue<int> *finished_kernel_list) {
+    // TODO: what is this function for? what is finished_kernel_list?
+    return -1;
+}
+
+extern int vt_upload_kernel_file(vt_device_h hdevice, const char *filename, int taskID) {
+    if (hdevice == nullptr) return -1;
+    auto device = (ventus_rtlsim_t *)hdevice;
+    // uint64_t ptroot = ptroots[taskID];
+
+    // parse ELF file, find .text and other data sections
+    const auto blocks = get_data_from_elf(filename, logger);
+    if (blocks.empty()) {
+        return -1; // at least .text section is needed
+    }
+
+    // alloc and load/zero-fill each block
+    for (auto block = blocks.begin(); block != blocks.end(); block++) {
+        uint64_t vaddr = block->vaddr;
+        uint64_t size = block->memsz;
+        SPDLOG_LOGGER_DEBUG(
+            logger, "vt_upload_kernel_file {}: vaddr=0x{:x}, size=0x{:x}", filename, vaddr, size
+        );
+        ventus_rtlsim_pmemcpy_h2d(device, vaddr, block->data.data(), block->data.size());
+        std::vector<uint8_t> zeros(size - block->data.size(), 0);
+        ventus_rtlsim_pmemcpy_h2d(device, vaddr + block->data.size(), zeros.data(), zeros.size());
+    }
+    ventus_rtlsim_icache_invalidate(device);
+    return 0;
+}
+int vt_upload_kernel_bytes(vt_device_h device, const void *content, uint64_t size, int taskID) {
+    return 0;
+}
+int vt_dump_perf(vt_device_h device, FILE *stream) { return 0; }
diff --git a/driver/spike_device/CMakeLists.txt b/driver/spike_device/CMakeLists.txt
index 63fe0f3..e3a2e44 100644
--- a/driver/spike_device/CMakeLists.txt
+++ b/driver/spike_device/CMakeLists.txt
@@ -1,34 +1,30 @@
 set(PROJECT spike_driver)
 project(${PROJECT})
 
-
-message("val of env is:$ENV{SPIKE_SRC_DIR}/spike_main")
 set(CMAKE_CXX_FLAGS -lstdc++)
 
-include_directories(${CMAKE_SOURCE_DIR}/../../include)
-include_directories(${CMAKE_SOURCE_DIR}/../../common)
-include_directories($ENV{SPIKE_SRC_DIR})
-include_directories($ENV{SPIKE_SRC_DIR}/spike_main)
-include_directories($ENV{SPIKE_SRC_DIR}/riscv)
-include_directories($ENV{SPIKE_SRC_DIR}/build)
-include_directories($ENV{SPIKE_SRC_DIR}/softfloat)
-include_directories($ENV{SPIKE_SRC_DIR}/fesvr)
-
-#set(CMAKE_POSITION_INDEPENDENT_CODE True)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../include)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../common)
+include_directories(${SPIKE_SRC_DIR})
+include_directories(${SPIKE_SRC_DIR}/spike_main)
+include_directories(${SPIKE_SRC_DIR}/riscv)
+include_directories(${SPIKE_SRC_DIR}/build)
+include_directories(${SPIKE_SRC_DIR}/softfloat)
+include_directories(${SPIKE_SRC_DIR}/fesvr)
 
 file(GLOB_RECURSE SRCS ./ventus.cpp)
 
-
-#message($ENV{SPIKE_TARGET_DIR})
-
 add_library(${PROJECT} SHARED ${SRCS})
-target_link_directories(${PROJECT} PUBLIC $ENV{SPIKE_TARGET_DIR}/lib)
+target_link_directories(${PROJECT} PUBLIC ${VENTUS_INSTALL_PREFIX}/lib)
 target_link_libraries(${PROJECT} PUBLIC spike_main)
 
 set_target_properties(${PROJECT} PROPERTIES OUTPUT_NAME "${PROJECT}")
 set_target_properties(${PROJECT} PROPERTIES CLEAN_DIRECT_OUTPUT 1)
-
-if(ENABLE_INSTALL)
-    message(STATUS "DRIVER_LIB_INSTALL_DIR:" ${DRIVER_LIB_INSTALL_DIR})
-    install(TARGETS ${PROJECT} LIBRARY DESTINATION ${DRIVER_LIB_INSTALL_DIR} COMPONENT "lib")
-    endif()
\ No newline at end of file
+set_target_properties(${PROJECT} PROPERTIES
+    BUILD_RPATH "${VENTUS_INSTALL_PREFIX}/lib"
+    INSTALL_RPATH "$ORIGIN"
+)
+
+install(TARGETS ${PROJECT}
+    LIBRARY DESTINATION lib COMPONENT "lib"
+)
diff --git a/driver/spike_device/ventus.cpp b/driver/spike_device/ventus.cpp
index 4f046ce..a0af49d 100644
--- a/driver/spike_device/ventus.cpp
+++ b/driver/spike_device/ventus.cpp
@@ -9,11 +9,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <iostream>
-#include <fstream>
-#include <future>
-#include <list>
-#include <chrono>
-// driver/page_table
 #include "ventus.h"
 #include "spike_main.h"
 
@@ -41,6 +36,12 @@ extern int vt_dev_caps(vt_device_h* hdevice, host_port_t* input_sig){
     // //set spike_device id to 1
     return 0;
 }
+
+int vt_dev_caps(vt_device_h* hdevice, uint64_t caps_id, uint64_t *value) {
+    // TODO: Not implemented yet
+    return -1;
+}
+
 extern int vt_buf_alloc(vt_device_h hdevice, uint64_t size, uint64_t *vaddr, int BUF_TYPE, uint64_t taskID, uint64_t kernelID) {
     if(size <= 0 || hdevice == nullptr)
         return -1;
@@ -211,3 +212,6 @@ extern int vt_upload_kernel_file(vt_device_h device, const char* filename, int t
   return err;
 }
 */
+
+int vt_upload_kernel_bytes(vt_device_h device, const void* content, uint64_t size, int taskID) { return 0; }
+int vt_dump_perf(vt_device_h device, FILE* stream) { return 0; }
diff --git a/driver/verilating_device/CMakeLists.txt b/driver/verilating_device/CMakeLists.txt
deleted file mode 100644
index 357e34e..0000000
--- a/driver/verilating_device/CMakeLists.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-set(PROJECT rtldriver)
-project(${PROJECT})
-
-set(CMAKE_CXX_STANDARD 11)
-
-include_directories(${CMAKE_SOURCE_DIR}/include)
-include_directories(${CMAKE_SOURCE_DIR}/common)
-include_directories(${CMAKE_SOURCE_DIR}/devices/verilating_device)
-include_directories(${CMAKE_SOURCE_DIR}/devices/verilating_device/page_table)
-
-#set(CMAKE_POSITION_INDEPENDENT_CODE True)
-
-file(GLOB_RECURSE SRCS "./*.cpp" "./*.h")
-
-
-link_directories(${CMAKE_BINARY_DIR}/devices/verilating_device)
-
-add_library(${PROJECT} SHARED ${SRCS})
-add_dependencies(${PROJECT} rtlsim)
-
-target_link_libraries(${PROJECT} PUBLIC rtlsim)
-
-
-set_target_properties(${PROJECT} PROPERTIES OUTPUT_NAME "${PROJECT}")
-set_target_properties(${PROJECT} PROPERTIES CLEAN_DIRECT_OUTPUT 1)
-
-if(ENABLE_INSTALL)
-    install(TARGETS ${PROJECT} LIBRARY DESTINATION ${DRIVER_LIB_INSTALL_DIR} COMPONENT "lib")
-endif()
\ No newline at end of file
diff --git a/driver/verilating_device/README.md b/driver/verilating_device/README.md
deleted file mode 100644
index e5673e9..0000000
--- a/driver/verilating_device/README.md
+++ /dev/null
@@ -1,7 +0,0 @@
-这个目录为GPGPU的源代码转为verilog后，执行OpenCL程序时，相应的driver需要实现的功能，包括：
-
-1. `/include/ventus.h`中声明的函数
-   
-2. `vt_device`类，表示GPGPU设备，成员变量包括设备类（不包含ram），设备ram类
-3. `vt_buffer`类，主机和设备之间交换数据的缓冲区，成员变量包括`vt_device`，数据，缓冲区大小
-4. 其他 TBD
\ No newline at end of file
diff --git a/driver/verilating_device/ventus.cpp b/driver/verilating_device/ventus.cpp
deleted file mode 100644
index 50d6df8..0000000
--- a/driver/verilating_device/ventus.cpp
+++ /dev/null
@@ -1,269 +0,0 @@
-/**
- * @file ventus.cpp
- * @brief 设备和OpenCL程序的交互功能的实现
- * 
- * 1. `/include/ventu.h`中声明的函数
- * 2. `vt_device`类，表示GPGPU设备，成员变量包括设备类（不包含ram），设备ram类
- * 3. `vt_buffer`类，主机和设备之间交换数据的缓冲，成员变量包括`vt_device`，数据，缓冲区大小
-
- * @author yangzexia (yang-zx17\@qq.com)
- * @version 1.0
- * @date 2022-11-16
- * 
- * @copyright Copyright (c) 2022  yangzexia@THU-DSPLAB
- * 
- * @par 修改日志:
- * <table>
- * <tr><th>Date       <th>Version <th>Author  <th>Description
- * <tr><td>2022-11-16 <td>1.0     <td>yangzexia     <td>创建
- * </table>
- */
-
-#include <stdint.h>
-#include <stdio.h>
-#include <cstring>
-#include <stdlib.h>
-#include <assert.h>
-#include <iostream>
-#include <fstream>
-#include <future>
-#include <list>
-#include <chrono>
-// driver/page_table
-#include "ventus.h"
-
-#include "vt_device.h"
-
-///@todo MMU，内存分配
-// #include <vt_malloc.h> 
-#include "vt_utils.h"
-#include "vt_config.h"
-
-// devices/page_table
-#include "verilating_device/page_table/vt_memory.h"
-#include "verilating_device/page_table/MemConfig.h"
-// #include <util.h>
-#include "processor.h"
-
-#define RAM_PAGE_SIZE 4096
-
-using namespace ventus;
-
-
-/// open the device and connect to it
-extern int vt_dev_open(vt_device_h* hdevice){
-    if(hdevice == nullptr)
-        return -1;
-    PCOUT_INFO << "vt_dev_open : hello world from ventus.cpp" << endl;
-    *hdevice = new vt_device();
-	vt_root_mem_alloc(*hdevice, 0);
-    return 0;
-}
-/// Close the device when all the operations are done
-extern int vt_dev_close(vt_device_h hdevice){
-    if(hdevice == nullptr)
-        return -1;
-    auto* device = (vt_device*) hdevice;
-    delete device;
-    return 0;
-}
-extern int vt_dev_caps(vt_device_h* hdevice,  uint64_t caps_id, uint64_t *value){
-     if(hdevice == nullptr)
-         return -1;
-    switch (caps_id) {
-        case VT_CAPS_VERSION:
-            *value = IMPLEMENTATION_ID;
-            break;
-        case VT_CAPS_MAX_CORES:
-            *value = NUM_CTA;
-            break;
-        case VT_CAPS_MAX_WARPS:
-            *value = NUM_WARP;
-            break;
-        case VT_CAPS_MAX_THREADS:
-            *value = NUM_THREAD;
-            break;
-        default:
-            std::cout << "invalid caps id: " << caps_id << std::endl;
-            std::abort();
-            return -1;
-    }
-
-    return 0;
-}
-extern int vt_buf_alloc(vt_device_h hdevice, uint64_t size, uint64_t *vaddr, int BUF_TYPE, uint64_t taskID, uint64_t kernelID) {
-    if(size <= 0 || hdevice == nullptr)
-        return -1;
-    auto device = ((vt_device*) hdevice);
-    return device->alloc_local_mem( size, vaddr, BUF_TYPE, taskID, kernelID);
-
-}
-extern int vt_buf_free(vt_device_h hdevice, uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID) {
-    if(size <= 0 || hdevice == nullptr)
-        return -1;
-    auto device = ((vt_device*) hdevice);
-
-    return device->free_local_mem( size, vaddr, taskID, kernelID);
-
-}
-
-/**
- * @brief  为设备分配内存，返回根页表的地址
- * @param  hdevice           
- * @param  size              
- * @param  dev_vaddr    申请物理地址时的虚拟地址         
- * @return int 
- */
-extern int vt_root_mem_alloc(vt_device_h hdevice, int taskID) {
-    if( hdevice == nullptr)
-        return -1;
-    vt_device* device = (vt_device*) hdevice;
-    return device->create_device_mem(taskID);
-}
-
-/**
- * 释放taskID（对应context）的根页表
- * @param hdevice
- * @param taskID
- * @return
- */
-extern int vt_root_mem_free(vt_device_h hdevice, int taskID) {
-    if(hdevice == nullptr) 
-        return -1;
-    auto device = (vt_device*) hdevice;
-    return device->delete_device_mem(taskID);
-}
-
-//extern int vt_create_kernel(vt_device_h hdevice, int taskID, int kernelID) {
-//    if(hdevice == nullptr)
-//        return -1;
-//    auto device = (vt_device*) hdevice;
-//    return device->push_kernel(taskID, kernelID);
-//}
-
-extern int vt_copy_to_dev(vt_device_h hdevice, uint64_t dev_vaddr,const void *src_addr, uint64_t size, uint64_t taskID, uint64_t kernelID) {
-    if(size <= 0)
-        return -1;
-    auto device = (vt_device*) hdevice;
-    return device->upload(dev_vaddr, src_addr, size, taskID, kernelID);
-}
-
-extern int vt_copy_from_dev(vt_device_h hdevice, uint64_t dev_vaddr, void *dst_addr, uint64_t size, uint64_t taskID, uint64_t kernelID) {
-    if(size <= 0)
-        return -1;
-    auto device = (vt_device*) hdevice;
-    return device->download(dev_vaddr, dst_addr, size, taskID, kernelID);
-}
-
-extern int vt_start(vt_device_h hdevice, void* metaData, uint64_t taskID) {
-    if(hdevice == nullptr)
-        return -1;
-    auto device = (vt_device *) hdevice;
-    device->start(taskID, metaData);
-    return 0;
-}
-extern int vt_ready_wait(vt_device_h hdevice, uint64_t timeout) {
-    if(hdevice == nullptr)
-        return -1;
-    auto* device = (vt_device*) hdevice;
-#ifdef DEBUG_VIRTUAL_ADDR
-	device->execute_all_kernel();
-	return 0;
-#endif
-    return device->wait(timeout);
-
-}
-
-extern int vt_finish_all_kernel(vt_device_h hdevice, queue<int> *finished_kernel_list) {
-    if(hdevice == nullptr)
-        return -1;
-    auto device = (vt_device*) hdevice;
-    *finished_kernel_list = device->execute_all_kernel();
-    return 0;
-}
-
-extern int vt_upload_kernel_bytes(vt_device_h device, const void* content, uint64_t size, int taskID) {
-  int err = 0;
-
-  if (NULL == content || 0 == size)
-    return -1;
-
-  uint32_t buffer_transfer_size = 65536; ///< 64 KB
-  uint64_t kernel_base_addr = BUF_PARA_BASE;
-
-  // allocate device buffer
-  uint64_t dev_mem_addr;
-
-  uint64_t offset = 0;
-	// 确定字符串可以被4整除
-
-	int numValues = size / 8; // 每个uint32_t值占据8个字符
-
-	// 创建uint32_t数组
-	uint32_t values[numValues];
-
-	// 将字符串转换为uint32_t数组
-	for (int i = 0; i < numValues; i++) {
-		std::string substring = (*(string*)content).substr(i * 8, 8); // 每次提取8个字符
-		unsigned int value = std::stoul(substring, nullptr, 16); // 转换为无符号整数
-		std::memcpy(values + i, &value, sizeof(uint32_t)); // 复制到数组中
-	}
-  void * const buffer = malloc(buffer_transfer_size);
-  while (offset < size) {
-    auto chunk_size = std::min<uint64_t>(buffer_transfer_size, size - offset);
-    std::memcpy(buffer, values + offset, chunk_size);
-
-	err = vt_buf_alloc(device, buffer_transfer_size, &dev_mem_addr, KERNEL_MEM, taskID, 0);
-	if (err != 0)
-	  return -1;
-
-	printf("***  Upload Kernel to 0x%0x: data=", kernel_base_addr + offset);
-    for (int i = 0; i < chunk_size; ++i) {
-      printf("%08x", ((values + offset))[i]);
-    }
-    printf("\n");
-
-
-    err = vt_copy_to_dev(device, dev_mem_addr, buffer, chunk_size, taskID, 0);
-    if (err != 0) {
-//      vt_buf_free(device, buffer_transfer_size, &dev_mem_addr, taskID, 0);
-      return err;
-    }
-    offset += chunk_size;
-  }
-	free(buffer);
-  return 0;
-}
-
-extern int vt_upload_kernel_file(vt_device_h device, const char* filename, int taskID) {
-
-//	return 0;
-
-	const char *pos = std::strchr(filename, '.');
-	char newname[100];
-	if (pos != nullptr) {
-		std::strncpy(newname, filename, pos - filename);
-		std::strcat(newname,".vmem");
-		std::size_t len = std::strlen(newname);
-		newname[len] = '\0';
-	}
-  std::ifstream ifs(newname, std::ios::binary);
-  if (!ifs) {
-    std::cout << "error: " << newname << " not found" << std::endl;
-    return -1;
-  }
-
-  // read file content
-  ifs.seekg(0, ifs.end);
-  auto size = ifs.tellg();
-  std::string content;
-  content.resize(size);
-  ifs.seekg(0, ifs.beg);
-  ifs.read(&content[0], size);
-  content.erase(std::remove(content.begin(), content.end(), '\n'), content.end());
-  // upload
-  int err = vt_upload_kernel_bytes(device, &content, content.length(), taskID);
-
-
-  return err;
-}
diff --git a/driver/verilating_device/vt_device.cpp b/driver/verilating_device/vt_device.cpp
deleted file mode 100644
index 7020bbb..0000000
--- a/driver/verilating_device/vt_device.cpp
+++ /dev/null
@@ -1,601 +0,0 @@
-#include "vt_device.h"
-#include <cstdlib>
-#include <cmath>
-#include <algorithm>
-#include "vt_utils.h"
-#include "MemConfig.h"
-//#include "processor.h"
-
-int vt_device::create_device_mem(uint64_t taskID) {
-    if(contextList_.find(taskID) != contextList_.end()) {
-        PCOUT_ERROR << "the taskID of " << taskID <<"has been created, check your input!" <<endl;
-        return -1;
-    }
-    int ret0 = addrManager_.createNewContext(taskID);
-    context_info tmp = context_info(taskID);
-    contextList_.emplace(taskID,context_info(taskID));
-    auto it = contextList_.find(taskID);
-    uint64_t ret1 = it->second.ram.createRootPageTable();
-    it->second.root = ret1;
-    return ret0 || !ret1;
-}
-
-int vt_device::delete_device_mem(int taskID){
-    if(contextList_.find(taskID) != contextList_.end()) {
-        PCOUT_ERROR << "the taskID of " << taskID <<"has not been created, check your input!" <<endl;
-        return -1;
-    }
-    contextList_.erase(taskID);
-    return 0;
-}
-
-int vt_device::alloc_local_mem(uint64_t size, uint64_t *vaddr, int BUF_TYPE, uint64_t taskID, uint64_t kernelID) {
-    if(size <= 0 || vaddr == nullptr || contextList_.find(taskID) == contextList_.end())
-        return -1;
-#ifdef DEBUG_VIRTUAL_ADDR
-    int ret0 = addrManager_.allocMemory(taskID, kernelID, vaddr, size, BUF_TYPE);
-    auto it = contextList_.find(taskID);
-    int ret1 = it->second.ram.allocateMemory(it->second.root, *vaddr, size);
-	PCOUT_INFO << "allocating memory at vaddr of 0x" <<hex << *vaddr << ", associated paddr of 0x" << ret1
-			   <<", size of "<<dec<<size << "bytes"<< endl;
-    return ret0 || !ret1;
-#else
-	/// 这里addrManager会分配一个虚拟地址addr,ram根据这个虚拟地址分配一个物理地址vaddr，
-	/// 这个物理地址作为buf_alloc的返回值
-
-    uint64_t  *addr = new uint64_t;
-    int ret0 = addrManager_.allocMemory(taskID, kernelID, addr, size, BUF_TYPE);
-    auto it = contextList_.find(taskID);
-    *vaddr = it->second.ram.allocateMemory(it->second.root, *addr, size);
-	/// 将ram分配的物理地址和addrManager分配的物理地址关联起来
-	addrManager_.attachPaddr(taskID, kernelID, addr, vaddr);
-	PCOUT_INFO << "allocating memory at vaddr of 0x" <<hex << *addr << ", associated paddr of 0x" << *vaddr
-	<<", size of "<<dec<<size << "bytes"<< endl;
-    delete addr;
-    return ret0 || !*vaddr;
-#endif
-}
-
-
-int vt_device::free_local_mem(uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID){
-    if(size <= 0 || vaddr == nullptr || !addrManager_.findContextID(taskID))
-        return -1;
-    auto it = contextList_.find(taskID);
-	uint64_t *paddr = new uint64_t;
-	*paddr = *vaddr;
-#ifndef DEBUG_VIRTUAL_ADDR
-	addrManager_.findVaByPa(kernelID,taskID,paddr,vaddr);
-#endif
-    int ret1 = it->second.ram.releaseMemory(it->second.root, *paddr);
-	delete paddr;
-#ifdef DEBUG_VIRTUAL_ADDR
-    int ret0 = addrManager_.releaseMemory(taskID, kernelID, vaddr, size);
-    return ret0 || ret1;
-#else
-    return ret1;
-#endif
-}
-
-int vt_device::upload(uint64_t dev_vaddr,const void *src_addr, uint64_t size, uint64_t taskID, uint64_t kernelID){
-    if(size <= 0 || src_addr == nullptr || contextList_.find(taskID) == contextList_.end())
-        return -1;
-    auto it = contextList_.find(taskID);
-#ifdef DEBUG_VIRTUAL_ADDR
-    return it->second.ram.writeDataVirtual(it->second.root, dev_vaddr, size, src_addr);
-#else
-    return it->second.ram.writeDataPhysical(dev_vaddr, size, src_addr);
-#endif
-
-
-}
-
-int vt_device::download(uint64_t dev_vaddr, void *dst_addr, uint64_t size, uint64_t taskID, uint64_t kernelID){
-    if(size <= 0 || dst_addr == nullptr || contextList_.find(taskID) == contextList_.end())
-        return -1;
-    auto it = contextList_.find(taskID);
-#ifdef DEBUG_VIRTUAL_ADDR
-    return it->second.ram.readDataVirtual(it->second.root, dev_vaddr, size, dst_addr);
-#else
-    return it->second.ram.readDataPhysical(dev_vaddr, size, dst_addr);
-#endif
-}
-/**
- * @brief   发送任务，每个任务由多个block组成，每次调用start发送一个任务
- * 传入到硬件的wg_id由processor.run()决定，函数执行完成后会返回实际的wg_id,
- * 任务队列的数据结构为一个元素为unordered_map的list，list的每个元素代表一个任务，
- * 每次发送一个任务会在list中增加一个元素
- * unordered_map的每个key代表block ID，value表示该block是否执行完成。
- * @param input_sig 输入到GPGPU的信号，与硬件接口对应
- * @param num_block 这个任务由多少个block组成
- * @return int 0
- *
- * @todo start中调用parse_metaData, 然后push_kernel， 然后为硬件接口赋值，启动GPU
- */
-int vt_device::start(int taskID, void* metaData){
-    //parse metaData
-
-    host_port_t *devicePort = new host_port_t;
-    auto inputData = (meta_data *)metaData;
-#ifdef DEBUG_VIRTUAL_ADDR
-    uint64_t wgNum = inputData->kernel_size[0] * inputData->kernel_size[1]*inputData->kernel_size[2];
-    uint64_t pdsParam = inputData->pdsSize * inputData->wf_size * inputData->wg_size;
-    devicePort->host_req_num_wf = inputData->wg_size;
-    devicePort->host_req_wf_size = inputData->wf_size;
-    devicePort->host_req_kernel_size_3d_0 = inputData->kernel_size[0];
-    devicePort->host_req_kernel_size_3d_1 = inputData->kernel_size[1];
-    devicePort->host_req_kernel_size_3d_2 = inputData->kernel_size[2];
-    devicePort->host_req_vgpr_size_total = inputData->wg_size * inputData->vgprUsage;
-    devicePort->host_req_sgpr_size_total = inputData->wg_size * inputData->sgprUsage;
-    devicePort->host_req_gds_size_total = 0;
-    devicePort->host_req_vgpr_size_per_wf = inputData->vgprUsage;
-    devicePort->host_req_sgpr_size_per_wf = inputData->sgprUsage;
-    devicePort->host_req_start_pc = 0x80000000;
-    devicePort->host_req_pds_baseaddr = inputData->pdsBaseAddr;
-    devicePort->host_req_csr_knl = inputData->metaDataBaseAddr;
-    devicePort->host_req_lds_size_total = inputData->ldsSize;
-    devicePort->host_req_gds_baseaddr = 0;
-#endif
-
-    if(contextList_.find(taskID) == contextList_.end()) {
-        PCOUT_ERROR << "the context of ID "<< taskID << " not exists, check your input!" << endl;
-        return -1;
-    }
-    processor_.attach_ram(&contextList_.find(taskID)->second.ram);
-    //each function call send one block of a kernel
-    for (int i = 0; i < wgNum; ++i) {
-	#ifdef DEBUG_VIRTUAL_ADDR
-			uint64_t kernelID = inputData->kernel_id;
-			devicePort->host_req_pds_baseaddr = inputData->pdsBaseAddr + i * pdsParam;
-	#else
-			uint64_t kernelID = 0;
-	#endif
-        devicePort->host_req_wg_id = (inst_len)(((
-                    kernelID<<(int)ceil(log2(MAX_CONTEXT)) | taskID)
-                    <<((int)ceil(log2(MAX_KERNEL)) | kernelID))
-                    <<((int)ceil(log2(NUM_SM*MAX_BLOCK_PER_SM)) | i))
-                    <<((int)ceil(log2(NUM_SM)));
-		#ifdef DEBUG_VERIFY_HW
-				devicePort->host_req_wg_id = 0;
-				devicePort->host_req_num_wf = 2;
-				devicePort->host_req_wf_size = 0x8;
-				devicePort->host_req_kernel_size_3d_0 = 0;
-				devicePort->host_req_kernel_size_3d_1 = 0;
-				devicePort->host_req_kernel_size_3d_2 = 0;
-				devicePort->host_req_vgpr_size_total = 0x040;
-				devicePort->host_req_sgpr_size_total = 0x040;
-				devicePort->host_req_gds_size_total = 0;
-				devicePort->host_req_vgpr_size_per_wf = 0x020;
-				devicePort->host_req_sgpr_size_per_wf = 0x020;
-				devicePort->host_req_start_pc = 0x80000000;
-				devicePort->host_req_pds_baseaddr = 0x80001000;
-				devicePort->host_req_csr_knl = 0x80023000;
-				devicePort->host_req_lds_size_total = 0x80;
-				devicePort->host_req_gds_baseaddr = 0x00000000;
-		#endif
-
-		processor_.run(contextList_.find(taskID)->second.root, devicePort);
-        //更新contextList_
-        map<int, _state>firedBlk;
-        firedBlk.emplace((int)(devicePort->host_req_wg_id), UNFINISH);
-        contextList_.find(taskID)->second.kernelList.emplace(kernelID, kernel_info(firedBlk, UNFINISH));
-
-    }
-    return 0;
-}
-/**
- * @brief 等待一定时间，更新kernel执行完成的信息
- * @param  time              
- * @return int 
- */
-int vt_device::wait(uint64_t time){
-    // 如果所有已经启动的run()任务都完成
-    if(!last_task_.valid());
-    else {
-        // 如果没有则等待到所有run()都完成
-        uint64_t timeout = time / 1000;
-        std::chrono::seconds wait_time(1);
-        for(;;){
-            auto status = last_task_.wait_for(wait_time);
-            if (status == std::future_status::ready || timeout-- == 0)
-                break;
-        }
-    }            // 如果正在遍历的任务的所有block都完成，则将该任务记录下来并删除，
-    // 根据GPGPU返回的block完成情况更新任务队列，将已完成的block ID与保存的list中的block ID比较
-
-    std::queue<int> finished_block = processor_.wait(time);
-
-    while(!finished_block.empty()) {
-        bool block_legal = true;
-        //根据硬件返回的已完成blkID，解码出所属的context, kernel和原本的block
-        uint64_t blkID = (finished_block.front() >> (int)ceil(log2(NUM_SM))) & (1 << (int)ceil(log2(NUM_SM*MAX_BLOCK_PER_SM)));
-        uint64_t kernelID = (finished_block.front() >> (int)ceil(log2(NUM_SM*MAX_BLOCK_PER_SM*NUM_SM))) & (1 << (int)ceil(log2(MAX_KERNEL)));
-        uint64_t contextID = (finished_block.front() >> (int)ceil(log2(NUM_SM*MAX_BLOCK_PER_SM*NUM_SM*MAX_KERNEL))) & (1 << (int)ceil(log2(MAX_CONTEXT)));
-        auto contextItem = contextList_.find(contextID);
-        //判断contextID是否存在
-        if(contextItem == contextList_.end())
-            block_legal = false;
-        else{
-            auto it = contextItem->second.kernelList.find(kernelID);
-                //判断kernelID是否存在
-                if(it == contextItem->second.kernelList.end()) {
-                    block_legal = false;
-                }
-                else {
-                    //判断blkID是否存在
-                    if(it->second.blk_list.find(blkID) == it->second.blk_list.end())
-                        block_legal = false;
-                    else {
-                        //将相应kernel的block设置为已完成
-                        it->second.blk_list[blkID] = FINISH;
-                        finished_block.pop();
-
-                        //当某一个kernel的block完成之后，判断是否该block的所有kernel都完成，
-                        // 判断该kernel所属的context的所有kernel是否都完成
-                        bool kernel_all_block_finished = true;
-                        for(auto& it_map : it->second.blk_list) {
-                            if(!it_map.second) {
-                                kernel_all_block_finished = false;
-                                break;
-                            }
-                        }
-                        if(kernel_all_block_finished) {
-                            finished_kernel_l.push(contextID << (int)ceil(log2(MAX_CONTEXT)) | kernelID);
-                            it->second.state = FINISH;
-                        }
-                    }
-                }
-        }
-        if(!block_legal) {
-            cout << "return Wrong finished block ID, something error" << endl;
-            return -1;
-        }
-    }
-    return 0;
-}
-/**
- * @brief 返回已经完成的kernel
- * @return queue<int> 
- */
-queue<int> vt_device::get_finished_kernel() {
-    queue<int> tmp;
-    while(!finished_kernel_l.empty()) {
-        tmp.push(finished_kernel_l.front());
-        finished_kernel_l.pop();
-    }
-    return tmp;
-}
-/**
- * 执行所有context下的所有kernel，并返回已完成kernel ID的队列
- * @return
- */
-queue<int> vt_device::execute_all_kernel() {
-    queue<int> tmp;
-	int cnt = 0;
-    while(!all_context_finished()) {
-        while(!finished_kernel_l.empty()) {
-            tmp.push(finished_kernel_l.front());
-            finished_kernel_l.pop();
-        }
-        wait(RUN_DELAY);
-		cnt++;
-		if(cnt > 30) break;
-    }
-    return tmp;
-}
-
-/**
- * 返回已经完成的contextID,如果没有执行完成，硬件时钟并不会前进
- * @return <queue<int>> contextID的队列
- */
-queue<int> vt_device::get_finished_context() {
-    queue<int> tmp;
-    auto it = contextList_.begin();
-    while(it != contextList_.end()) {
-        if(it->second.context_finished()){ ///< 这个context里的所有kernel都执行完成了
-            tmp.push(it->second.contextID);
-            it = contextList_.erase(it);
-        }
-        else ++it;
-    }
-    return tmp;
-}
-
-bool vt_device::all_context_finished() {
-	auto it = contextList_.begin();
-	while(it != contextList_.end()) {
-		if(!it->second.context_finished())
-			return false;
-	}
-	return true;
-}
-
-
-addr_manager::~addr_manager() {
-    for(auto it : contextMemory_) {
-        addrItem *curItem = it.second;
-        while(curItem != nullptr) {
-            auto tmp = curItem;
-            curItem = curItem->succContextItem;
-            delete tmp;
-        }
-    }
-}
-
-
-int addr_manager::allocMemory(uint64_t contextID, uint64_t kernelID, uint64_t *vaddr, uint64_t size, int BUF_TYPE) {
-    if(size == 0 || vaddr == nullptr) {
-        PCOUT_ERROR << "vaddr needs to allocate memory is nullptr! error!" << endl;
-        return -1;
-    }
-
-    size = aligned_size(size, BLOCK_SIZE);
-    addrItem* currentItem = nullptr;
-//    auto curContextIt = contextList_.begin();
-//    while(curContextIt != contextList_.end()) {
-//        if(*curContextIt == contextID) {
-    if(contextMemory_.find(contextID) == contextMemory_.end()) {/// 检查这个context是否存在
-        PCOUT_ERROR << "Context of ID" << contextID <<" has not created, can't allocate memory!" << endl;
-        return -1;
-    }
-
-	switch (BUF_TYPE) {///
-		case READ_ONLY:
-			if(size < RWDATA_BASE - RODATA_BASE) {
-				*vaddr = RODATA_BASE;
-				break;
-			} else {
-				PCOUT_ERROR << "buffer size too large, error!" << endl;
-				return -1;
-			}
-		case READ_WRITE: if(size < RWDATA_BASE - RODATA_BASE) {
-				*vaddr = RWDATA_BASE;
-				break;
-			} else {
-				PCOUT_ERROR << "buffer size too large, error!" << endl;
-				return -1;
-			}
-		case KERNEL_MEM: if(size < GLOBALMEM_SIZE/2) {
-				*vaddr = BUF_PARA_BASE;
-				break;
-			} else {
-				PCOUT_ERROR << "buffer size too large, error!" << endl;
-				return -1;
-			}
-		default: break;
-	}
-    if(contextMemory_.at(contextID) == nullptr) {
-                currentItem = new addrItem(kernelID, contextID, *vaddr, size);
-                contextMemory_.at(contextID) = currentItem;
-            }
-            else {
-                currentItem = contextMemory_.at(contextID);
-                if(!allocVaddr(&currentItem, vaddr, size, BUF_TYPE))
-                	insertNewItem(currentItem, contextID, kernelID, vaddr, size);
-				else {
-					PCOUT_ERROR << "allocating virtual addr failed !" << endl;
-				}
-            }
-    return 0;
-}
-
-int addr_manager::createNewContext(uint64_t contextID) {
-
-    for(auto it : contextMemory_) {
-        if(it.first == contextID) {
-            PCOUT_ERROR << "A context of ID" << contextID <<" exists, error!" << endl;
-            return -1;
-        }
-    }
-//    contextList_.emplace_back(contextID);
-    auto t = contextMemory_.emplace(contextID, nullptr);
-    auto p = t.first;
-    return 0;
-}
-/// 插入一个地址元素,如果contextMemory_中已经存在读写类型的地址，并且需要插入只读类型的地址，则要插入的地址为开头,
-/// 同时修改该context的地址链表的开头元素为要插入的元素
-/// \param currentItem 在该元素的后面插入
-/// \param contextID
-/// \param kernelID
-/// \param vaddr
-/// \param size
-void addr_manager::insertNewItem(addrItem *currentItem, uint64_t contextID, uint64_t kernelID, uint64_t *vaddr,
-                                 uint64_t size) {
-    auto tmp = new addrItem(kernelID, contextID, *vaddr, size);
-	if(tmp->vaddr == RODATA_BASE && currentItem->vaddr == RWDATA_BASE) {
-		tmp->succContextItem = currentItem;
-		currentItem->prevContextItem = tmp;
-		contextMemory_.at(contextID) = tmp;
-		return;
-	}
-    tmp->succContextItem = (currentItem)->succContextItem;
-    tmp->prevContextItem = currentItem;
-    if(currentItem->succContextItem != nullptr)
-        currentItem->succContextItem->prevContextItem = tmp;
-    (currentItem)->succContextItem = tmp;
-
-}
-/// https://raw.githubusercontent.com/yangzexia/md-image/image/202305171429512.svg
-int addr_manager::allocVaddr(addrItem **rootItem, uint64_t *vaddr, uint64_t size, int BUF_TYPE) {
-
-	uint64_t curAddr;
-	switch (BUF_TYPE) {/// 寻找下一个还没有分配的地址
-		case READ_ONLY:
-			if((*rootItem)->vaddr==RODATA_BASE) {
-				*vaddr = aligned_size((*rootItem)->vaddr + (*rootItem)->size, PAGESIZE);
-				while ((*rootItem)->vaddr < RWDATA_BASE && (*rootItem)->succContextItem != nullptr) {
-//					*vaddr = (*rootItem)->vaddr + (*rootItem)->size;
-					if (*vaddr + size <= (*rootItem)->succContextItem->vaddr) {
-						break;/// 该地址符合条件，跳出循环
-					}
-					*rootItem = (*rootItem)->succContextItem;
-					*vaddr = aligned_size((*rootItem)->vaddr + (*rootItem)->size, PAGESIZE);
-				}
-
-				if ((*rootItem)->succContextItem == nullptr || (*rootItem)->succContextItem->vaddr >= RWDATA_BASE) {
-					if (*vaddr + size <= RWDATA_BASE)
-						break;
-					else {
-						PCOUT_ERROR << "memory needs to allocate of size of 0x" << hex << size << dec
-									<< "failed! No enough space!" << endl;
-						return -1;
-					}
-				}
-			} else {
-				*vaddr = RODATA_BASE;
-			}
-			break;
-
-
-		case READ_WRITE:
-			if((*rootItem)->vaddr == RODATA_BASE) {/// 如果第一个元素是只读类型的地址，则遍历到RW_BASE,如果没有RW的地址，则要分配的地址为RW_BASE
-				while((*rootItem)->vaddr < RWDATA_BASE ) {
-					if ((*rootItem)->succContextItem == nullptr) {
-						*vaddr = RWDATA_BASE;
-						return 0;
-					}
-					*rootItem = (*rootItem)->succContextItem;
-				}
-			}
-			*vaddr = aligned_size((*rootItem)->vaddr + (*rootItem)->size, PAGESIZE);
-			while ((*rootItem)->vaddr < BUF_PARA_BASE && (*rootItem)->succContextItem != nullptr) {
-//				*vaddr = (*rootItem)->vaddr + (*rootItem)->size;
-				if (*vaddr + size <= (*rootItem)->succContextItem->vaddr) {
-					break;/// 该地址符合条件，跳出循环
-				}
-				*rootItem = (*rootItem)->succContextItem;
-//				if((*rootItem)->succContextItem == nullptr)
-					*vaddr = aligned_size((*rootItem)->vaddr + (*rootItem)->size, PAGESIZE);
-			}
-			if ((*rootItem)->succContextItem == nullptr && (*vaddr + size > BUF_PARA_BASE)) {
-				PCOUT_ERROR << "memory needs to allocate of size of 0x" << hex << size << dec
-							<< "failed! No enough space!" << endl;
-				return -1;
-			}
-			break;
-		case KERNEL_MEM:
-			if((*rootItem)->vaddr == RODATA_BASE || (*rootItem)->vaddr == RWDATA_BASE) {/// 如果第一个元素是只读类型的地址，则遍历到RW_BASE,如果没有RW的地址，则要分配的地址为RW_BASE
-				while((*rootItem)->vaddr < BUF_PARA_BASE ) {
-					if ((*rootItem)->succContextItem == nullptr) {
-						*vaddr = BUF_PARA_BASE;
-						return 0;
-					}
-					*rootItem = (*rootItem)->succContextItem;
-				}
-			}
-			*vaddr = aligned_size((*rootItem)->vaddr + (*rootItem)->size, PAGESIZE);
-			while ((*rootItem)->vaddr < BUF_PARA_BASE+GLOBALMEM_SIZE/2 && (*rootItem)->succContextItem != nullptr) {
-//				*vaddr = (*rootItem)->vaddr + (*rootItem)->size;
-				if (*vaddr + size <= (*rootItem)->succContextItem->vaddr) {
-					break;/// 该地址符合条件，跳出循环
-				}
-				*rootItem = (*rootItem)->succContextItem;
-//				if((*rootItem)->succContextItem == nullptr)
-				*vaddr = aligned_size((*rootItem)->vaddr + (*rootItem)->size, PAGESIZE);
-			}
-			if ((*rootItem)->succContextItem == nullptr && (*vaddr + size > BUF_PARA_BASE+GLOBALMEM_SIZE/2)) {
-				PCOUT_ERROR << "memory needs to allocate of size of 0x" << hex << size << dec
-							<< "failed! No enough space!" << endl;
-				return -1;
-			}
-			break;
-	}
-
-	return 0;
-}
-
-
-int addr_manager::attachPaddr(uint64_t kernelID, uint64_t contextID, uint64_t *vaddr, uint64_t *paddr) {
-	bool b_contextExist = false;
-	bool b_vaddrExist = false;
-//    for(auto it : contextList_) {
-//        if(it == contextID) {
-	b_contextExist = true;
-	auto tmp = contextMemory_.at(contextID);
-	while(tmp != nullptr) {
-		if(tmp->vaddr == *vaddr) {
-			tmp->paddr = *paddr;
-			break;
-		}
-		tmp = tmp->succContextItem;
-//            }
-//            break;
-//        }
-	}
-	if(!tmp) {
-		PCOUT_ERROR << "Attaching paddr created by ram and vaddr created by addrManager_ failed, vaddr not exists!" << endl;
-		return -1;
-	}
-	return 0;
-}
-
-int addr_manager::findVaByPa(uint64_t kernelID, uint64_t contextID, uint64_t *vaddr, uint64_t *paddr) {
-	if(contextMemory_.find(contextID) == contextMemory_.end()) {
-		PCOUT_ERROR << "Context of ID" << contextID <<" has not created,check parameters!" << endl;
-		return -1;
-	}
-	auto tmp = contextMemory_.at(contextID);
-	while(tmp != nullptr) {
-		if(tmp->paddr == *paddr) {
-			*vaddr = tmp->vaddr;
-			break;
-		}
-		tmp = tmp->succContextItem;
-	}
-	return 0;
-}
-
-int addr_manager::releaseMemory(uint64_t contextID, uint64_t kernelID, uint64_t *vaddr, uint64_t size) {
-//    auto tmp = new addrItem(kernelID, contextID, *vaddr, size);
-    bool b_contextExist = false;
-    bool b_vaddrExist = false;
-//    for(auto it : contextList_) {
-//        if(it == contextID) {
-            b_contextExist = true;
-            auto tmp = contextMemory_.at(contextID);
-            while(tmp != nullptr) {
-                if(tmp->vaddr == *vaddr) {
-                    if(tmp->prevContextItem == nullptr && tmp->succContextItem == nullptr);
-                    else if(tmp->prevContextItem == nullptr)
-                        tmp->succContextItem->prevContextItem = nullptr;
-                    else if(tmp->succContextItem == nullptr)
-                        tmp->prevContextItem->succContextItem = nullptr;
-                    else {
-                        tmp->prevContextItem->succContextItem = tmp->succContextItem;
-                        tmp->succContextItem->prevContextItem = tmp->prevContextItem;
-                    }
-                    delete tmp;
-                    b_vaddrExist = true;
-                    break;
-                }
-                tmp = tmp->succContextItem;
-//            }
-//            break;
-//        }
-    }
-    if(!b_contextExist) {
-        PCOUT_ERROR << "context ID of " << contextID << " check your input! " << endl;
-        return -1;
-    }
-    if(!b_vaddrExist) {
-        PCOUT_ERROR << "invalid vaddr of " << *vaddr << " check your input! " << endl;
-        return -1;
-    }
-    return 0;
-}
-
-bool addr_manager::findContextID(uint64_t contextID) {
-    for(auto it : contextMemory_) {
-        if(it.first == contextID)
-            return true;
-    }
-    return false;
-}
-
-bool addr_manager::findKernelID(uint64_t kernelID) {
-    for(auto it : kernelList_) {
-        if(it == kernelID)
-            return true;
-    }
-    return false;
-}
-
diff --git a/driver/verilating_device/vt_device.h b/driver/verilating_device/vt_device.h
deleted file mode 100644
index 45597f6..0000000
--- a/driver/verilating_device/vt_device.h
+++ /dev/null
@@ -1,221 +0,0 @@
-/**
- * @file vt_device.h
- * @brief 与驱动提供的API对接的类的声明
- * @author YangZexia (yang-zx17\@qq.com)
- * @version 1.2
- * @date 2022-11-24
- * 
- * @copyright Copyright (c) {2022}  DSPLAB@Tsinghua University
- * 
- * @par 修改日志:
- * <table>
- * <tr><th>Date       <th>Version <th>Author  <th>Description
- * <tr><td>2022-11-24 <td>1.0     <td>YangZexia     <td>首次创建
- * <tr><td>2022-12-14 <td>1.1
- * <tr><td>2022-11-24 <td>1.2     <td>YangZexia     <td>增加了多任务（多个根页表）机制
- * </table>
- */
-#include "processor.h"
-#include "vt_utils.h"
-#include "vt_config.h"
-#include <future>
-#include <list>
-#include <map>
-#include <utility>
-#include <vector>
-#include <unordered_map>
-
-using namespace ventus;
-using namespace std;
-//These macro is defined as test
-
-enum _state{UNFINISH, FINISH};
-
-
-struct meta_data{
-    uint64_t kernel_id;
-    uint64_t kernel_size[3];///< 每个kernel的workgroup三维数目
-    uint64_t wf_size; ///< 每个warp的thread数目
-    uint64_t wg_size; ///< 每个workgroup的warp数目
-    uint64_t metaDataBaseAddr;///< CSR_KNL的值，
-    uint64_t ldsSize;///< 每个workgroup使用的local memory的大小
-    uint64_t pdsSize;///< 每个thread用到的private memory大小
-    uint64_t sgprUsage;///< 每个workgroup使用的标量寄存器数目
-    uint64_t vgprUsage;///< 每个thread使用的向量寄存器数目
-    uint64_t pdsBaseAddr;///< private memory的基址，要转成每个workgroup的基地址， wf_size*wg_size*pdsSize
-};
-
-
-struct kernel_info{ ///< 一个kernel由多个NDrange组成，一个NDrange由多个workgroup组成，每个workgroup在硬件上执行时映射到一个block.
-    map<int, _state> blk_list; ///< 该kernel总共包含的block，以及每个block的执行状态
-    _state state; ///< 该kernel的执行状态
-    kernel_info(map<int, _state> input_blk_list, _state stateIn):
-                blk_list(std::move(input_blk_list)),
-                state(stateIn){}
-};
-
-struct context_info{
-    uint64_t contextID;
-    map<uint64_t, kernel_info> kernelList; ///< 该context已经发送给硬件执行的kernel（只有发送给硬件的kernel从会被记录）及其状态：执行完成，未完成
-    uint64_t root;
-    Memory ram = Memory(RAM_RANGE);
-    context_info(uint64_t taskID) : ram(RAM_RANGE){
-        contextID = taskID;
-        cout << RAM_RANGE <<endl;
-        root = 0;
-    }
-    context_info(const context_info &c): ram(c.ram) {
-        contextID = c.contextID;
-        root = c.root;
-        kernelList = c.kernelList;
-    }
-
-    bool context_finished(){
-        for(auto const &it : kernelList) {
-            if(it.second.state ==  UNFINISH)
-                return false;
-        }
-        return true;
-    }
-};
-
-struct addrItem{
-    addrItem *prevContextItem;
-    addrItem *succContextItem;
-    addrItem *prevKernelItem;
-    addrItem *succKernelItem;
-    uint64_t kernelID;
-    uint64_t taskID;
-    uint64_t vaddr;
-	uint64_t paddr;
-    uint64_t size;
-    addrItem(uint64_t in_kernelID,uint64_t in_taskID,uint64_t in_vaddr,uint64_t in_size)
-            :prevContextItem(nullptr),
-             succContextItem(nullptr),
-             prevKernelItem(nullptr),
-             succKernelItem(nullptr),
-             kernelID(in_kernelID),
-             taskID(in_taskID),
-             vaddr(in_vaddr),
-             size(in_size)
-    {}
-	addrItem& operator[](uint64_t vaddr) {
-
-	}
-};
-
-/**
- * 地址管理，能够以device为单位管理内存地址空间，可能包含多个context的根页表，以及每个kernel使用的
- * 地址，
- * 分配地址时判断地址段是否可用，
- * 管理分配的内存空间的类型（只读段，读写段），
- * 地址段的释放。
- */
-class addr_manager{
-public:
-    addr_manager(){};
-    ~addr_manager();
-
-//    void attatch_ram(Memory* ram);
-    int createNewContext(uint64_t contextID);
-    int allocMemory(uint64_t contextID, uint64_t kernelID, uint64_t *vaddr, uint64_t size, int BUF_TYPE);
-    int releaseMemory(uint64_t contextID, uint64_t kernelID, uint64_t *vaddr, uint64_t size);
-	int attachPaddr(uint64_t kernelID, uint64_t contextID, uint64_t *vaddr, uint64_t *paddr);
-	int findVaByPa(uint64_t kernelID, uint64_t contextID, uint64_t *vaddr, uint64_t *paddr);
-	bool findContextID(uint64_t contextID);
-    bool findKernelID(uint64_t kernelID);
-private:
-
-    int allocVaddr(addrItem** rootItem, uint64_t *vaddr, uint64_t size, int BUF_TYPE);
-    /**
-     * 插入一个addrItem到currentItem的后面
-     * @param currentItem
-     * @param contextID
-     * @param kernelID
-     * @param vaddr
-     * @param size
-     */
-    void insertNewItem(addrItem* currentItem, uint64_t contextID, uint64_t kernelID,uint64_t *vaddr, uint64_t size);
-
-//    list<uint64_t> contextList_;
-    map<uint64_t, addrItem*> contextMemory_;
-    list<uint64_t> kernelList_;
-
-
-};
-
-class vt_device {
-public:
-    vt_device() {
-//            addrManager_.attatch_ram(&ram_);
-            test_proc();
-            // list<unordered_map<int, bool>> task_by_block_l;
-            // vector<uint64_t> roots;
-        }
-    ~vt_device(){
-        if(last_task_.valid())
-            last_task_.wait();
-    }
-
-    int create_device_mem(uint64_t taskID);
-
-    /**
-     * @brief 释放分配的空间，释放根页表所指向的空间
-     * @param  taskID    要释放的内存空间对应的任务ID
-     * @return int
-     */
-    int delete_device_mem(int taskID);
-
-
-    /**
-     * @brief 为GPU分配按照虚拟地址分配内存空间，返回指向根页表的指针
-     * @param  size         要分配的空间大小
-     * @param  dev_maddr    要分配的空间起始虚拟地址
-     * @param  root         指向根页表的指针        
-     * @return int 
-     */
-    int alloc_local_mem(uint64_t size, uint64_t *vaddr, int BUF_TYPE, uint64_t taskID, uint64_t kernelID);
-
-    int free_local_mem(uint64_t size, uint64_t *vaddr, uint64_t taskID, uint64_t kernelID);
-
-    /**
-     * @brief 将buffer写入到分配给GPU的memory中，只读区间
-     * @param  taskID            任务ID    
-     * @param  dest_addr         GPU的memory，虚拟地址
-     * @param  size              大小
-     * @return int 
-     */
-    int upload(uint64_t dev_vaddr,const void *src_addr, uint64_t size, uint64_t taskID, uint64_t kernelID);
-    /**
-     * @brief 
-     * @param  root              根页表
-     * @param  dest_data_addr    要读取的数据地址，虚拟地址
-     * @param  src_addr          读出后要放置的位置
-     * @param  size              大小
-     * @return int 
-     */
-    int download(uint64_t dev_vaddr, void *dst_addr, uint64_t size, uint64_t taskID, uint64_t kernelID);
-    int start(int taskID, void* metaData);
-    int wait(uint64_t time);
-    queue<int> get_finished_kernel();
-    queue<int> get_finished_context();
-    queue<int> execute_all_kernel();
-	bool all_context_finished();
-
-
-
-private:
-
-    int push_kernel(uint64_t taskID, uint64_t kernelID, map<int, bool>input_blk_list);
-//    uint64_t parse_metaData(uint64_t taskID, void *metaData, host_port_t* devicePort);
-
-    Processor processor_;
-    future<int> last_task_;
-    queue<int> finished_kernel_l; ///< 已经执行完成的任务ID
-    addr_manager addrManager_;
-    map<uint64_t, context_info> contextList_;
-};
-
-
-
-
diff --git a/include/ventus.h b/include/ventus.h
index 47752ab..6917c7d 100644
--- a/include/ventus.h
+++ b/include/ventus.h
@@ -55,7 +55,7 @@ int vt_dev_open(vt_device_h* hdevice);
 int vt_dev_close(vt_device_h hdevice);
 
 /// return device configurations
- int vt_dev_caps(vt_device_h* hdevice, uint64_t caps_id, uint64_t *value);
+int vt_dev_caps(vt_device_h* hdevice, uint64_t caps_id, uint64_t *value);
 
 /// @brief 【已实现】以任务为单位，在GPGPU设备上分配虚拟内存空间（创建根页表）
 /// @param hdevice 指向设备的指针