diff --git a/.github/workflows/build-cuda.yml b/.github/workflows/build-cuda.yml
index fe5baad9..54a1eec9 100644
--- a/.github/workflows/build-cuda.yml
+++ b/.github/workflows/build-cuda.yml
@@ -1,10 +1,8 @@
 name: Build and Test CUDA Backend
 
 on:
-  push:
-    branches: [ main ]
   pull_request:
-    branches: [ main ]
+    branches: [main, spatter-devel]
   schedule:
     - cron: '30 8 * * *'
 
@@ -15,4 +13,4 @@ jobs:
     - uses: actions/checkout@v4
     - name: Run batch file
       run: cd tests/misc && chmod +x run-crnch-cuda.sh && sbatch run-crnch-cuda.sh
-      
\ No newline at end of file
+      
diff --git a/.github/workflows/build-mpi.yml b/.github/workflows/build-mpi.yml
index 40e299c8..6860d9c9 100644
--- a/.github/workflows/build-mpi.yml
+++ b/.github/workflows/build-mpi.yml
@@ -1,13 +1,9 @@
 name: Build and Test MPI Backend
 
 on:
-  push:
-    branches: [ main ]
   pull_request:
-    branches: [ main ]
-  schedule:
-    - cron: '30 8 * * *'
-
+    branches: [main, spatter-devel]
+  
 jobs:  
   build-and-run-mpi:
     runs-on: ubuntu-latest
@@ -21,5 +17,4 @@ jobs:
         run: 
           cmake -DUSE_MPI=1 -B build_mpi -S . && make -C build_mpi
       - name: Test-MPI
-        run: make test -C build_mpi
-        
\ No newline at end of file
+        run: make test -C build_mpi
\ No newline at end of file
diff --git a/.github/workflows/build-serial-omp.yml b/.github/workflows/build-serial-omp.yml
index ab9d0600..606966fe 100644
--- a/.github/workflows/build-serial-omp.yml
+++ b/.github/workflows/build-serial-omp.yml
@@ -1,12 +1,8 @@
 name: Build and Test Serial and OpenMP backends
 
-on:
-  push:
-    branches: [ main ]
+on: 
   pull_request:
-    branches: [ main ]
-  schedule:
-    - cron: '30 8 * * *'
+    branches: [main, spatter-devel]
 
 jobs:
   build-and-run-serial:
@@ -27,4 +23,3 @@ jobs:
         cmake -DUSE_OPENMP=1 -B build_omp_gnu -S . && make -C build_omp_gnu
     - name: Test-OMP
       run: make test -C build_omp_gnu
-      
\ No newline at end of file
diff --git a/.github/workflows/notebooks.yml b/.github/workflows/notebooks.yml
index 27f2f4ed..5cd4be71 100644
--- a/.github/workflows/notebooks.yml
+++ b/.github/workflows/notebooks.yml
@@ -1,9 +1,8 @@
 name: Notebook Build
 
 on:
-  push:
-    branches:
-      - 'main'
+  pull_request:
+    branches: [main, spatter-devel]
 
 jobs:
   notebook-tests:
diff --git a/AUTHORS b/AUTHORS
index 36104e13..c67dcbf4 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -4,17 +4,19 @@ Patrick Lavin
 	General code development, key gather/scatter kernels, maintainer
 	
 Jeffrey Young
-	OpenCL backend and general updates
+	General code development, notebooks and CI/CD, maintainer
 
 Jered Dominguez-Trujillo
-	Support for combined gather/scatter kernel
+	Development of refactor for Spatter 2.0, MPI features, improved CUDA backend
+
+Connor Radelja
+	General bugfixes and analysis for Spatter 2.0 release; development of Spatter SST element
 
 Agustin Vaca Valverde
-	Unit testing and scripts
+	Unit testing and scripts, notebooks
 
 Vincent Huang
 	Scripting and OneAPI backend support
 
 James Wood
 	OneAPI backend support
-
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4b8fc7f6..cada5365 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,10 +1,28 @@
-# Spatter 1.0 Release Notes
+# Spatter Release Notes
+
+## Spatter 2.0 Release Notes
+
+The 2.0 release of Spatter brings several important changes to the codebase, some of which are summarized in the MEMSYS 2024 paper, by Sheridan, et al., ["A Workflow for the Synthesis of Irregular Memory Access Microbenchmarks"](https://www.memsys.io/wp-content/uploads/ninja-forms/5/sheridan_et_al_workflow_irregular_patterns_paper_32_MEMSYS_2024-1.pdf). Specifically, this release includes these major changes:
+
+- Switch from C to C++ for the codebase, enabling more robus configuration options
+- Refactor of the CMake infrastructure
+- Different parsing mechanisms and deprecation of older parsing code requiring third-party submodules
+- Improvements to the CUDA backend including support for longer patterns and support for GPU throughput testing
+- MPI support for weak and strong scaling test scenarios
+- Removal of integrated support for PAPI
+- Addition of contributors guide, improved GH actions for testing, and templates for PRs and new issues.
+- Update to the Spatter wiki to describe [changes in Spatter 2.0 that may affect benchmark performance](https://github.com/hpcgarage/spatter/wiki/Spatter-2.0-Validation) and a [new guide for developing new backends](https://github.com/hpcgarage/spatter/wiki/Adding-New-Backends-to-Spatter).
+
+This release includes commits from Patrick Lavin, Jeff Young, Julio Agustin Vaca Valverde, Jered Dominguez-Trujillo, and Connor Radelja. We are greatly appreciative of the support of contributors from Los Alamos National Labs through [their work with Spatter](https://github.com/lanl/spatter) as well as new work on [generating input patterns with GS Patterns](https://github.com/lanl/gs_patterns/).
+
+
+## Spatter 1.0 Release Notes
 
 After 6 years of development, we are ready to release Spatter version 1.0! Over the years we have gone through a number of input and output formats, but they have not changed significantly in several years now, so we are confident that they are stable enough for our first major release. 
 
 Building Spatter has been a collaborative effort. This release includes commits from Patrick Lavin, Jeff Young, Julio Agustin Vaca Valverde, Jered Dominguez-Trujillo, James Wood, Vincent Huang, Sudhanshu Agarwal, Jeff Inman, and Jeff Hammond. Thank you to all of you for your effort and to those who submitted issues and advised us on the project as well.
 
-## New Features
+### New Features
 Since version 0.6, Spatter has added several major new features: 
 
  - multi-gather, multi-scatter: the multi- kernels perform two levels of indirection, e.g. multi-gather is dest[i] = src[idx1[idx2[i]] and multi-scatter is analogous. This greatly expands the class of patterns that Spatter can represent. 
@@ -14,6 +32,6 @@ MPI support allows Spatter to run the same pattern on many ranks and allows for
  - Testing and CI/CD has been greatly improved. The GPU backend is now included in the test suite, and we have expanded the set of automated tests used for the CPU backend as well.
  - Documentation now includes a Getting Started Jupyter notebook to demonstrate how to use the benchmark and how to plot outputs.
 
-## Ongoing Work
+### Ongoing Work
 We are still implementing a CI solution to automatically test commits to the GPU backend. 
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 00000000..ce317ef6
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,48 @@
+# Contributing to Spatter
+
+Thank you for your interest in contributing to the Spatter open source project. We appreciate your interest and contributions. Please review these guidelines for the project so that you can most effectively engage with the developers and maintainers of the project. 
+
+The license for the project is BSD-3 with some slight language additions from the main contributors at Los Alamos National Labs and Georgia Institute of Technology. 
+
+## Types of Contributions Accepted
+
+Spatter is a growing project and ecosystem, and there are several opportunities for contributions in the areas of new features, training and education, and general improvements to the codebase. 
+
+### Contribution Process
+In general, to contribute fixes or new code to Spatter, you would do the following:
+1) Fork the Spatter codebase
+2) Make your changes and run unit tests locally to see that changes do not break anything
+3) Rebase on spatter-devel branch in the main repo, as needed. This is our "test" branch whereas main is the "stable" branch.
+4) Open a PR and fill in the PR template with relevant information on your changes.
+5) Request reviewers and discuss/update the PR until it is ready to be accepted.
+
+### Reporting Bugs
+
+**NOTE**: If you find a security vulnerability in the codebase, please do not open an issue. Email the primary maintainers or post a Discussion message noting you have something you'd like to report. This provides the development team an opportunity to respond to your findings and prepare a patch that can be released to mitigate any vulnerabilities.
+
+Other general bugs can be reported using our [Bug Report Issue Template](https://github.com/hpcgarage/spatter/issues/new?assignees=&labels=bug&projects=&template=00-bug-report.yml&title=%F0%9F%90%9B+%5BBUG%5D+-+%3Ctitle%3E).
+
+### Suggesting Features or Enhancements
+
+If there is a feature that you would like to see in Spatter that doesn't currently exist, we also have a [feature request issue template](https://github.com/hpcgarage/spatter/issues/new?assignees=&labels=feature+request&projects=&template=01-feature-request.yml&title=%E2%9C%A8+%5BFEATURE+REQUEST%5D+-+%3Ctitle%3E) that you can use. This will create a new issue for further discussion with the development team. 
+
+### New Spatter Backends
+
+We have created a guide to developing new backends for Spatter [on our wiki here](https://github.com/hpcgarage/spatter/wiki/Adding-New-Backends-to-Spatter). If you would like to create a new backend for Spatter, you can create a new feature request, and the maintainers can link any 
+
+## Code review process
+
+We will aim to review your PR in 1-2 weeks with a goal of getting it merged within that time frame. If added changes are needed, we will follow up via the PR request and may make some suggestions to help your PR pass the CI unit tests and be able to be merged into the main line of the codebase. 
+
+### Code style conventions
+
+Spatter does not currently have strong checks for code formatting but please try to be consistent with the existing codebase with your changes. The `.clang-format` file in the top-level directory specifies the desired format for any commits. 
+
+## Community Channels for Discussion
+
+The primary discussion channels for this project are via [GitHub Issues](https://github.com/hpcgarage/spatter/issues) and the [GitHub Discussions space of this project](https://github.com/hpcgarage/spatter/discussions). You are welcome to post any general questions about Spatter to the Discussions channel or email the maintainers. 
+
+### Code of Conduct for Discussions
+The maintainers of the project aim to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, caste, color, religion, or sexual identity and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. For more details on our standard of conduct, please see the [Contributor Covenant Guidelines](https://www.contributor-covenant.org/version/2/1/code_of_conduct/), which we follow to ensure constructive and welcoming discussions and engagement.s
diff --git a/README.md b/README.md
index 1bf37093..864a60d8 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 Spatter is a microbenchmark for timing Gather/Scatter kernels on CPUs and GPUs. View the [source](https://github.com/hpcgarage/spatter), and please submit an issue on Github if you run into any issues. 
 
-![Build Status](https://github.com/hpcgarage/spatter/actions/workflows/build.yml/badge.svg)
+![Build Status](https://github.com/hpcgarage/spatter/actions/workflows/build-serial-omp.yml/badge.svg)
 
 
 ## Purpose 
diff --git a/cmake/pkgs/MPISupport.cmake b/cmake/pkgs/MPISupport.cmake
index f219cd55..d9411ff9 100644
--- a/cmake/pkgs/MPISupport.cmake
+++ b/cmake/pkgs/MPISupport.cmake
@@ -3,6 +3,8 @@ option(USE_MPI "Enable support for MPI")
 if (USE_MPI)
     find_package(MPI)
     include_directories(${MPI_INCLUDE_PATH})
+    #Explicitly add directory for Ubuntu 22 to search
+    include_directories(/usr/lib/x86_64-linux-gnu/openmpi/include)
     set(COMMON_LINK_LIBRARIES ${COMMON_LINK_LIBRARIES} MPI::MPI_CXX)
     add_definitions(-DUSE_MPI)
 endif()
diff --git a/notebooks/requirements.txt b/notebooks/requirements.txt
index dd018a71..2b3e6fb4 100644
--- a/notebooks/requirements.txt
+++ b/notebooks/requirements.txt
@@ -41,7 +41,7 @@ jupyter_client==8.6.2
 jupyter_core==5.7.2
 jupyter_server==2.14.1
 jupyter_server_terminals==0.5.3
-jupyterlab==4.2.2
+jupyterlab==4.2.5
 jupyterlab_pygments==0.3.0
 jupyterlab_server==2.27.2
 kiwisolver==1.4.5
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e4ae5a5e..28774c26 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -5,6 +5,7 @@ add_subdirectory(Spatter)
 add_executable(spatter main.cc)
 #target_compile_options(spatter PUBLIC "-fnew-alignment 32")
 target_link_libraries(spatter ${COMMON_LINK_LIBRARIES} Spatter)
+target_link_libraries(spatter z)
 set_target_properties(spatter PROPERTIES
     COMPILE_DEFINITIONS "${COMMON_COMPILE_DEFINITIONS}"
     COMPILE_OPTIONS "${WARNING_FLAGS}"
diff --git a/src/Spatter/CMakeLists.txt b/src/Spatter/CMakeLists.txt
index 15300460..34be97ba 100644
--- a/src/Spatter/CMakeLists.txt
+++ b/src/Spatter/CMakeLists.txt
@@ -64,11 +64,15 @@ endif()
 target_link_libraries(Spatter
     PUBLIC
     ${COMMON_LINK_LIBRARIES}
+    z
+    stdc++fs
     )
 
 target_link_libraries(Spatter_shared
     PUBLIC
     ${COMMON_LINK_LIBRARIES}
+    z
+    stdc++fs
     )
 
 target_compile_options(Spatter
diff --git a/src/Spatter/Configuration.cc b/src/Spatter/Configuration.cc
index 20768496..373a1069 100644
--- a/src/Spatter/Configuration.cc
+++ b/src/Spatter/Configuration.cc
@@ -4,6 +4,7 @@
 
 #include <numeric>
 #include <atomic>
+#include <zlib.h>
 
 #include "Configuration.hh"
 
@@ -20,7 +21,8 @@ ConfigurationBase::ConfigurationBase(const size_t id, const std::string name,
     aligned_vector<double> &dense,
     aligned_vector<aligned_vector<double>> &dense_perthread, double *&dev_dense,
     size_t &dense_size, const size_t delta, const size_t delta_gather,
-    const size_t delta_scatter, const long int seed, const size_t wrap,
+    const size_t delta_scatter, aligned_vector<size_t> &trace_rw,
+    const long int seed, const size_t wrap,
     const size_t count, const size_t shared_mem, const size_t local_work_size,
     const int nthreads, const unsigned long nruns, const bool aggregate,
     const bool atomic, const unsigned long verbosity)
@@ -33,7 +35,7 @@ ConfigurationBase::ConfigurationBase(const size_t id, const std::string name,
       sparse_scatter_size(sparse_scatter_size), dense(dense),
       dense_perthread(dense_perthread), dev_dense(dev_dense),
       dense_size(dense_size), delta(delta), delta_gather(delta_gather),
-      delta_scatter(delta_scatter), seed(seed), wrap(wrap), count(count),
+      delta_scatter(delta_scatter), trace_rw(trace_rw), seed(seed), wrap(wrap), count(count),
       shmem(shared_mem), local_work_size(local_work_size),
       omp_threads(nthreads), nruns(nruns), aggregate(aggregate), atomic(atomic),
       verbosity(verbosity), time_seconds(nruns, 0) {
@@ -54,6 +56,8 @@ int ConfigurationBase::run(bool timed, unsigned long run_id) {
     multi_gather(timed, run_id);
   else if (kernel.compare("multiscatter") == 0)
     multi_scatter(timed, run_id);
+  else if (kernel.compare("trace") == 0)
+    trace_replay(timed, run_id);
   else {
     std::cerr << "Invalid Kernel Type" << std::endl;
     return -1;
@@ -246,6 +250,10 @@ void ConfigurationBase::setup() {
       }
     }
 
+    if (kernel.compare("trace") == 0) {
+      //TOD: Anything needed here?
+    }
+
     if (verbosity >= 3) {
       std::cout << "Pattern Array Size: " << pattern.size()
                 << "\tDelta: " << delta << "\tCount: " << count
@@ -386,7 +394,8 @@ Configuration<Spatter::Serial>::Configuration(const size_t id,
     aligned_vector<double> &dense,
     aligned_vector<aligned_vector<double>> &dense_perthread,
     double *&dev_dense, size_t &dense_size,const size_t delta,
-    const size_t delta_gather, const size_t delta_scatter, const long int seed,
+    const size_t delta_gather, const size_t delta_scatter, aligned_vector<size_t> &trace_rw,
+    const long int seed,
     const size_t wrap, const size_t count, const unsigned long nruns,
     const bool aggregate, const unsigned long verbosity)
     : ConfigurationBase(id, name, kernel, pattern, pattern_gather,
@@ -394,7 +403,7 @@ Configuration<Spatter::Serial>::Configuration(const size_t id,
           dev_sparse_gather, sparse_gather_size, sparse_scatter,
           dev_sparse_scatter, sparse_scatter_size, dense, dense_perthread,
           dev_dense, dense_size, delta, delta_gather,
-          delta_scatter, seed, wrap, count, 0, 1024, 1, nruns, aggregate, false,
+          delta_scatter, trace_rw, seed, wrap, count, 0, 1024, 1, nruns, aggregate, false,
           verbosity) {
   ConfigurationBase::setup();
 }
@@ -511,6 +520,41 @@ void Configuration<Spatter::Serial>::multi_scatter(
   }
 }
 
+void Configuration<Spatter::Serial>::trace_replay(
+    bool timed, unsigned long run_id) {
+  size_t pattern_length = pattern.size();
+
+  if (pattern_length != trace_rw.size()) {
+    std::cout << "Error: Pattern length does not match read/write trace length\n";
+    exit(1);
+  }
+
+#ifdef USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+
+  if (timed)
+    timer.start();
+
+  double x0 = 1.0;
+  //double x1 = 2.0;
+  for (size_t i = 0; i < count; ++i) {
+    for (size_t j = 0; j < pattern_length; ++j) {
+      if (trace_rw[j] == 0) {
+        x0 = sparse[pattern[j]]; // Read from address pattern[j]
+      } else {
+        sparse[pattern[j]] = x0; // Write to address pattern[j]
+      }
+    }
+  }
+
+  if (timed) {
+    timer.stop();
+    time_seconds[run_id] = timer.seconds();
+    timer.clear();
+  }
+}
+
 #ifdef USE_OPENMP
 Configuration<Spatter::OpenMP>::Configuration(const size_t id,
     const std::string name, const std::string kernel,
@@ -524,7 +568,8 @@ Configuration<Spatter::OpenMP>::Configuration(const size_t id,
     aligned_vector<double> &dense,
     aligned_vector<aligned_vector<double>> &dense_perthread,
     double *&dev_dense, size_t &dense_size,const size_t delta,
-    const size_t delta_gather, const size_t delta_scatter, const long int seed,
+    const size_t delta_gather, const size_t delta_scatter, aligned_vector<size_t> &trace_rw,
+    const long int seed,
     const size_t wrap, const size_t count, const int nthreads,
     const unsigned long nruns, const bool aggregate, const bool atomic,
     const unsigned long verbosity)
@@ -532,7 +577,7 @@ Configuration<Spatter::OpenMP>::Configuration(const size_t id,
           pattern_scatter, sparse, dev_sparse, sparse_size, sparse_gather,
           dev_sparse_gather, sparse_gather_size, sparse_scatter,
           dev_sparse_scatter, sparse_scatter_size, dense, dense_perthread,
-          dev_dense, dense_size, delta, delta_gather, delta_scatter, seed, wrap,
+          dev_dense, dense_size, delta, delta_gather, delta_scatter, trace_rw, seed, wrap,
           count, 0, 1024, nthreads, nruns, aggregate, atomic, verbosity) {
   ConfigurationBase::setup();
 }
@@ -707,8 +752,45 @@ void Configuration<Spatter::OpenMP>::multi_scatter(
     timer.clear();
   }
 }
+
+void Configuration<Spatter::OpenMP>::trace_replay(
+    bool timed, unsigned long run_id) {
+  size_t pattern_length = pattern.size();
+
+  if (pattern_length != trace_rw.size()) {
+    std::cout << "Error: Pattern length does not match read/write trace length\n";
+    exit(1);
+  }
+
+#ifdef USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
 #endif
 
+  if (timed)
+    timer.start();
+
+  double x0 = 1.0;
+  double x1 = 2.0;
+  for (size_t i = 0; i < count; ++i) {
+#pragma omp parallel for private(x0, x1)
+    for (size_t j = 0; j < pattern_length; ++j) {
+      if (trace_rw[j] == 0) {
+        x0 = sparse[pattern[j]]; // Read from address pattern[j]
+      } else {
+        sparse[pattern[j]] = x1; // Write to address pattern[j]
+      }
+    }
+  }
+
+  if (timed) {
+    timer.stop();
+    time_seconds[run_id] = timer.seconds();
+    timer.clear();
+  }
+}
+
+#endif //USE_OPENMP
+
 #ifdef USE_CUDA
 Configuration<Spatter::CUDA>::Configuration(const size_t id,
     const std::string name, const std::string kernel,
@@ -722,7 +804,8 @@ Configuration<Spatter::CUDA>::Configuration(const size_t id,
     aligned_vector<double> &dense,
     aligned_vector<aligned_vector<double>> &dense_perthread, double *&dev_dense,
     size_t &dense_size, const size_t delta, const size_t delta_gather,
-    const size_t delta_scatter, const long int seed, const size_t wrap,
+    const size_t delta_scatter, aligned_vector<size_t> &trace_rw,
+    onst long int seed, const size_t wrap,
     const size_t count, const size_t shared_mem, const size_t local_work_size,
     const unsigned long nruns, const bool aggregate, const bool atomic,
     const unsigned long verbosity)
@@ -730,10 +813,10 @@ Configuration<Spatter::CUDA>::Configuration(const size_t id,
           pattern_scatter, sparse, dev_sparse, sparse_size, sparse_gather,
           dev_sparse_gather, sparse_gather_size, sparse_scatter,
           dev_sparse_scatter, sparse_scatter_size, dense, dense_perthread,
-          dev_dense, dense_size, delta, delta_gather, delta_scatter, seed,
+          dev_dense, dense_size, delta, delta_gather, delta_scatter, trace_rw, seed,
           wrap, count, shared_mem, local_work_size, 1, nruns, aggregate, atomic,
           verbosity) {
-  
+
   setup();
 }
 
diff --git a/src/Spatter/Configuration.hh b/src/Spatter/Configuration.hh
index 77d12fc9..4e4bdf79 100644
--- a/src/Spatter/Configuration.hh
+++ b/src/Spatter/Configuration.hh
@@ -46,6 +46,8 @@ inline void gpuAssert(
 #include "SpatterTypes.hh"
 #include "Timer.hh"
 
+typedef uintptr_t addr_t;
+
 #define ALIGN 64
 template <typename T>
 using aligned_vector = std::vector<T, aligned_allocator<T, ALIGN>>;
@@ -65,7 +67,7 @@ public:
       aligned_vector<double> &dense,
       aligned_vector<aligned_vector<double>> &dense_perthread,
       double *&dev_dense, size_t &dense_size, const size_t delta,
-      const size_t delta_gather, const size_t delta_scatter,
+      const size_t delta_gather, const size_t delta_scatter, aligned_vector<size_t> &trace_rw,
       const long int seed, const size_t wrap, const size_t count,
       const size_t shared_mem, const size_t local_work_size, const int nthreads,
       const unsigned long nruns, const bool aggregate, const bool atomic,
@@ -80,6 +82,7 @@ public:
   virtual void scatter_gather(bool timed, unsigned long run_id) = 0;
   virtual void multi_gather(bool timed, unsigned long run_id) = 0;
   virtual void multi_scatter(bool timed, unsigned long run_id) = 0;
+  virtual void trace_replay(bool timed, unsigned long run_id) = 0;
 
   virtual void report();
 
@@ -103,6 +106,7 @@ public:
   const aligned_vector<size_t> pattern;
   const aligned_vector<size_t> pattern_gather;
   const aligned_vector<size_t> pattern_scatter;
+  const aligned_vector<size_t> trace_rw;
 
   aligned_vector<double> &sparse;
   double *&dev_sparse;
@@ -160,7 +164,7 @@ public:
       aligned_vector<double> &dense,
       aligned_vector<aligned_vector<double>> &dense_perthread,
       double *&dev_dense, size_t &dense_size, const size_t delta,
-      const size_t delta_gather, const size_t delta_scatter,
+      const size_t delta_gather, const size_t delta_scatter, aligned_vector<size_t> &trace_rw,
       const long int seed, const size_t wrap, const size_t count,
       const unsigned long nruns, const bool aggregate,
       const unsigned long verbosity);
@@ -170,6 +174,7 @@ public:
   void scatter_gather(bool timed, unsigned long run_id);
   void multi_gather(bool timed, unsigned long run_id);
   void multi_scatter(bool timed, unsigned long run_id);
+  void trace_replay(bool timed, unsigned long run_id);
 };
 
 #ifdef USE_OPENMP
@@ -186,7 +191,7 @@ public:
       aligned_vector<double> &dense,
       aligned_vector<aligned_vector<double>> &dense_perthread,
       double *&dev_dense, size_t &dense_size, const size_t delta,
-      const size_t delta_gather, const size_t delta_scatter,
+      const size_t delta_gather, const size_t delta_scatter, aligned_vector<size_t> &trace_rw,
       const long int seed, const size_t wrap, const size_t count,
       const int nthreads, const unsigned long nruns, const bool aggregate,
       const bool atomic, const unsigned long verbosity);
@@ -198,6 +203,7 @@ public:
   void scatter_gather(bool timed, unsigned long run_id);
   void multi_gather(bool timed, unsigned long run_id);
   void multi_scatter(bool timed, unsigned long run_id);
+  void trace_replay(bool timed, unsigned long run_id);
 };
 #endif
 
@@ -215,7 +221,7 @@ public:
       aligned_vector<double> &dense,
       aligned_vector<aligned_vector<double>> &dense_perthread,
       double *&dev_dense, size_t &dense_size, const size_t delta,
-      const size_t delta_gather, const size_t delta_scatter,
+      const size_t delta_gather, const size_t delta_scatter, aligned_vector<size_t> &trace_rw,
       const long int seed, const size_t wrap, const size_t count,
       const size_t shared_mem, const size_t local_work_size,
       const unsigned long nruns, const bool aggregate, const bool atomic,
@@ -229,6 +235,7 @@ public:
   void scatter_gather(bool timed, unsigned long run_id);
   void multi_gather(bool timed, unsigned long run_id);
   void multi_scatter(bool timed, unsigned long run_id);
+  void trace_replay(bool timed, unsigned long run_id);
   void setup();
 
 public:
diff --git a/src/Spatter/Input.hh b/src/Spatter/Input.hh
index 91d69efe..d4935514 100644
--- a/src/Spatter/Input.hh
+++ b/src/Spatter/Input.hh
@@ -5,6 +5,7 @@
 #ifndef SPATTER_INPUT_HH
 #define SPATTER_INPUT_HH
 
+#include <filesystem>
 #include <algorithm>
 #include <cctype>
 #include <getopt.h>
@@ -14,6 +15,7 @@
 #include <sstream>
 #include <string>
 #include <vector>
+#include <zlib.h>
 
 #ifdef USE_OPENMP
 #include <omp.h>
@@ -24,6 +26,19 @@
 #include "PatternParser.hh"
 #include "SpatterTypes.hh"
 
+struct _trace_entry_t {
+  unsigned short type; // 2 bytes: trace_type_t
+  unsigned short size;
+  union {
+    addr_t addr;
+    unsigned char length[sizeof(addr_t)];
+  };
+}  __attribute__((packed));
+typedef struct _trace_entry_t trace_entry_t;
+
+#define N_TRACE 1000
+trace_entry_t tracebuf[N_TRACE];
+
 namespace Spatter {
 static char *shortargs =
     (char *)"ab:cd:e:f:g:hj:k:l:m:n:o:p:r:s::t:u:v:w:x:y:z:";
@@ -38,6 +53,7 @@ const option longargs[] = {{"aggregate", no_argument, nullptr, 'a'},
     {"help", no_argument, nullptr, 'h'},
     {"pattern-size", required_argument, nullptr, 'j'},
     {"kernel", required_argument, nullptr, 'k'},
+    {"tracefile", required_argument, nullptr, 0},
     {"count", required_argument, nullptr, 'l'},
     {"shared-memory", required_argument, nullptr, 'm'},
     {"name", required_argument, nullptr, 'n'},
@@ -70,10 +86,12 @@ struct ClArgs {
 
   aligned_vector<double> dense;
   aligned_vector<aligned_vector<double>> dense_perthread;
+  aligned_vector<size_t> trace_rw;
   double *dev_dense;
   size_t dense_size;
 
   std::string backend;
+  std::string tracefilename;
   bool aggregate;
   bool atomic;
   bool compress;
@@ -187,6 +205,8 @@ void help(char *progname) {
   std::cout << std::left << std::setw(10) << "-z (--local-work-size)"
             << std::setw(40) << "Set Local Work Size (default 1024)"
             << std::left << "\n";
+  std::cout << std::left << std::setw(10) << "--tracefile" << std::setw(40)
+            << "Trace file" << std::left << "\n";
 }
 
 void usage(char *progname) {
@@ -199,7 +219,7 @@ void usage(char *progname) {
                "shared-memory] [-n name] [-o op]"
                "[-p pattern] [-r runs] [-s random] [-t nthreads] [-u inner "
                "scatter pattern] [-v "
-               "verbosity] [-w wrap] [-z local-work-size]"
+               "verbosity] [-w wrap] [-z local-work-size] [--tracefile]"
             << std::endl;
 }
 
@@ -261,6 +281,7 @@ int parse_input(const int argc, char **argv, ClArgs &cl) {
   cl.atomic = false;
   cl.compress = false;
   cl.verbosity = 1;
+  cl.tracefilename = "";
 
   // In flag alphabetical order
   bool aggregate = cl.aggregate;
@@ -285,6 +306,7 @@ int parse_input(const int argc, char **argv, ClArgs &cl) {
 
   std::stringstream pattern_string;
   aligned_vector<size_t> pattern;
+  aligned_vector<size_t> trace_rw;
 
   unsigned long nruns = 10;
   long int seed = -1;
@@ -320,6 +342,13 @@ int parse_input(const int argc, char **argv, ClArgs &cl) {
           return -1;
         atomic = (atomic_val > 0) ? true : false;
       }
+      if (strcmp(longargs[option_index].name, "tracefile") == 0) {
+        if (!std::filesystem::exists(optarg)) {
+          std::cerr << "Error: tracefile (" << optarg << ") does not exist\n";
+          return -1;
+        }
+        cl.tracefilename = optarg;
+      }
       break;
 
     case 'a':
@@ -393,9 +422,9 @@ int parse_input(const int argc, char **argv, ClArgs &cl) {
 
       if ((kernel.compare("gather") != 0) && (kernel.compare("scatter") != 0) &&
           (kernel.compare("sg") != 0) && (kernel.compare("multigather") != 0) &&
-          (kernel.compare("multiscatter") != 0)) {
+          (kernel.compare("multiscatter") != 0) && (kernel.compare("trace") != 0)) {
         std::cerr << "Valid Kernels are: gather, scatter, sg, multigather, "
-                     "multiscatter"
+                     "multiscatter, and trace"
                   << std::endl;
         return -1;
       }
@@ -505,6 +534,47 @@ int parse_input(const int argc, char **argv, ClArgs &cl) {
 #endif
   }
 
+  if (kernel.compare("trace") == 0) {
+
+    // Open trace file
+    if (cl.tracefilename.compare("") == 0) {
+      std::cerr << "Error: trace kernel selected but no tracefile given with \"--tracefile\"\n";
+      exit(1);
+    }
+	gzFile tracefile = gzopen(cl.tracefilename.c_str(), "hrb");
+	if (!tracefile) {
+	  std::cerr << "Could not open tracefile: " << cl.tracefilename << std::endl;
+      exit(1);
+	}
+
+	int total_read = 0;
+	trace_entry_t tr[N_TRACE];;
+	int bytes_read = 0;
+	int nreads  = 0;
+	int nwrites = 0;
+
+	// Read N_TRACE entries at a time from tracefile
+	while ((bytes_read = gzread(tracefile, tr, sizeof(trace_entry_t)*N_TRACE))) {
+	  int entries_read = bytes_read / sizeof(trace_entry_t);
+	  total_read += entries_read;
+	  for (int i = 0; i < entries_read; i++) {
+		switch (tr[i].type) {
+		  case 0: // reads
+			nreads +=1;
+			trace_rw.push_back(0);
+			pattern.push_back(tr[i].addr);
+			break;
+		  case 1: //writes
+			nwrites += 1;
+			trace_rw.push_back(1);
+			pattern.push_back(tr[i].addr);
+			break;
+		}
+	  }
+	}
+	gzclose(tracefile);
+  }
+
   cl.backend = backend;
   cl.aggregate = aggregate;
   cl.compress = compress;
@@ -619,7 +689,7 @@ int parse_input(const int argc, char **argv, ClArgs &cl) {
           cl.dev_sparse_gather, cl.sparse_gather_size, cl.sparse_scatter,
           cl.dev_sparse_scatter, cl.sparse_scatter_size, cl.dense,
           cl.dense_perthread, cl.dev_dense, cl.dense_size, delta, delta_gather,
-          delta_scatter, seed, wrap, count, nruns, aggregate, verbosity);
+          delta_scatter, trace_rw, seed, wrap, count, nruns, aggregate, verbosity);
 #ifdef USE_OPENMP
     else if (backend.compare("openmp") == 0)
       c = std::make_unique<Spatter::Configuration<Spatter::OpenMP>>(0,
@@ -628,7 +698,7 @@ int parse_input(const int argc, char **argv, ClArgs &cl) {
           cl.dev_sparse_gather, cl.sparse_gather_size, cl.sparse_scatter,
           cl.dev_sparse_scatter, cl.sparse_scatter_size, cl.dense,
           cl.dense_perthread, cl.dev_dense, cl.dense_size, delta, delta_gather,
-          delta_scatter, seed, wrap, count, nthreads, nruns, aggregate, atomic,
+          delta_scatter, trace_rw, seed, wrap, count, nthreads, nruns, aggregate, atomic,
           verbosity);
 #endif
 #ifdef USE_CUDA
@@ -639,7 +709,7 @@ int parse_input(const int argc, char **argv, ClArgs &cl) {
           cl.dev_sparse_gather, cl.sparse_gather_size, cl.sparse_scatter,
           cl.dev_sparse_scatter, cl.sparse_scatter_size, cl.dense,
           cl.dense_perthread, cl.dev_dense, cl.dense_size, delta, delta_gather,
-          delta_scatter, seed, wrap, count, shared_mem, local_work_size, nruns,
+          delta_scatter, trace_rw, seed, wrap, count, shared_mem, local_work_size, nruns,
           aggregate, atomic, verbosity);
 #endif
     else {
@@ -652,7 +722,7 @@ int parse_input(const int argc, char **argv, ClArgs &cl) {
     Spatter::JSONParser json_file = Spatter::JSONParser(json_fname, cl.sparse,
         cl.dev_sparse, cl.sparse_size, cl.sparse_gather, cl.dev_sparse_gather,
         cl.sparse_gather_size, cl.sparse_scatter, cl.dev_sparse_scatter,
-        cl.sparse_scatter_size, cl.dense, cl.dense_perthread, cl.dev_dense,
+        cl.sparse_scatter_size, cl.dense, cl.dense_perthread, cl.trace_rw, cl.dev_dense,
         cl.dense_size, backend, aggregate, atomic, compress, shared_mem,
         nthreads, verbosity);
 
diff --git a/src/Spatter/JSONParser.cc b/src/Spatter/JSONParser.cc
index d7164419..8b1f09ae 100644
--- a/src/Spatter/JSONParser.cc
+++ b/src/Spatter/JSONParser.cc
@@ -14,7 +14,8 @@ JSONParser::JSONParser(std::string filename, aligned_vector<double> &sparse,
     size_t &sparse_gather_size, aligned_vector<double> &sparse_scatter,
     double *&dev_sparse_scatter, size_t &sparse_scatter_size,
     aligned_vector<double> &dense,
-    aligned_vector<aligned_vector<double>> &dense_perthread, double *&dev_dense,
+    aligned_vector<aligned_vector<double>> &dense_perthread,
+    aligned_vector<size_t> &trace_rw, double *&dev_dense,
     size_t &dense_size, const std::string backend, const bool aggregate,
     const bool atomic, const bool compress, size_t shared_mem,
     const int nthreads, const unsigned long verbosity, const std::string name,
@@ -27,13 +28,14 @@ JSONParser::JSONParser(std::string filename, aligned_vector<double> &sparse,
       sparse_gather_size(sparse_gather_size), sparse_scatter(sparse_scatter),
       dev_sparse_scatter(dev_sparse_scatter),
       sparse_scatter_size(sparse_scatter_size), dense(dense),
-      dense_perthread(dense_perthread), dev_dense(dev_dense),
+      dense_perthread(dense_perthread), trace_rw(trace_rw), dev_dense(dev_dense),
       dense_size(dense_size), backend_(backend), aggregate_(aggregate),
       atomic_(atomic), compress_(compress), shared_mem_(shared_mem),
       omp_threads_(nthreads), verbosity_(verbosity), default_name_(name),
       default_kernel_(kernel), default_pattern_size_(pattern_size),
       default_delta_(delta), default_delta_gather_(delta_gather),
-      default_delta_scatter_(delta_scatter), default_boundary_(boundary),
+      default_delta_scatter_(delta_scatter),
+      default_boundary_(boundary),
       default_seed_(seed), default_wrap_(wrap), default_count_(count),
       default_local_work_size_(local_work_size), default_nruns_(nruns) {
   if (!file_exists_(filename)) {
@@ -128,6 +130,7 @@ std::unique_ptr<Spatter::ConfigurationBase> JSONParser::operator[](
   aligned_vector<size_t> pattern;
   aligned_vector<size_t> pattern_gather;
   aligned_vector<size_t> pattern_scatter;
+  aligned_vector<size_t> trace_rw;
 
   size_t pattern_size = data_[index]["pattern-size"];
   size_t delta = data_[index]["delta"];
@@ -189,7 +192,7 @@ std::unique_ptr<Spatter::ConfigurationBase> JSONParser::operator[](
         pattern_scatter, sparse, dev_sparse, sparse_size, sparse_gather,
         dev_sparse_gather, sparse_gather_size, sparse_scatter,
         dev_sparse_scatter, sparse_scatter_size, dense, dense_perthread,
-        dev_dense, dense_size, delta, delta_gather, delta_scatter,
+        dev_dense, dense_size, delta, delta_gather, delta_scatter, trace_rw,
         data_[index]["seed"], data_[index]["wrap"], data_[index]["count"],
         data_[index]["nruns"], aggregate_, verbosity_);
 #ifdef USE_OPENMP
@@ -199,7 +202,7 @@ std::unique_ptr<Spatter::ConfigurationBase> JSONParser::operator[](
         pattern_scatter, sparse, dev_sparse, sparse_size, sparse_gather,
         dev_sparse_gather, sparse_gather_size, sparse_scatter,
         dev_sparse_scatter, sparse_scatter_size, dense, dense_perthread,
-        dev_dense, dense_size, delta, delta_gather, delta_scatter,
+        dev_dense, dense_size, delta, delta_gather, delta_scatter, trace_rw,
         data_[index]["seed"], data_[index]["wrap"], data_[index]["count"],
         omp_threads_, data_[index]["nruns"], aggregate_, atomic_, verbosity_);
 #endif
@@ -210,7 +213,7 @@ std::unique_ptr<Spatter::ConfigurationBase> JSONParser::operator[](
         pattern_scatter, sparse, dev_sparse, sparse_size, sparse_gather,
         dev_sparse_gather, sparse_gather_size, sparse_scatter,
         dev_sparse_scatter, sparse_scatter_size, dense, dense_perthread,
-        dev_dense, dense_size,delta, delta_gather, delta_scatter,
+        dev_dense, dense_size,delta, delta_gather, delta_scatter, trace_rw,
         data_[index]["seed"], data_[index]["wrap"], data_[index]["count"],
         shared_mem_, data_[index]["local-work-size"], data_[index]["nruns"],
         aggregate_, atomic_, verbosity_);
diff --git a/src/Spatter/JSONParser.hh b/src/Spatter/JSONParser.hh
index ebc9266e..9dbb8115 100644
--- a/src/Spatter/JSONParser.hh
+++ b/src/Spatter/JSONParser.hh
@@ -33,6 +33,7 @@ public:
       double *&dev_sparse_scatter, size_t &sparse_scatter_size,
       aligned_vector<double> &dense,
       aligned_vector<aligned_vector<double>> &dense_perthread,
+      aligned_vector<size_t> &trace_rw,
       double *&dev_dense, size_t &dense_size, const std::string backend,
       const bool aggregate, const bool atomic, const bool compress,
       const size_t shared_mem, const int nthreads,
@@ -71,9 +72,11 @@ private:
 
   aligned_vector<double> &dense;
   aligned_vector<aligned_vector<double>> &dense_perthread;
+  aligned_vector<size_t> &trace_rw;
   double *&dev_dense;
   size_t &dense_size;
 
+
   std::string backend_;
   const bool aggregate_;
   const bool atomic_;
@@ -97,6 +100,7 @@ private:
 
   const size_t default_local_work_size_;
   const unsigned long default_nruns_;
+
 };
 
 } // namespace Spatter
diff --git a/trace-replay/.gitignore b/trace-replay/.gitignore
new file mode 100644
index 00000000..9ab535bf
--- /dev/null
+++ b/trace-replay/.gitignore
@@ -0,0 +1,3 @@
+main
+stream
+trace-replay-kernel.o
diff --git a/trace-replay/Makefile b/trace-replay/Makefile
new file mode 100644
index 00000000..1793c131
--- /dev/null
+++ b/trace-replay/Makefile
@@ -0,0 +1,10 @@
+all: main stream
+trace-replay-kernel.o: trace-replay-kernel.cpp trace-replay-kernel.h
+	g++ -fopenmp -c -O3 -o trace-replay-kernel.o trace-replay-kernel.cpp
+main: trace-replay-kernel.o main.cpp
+	g++ -fopenmp -O3 -o main trace-replay-kernel.o main.cpp
+stream: trace-replay-kernel.o stream.cpp
+	g++ -fopenmp -O3 -o stream trace-replay-kernel.o stream.cpp
+clean:
+	rm -rf main
+.PHONY: all clean
diff --git a/trace-replay/main.cpp b/trace-replay/main.cpp
new file mode 100644
index 00000000..040ea98b
--- /dev/null
+++ b/trace-replay/main.cpp
@@ -0,0 +1,38 @@
+#include <assert.h>
+#include <omp.h>
+#include <cstdint>
+#include <cstdlib>
+#include <cstdio>
+#include "trace-replay-kernel.h"
+
+// Helpers for creating the trace
+long to_rwsz(long long a) {
+    return a << ADDR_BITS;
+}
+
+long to_addr(long long a) {
+    return a & ADDR_MASK;
+}
+
+int main() {
+    assert(sizeof(long) == 8);
+
+    // Create a trace with 8 entries
+    trace_entry *tr = (trace_entry*)malloc(sizeof(trace_entry) * 8);
+
+    tr[0] = to_rwsz(0) | to_addr(0x0);  // Read  4
+    tr[1] = to_rwsz(1) | to_addr(0x8);  // Read  8
+    tr[2] = to_rwsz(2) | to_addr(0xC);  // Write 4
+    tr[3] = to_rwsz(3) | to_addr(0x18); // Write 8
+
+    tr[4] = to_rwsz(3) | to_addr(0x30); // Write 8
+    tr[5] = to_rwsz(2) | to_addr(0x34); // Write 4
+    tr[6] = to_rwsz(1) | to_addr(0x40); // Read  8
+    tr[7] = to_rwsz(0) | to_addr(0x44); // Read  4
+
+    double *local = (double*)malloc(sizeof(double) * GAP_SIZE * omp_get_max_threads());
+    void *mem   = (void*)malloc(sizeof(char)   * 0x64);
+
+    trace_replay_kernel(tr, 8, mem, local);
+    printf("All done!\n");
+}
diff --git a/trace-replay/stream.cpp b/trace-replay/stream.cpp
new file mode 100644
index 00000000..5901dd8a
--- /dev/null
+++ b/trace-replay/stream.cpp
@@ -0,0 +1,47 @@
+#include <assert.h>
+#include <omp.h>
+#include <cstdint>
+#include <cstdlib>
+#include <cstdio>
+#include <chrono>
+#include <iostream>
+#include "trace-replay-kernel.h"
+
+// Helpers for creating the trace
+long to_rwsz(long long a) {
+    return a << ADDR_BITS;
+}
+
+long to_addr(long long a) {
+    return a & ADDR_MASK;
+}
+
+int main() {
+    assert(sizeof(long long) == 8);
+
+    // Create a trace with 8 entries
+    long long target_size = 16LL * 1024 * 1024 * 1024; // 16 GiB, read from first half, write to second
+
+    // Each trace entry will read 8 bytes then write 8 bytes
+    // Need a trace entry for each of the 1024*1024*1024 locations, and one for read, one for write
+    trace_entry *tr = (trace_entry*)malloc(sizeof(trace_entry) * 1024LL*1024*1024 * 2);
+
+    for (long i = 0; i < 1024LL*1024*1024; i++) {
+        tr[2*i  ] = to_rwsz(1) | to_addr(8*i);
+        tr[2*i+1] = to_rwsz(3) | to_addr(8*i + 8LL*1024*1024*1024);
+    }
+
+    double *local = (double*)malloc(sizeof(double) * GAP_SIZE * omp_get_max_threads());
+    void *mem   = (void*)malloc(sizeof(char)   * target_size);
+
+    auto start = std::chrono::high_resolution_clock::now();
+    trace_replay_kernel(tr, 1024LL*1024*1024*2, mem, local);
+    auto end = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> elapsed = end - start;
+
+    long long size_mb = target_size / (1024*1024);
+    double time_seconds = elapsed.count();
+    double bw_mbps = size_mb / elapsed.count();
+    printf("threads size_mb time_s bandwidth_mb\n");
+    printf("%d %lld %.2lf %.2lf\n", omp_get_max_threads(), size_mb, time_seconds, bw_mbps);
+}
diff --git a/trace-replay/trace-replay-kernel.cpp b/trace-replay/trace-replay-kernel.cpp
new file mode 100644
index 00000000..ab64dbff
--- /dev/null
+++ b/trace-replay/trace-replay-kernel.cpp
@@ -0,0 +1,85 @@
+#include <assert.h>
+#include <omp.h>
+#include <cstdint>
+#include <cstdlib>
+#include <cstdio>
+
+#include "trace-replay-kernel.h"
+
+// The first byte of trace_entry is an enum
+//   0 -> read,  size 4
+//   1 -> read,  size 8
+//   2 -> write, size 4
+//   3 -> write, size 8
+//
+// The rest of the trace_entry is the address
+// 63-56 : rw/sz
+// 55-0  : addr
+
+// We use two memory buffers in the kernel, `local` and `mem`
+// Each thread is some extra space (GAP_SIZE) in `local` to avoid false sharing.
+// The `mem` buffer is large enough for the largest access in the trace.
+
+// Issues:
+//  1. Does each thread need only a single 8-byte read-write location?
+//      - It probably makes sense to do a small circular buffer so that each
+//        thread can immediately issue a write if the preceding access
+//        was a read, to avoid stalls
+//      - The right amount of space is tricky. Too small and we risk stalls,
+//        too large and we risk polluting the cache
+//  2. Should we make all writes atomic to deal with write-conflicts?
+//  3. Need to ensure that rw_sz() and addr() are inlined
+//  4. Need to consider non-temporal stores for the trace
+
+#define DEBUG 0
+#define GAP_SIZE 32 // 32 doubles == 256 bytes
+
+typedef uint64_t trace_entry;
+
+int rw_sz(trace_entry t) {
+    return t >> ADDR_BITS;
+}
+
+long long addr(trace_entry t) {
+    return t & ADDR_MASK;
+}
+
+void trace_replay_kernel(trace_entry *tr, long len, void *mem, double *local) {
+
+#pragma omp parallel
+{
+    int tid = omp_get_thread_num();
+
+// nowait ignores all dependencies
+#pragma omp for nowait
+    for (long i = 0; i < len; i++) {
+        switch (rw_sz(tr[i])) {
+            case 0:
+#if DEBUG
+                printf("rd 4 0x%llx\n", addr(tr[i]));
+#endif
+                local[tid*GAP_SIZE] = ((float*)mem)[addr(tr[i])/4]; // Adjust from address to array-index by dividing by 4
+                break;
+            case 1:
+#if DEBUG
+                printf("rd 8 0x%llx\n", addr(tr[i]));
+#endif
+                local[tid*GAP_SIZE] = ((double*)mem)[addr(tr[i])/8];
+                break;
+            case 2:
+#if DEBUG
+                printf("wr 4 0x%llx\n", addr(tr[i]));
+#endif
+                ((float*)mem)[addr(tr[i])/4] = local[tid*GAP_SIZE];
+                break;
+            case 3:
+#if DEBUG
+                printf("wr 8 0x%llx\n", addr(tr[i]));
+#endif
+                ((double*)mem)[addr(tr[i])/8] = local[tid*GAP_SIZE];
+                break;
+        }
+    }
+}
+
+}
diff --git a/trace-replay/trace-replay-kernel.h b/trace-replay/trace-replay-kernel.h
new file mode 100644
index 00000000..a7a35059
--- /dev/null
+++ b/trace-replay/trace-replay-kernel.h
@@ -0,0 +1,11 @@
+#ifndef TRACE_REPLAY_KERNEL_H
+#define TRACE_REPLAY_KERNEL_H
+#include <cstdint>
+
+#define ADDR_BITS 56
+#define ADDR_MASK 0xFF'FF'FF'FF'FF'FF'FFLL
+#define GAP_SIZE 32 // 32 doubles == 256 bytes
+typedef uint64_t trace_entry;
+void trace_replay_kernel(trace_entry *tr, long len, void *mem, double *local);
+
+#endif