Rust and cuda clang support #1873

SirAlienTheGreat · 2025-07-21T23:04:33Z

Adds rust qfunction support

Allen rust jit

examples/ceed/bruhh.rs

Makefile

backends/cuda/ceed-cuda-compile.cpp

jeremylt · 2025-07-30T17:23:41Z

As mentioned in person, here's my diff to make stdout quiet

diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index 08f29af2..457e2380 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -35,6 +35,25 @@
     CeedChk_Nvrtc(ceed, ierr_q_); \
   } while (0)
 
+//------------------------------------------------------------------------------
+// Call system command and capture stdout + stderr
+//------------------------------------------------------------------------------
+static int CeedCallSystem(Ceed ceed, const char *command, const char *message) {
+  CeedDebug(ceed, "Running command:\n$ %s\n", command);
+  FILE *output_stream = popen((command + std::string(" 2>&1")).c_str(), "r");
+
+  CeedCheck(output_stream != nullptr, ceed, CEED_ERROR_BACKEND, "Failed to %s with command: %s", message, command);
+
+  char output[4 * CEED_MAX_RESOURCE_LEN];
+
+  while (fgets(output, sizeof(output), output_stream) != nullptr) {
+  }
+  CeedDebug(ceed, "Command output:\n%s\n", output);
+
+  CeedCheck(pclose(output_stream) == 0, ceed, CEED_ERROR_BACKEND, "Failed to %s with error: %s", message, output);
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Compile CUDA kernel
 //------------------------------------------------------------------------------
@@ -61,9 +80,9 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
   CeedCallBackend(CeedGetIsClang(ceed, &using_clang));
 
   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS,
-               using_clang
-                   ? "Compiling CUDA with Clang backend (with Rust QFunction support)"
-                   : "Compiling CUDA with NVRTC backend (without Rust QFunction support). To use Clang, set the environmental variable GPU_CLANG=1");
+               using_clang ? "Compiling CUDA with Clang backend (with Rust QFunction support)"
+                           : "Compiling CUDA with NVRTC backend (without Rust QFunction support).\nTo use the Clang backend, set the environment "
+                             "variable GPU_CLANG=1");
 
   // Get kernel specific options, such as kernel constants
   if (num_defines > 0) {
@@ -198,12 +217,10 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
     CeedCallBackend(CeedFree(&ptx));
     return CEED_ERROR_SUCCESS;
   } else {
-    const char *full_filename = "temp-jit.cu";
+    const char *full_filename = "temp_kernel_source.cu";
     FILE       *file          = fopen(full_filename, "w");
-    if (!file) {
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "Failed to create file. Write access is required for cuda-clang\n");
-      return 1;
-    }
+
+    CeedCheck(file, ceed, CEED_ERROR_BACKEND, "Failed to create file. Write access is required for cuda-clang\n");
     fputs(code.str().c_str(), file);
     fclose(file);
 
@@ -226,26 +243,22 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
 
     CeedCallBackend(CeedRestoreRustSourceRoots(ceed, &rust_source_dirs));
 
-    // Compile with rust
-    int         err;
-    std::string cmd;
+    // Compile Rust crate(s) needed
+    std::string command;
 
     for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
-      cmd = "cargo +nightly build --release --target nvptx64-nvidia-cuda --config " + rust_dirs[i] + "/.cargo/config.toml --manifest-path " +
-            rust_dirs[i] + "/Cargo.toml";
-      err = system(cmd.c_str());
-      CeedCheck(!err, ceed, CEED_ERROR_BACKEND, "Failed to build Rust crates for GPU JiT.\nFailed to build Rust crate %d with command: %s", i,
-                cmd.c_str());
+      command = "cargo +nightly build --release --target nvptx64-nvidia-cuda --config " + rust_dirs[i] + "/.cargo/config.toml --manifest-path " +
+                rust_dirs[i] + "/Cargo.toml";
+      CeedCallBackend(CeedCallSystem(ceed, command.c_str(), "build Rust crate"));
     }
 
-    cmd = "clang++ -flto=thin --cuda-gpu-arch=sm_" + std::to_string(prop.major) + std::to_string(prop.minor) +
-          " --cuda-device-only -emit-llvm -S temp-jit.cu -o kern.ll ";
-    cmd += opts[4];
-    err = system(cmd.c_str());
-    CeedCheck(!err, ceed, CEED_ERROR_BACKEND, "Failed to compile QFunction source to LLVM IR");
-
-    cmd = "llvm-link-20 kern.ll --ignore-non-bitcode --internalize --only-needed -S -o kern2.ll  ";
+    // Compile wrapper kernel
+    command = "clang++ -flto=thin --cuda-gpu-arch=sm_" + std::to_string(prop.major) + std::to_string(prop.minor) +
+              " --cuda-device-only -emit-llvm -S temp_kernel_source.cu -o temp_kernel.ll ";
+    command += opts[4];
+    CeedCallBackend(CeedCallSystem(ceed, command.c_str(), "JiT kernel source"));
 
+    command = "llvm-link temp_kernel.ll --ignore-non-bitcode --internalize --only-needed -S -o temp_kernel_linked.ll  ";
     // Searches for .a files in rust directoy
     // Note: this is necessary because rust crate names may not match the folder they are in
     for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
@@ -260,24 +273,23 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
         std::string filename(entry->d_name);
 
         if (filename.size() >= 2 && filename.substr(filename.size() - 2) == ".a") {
-          cmd += dir + "/" + filename + " ";
+          command += dir + "/" + filename + " ";
         }
       }
       closedir(dp);
-      // Todo: when libceed switches to c++17, switch to std::filesystem for the loop above
+      // TODO: when libCEED switches to c++17, switch to std::filesystem for the loop above
     }
 
-    CeedDebug(ceed, "Running llvm-link: %s\n", cmd.c_str());
-    err = system(cmd.c_str());
-    CeedCheck(!err, ceed, CEED_ERROR_BACKEND, "Failed to link C and Rust sources with LLVM\nllvm-link command: %s", cmd.c_str());
+    // Link, optimize, and compile final CUDA kernel
+    CeedCallBackend(CeedCallSystem(ceed, command.c_str(), "link C and Rust source"));
+    CeedCallBackend(
+        CeedCallSystem(ceed, "opt --passes internalize,inline temp_kernel_linked.ll -o temp_kernel_opt.bc", "optimize linked C and Rust source"));
+    CeedCallBackend(CeedCallSystem(
+        ceed,
+        ("llc -O3 -mcpu=sm_" + std::to_string(prop.major) + std::to_string(prop.minor) + " temp_kernel_opt.bc -o temp_kernel_final.ptx").c_str(),
+        "compile final CUDA kernel"));
 
-    err = system("opt --passes internalize,inline kern2.ll -o kern3.bc");
-    CeedCheck(!err, ceed, CEED_ERROR_BACKEND, "Failed  to Optimize QFunction LLVM IR");
-
-    err = system(("llc -O3 -mcpu=sm_" + std::to_string(prop.major) + std::to_string(prop.minor) + " kern3.bc -o kern.ptx").c_str());
-    CeedCheck(!err, ceed, CEED_ERROR_BACKEND, "Failed to compile QFunction LLVM IR)\n");
-
-    ifstream      ptxfile("kern.ptx");
+    ifstream      ptxfile("temp_kernel_final.ptx");
     ostringstream sstr;
 
     sstr << ptxfile.rdbuf();
@@ -285,8 +297,21 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
     auto ptx_data = sstr.str();
     ptx_size      = ptx_data.length();
 
-    CeedCallCuda(ceed, cuModuleLoadData(module, ptx_data.c_str()));
-    CeedCallBackend(CeedFree(&ptx_data));
+    int result = cuModuleLoadData(module, ptx_data.c_str());
+
+    *is_compile_good = result == 0;
+    if (!*is_compile_good) {
+      if (throw_error) {
+        return CeedError(ceed, CEED_ERROR_BACKEND, "Failed to load module data");
+      } else {
+        // LCOV_EXCL_START
+        CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
+        CeedDebug(ceed, "Error: Failed to load module data");
+        CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
+        return CEED_ERROR_SUCCESS;
+        // LCOV_EXCL_STOP
+      }
+    }
   }
   return CEED_ERROR_SUCCESS;
 }

This reverts commit 96e762f.

jeremylt · 2025-07-30T19:16:19Z

This makes it work like the other CeedCall* functions

diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index 99ec8dc7..7b78a4e1 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -35,10 +35,12 @@
     CeedChk_Nvrtc(ceed, ierr_q_); \
   } while (0)
 
+#define CeedCallSystem(ceed, command, message) CeedCallBackend(CeedCallSystem_Core(ceed, command, message))
+
 //------------------------------------------------------------------------------
 // Call system command and capture stdout + stderr
 //------------------------------------------------------------------------------
-static int CeedCallSystem(Ceed ceed, const char *command, const char *message) {
+static int CeedCallSystem_Core(Ceed ceed, const char *command, const char *message) {
   CeedDebug(ceed, "Running command:\n$ %s\n", command);
   FILE *output_stream = popen((command + std::string(" 2>&1")).c_str(), "r");
 
@@ -249,14 +251,14 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
     for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
       command = "cargo +nightly build --release --target nvptx64-nvidia-cuda --config " + rust_dirs[i] + "/.cargo/config.toml --manifest-path " +
                 rust_dirs[i] + "/Cargo.toml";
-      CeedCallBackend(CeedCallSystem(ceed, command.c_str(), "build Rust crate"));
+      CeedCallSystem(ceed, command.c_str(), "build Rust crate");
     }
 
     // Compile wrapper kernel
     command = "clang++ -flto=thin --cuda-gpu-arch=sm_" + std::to_string(prop.major) + std::to_string(prop.minor) +
               " --cuda-device-only -emit-llvm -S temp_kernel_source.cu -o temp_kernel.ll ";
     command += opts[4];
-    CeedCallBackend(CeedCallSystem(ceed, command.c_str(), "JiT kernel source"));
+    CeedCallSystem(ceed, command.c_str(), "JiT kernel source");
 
     command = "llvm-link-20 temp_kernel.ll --ignore-non-bitcode --internalize --only-needed -S -o temp_kernel_linked.ll  ";
     // Searches for .a files in rust directoy
@@ -281,13 +283,12 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
     }
 
     // Link, optimize, and compile final CUDA kernel
-    CeedCallBackend(CeedCallSystem(ceed, command.c_str(), "link C and Rust source"));
-    CeedCallBackend(
-        CeedCallSystem(ceed, "opt --passes internalize,inline temp_kernel_linked.ll -o temp_kernel_opt.bc", "optimize linked C and Rust source"));
-    CeedCallBackend(CeedCallSystem(
+    CeedCallSystem(ceed, command.c_str(), "link C and Rust source");
+    CeedCallSystem(ceed, "opt --passes internalize,inline temp_kernel_linked.ll -o temp_kernel_opt.bc", "optimize linked C and Rust source");
+    CeedCallSystem(
         ceed,
         ("llc -O3 -mcpu=sm_" + std::to_string(prop.major) + std::to_string(prop.minor) + " temp_kernel_opt.bc -o temp_kernel_final.ptx").c_str(),
-        "compile final CUDA kernel"));
+        "compile final CUDA kernel");
 
     ifstream      ptxfile("temp_kernel_final.ptx");
     ostringstream sstr;

examples/ceed/Makefile

examples/rust-qfunctions/ex1-volume-rust.c

examples/rust-qfunctions/ex1-volume.c

jeremylt · 2025-07-31T16:32:56Z

Ok, here is the CI and some small tidying

diff --git a/.gitignore b/.gitignore
index e7b100a0..dc446bd9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -51,6 +51,9 @@ doc/sphinx/build/
 # Example docs automatically copied from source tree
 doc/sphinx/source/examples/
 
+# Clang GPU temp files
+temp_*
+
 # Output files, videos, and compressed archives should not be added accidentally
 *.avi
 *.bin
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 5389460d..9a7b3adf 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -171,6 +171,57 @@ noether-cpu:
 #    - touch .SUCCESS
 
 
+# ----------------------------------------------------------------------------------------
+# Rust + CUDA
+# ----------------------------------------------------------------------------------------
+noether-rust-qfunctions:
+  stage: test:gpu-and-float
+  tags:
+    - cuda
+  interruptible: true
+  before_script:
+    # Environment
+    - export COVERAGE=1 CC=gcc CXX=g++ FC=gfortran NVCC=nvcc GPU_CLANG=1
+    - export NPROC_POOL=4
+    - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU
+    - echo "-------------- CC ------------------" && $CC --version
+    - echo "-------------- CXX -----------------" && $CXX --version
+    - echo "-------------- FC ------------------" && $FC --version
+    - echo "-------------- NVCC ----------------" && $NVCC --version
+    - echo "-------------- Rustc ---------------" && rustc --version
+    - echo "-------------- Clang++ -------------" && clang++ --version
+    - echo "-------------- GCOV ----------------" && gcov --version
+  script:
+    - rm -f .SUCCESS
+    # libCEED
+    - make configure OPT='-O -march=native -ffp-contract=fast' CUDA_DIR=/usr/local/cuda-12.9
+    - echo "-------------- libCEED -------------" && make info
+    - make clean
+    - make -k -j$NPROC_CPU -l$NPROC_CPU
+    # -- libCEED only tests
+    - echo "-------------- Rust QFunction tests -"
+    #    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
+    - export PETSC_DIR= PETSC_ARCH=
+    - make -k -j$((NPROC_GPU / NPROC_POOL)) JUNIT_BATCH="rust-qfunction" junit realsearch=rust-qfunction
+    # Report status
+    - touch .SUCCESS
+  after_script:
+    - |
+      if [ -f .SUCCESS ]; then
+        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch;
+        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
+        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
+        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
+        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F tests;
+        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F examples;
+      fi
+  artifacts:
+    paths:
+      - build/*.junit
+    reports:
+      junit: build/*.junit
+
+
 # ----------------------------------------------------------------------------------------
 # CUDA backends
 # ----------------------------------------------------------------------------------------
diff --git a/Makefile b/Makefile
index 5331ca3d..3a03310e 100644
--- a/Makefile
+++ b/Makefile
@@ -349,6 +349,10 @@ mfemexamples     := $(mfemexamples.cpp:examples/mfem/%.cpp=$(OBJDIR)/mfem-%)
 # Nek5K Examples
 nekexamples := $(OBJDIR)/nek-bps
 
+# Rust QFunction Examples
+rustqfunctions.c       := $(sort $(wildcard examples/rust-qfunctions/*.c))
+rustqfunctionsexamples := $(rustqfunctions.c:examples/rust-qfunctions/%.c=$(OBJDIR)/rustqfunctions-%)
+
 # PETSc Examples
 petscexamples.c := $(wildcard examples/petsc/*.c)
 petscexamples   := $(petscexamples.c:examples/petsc/%.c=$(OBJDIR)/petsc-%)
@@ -733,6 +737,11 @@ $(OBJDIR)/nek-bps : examples/nek/bps/bps.usr examples/nek/nek-examples.sh $(libc
 	mv examples/nek/build/bps $(OBJDIR)/bps
 	cp examples/nek/nek-examples.sh $(OBJDIR)/nek-bps
 
+# Rust QFunctions
+$(OBJDIR)/rustqfunctions-% : examples/rust-qfunctions/%.c $(libceed) | $$(@D)/.DIR
+	+$(MAKE) -C examples/rust-qfunctions CEED_DIR=`pwd`
+	cp examples/rust-qfunctions/$* $@
+
 # PETSc
 # Several executables have common utilities, but we can't build the utilities
 # from separate submake invocations because they'll compete with each
@@ -763,19 +772,22 @@ $(OBJDIR)/solids-% : examples/solids/%.c examples/solids/%.h \
 	  PETSC_DIR="$(abspath $(PETSC_DIR))" OPT="$(OPT)" $*
 	cp examples/solids/$* $@
 
-examples : $(allexamples)
-ceedexamples : $(examples)
-nekexamples : $(nekexamples)
-mfemexamples : $(mfemexamples)
+examples      : $(allexamples)
+ceedexamples  : $(examples)
+nekexamples   : $(nekexamples)
+mfemexamples  : $(mfemexamples)
 petscexamples : $(petscexamples)
 
+rustqfunctionsexamples : $(rustqfunctionsexamples)
+
 external_examples := \
 	$(if $(MFEM_DIR),$(mfemexamples)) \
 	$(if $(PETSC_DIR),$(petscexamples)) \
 	$(if $(NEK5K_DIR),$(nekexamples)) \
 	$(if $(DEAL_II_DIR),$(dealiiexamples)) \
 	$(if $(PETSC_DIR),$(fluidsexamples)) \
-	$(if $(PETSC_DIR),$(solidsexamples))
+	$(if $(PETSC_DIR),$(solidsexamples)) \
+	$(rustqfunctionsexamples)
 
 allexamples = $(examples) $(external_examples)
 
diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index 0399168a..c73a988a 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -37,10 +37,12 @@
     CeedChk_Nvrtc(ceed, ierr_q_); \
   } while (0)
 
+#define CeedCallSystem(ceed, command, message) CeedCallBackend(CeedCallSystem_Core(ceed, command, message))
+
 //------------------------------------------------------------------------------
 // Call system command and capture stdout + stderr
 //------------------------------------------------------------------------------
-static int CeedCallSystem(Ceed ceed, const char *command, const char *message) {
+static int CeedCallSystem_Core(Ceed ceed, const char *command, const char *message) {
   CeedDebug(ceed, "Running command:\n$ %s\n", command);
   FILE *output_stream = popen((command + std::string(" 2>&1")).c_str(), "r");
 
@@ -245,32 +247,32 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
 
     CeedCallBackend(CeedRestoreRustSourceRoots(ceed, &rust_source_dirs));
 
-    char* rust_toolchain = std::getenv("RUST_TOOLCHAIN");
+    char *rust_toolchain = std::getenv("RUST_TOOLCHAIN");
 
-    if(rust_toolchain == nullptr){
-        rust_toolchain = "nightly";
-        setenv("RUST_TOOLCHAIN", "nightly", 0);
+    if (rust_toolchain == nullptr) {
+      rust_toolchain = (char *)"nightly";
+      setenv("RUST_TOOLCHAIN", "nightly", 0);
     }
 
     // Compile Rust crate(s) needed
     std::string command;
 
     for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
-      command = "cargo +" + std::string(rust_toolchain) + " build --release --target nvptx64-nvidia-cuda --config " + rust_dirs[i] + "/.cargo/config.toml --manifest-path " +
-                rust_dirs[i] + "/Cargo.toml";
-      CeedCallBackend(CeedCallSystem(ceed, command.c_str(), "build Rust crate"));
+      command = "cargo +" + std::string(rust_toolchain) + " build --release --target nvptx64-nvidia-cuda --config " + rust_dirs[i] +
+                "/.cargo/config.toml --manifest-path " + rust_dirs[i] + "/Cargo.toml";
+      CeedCallSystem(ceed, command.c_str(), "build Rust crate");
     }
 
     // Compile wrapper kernel
     command = "clang++ -flto=thin --cuda-gpu-arch=sm_" + std::to_string(prop.major) + std::to_string(prop.minor) +
               " --cuda-device-only -emit-llvm -S temp_kernel_source.cu -o temp_kernel.ll ";
     command += opts[4];
-    CeedCallBackend(CeedCallSystem(ceed, command.c_str(), "JiT kernel source"));
+    CeedCallSystem(ceed, command.c_str(), "JiT kernel source");
 
     // the find command finds the rust-installed llvm-link tool and runs it
-    command =
-        "$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llvm-link) temp_kernel.ll --ignore-non-bitcode --internalize --only-needed -S -o "
-        "temp_kernel_linked.ll  ";
+    command = "$(find $(rustup run " + std::string(rust_toolchain) +
+              " rustc --print sysroot) -name llvm-link) temp_kernel.ll --ignore-non-bitcode --internalize --only-needed -S -o "
+              "temp_kernel_linked.ll  ";
 
     // Searches for .a files in rust directoy
     // Note: this is necessary because rust crate names may not match the folder they are in
@@ -295,15 +297,17 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
 
     // Link, optimize, and compile final CUDA kernel
     // note that the find command is used to find the rust-installed llvm tool
-    CeedCallBackend(CeedCallSystem(ceed, command.c_str(), "link C and Rust source"));
-    CeedCallBackend(CeedCallSystem(
-        ceed, ("$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name opt) --passes internalize,inline temp_kernel_linked.ll -o temp_kernel_opt.bc").c_str(),
-        "optimize linked C and Rust source"));
-    CeedCallBackend(CeedCallSystem(ceed,
-                                   ("$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llc) -O3 -mcpu=sm_" + std::to_string(prop.major) +
-                                    std::to_string(prop.minor) + " temp_kernel_opt.bc -o temp_kernel_final.ptx")
-                                       .c_str(),
-                                   "compile final CUDA kernel"));
+    CeedCallSystem(ceed, command.c_str(), "link C and Rust source");
+    CeedCallSystem(ceed,
+                   ("$(find $(rustup run " + std::string(rust_toolchain) +
+                    " rustc --print sysroot) -name opt) --passes internalize,inline temp_kernel_linked.ll -o temp_kernel_opt.bc")
+                       .c_str(),
+                   "optimize linked C and Rust source");
+    CeedCallSystem(ceed,
+                   ("$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llc) -O3 -mcpu=sm_" +
+                    std::to_string(prop.major) + std::to_string(prop.minor) + " temp_kernel_opt.bc -o temp_kernel_final.ptx")
+                       .c_str(),
+                   "compile final CUDA kernel");
 
     ifstream      ptxfile("temp_kernel_final.ptx");
     ostringstream sstr;
diff --git a/examples/rust-qfunctions/ex1-volume b/examples/rust-qfunctions/ex1-volume
index 96d7079d..a9de7df2 100755
Binary files a/examples/rust-qfunctions/ex1-volume and b/examples/rust-qfunctions/ex1-volume differ
* Unmerged path examples/rust-qfunctions/ex1-volume.c
diff --git a/examples/rust-qfunctions/ex1-volume.c b/examples/rust-qfunctions/ex1-volume.c
index ce9d5cd4..f93d29ef 100644
--- a/examples/rust-qfunctions/ex1-volume.c
+++ b/examples/rust-qfunctions/ex1-volume.c
@@ -122,12 +122,12 @@ int main(int argc, const char *argv[]) {
   CeedInit(ceed_spec, &ceed);
 
   // Add the path to the Rust crate to the ceed object.
-    {
-      char  root[2048] = __FILE__;
-      char *last_slash = strrchr(root, '/');
-      strncpy(last_slash + 1, "ex1-volume-rs", 14);
-      CeedAddRustSourceRoot(ceed, root);
-    }
+  {
+    char  root[2048] = __FILE__;
+    char *last_slash = strrchr(root, '/');
+    strncpy(last_slash + 1, "ex1-volume-rs", 14);
+    CeedAddRustSourceRoot(ceed, root);
+  }
 
   // Construct the mesh and solution bases.
   CeedBasis mesh_basis, sol_basis;
* Unmerged path examples/rust-qfunctions/ex1-volume.h
diff --git a/temp_kernel.ll b/temp_kernel.ll
deleted file mode 100644
index 917ef198..00000000
--- a/temp_kernel.ll
+++ /dev/null
@@ -1,1639 +0,0 @@
-; ModuleID = 'temp_kernel_source.cu'
-source_filename = "temp_kernel_source.cu"
-target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
-target triple = "nvptx64-nvidia-cuda"
-
-%struct.__cuda_builtin_threadIdx_t = type { i8 }
-%struct.__cuda_builtin_blockDim_t = type { i8 }
-%struct.__cuda_builtin_blockIdx_t = type { i8 }
-%struct.__cuda_builtin_gridDim_t = type { i8 }
-%struct.FieldsInt_Cuda = type { [16 x ptr], [16 x ptr] }
-%struct.Fields_Cuda = type { [16 x ptr], [16 x ptr] }
-%struct.Points_Cuda = type { i32, ptr, ptr, ptr }
-%struct.SharedData_Cuda = type { i32, i32, i32, i32, ptr }
-
-$_Z10LoadMatrixILi5ELi6EEvR15SharedData_CudaPKdPd = comdat any
-
-$_Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd = comdat any
-
-$_Z14InterpTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd = comdat any
-
-$_Z17ReadLVecStrided3dILi1ELi6ELi1ELi884736ELi216EEvR15SharedData_CudaiPKdPd = comdat any
-
-$_Z23InterpTransposeTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd = comdat any
-
-$_Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd = comdat any
-
-$_Z11ContractX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd = comdat any
-
-$_Z11ContractY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd = comdat any
-
-$_Z11ContractZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd = comdat any
-
-$_Z20ContractTransposeZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd = comdat any
-
-$_Z20ContractTransposeY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd = comdat any
-
-$_Z20ContractTransposeX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd = comdat any
-
-@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
-@blockDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1
-@slice = external dso_local addrspace(3) global [0 x double], align 8
-@_ZZ36CeedKernelCudaGenOperator_apply_massE8s_B_in_0 = internal addrspace(3) global [30 x double] undef, align 8
-@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
-@gridDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_gridDim_t, align 1
-
-; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone
-define dso_local ptx_kernel void @CeedKernelCudaGenOperator_apply_mass(i32 noundef %0, ptr noundef %1, ptr noundef byval(%struct.FieldsInt_Cuda) align 8 %2, ptr noundef byval(%struct.Fields_Cuda) align 8 %3, ptr noundef byval(%struct.Fields_Cuda) align 8 %4, ptr noundef byval(%struct.Fields_Cuda) align 8 %5, ptr noundef %6, ptr noundef byval(%struct.Points_Cuda) align 8 %7) #0 {
-  %9 = alloca i32, align 4
-  %10 = alloca ptr, align 8
-  %11 = alloca ptr, align 8
-  %12 = alloca ptr, align 8
-  %13 = alloca ptr, align 8
-  %14 = alloca ptr, align 8
-  %15 = alloca i32, align 4
-  %16 = alloca i32, align 4
-  %17 = alloca %struct.SharedData_Cuda, align 8
-  %18 = alloca i32, align 4
-  %19 = alloca i32, align 4
-  %20 = alloca i32, align 4
-  %21 = alloca i32, align 4
-  %22 = alloca i32, align 4
-  %23 = alloca i32, align 4
-  %24 = alloca i32, align 4
-  %25 = alloca i32, align 4
-  %26 = alloca i32, align 4
-  %27 = alloca ptr, align 8
-  %28 = alloca i32, align 4
-  %29 = alloca [6 x double], align 8
-  %30 = alloca ptr, align 8
-  %31 = alloca i32, align 4
-  %32 = alloca i32, align 4
-  %33 = alloca [6 x double], align 8
-  %34 = alloca [6 x double], align 8
-  %35 = alloca i32, align 4
-  %36 = alloca i32, align 4
-  %37 = alloca i32, align 4
-  %38 = alloca ptr, align 8
-  %39 = alloca [6 x double], align 8
-  %40 = alloca ptr, align 8
-  %41 = alloca ptr, align 8
-  %42 = alloca ptr, align 8
-  %43 = alloca [2 x ptr], align 8
-  %44 = alloca [1 x ptr], align 8
-  %45 = alloca ptr, align 8
-  %46 = alloca i32, align 4
-  %47 = alloca i32, align 4
-  store i32 %0, ptr %9, align 4
-  store ptr %1, ptr %10, align 8
-  store ptr %6, ptr %11, align 8
-  %48 = getelementptr inbounds nuw %struct.Fields_Cuda, ptr %3, i32 0, i32 0
-  %49 = getelementptr inbounds [16 x ptr], ptr %48, i64 0, i64 0
-  %50 = load ptr, ptr %49, align 8
-  store ptr %50, ptr %12, align 8
-  %51 = getelementptr inbounds nuw %struct.Fields_Cuda, ptr %3, i32 0, i32 0
-  %52 = getelementptr inbounds [16 x ptr], ptr %51, i64 0, i64 1
-  %53 = load ptr, ptr %52, align 8
-  store ptr %53, ptr %13, align 8
-  %54 = getelementptr inbounds nuw %struct.Fields_Cuda, ptr %3, i32 0, i32 1
-  %55 = getelementptr inbounds [16 x ptr], ptr %54, i64 0, i64 0
-  %56 = load ptr, ptr %55, align 8
-  store ptr %56, ptr %14, align 8
-  store i32 3, ptr %15, align 4
-  store i32 6, ptr %16, align 4
-  %57 = call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %58 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %17, i32 0, i32 0
-  store i32 %57, ptr %58, align 8
-  %59 = call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
-  %60 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %17, i32 0, i32 1
-  store i32 %59, ptr %60, align 4
-  %61 = call noundef range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.z()
-  %62 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %17, i32 0, i32 2
-  store i32 %61, ptr %62, align 8
-  %63 = call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %64 = call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
-  %65 = call noundef range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  %66 = mul i32 %64, %65
-  %67 = add i32 %63, %66
-  %68 = call noundef range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.z()
-  %69 = call noundef range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
-  %70 = mul i32 %68, %69
-  %71 = call noundef range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  %72 = mul i32 %70, %71
-  %73 = add i32 %67, %72
-  %74 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %17, i32 0, i32 3
-  store i32 %73, ptr %74, align 4
-  %75 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %17, i32 0, i32 2
-  %76 = load i32, ptr %75, align 8
-  %77 = mul nsw i32 %76, 6
-  %78 = mul nsw i32 %77, 6
-  %79 = sext i32 %78 to i64
-  %80 = getelementptr inbounds double, ptr addrspacecast (ptr addrspace(3) @slice to ptr), i64 %79
-  %81 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %17, i32 0, i32 4
-  store ptr %80, ptr %81, align 8
-  store i32 3, ptr %18, align 4
-  store i32 5, ptr %19, align 4
-  store i32 1, ptr %20, align 4
-  %82 = getelementptr inbounds nuw %struct.Fields_Cuda, ptr %4, i32 0, i32 0
-  %83 = getelementptr inbounds [16 x ptr], ptr %82, i64 0, i64 0
-  %84 = load ptr, ptr %83, align 8
-  call void @_Z10LoadMatrixILi5ELi6EEvR15SharedData_CudaPKdPd(ptr noundef nonnull align 8 dereferenceable(24) %17, ptr noundef %84, ptr noundef addrspacecast (ptr addrspace(3) @_ZZ36CeedKernelCudaGenOperator_apply_massE8s_B_in_0 to ptr)) #5
-  store i32 3, ptr %21, align 4
-  store i32 6, ptr %22, align 4
-  store i32 1, ptr %23, align 4
-  store i32 3, ptr %24, align 4
-  store i32 5, ptr %25, align 4
-  store i32 1, ptr %26, align 4
-  store ptr addrspacecast (ptr addrspace(3) @_ZZ36CeedKernelCudaGenOperator_apply_massE8s_B_in_0 to ptr), ptr %27, align 8
-  call void @llvm.nvvm.barrier0()
-  %85 = call noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-  %86 = call noundef range(i32 1, 65) i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
-  %87 = mul i32 %85, %86
-  %88 = call noundef range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.z()
-  %89 = add i32 %87, %88
-  store i32 %89, ptr %28, align 4
-  br label %90
-
-90:                                               ; preds = %131, %8
-  %91 = load i32, ptr %28, align 4
-  %92 = load i32, ptr %9, align 4
-  %93 = icmp slt i32 %91, %92
-  br i1 %93, label %94, label %137
-
-94:                                               ; preds = %90
-  %95 = getelementptr inbounds [6 x double], ptr %29, i64 0, i64 0
-  store ptr %95, ptr %30, align 8
-  store i32 274625, ptr %31, align 4
-  store i32 274625, ptr %32, align 4
-  %96 = load i32, ptr %28, align 4
-  %97 = getelementptr inbounds nuw %struct.FieldsInt_Cuda, ptr %2, i32 0, i32 0
-  %98 = getelementptr inbounds [16 x ptr], ptr %97, i64 0, i64 0
-  %99 = load ptr, ptr %98, align 8
-  %100 = load ptr, ptr %12, align 8
-  %101 = load ptr, ptr %30, align 8
-  call void @_Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd(ptr noundef nonnull align 8 dereferenceable(24) %17, i32 noundef 274625, i32 noundef %96, ptr noundef %99, ptr noundef %100, ptr noundef %101) #5
-  %102 = load ptr, ptr %30, align 8
-  %103 = getelementptr inbounds [6 x double], ptr %33, i64 0, i64 0
-  call void @_Z14InterpTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %17, ptr noundef %102, ptr noundef addrspacecast (ptr addrspace(3) @_ZZ36CeedKernelCudaGenOperator_apply_massE8s_B_in_0 to ptr), ptr noundef %103) #5
-  store i32 1, ptr %35, align 4
-  store i32 884736, ptr %36, align 4
-  store i32 216, ptr %37, align 4
-  %104 = load i32, ptr %28, align 4
-  %105 = load ptr, ptr %13, align 8
-  %106 = getelementptr inbounds [6 x double], ptr %34, i64 0, i64 0
-  call void @_Z17ReadLVecStrided3dILi1ELi6ELi1ELi884736ELi216EEvR15SharedData_CudaiPKdPd(ptr noundef nonnull align 8 dereferenceable(24) %17, i32 noundef %104, ptr noundef %105, ptr noundef %106) #5
-  %107 = getelementptr inbounds [6 x double], ptr %34, i64 0, i64 0
-  store ptr %107, ptr %38, align 8
-  %108 = getelementptr inbounds [6 x double], ptr %33, i64 0, i64 0
-  store ptr %108, ptr %40, align 8
-  %109 = load ptr, ptr %38, align 8
-  store ptr %109, ptr %41, align 8
-  %110 = getelementptr inbounds [6 x double], ptr %39, i64 0, i64 0
-  store ptr %110, ptr %42, align 8
-  %111 = load ptr, ptr %40, align 8
-  %112 = getelementptr inbounds [2 x ptr], ptr %43, i64 0, i64 0
-  store ptr %111, ptr %112, align 8
-  %113 = load ptr, ptr %41, align 8
-  %114 = getelementptr inbounds [2 x ptr], ptr %43, i64 0, i64 1
-  store ptr %113, ptr %114, align 8
-  %115 = load ptr, ptr %42, align 8
-  %116 = getelementptr inbounds [1 x ptr], ptr %44, i64 0, i64 0
-  store ptr %115, ptr %116, align 8
-  %117 = load ptr, ptr %10, align 8
-  %118 = getelementptr inbounds [2 x ptr], ptr %43, i64 0, i64 0
-  %119 = getelementptr inbounds [1 x ptr], ptr %44, i64 0, i64 0
-  %120 = call noundef i32 @_ZL10apply_massPviPKPKdPKPd(ptr noundef %117, i32 noundef 6, ptr noundef %118, ptr noundef %119) #5
-  %121 = getelementptr inbounds [6 x double], ptr %29, i64 0, i64 0
-  store ptr %121, ptr %45, align 8
-  %122 = getelementptr inbounds [6 x double], ptr %39, i64 0, i64 0
-  %123 = load ptr, ptr %27, align 8
-  %124 = load ptr, ptr %45, align 8
-  call void @_Z23InterpTransposeTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %17, ptr noundef %122, ptr noundef %123, ptr noundef %124) #5
-  store i32 274625, ptr %46, align 4
-  store i32 274625, ptr %47, align 4
-  %125 = load i32, ptr %28, align 4
-  %126 = getelementptr inbounds nuw %struct.FieldsInt_Cuda, ptr %2, i32 0, i32 1
-  %127 = getelementptr inbounds [16 x ptr], ptr %126, i64 0, i64 0
-  %128 = load ptr, ptr %127, align 8
-  %129 = load ptr, ptr %45, align 8
-  %130 = load ptr, ptr %14, align 8
-  call void @_Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd(ptr noundef nonnull align 8 dereferenceable(24) %17, i32 noundef 274625, i32 noundef %125, ptr noundef %128, ptr noundef %129, ptr noundef %130) #5
-  br label %131
-
-131:                                              ; preds = %94
-  %132 = call noundef range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
-  %133 = call noundef range(i32 1, 65) i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
-  %134 = mul i32 %132, %133
-  %135 = load i32, ptr %28, align 4
-  %136 = add i32 %135, %134
-  store i32 %136, ptr %28, align 4
-  br label %90, !llvm.loop !8
-
-137:                                              ; preds = %90
-  ret void
-}
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define linkonce_odr dso_local void @_Z10LoadMatrixILi5ELi6EEvR15SharedData_CudaPKdPd(ptr noundef nonnull align 8 dereferenceable(24) %0, ptr noalias noundef %1, ptr noundef %2) #1 comdat {
-  %4 = alloca ptr, align 8
-  %5 = alloca ptr, align 8
-  %6 = alloca ptr, align 8
-  %7 = alloca i32, align 4
-  store ptr %0, ptr %4, align 8
-  store ptr %1, ptr %5, align 8
-  store ptr %2, ptr %6, align 8
-  %8 = load ptr, ptr %4, align 8
-  %9 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %8, i32 0, i32 3
-  %10 = load i32, ptr %9, align 4
-  store i32 %10, ptr %7, align 4
-  br label %11
-
-11:                                               ; preds = %24, %3
-  %12 = load i32, ptr %7, align 4
-  %13 = icmp slt i32 %12, 30
-  br i1 %13, label %14, label %32
-
-14:                                               ; preds = %11
-  %15 = load ptr, ptr %5, align 8
-  %16 = load i32, ptr %7, align 4
-  %17 = sext i32 %16 to i64
-  %18 = getelementptr inbounds double, ptr %15, i64 %17
-  %19 = load double, ptr %18, align 8
-  %20 = load ptr, ptr %6, align 8
-  %21 = load i32, ptr %7, align 4
-  %22 = sext i32 %21 to i64
-  %23 = getelementptr inbounds double, ptr %20, i64 %22
-  store double %19, ptr %23, align 8
-  br label %24
-
-24:                                               ; preds = %14
-  %25 = call noundef range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  %26 = call noundef range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
-  %27 = mul i32 %25, %26
-  %28 = call noundef range(i32 1, 65) i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
-  %29 = mul i32 %27, %28
-  %30 = load i32, ptr %7, align 4
-  %31 = add i32 %30, %29
-  store i32 %31, ptr %7, align 4
-  br label %11, !llvm.loop !10
-
-32:                                               ; preds = %11
-  ret void
-}
-
-; Function Attrs: convergent nocallback nounwind
-declare void @llvm.nvvm.barrier0() #2
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define linkonce_odr dso_local void @_Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd(ptr noundef nonnull align 8 dereferenceable(24) %0, i32 noundef %1, i32 noundef %2, ptr noalias noundef %3, ptr noalias noundef %4, ptr noalias noundef %5) #1 comdat {
-  %7 = alloca ptr, align 8
-  %8 = alloca i32, align 4
-  %9 = alloca i32, align 4
-  %10 = alloca ptr, align 8
-  %11 = alloca ptr, align 8
-  %12 = alloca ptr, align 8
-  %13 = alloca i32, align 4
-  %14 = alloca i32, align 4
-  %15 = alloca i32, align 4
-  %16 = alloca i32, align 4
-  store ptr %0, ptr %7, align 8
-  store i32 %1, ptr %8, align 4
-  store i32 %2, ptr %9, align 4
-  store ptr %3, ptr %10, align 8
-  store ptr %4, ptr %11, align 8
-  store ptr %5, ptr %12, align 8
-  %17 = load ptr, ptr %7, align 8
-  %18 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %17, i32 0, i32 0
-  %19 = load i32, ptr %18, align 8
-  %20 = icmp slt i32 %19, 5
-  br i1 %20, label %21, label %80
-
-21:                                               ; preds = %6
-  %22 = load ptr, ptr %7, align 8
-  %23 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %22, i32 0, i32 1
-  %24 = load i32, ptr %23, align 4
-  %25 = icmp slt i32 %24, 5
-  br i1 %25, label %26, label %80
-
-26:                                               ; preds = %21
-  store i32 0, ptr %13, align 4
-  br label %27
-
-27:                                               ; preds = %76, %26
-  %28 = load i32, ptr %13, align 4
-  %29 = icmp slt i32 %28, 5
-  br i1 %29, label %30, label %79
-
-30:                                               ; preds = %27
-  %31 = load ptr, ptr %7, align 8
-  %32 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %31, i32 0, i32 0
-  %33 = load i32, ptr %32, align 8
-  %34 = load ptr, ptr %7, align 8
-  %35 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %34, i32 0, i32 1
-  %36 = load i32, ptr %35, align 4
-  %37 = mul nsw i32 %36, 5
-  %38 = add nsw i32 %33, %37
-  %39 = load i32, ptr %13, align 4
-  %40 = mul nsw i32 %39, 5
-  %41 = mul nsw i32 %40, 5
-  %42 = add nsw i32 %38, %41
-  store i32 %42, ptr %14, align 4
-  %43 = load ptr, ptr %10, align 8
-  %44 = load i32, ptr %14, align 4
-  %45 = load i32, ptr %9, align 4
-  %46 = mul nsw i32 %45, 5
-  %47 = mul nsw i32 %46, 5
-  %48 = mul nsw i32 %47, 5
-  %49 = add nsw i32 %44, %48
-  %50 = sext i32 %49 to i64
-  %51 = getelementptr inbounds i32, ptr %43, i64 %50
-  %52 = load i32, ptr %51, align 4
-  store i32 %52, ptr %15, align 4
-  store i32 0, ptr %16, align 4
-  br label %53
-
-53:                                               ; preds = %72, %30
-  %54 = load i32, ptr %16, align 4
-  %55 = icmp slt i32 %54, 1
-  br i1 %55, label %56, label %75
-
-56:                                               ; preds = %53
-  %57 = load ptr, ptr %11, align 8
-  %58 = load i32, ptr %15, align 4
-  %59 = load i32, ptr %16, align 4
-  %60 = mul nsw i32 274625, %59
-  %61 = add nsw i32 %58, %60
-  %62 = sext i32 %61 to i64
-  %63 = getelementptr inbounds double, ptr %57, i64 %62
-  %64 = load double, ptr %63, align 8
-  %65 = load ptr, ptr %12, align 8
-  %66 = load i32, ptr %13, align 4
-  %67 = load i32, ptr %16, align 4
-  %68 = mul nsw i32 %67, 5
-  %69 = add nsw i32 %66, %68
-  %70 = sext i32 %69 to i64
-  %71 = getelementptr inbounds double, ptr %65, i64 %70
-  store double %64, ptr %71, align 8
-  br label %72
-
-72:                                               ; preds = %56
-  %73 = load i32, ptr %16, align 4
-  %74 = add nsw i32 %73, 1
-  store i32 %74, ptr %16, align 4
-  br label %53, !llvm.loop !11
-
-75:                                               ; preds = %53
-  br label %76
-
-76:                                               ; preds = %75
-  %77 = load i32, ptr %13, align 4
-  %78 = add nsw i32 %77, 1
-  store i32 %78, ptr %13, align 4
-  br label %27, !llvm.loop !12
-
-79:                                               ; preds = %27
-  br label %80
-
-80:                                               ; preds = %79, %21, %6
-  ret void
-}
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define linkonce_odr dso_local void @_Z14InterpTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %0, ptr noalias noundef %1, ptr noundef %2, ptr noalias noundef %3) #1 comdat {
-  %5 = alloca ptr, align 8
-  %6 = alloca ptr, align 8
-  %7 = alloca ptr, align 8
-  %8 = alloca ptr, align 8
-  %9 = alloca [6 x double], align 8
-  %10 = alloca [6 x double], align 8
-  %11 = alloca i32, align 4
-  store ptr %0, ptr %5, align 8
-  store ptr %1, ptr %6, align 8
-  store ptr %2, ptr %7, align 8
-  store ptr %3, ptr %8, align 8
-  store i32 0, ptr %11, align 4
-  br label %12
-
-12:                                               ; preds = %36, %4
-  %13 = load i32, ptr %11, align 4
-  %14 = icmp slt i32 %13, 1
-  br i1 %14, label %15, label %39
-
-15:                                               ; preds = %12
-  %16 = load ptr, ptr %5, align 8
-  %17 = load ptr, ptr %6, align 8
-  %18 = load i32, ptr %11, align 4
-  %19 = mul nsw i32 %18, 5
-  %20 = sext i32 %19 to i64
-  %21 = getelementptr inbounds double, ptr %17, i64 %20
-  %22 = load ptr, ptr %7, align 8
-  %23 = getelementptr inbounds [6 x double], ptr %9, i64 0, i64 0
-  call void @_Z11ContractX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %16, ptr noundef %21, ptr noundef %22, ptr noundef %23) #5
-  %24 = load ptr, ptr %5, align 8
-  %25 = getelementptr inbounds [6 x double], ptr %9, i64 0, i64 0
-  %26 = load ptr, ptr %7, align 8
-  %27 = getelementptr inbounds [6 x double], ptr %10, i64 0, i64 0
-  call void @_Z11ContractY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %24, ptr noundef %25, ptr noundef %26, ptr noundef %27) #5
-  %28 = load ptr, ptr %5, align 8
-  %29 = getelementptr inbounds [6 x double], ptr %10, i64 0, i64 0
-  %30 = load ptr, ptr %7, align 8
-  %31 = load ptr, ptr %8, align 8
-  %32 = load i32, ptr %11, align 4
-  %33 = mul nsw i32 %32, 6
-  %34 = sext i32 %33 to i64
-  %35 = getelementptr inbounds double, ptr %31, i64 %34
-  call void @_Z11ContractZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %28, ptr noundef %29, ptr noundef %30, ptr noundef %35) #5
-  br label %36
-
-36:                                               ; preds = %15
-  %37 = load i32, ptr %11, align 4
-  %38 = add nsw i32 %37, 1
-  store i32 %38, ptr %11, align 4
-  br label %12, !llvm.loop !13
-
-39:                                               ; preds = %12
-  ret void
-}
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define linkonce_odr dso_local void @_Z17ReadLVecStrided3dILi1ELi6ELi1ELi884736ELi216EEvR15SharedData_CudaiPKdPd(ptr noundef nonnull align 8 dereferenceable(24) %0, i32 noundef %1, ptr noalias noundef %2, ptr noalias noundef %3) #1 comdat {
-  %5 = alloca ptr, align 8
-  %6 = alloca i32, align 4
-  %7 = alloca ptr, align 8
-  %8 = alloca ptr, align 8
-  %9 = alloca i32, align 4
-  %10 = alloca i32, align 4
-  %11 = alloca i32, align 4
-  %12 = alloca i32, align 4
-  store ptr %0, ptr %5, align 8
-  store i32 %1, ptr %6, align 4
-  store ptr %2, ptr %7, align 8
-  store ptr %3, ptr %8, align 8
-  %13 = load ptr, ptr %5, align 8
-  %14 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %13, i32 0, i32 0
-  %15 = load i32, ptr %14, align 8
-  %16 = icmp slt i32 %15, 6
-  br i1 %16, label %17, label %71
-
-17:                                               ; preds = %4
-  %18 = load ptr, ptr %5, align 8
-  %19 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %18, i32 0, i32 1
-  %20 = load i32, ptr %19, align 4
-  %21 = icmp slt i32 %20, 6
-  br i1 %21, label %22, label %71
-
-22:                                               ; preds = %17
-  store i32 0, ptr %9, align 4
-  br label %23
-
-23:                                               ; preds = %67, %22
-  %24 = load i32, ptr %9, align 4
-  %25 = icmp slt i32 %24, 6
-  br i1 %25, label %26, label %70
-
-26:                                               ; preds = %23
-  %27 = load ptr, ptr %5, align 8
-  %28 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %27, i32 0, i32 0
-  %29 = load i32, ptr %28, align 8
-  %30 = load ptr, ptr %5, align 8
-  %31 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %30, i32 0, i32 1
-  %32 = load i32, ptr %31, align 4
-  %33 = mul nsw i32 %32, 6
-  %34 = add nsw i32 %29, %33
-  %35 = load i32, ptr %9, align 4
-  %36 = mul nsw i32 %35, 6
-  %37 = mul nsw i32 %36, 6
-  %38 = add nsw i32 %34, %37
-  store i32 %38, ptr %10, align 4
-  %39 = load i32, ptr %10, align 4
-  %40 = mul nsw i32 %39, 1
-  %41 = load i32, ptr %6, align 4
-  %42 = mul nsw i32 %41, 216
-  %43 = add nsw i32 %40, %42
-  store i32 %43, ptr %11, align 4
-  store i32 0, ptr %12, align 4
-  br label %44
-
-44:                                               ; preds = %63, %26
-  %45 = load i32, ptr %12, align 4
-  %46 = icmp slt i32 %45, 1
-  br i1 %46, label %47, label %66
-
-47:                                               ; preds = %44
-  %48 = load ptr, ptr %7, align 8
-  %49 = load i32, ptr %11, align 4
-  %50 = load i32, ptr %12, align 4
-  %51 = mul nsw i32 %50, 884736
-  %52 = add nsw i32 %49, %51
-  %53 = sext i32 %52 to i64
-  %54 = getelementptr inbounds double, ptr %48, i64 %53
-  %55 = load double, ptr %54, align 8
-  %56 = load ptr, ptr %8, align 8
-  %57 = load i32, ptr %9, align 4
-  %58 = load i32, ptr %12, align 4
-  %59 = mul nsw i32 %58, 6
-  %60 = add nsw i32 %57, %59
-  %61 = sext i32 %60 to i64
-  %62 = getelementptr inbounds double, ptr %56, i64 %61
-  store double %55, ptr %62, align 8
-  br label %63
-
-63:                                               ; preds = %47
-  %64 = load i32, ptr %12, align 4
-  %65 = add nsw i32 %64, 1
-  store i32 %65, ptr %12, align 4
-  br label %44, !llvm.loop !14
-
-66:                                               ; preds = %44
-  br label %67
-
-67:                                               ; preds = %66
-  %68 = load i32, ptr %9, align 4
-  %69 = add nsw i32 %68, 1
-  store i32 %69, ptr %9, align 4
-  br label %23, !llvm.loop !15
-
-70:                                               ; preds = %23
-  br label %71
-
-71:                                               ; preds = %70, %17, %4
-  ret void
-}
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define internal noundef i32 @_ZL10apply_massPviPKPKdPKPd(ptr noundef %0, i32 noundef %1, ptr noundef %2, ptr noundef %3) #1 {
-  %5 = alloca ptr, align 8
-  %6 = alloca i32, align 4
-  %7 = alloca ptr, align 8
-  %8 = alloca ptr, align 8
-  store ptr %0, ptr %5, align 8
-  store i32 %1, ptr %6, align 4
-  store ptr %2, ptr %7, align 8
-  store ptr %3, ptr %8, align 8
-  %9 = load ptr, ptr %5, align 8
-  %10 = load i32, ptr %6, align 4
-  %11 = load ptr, ptr %7, align 8
-  %12 = load ptr, ptr %8, align 8
-  %13 = call i32 @apply_mass_rs(ptr noundef %9, i32 noundef %10, ptr noundef %11, ptr noundef %12) #5
-  ret i32 %13
-}
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define linkonce_odr dso_local void @_Z23InterpTransposeTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %0, ptr noalias noundef %1, ptr noundef %2, ptr noalias noundef %3) #1 comdat {
-  %5 = alloca ptr, align 8
-  %6 = alloca ptr, align 8
-  %7 = alloca ptr, align 8
-  %8 = alloca ptr, align 8
-  %9 = alloca [6 x double], align 8
-  %10 = alloca [6 x double], align 8
-  %11 = alloca i32, align 4
-  store ptr %0, ptr %5, align 8
-  store ptr %1, ptr %6, align 8
-  store ptr %2, ptr %7, align 8
-  store ptr %3, ptr %8, align 8
-  store i32 0, ptr %11, align 4
-  br label %12
-
-12:                                               ; preds = %36, %4
-  %13 = load i32, ptr %11, align 4
-  %14 = icmp slt i32 %13, 1
-  br i1 %14, label %15, label %39
-
-15:                                               ; preds = %12
-  %16 = load ptr, ptr %5, align 8
-  %17 = load ptr, ptr %6, align 8
-  %18 = load i32, ptr %11, align 4
-  %19 = mul nsw i32 %18, 6
-  %20 = sext i32 %19 to i64
-  %21 = getelementptr inbounds double, ptr %17, i64 %20
-  %22 = load ptr, ptr %7, align 8
-  %23 = getelementptr inbounds [6 x double], ptr %9, i64 0, i64 0
-  call void @_Z20ContractTransposeZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %16, ptr noundef %21, ptr noundef %22, ptr noundef %23) #5
-  %24 = load ptr, ptr %5, align 8
-  %25 = getelementptr inbounds [6 x double], ptr %9, i64 0, i64 0
-  %26 = load ptr, ptr %7, align 8
-  %27 = getelementptr inbounds [6 x double], ptr %10, i64 0, i64 0
-  call void @_Z20ContractTransposeY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %24, ptr noundef %25, ptr noundef %26, ptr noundef %27) #5
-  %28 = load ptr, ptr %5, align 8
-  %29 = getelementptr inbounds [6 x double], ptr %10, i64 0, i64 0
-  %30 = load ptr, ptr %7, align 8
-  %31 = load ptr, ptr %8, align 8
-  %32 = load i32, ptr %11, align 4
-  %33 = mul nsw i32 %32, 5
-  %34 = sext i32 %33 to i64
-  %35 = getelementptr inbounds double, ptr %31, i64 %34
-  call void @_Z20ContractTransposeX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %28, ptr noundef %29, ptr noundef %30, ptr noundef %35) #5
-  br label %36
-
-36:                                               ; preds = %15
-  %37 = load i32, ptr %11, align 4
-  %38 = add nsw i32 %37, 1
-  store i32 %38, ptr %11, align 4
-  br label %12, !llvm.loop !16
-
-39:                                               ; preds = %12
-  ret void
-}
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define linkonce_odr dso_local void @_Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd(ptr noundef nonnull align 8 dereferenceable(24) %0, i32 noundef %1, i32 noundef %2, ptr noalias noundef %3, ptr noalias noundef %4, ptr noalias noundef %5) #1 comdat {
-  %7 = alloca ptr, align 8
-  %8 = alloca i32, align 4
-  %9 = alloca i32, align 4
-  %10 = alloca ptr, align 8
-  %11 = alloca ptr, align 8
-  %12 = alloca ptr, align 8
-  %13 = alloca i32, align 4
-  %14 = alloca i32, align 4
-  %15 = alloca i32, align 4
-  %16 = alloca i32, align 4
-  store ptr %0, ptr %7, align 8
-  store i32 %1, ptr %8, align 4
-  store i32 %2, ptr %9, align 4
-  store ptr %3, ptr %10, align 8
-  store ptr %4, ptr %11, align 8
-  store ptr %5, ptr %12, align 8
-  %17 = load ptr, ptr %7, align 8
-  %18 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %17, i32 0, i32 0
-  %19 = load i32, ptr %18, align 8
-  %20 = icmp slt i32 %19, 5
-  br i1 %20, label %21, label %81
-
-21:                                               ; preds = %6
-  %22 = load ptr, ptr %7, align 8
-  %23 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %22, i32 0, i32 1
-  %24 = load i32, ptr %23, align 4
-  %25 = icmp slt i32 %24, 5
-  br i1 %25, label %26, label %81
-
-26:                                               ; preds = %21
-  store i32 0, ptr %13, align 4
-  br label %27
-
-27:                                               ; preds = %77, %26
-  %28 = load i32, ptr %13, align 4
-  %29 = icmp slt i32 %28, 5
-  br i1 %29, label %30, label %80
-
-30:                                               ; preds = %27
-  %31 = load ptr, ptr %7, align 8
-  %32 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %31, i32 0, i32 0
-  %33 = load i32, ptr %32, align 8
-  %34 = load ptr, ptr %7, align 8
-  %35 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %34, i32 0, i32 1
-  %36 = load i32, ptr %35, align 4
-  %37 = mul nsw i32 %36, 5
-  %38 = add nsw i32 %33, %37
-  %39 = load i32, ptr %13, align 4
-  %40 = mul nsw i32 %39, 5
-  %41 = mul nsw i32 %40, 5
-  %42 = add nsw i32 %38, %41
-  store i32 %42, ptr %14, align 4
-  %43 = load ptr, ptr %10, align 8
-  %44 = load i32, ptr %14, align 4
-  %45 = load i32, ptr %9, align 4
-  %46 = mul nsw i32 %45, 5
-  %47 = mul nsw i32 %46, 5
-  %48 = mul nsw i32 %47, 5
-  %49 = add nsw i32 %44, %48
-  %50 = sext i32 %49 to i64
-  %51 = getelementptr inbounds i32, ptr %43, i64 %50
-  %52 = load i32, ptr %51, align 4
-  store i32 %52, ptr %15, align 4
-  store i32 0, ptr %16, align 4
-  br label %53
-
-53:                                               ; preds = %73, %30
-  %54 = load i32, ptr %16, align 4
-  %55 = icmp slt i32 %54, 1
-  br i1 %55, label %56, label %76
-
-56:                                               ; preds = %53
-  %57 = load ptr, ptr %12, align 8
-  %58 = load i32, ptr %15, align 4
-  %59 = load i32, ptr %16, align 4
-  %60 = mul nsw i32 274625, %59
-  %61 = add nsw i32 %58, %60
-  %62 = sext i32 %61 to i64
-  %63 = getelementptr inbounds double, ptr %57, i64 %62
-  %64 = load ptr, ptr %11, align 8
-  %65 = load i32, ptr %13, align 4
-  %66 = load i32, ptr %16, align 4
-  %67 = mul nsw i32 %66, 5
-  %68 = add nsw i32 %65, %67
-  %69 = sext i32 %68 to i64
-  %70 = getelementptr inbounds double, ptr %64, i64 %69
-  %71 = load double, ptr %70, align 8
-  %72 = call contract noundef double @_ZL9atomicAddPdd(ptr noundef %63, double noundef %71) #5
-  br label %73
-
-73:                                               ; preds = %56
-  %74 = load i32, ptr %16, align 4
-  %75 = add nsw i32 %74, 1
-  store i32 %75, ptr %16, align 4
-  br label %53, !llvm.loop !17
-
-76:                                               ; preds = %53
-  br label %77
-
-77:                                               ; preds = %76
-  %78 = load i32, ptr %13, align 4
-  %79 = add nsw i32 %78, 1
-  store i32 %79, ptr %13, align 4
-  br label %27, !llvm.loop !18
-
-80:                                               ; preds = %27
-  br label %81
-
-81:                                               ; preds = %80, %21, %6
-  ret void
-}
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.y() #3
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.z() #3
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #3
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #3
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.z() #3
-
-; Function Attrs: convergent nounwind
-declare dso_local i32 @apply_mass_rs(ptr noundef, i32 noundef, ptr noundef, ptr noundef) #4
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare noundef i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() #3
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define linkonce_odr dso_local void @_Z11ContractX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %0, ptr noundef %1, ptr noundef %2, ptr noundef %3) #1 comdat {
-  %5 = alloca ptr, align 8
-  %6 = alloca ptr, align 8
-  %7 = alloca ptr, align 8
-  %8 = alloca ptr, align 8
-  %9 = alloca [5 x double], align 8
-  %10 = alloca i32, align 4
-  %11 = alloca i32, align 4
-  %12 = alloca i32, align 4
-  store ptr %0, ptr %5, align 8
-  store ptr %1, ptr %6, align 8
-  store ptr %2, ptr %7, align 8
-  store ptr %3, ptr %8, align 8
-  store i32 0, ptr %10, align 4
-  br label %13
-
-13:                                               ; preds = %30, %4
-  %14 = load i32, ptr %10, align 4
-  %15 = icmp slt i32 %14, 5
-  br i1 %15, label %16, label %33
-
-16:                                               ; preds = %13
-  %17 = load ptr, ptr %7, align 8
-  %18 = load i32, ptr %10, align 4
-  %19 = load ptr, ptr %5, align 8
-  %20 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %19, i32 0, i32 0
-  %21 = load i32, ptr %20, align 8
-  %22 = mul nsw i32 %21, 5
-  %23 = add nsw i32 %18, %22
-  %24 = sext i32 %23 to i64
-  %25 = getelementptr inbounds double, ptr %17, i64 %24
-  %26 = load double, ptr %25, align 8
-  %27 = load i32, ptr %10, align 4
-  %28 = sext i32 %27 to i64
-  %29 = getelementptr inbounds [5 x double], ptr %9, i64 0, i64 %28
-  store double %26, ptr %29, align 8
-  br label %30
-
-30:                                               ; preds = %16
-  %31 = load i32, ptr %10, align 4
-  %32 = add nsw i32 %31, 1
-  store i32 %32, ptr %10, align 4
-  br label %13, !llvm.loop !19
-
-33:                                               ; preds = %13
-  store i32 0, ptr %11, align 4
-  br label %34
-
-34:                                               ; preds = %102, %33
-  %35 = load i32, ptr %11, align 4
-  %36 = icmp slt i32 %35, 5
-  br i1 %36, label %37, label %105
-
-37:                                               ; preds = %34
-  call void @llvm.nvvm.barrier0()
-  %38 = load ptr, ptr %6, align 8
-  %39 = load i32, ptr %11, align 4
-  %40 = sext i32 %39 to i64
-  %41 = getelementptr inbounds double, ptr %38, i64 %40
-  %42 = load double, ptr %41, align 8
-  %43 = load ptr, ptr %5, align 8
-  %44 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %43, i32 0, i32 4
-  %45 = load ptr, ptr %44, align 8
-  %46 = load ptr, ptr %5, align 8
-  %47 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %46, i32 0, i32 0
-  %48 = load i32, ptr %47, align 8
-  %49 = load ptr, ptr %5, align 8
-  %50 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %49, i32 0, i32 1
-  %51 = load i32, ptr %50, align 4
-  %52 = mul nsw i32 %51, 6
-  %53 = add nsw i32 %48, %52
-  %54 = sext i32 %53 to i64
-  %55 = getelementptr inbounds double, ptr %45, i64 %54
-  store double %42, ptr %55, align 8
-  call void @llvm.nvvm.barrier0()
-  %56 = load ptr, ptr %8, align 8
-  %57 = load i32, ptr %11, align 4
-  %58 = sext i32 %57 to i64
-  %59 = getelementptr inbounds double, ptr %56, i64 %58
-  store double 0.000000e+00, ptr %59, align 8
-  %60 = load ptr, ptr %5, align 8
-  %61 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %60, i32 0, i32 0
-  %62 = load i32, ptr %61, align 8
-  %63 = icmp slt i32 %62, 6
-  br i1 %63, label %64, label %101
-
-64:                                               ; preds = %37
-  %65 = load ptr, ptr %5, align 8
-  %66 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %65, i32 0, i32 1
-  %67 = load i32, ptr %66, align 4
-  %68 = icmp slt i32 %67, 5
-  br i1 %68, label %69, label %101
-
-69:                                               ; preds = %64
-  store i32 0, ptr %12, align 4
-  br label %70
-
-70:                                               ; preds = %97, %69
-  %71 = load i32, ptr %12, align 4
-  %72 = icmp slt i32 %71, 5
-  br i1 %72, label %73, label %100
-
-73:                                               ; preds = %70
-  %74 = load i32, ptr %12, align 4
-  %75 = sext i32 %74 to i64
-  %76 = getelementptr inbounds [5 x double], ptr %9, i64 0, i64 %75
-  %77 = load double, ptr %76, align 8
-  %78 = load ptr, ptr %5, align 8
-  %79 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %78, i32 0, i32 4
-  %80 = load ptr, ptr %79, align 8
-  %81 = load i32, ptr %12, align 4
-  %82 = load ptr, ptr %5, align 8
-  %83 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %82, i32 0, i32 1
-  %84 = load i32, ptr %83, align 4
-  %85 = mul nsw i32 %84, 6
-  %86 = add nsw i32 %81, %85
-  %87 = sext i32 %86 to i64
-  %88 = getelementptr inbounds double, ptr %80, i64 %87
-  %89 = load double, ptr %88, align 8
-  %90 = fmul contract double %77, %89
-  %91 = load ptr, ptr %8, align 8
-  %92 = load i32, ptr %11, align 4
-  %93 = sext i32 %92 to i64
-  %94 = getelementptr inbounds double, ptr %91, i64 %93
-  %95 = load double, ptr %94, align 8
-  %96 = fadd contract double %95, %90
-  store double %96, ptr %94, align 8
-  br label %97
-
-97:                                               ; preds = %73
-  %98 = load i32, ptr %12, align 4
-  %99 = add nsw i32 %98, 1
-  store i32 %99, ptr %12, align 4
-  br label %70, !llvm.loop !20
-
-100:                                              ; preds = %70
-  br label %101
-
-101:                                              ; preds = %100, %64, %37
-  br label %102
-
-102:                                              ; preds = %101
-  %103 = load i32, ptr %11, align 4
-  %104 = add nsw i32 %103, 1
-  store i32 %104, ptr %11, align 4
-  br label %34, !llvm.loop !21
-
-105:                                              ; preds = %34
-  ret void
-}
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define linkonce_odr dso_local void @_Z11ContractY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %0, ptr noundef %1, ptr noundef %2, ptr noundef %3) #1 comdat {
-  %5 = alloca ptr, align 8
-  %6 = alloca ptr, align 8
-  %7 = alloca ptr, align 8
-  %8 = alloca ptr, align 8
-  %9 = alloca [5 x double], align 8
-  %10 = alloca i32, align 4
-  %11 = alloca i32, align 4
-  %12 = alloca i32, align 4
-  store ptr %0, ptr %5, align 8
-  store ptr %1, ptr %6, align 8
-  store ptr %2, ptr %7, align 8
-  store ptr %3, ptr %8, align 8
-  store i32 0, ptr %10, align 4
-  br label %13
-
-13:                                               ; preds = %30, %4
-  %14 = load i32, ptr %10, align 4
-  %15 = icmp slt i32 %14, 5
-  br i1 %15, label %16, label %33
-
-16:                                               ; preds = %13
-  %17 = load ptr, ptr %7, align 8
-  %18 = load i32, ptr %10, align 4
-  %19 = load ptr, ptr %5, align 8
-  %20 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %19, i32 0, i32 1
-  %21 = load i32, ptr %20, align 4
-  %22 = mul nsw i32 %21, 5
-  %23 = add nsw i32 %18, %22
-  %24 = sext i32 %23 to i64
-  %25 = getelementptr inbounds double, ptr %17, i64 %24
-  %26 = load double, ptr %25, align 8
-  %27 = load i32, ptr %10, align 4
-  %28 = sext i32 %27 to i64
-  %29 = getelementptr inbounds [5 x double], ptr %9, i64 0, i64 %28
-  store double %26, ptr %29, align 8
-  br label %30
-
-30:                                               ; preds = %16
-  %31 = load i32, ptr %10, align 4
-  %32 = add nsw i32 %31, 1
-  store i32 %32, ptr %10, align 4
-  br label %13, !llvm.loop !22
-
-33:                                               ; preds = %13
-  store i32 0, ptr %11, align 4
-  br label %34
-
-34:                                               ; preds = %102, %33
-  %35 = load i32, ptr %11, align 4
-  %36 = icmp slt i32 %35, 5
-  br i1 %36, label %37, label %105
-
-37:                                               ; preds = %34
-  call void @llvm.nvvm.barrier0()
-  %38 = load ptr, ptr %6, align 8
-  %39 = load i32, ptr %11, align 4
-  %40 = sext i32 %39 to i64
-  %41 = getelementptr inbounds double, ptr %38, i64 %40
-  %42 = load double, ptr %41, align 8
-  %43 = load ptr, ptr %5, align 8
-  %44 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %43, i32 0, i32 4
-  %45 = load ptr, ptr %44, align 8
-  %46 = load ptr, ptr %5, align 8
-  %47 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %46, i32 0, i32 0
-  %48 = load i32, ptr %47, align 8
-  %49 = load ptr, ptr %5, align 8
-  %50 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %49, i32 0, i32 1
-  %51 = load i32, ptr %50, align 4
-  %52 = mul nsw i32 %51, 6
-  %53 = add nsw i32 %48, %52
-  %54 = sext i32 %53 to i64
-  %55 = getelementptr inbounds double, ptr %45, i64 %54
-  store double %42, ptr %55, align 8
-  call void @llvm.nvvm.barrier0()
-  %56 = load ptr, ptr %8, align 8
-  %57 = load i32, ptr %11, align 4
-  %58 = sext i32 %57 to i64
-  %59 = getelementptr inbounds double, ptr %56, i64 %58
-  store double 0.000000e+00, ptr %59, align 8
-  %60 = load ptr, ptr %5, align 8
-  %61 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %60, i32 0, i32 0
-  %62 = load i32, ptr %61, align 8
-  %63 = icmp slt i32 %62, 6
-  br i1 %63, label %64, label %101
-
-64:                                               ; preds = %37
-  %65 = load ptr, ptr %5, align 8
-  %66 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %65, i32 0, i32 1
-  %67 = load i32, ptr %66, align 4
-  %68 = icmp slt i32 %67, 6
-  br i1 %68, label %69, label %101
-
-69:                                               ; preds = %64
-  store i32 0, ptr %12, align 4
-  br label %70
-
-70:                                               ; preds = %97, %69
-  %71 = load i32, ptr %12, align 4
-  %72 = icmp slt i32 %71, 5
-  br i1 %72, label %73, label %100
-
-73:                                               ; preds = %70
-  %74 = load i32, ptr %12, align 4
-  %75 = sext i32 %74 to i64
-  %76 = getelementptr inbounds [5 x double], ptr %9, i64 0, i64 %75
-  %77 = load double, ptr %76, align 8
-  %78 = load ptr, ptr %5, align 8
-  %79 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %78, i32 0, i32 4
-  %80 = load ptr, ptr %79, align 8
-  %81 = load ptr, ptr %5, align 8
-  %82 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %81, i32 0, i32 0
-  %83 = load i32, ptr %82, align 8
-  %84 = load i32, ptr %12, align 4
-  %85 = mul nsw i32 %84, 6
-  %86 = add nsw i32 %83, %85
-  %87 = sext i32 %86 to i64
-  %88 = getelementptr inbounds double, ptr %80, i64 %87
-  %89 = load double, ptr %88, align 8
-  %90 = fmul contract double %77, %89
-  %91 = load ptr, ptr %8, align 8
-  %92 = load i32, ptr %11, align 4
-  %93 = sext i32 %92 to i64
-  %94 = getelementptr inbounds double, ptr %91, i64 %93
-  %95 = load double, ptr %94, align 8
-  %96 = fadd contract double %95, %90
-  store double %96, ptr %94, align 8
-  br label %97
-
-97:                                               ; preds = %73
-  %98 = load i32, ptr %12, align 4
-  %99 = add nsw i32 %98, 1
-  store i32 %99, ptr %12, align 4
-  br label %70, !llvm.loop !23
-
-100:                                              ; preds = %70
-  br label %101
-
-101:                                              ; preds = %100, %64, %37
-  br label %102
-
-102:                                              ; preds = %101
-  %103 = load i32, ptr %11, align 4
-  %104 = add nsw i32 %103, 1
-  store i32 %104, ptr %11, align 4
-  br label %34, !llvm.loop !24
-
-105:                                              ; preds = %34
-  ret void
-}
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define linkonce_odr dso_local void @_Z11ContractZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %0, ptr noundef %1, ptr noundef %2, ptr noundef %3) #1 comdat {
-  %5 = alloca ptr, align 8
-  %6 = alloca ptr, align 8
-  %7 = alloca ptr, align 8
-  %8 = alloca ptr, align 8
-  %9 = alloca i32, align 4
-  %10 = alloca i32, align 4
-  store ptr %0, ptr %5, align 8
-  store ptr %1, ptr %6, align 8
-  store ptr %2, ptr %7, align 8
-  store ptr %3, ptr %8, align 8
-  store i32 0, ptr %9, align 4
-  br label %11
-
-11:                                               ; preds = %58, %4
-  %12 = load i32, ptr %9, align 4
-  %13 = icmp slt i32 %12, 6
-  br i1 %13, label %14, label %61
-
-14:                                               ; preds = %11
-  %15 = load ptr, ptr %8, align 8
-  %16 = load i32, ptr %9, align 4
-  %17 = sext i32 %16 to i64
-  %18 = getelementptr inbounds double, ptr %15, i64 %17
-  store double 0.000000e+00, ptr %18, align 8
-  %19 = load ptr, ptr %5, align 8
-  %20 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %19, i32 0, i32 0
-  %21 = load i32, ptr %20, align 8
-  %22 = icmp slt i32 %21, 6
-  br i1 %22, label %23, label %57
-
-23:                                               ; preds = %14
-  %24 = load ptr, ptr %5, align 8
-  %25 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %24, i32 0, i32 1
-  %26 = load i32, ptr %25, align 4
-  %27 = icmp slt i32 %26, 6
-  br i1 %27, label %28, label %57
-
-28:                                               ; preds = %23
-  store i32 0, ptr %10, align 4
-  br label %29
-
-29:                                               ; preds = %53, %28
-  %30 = load i32, ptr %10, align 4
-  %31 = icmp slt i32 %30, 5
-  br i1 %31, label %32, label %56
-
-32:                                               ; preds = %29
-  %33 = load ptr, ptr %7, align 8
-  %34 = load i32, ptr %10, align 4
-  %35 = load i32, ptr %9, align 4
-  %36 = mul nsw i32 %35, 5
-  %37 = add nsw i32 %34, %36
-  %38 = sext i32 %37 to i64
-  %39 = getelementptr inbounds double, ptr %33, i64 %38
-  %40 = load double, ptr %39, align 8
-  %41 = load ptr, ptr %6, align 8
-  %42 = load i32, ptr %10, align 4
-  %43 = sext i32 %42 to i64
-  %44 = getelementptr inbounds double, ptr %41, i64 %43
-  %45 = load double, ptr %44, align 8
-  %46 = fmul contract double %40, %45
-  %47 = load ptr, ptr %8, align 8
-  %48 = load i32, ptr %9, align 4
-  %49 = sext i32 %48 to i64
-  %50 = getelementptr inbounds double, ptr %47, i64 %49
-  %51 = load double, ptr %50, align 8
-  %52 = fadd contract double %51, %46
-  store double %52, ptr %50, align 8
-  br label %53
-
-53:                                               ; preds = %32
-  %54 = load i32, ptr %10, align 4
-  %55 = add nsw i32 %54, 1
-  store i32 %55, ptr %10, align 4
-  br label %29, !llvm.loop !25
-
-56:                                               ; preds = %29
-  br label %57
-
-57:                                               ; preds = %56, %23, %14
-  br label %58
-
-58:                                               ; preds = %57
-  %59 = load i32, ptr %9, align 4
-  %60 = add nsw i32 %59, 1
-  store i32 %60, ptr %9, align 4
-  br label %11, !llvm.loop !26
-
-61:                                               ; preds = %11
-  ret void
-}
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define linkonce_odr dso_local void @_Z20ContractTransposeZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %0, ptr noundef %1, ptr noundef %2, ptr noundef %3) #1 comdat {
-  %5 = alloca ptr, align 8
-  %6 = alloca ptr, align 8
-  %7 = alloca ptr, align 8
-  %8 = alloca ptr, align 8
-  %9 = alloca i32, align 4
-  %10 = alloca i32, align 4
-  store ptr %0, ptr %5, align 8
-  store ptr %1, ptr %6, align 8
-  store ptr %2, ptr %7, align 8
-  store ptr %3, ptr %8, align 8
-  store i32 0, ptr %9, align 4
-  br label %11
-
-11:                                               ; preds = %58, %4
-  %12 = load i32, ptr %9, align 4
-  %13 = icmp slt i32 %12, 5
-  br i1 %13, label %14, label %61
-
-14:                                               ; preds = %11
-  %15 = load ptr, ptr %8, align 8
-  %16 = load i32, ptr %9, align 4
-  %17 = sext i32 %16 to i64
-  %18 = getelementptr inbounds double, ptr %15, i64 %17
-  store double 0.000000e+00, ptr %18, align 8
-  %19 = load ptr, ptr %5, align 8
-  %20 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %19, i32 0, i32 0
-  %21 = load i32, ptr %20, align 8
-  %22 = icmp slt i32 %21, 6
-  br i1 %22, label %23, label %57
-
-23:                                               ; preds = %14
-  %24 = load ptr, ptr %5, align 8
-  %25 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %24, i32 0, i32 1
-  %26 = load i32, ptr %25, align 4
-  %27 = icmp slt i32 %26, 6
-  br i1 %27, label %28, label %57
-
-28:                                               ; preds = %23
-  store i32 0, ptr %10, align 4
-  br label %29
-
-29:                                               ; preds = %53, %28
-  %30 = load i32, ptr %10, align 4
-  %31 = icmp slt i32 %30, 6
-  br i1 %31, label %32, label %56
-
-32:                                               ; preds = %29
-  %33 = load ptr, ptr %7, align 8
-  %34 = load i32, ptr %9, align 4
-  %35 = load i32, ptr %10, align 4
-  %36 = mul nsw i32 %35, 5
-  %37 = add nsw i32 %34, %36
-  %38 = sext i32 %37 to i64
-  %39 = getelementptr inbounds double, ptr %33, i64 %38
-  %40 = load double, ptr %39, align 8
-  %41 = load ptr, ptr %6, align 8
-  %42 = load i32, ptr %10, align 4
-  %43 = sext i32 %42 to i64
-  %44 = getelementptr inbounds double, ptr %41, i64 %43
-  %45 = load double, ptr %44, align 8
-  %46 = fmul contract double %40, %45
-  %47 = load ptr, ptr %8, align 8
-  %48 = load i32, ptr %9, align 4
-  %49 = sext i32 %48 to i64
-  %50 = getelementptr inbounds double, ptr %47, i64 %49
-  %51 = load double, ptr %50, align 8
-  %52 = fadd contract double %51, %46
-  store double %52, ptr %50, align 8
-  br label %53
-
-53:                                               ; preds = %32
-  %54 = load i32, ptr %10, align 4
-  %55 = add nsw i32 %54, 1
-  store i32 %55, ptr %10, align 4
-  br label %29, !llvm.loop !27
-
-56:                                               ; preds = %29
-  br label %57
-
-57:                                               ; preds = %56, %23, %14
-  br label %58
-
-58:                                               ; preds = %57
-  %59 = load i32, ptr %9, align 4
-  %60 = add nsw i32 %59, 1
-  store i32 %60, ptr %9, align 4
-  br label %11, !llvm.loop !28
-
-61:                                               ; preds = %11
-  ret void
-}
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define linkonce_odr dso_local void @_Z20ContractTransposeY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %0, ptr noundef %1, ptr noundef %2, ptr noundef %3) #1 comdat {
-  %5 = alloca ptr, align 8
-  %6 = alloca ptr, align 8
-  %7 = alloca ptr, align 8
-  %8 = alloca ptr, align 8
-  %9 = alloca [6 x double], align 8
-  %10 = alloca i32, align 4
-  %11 = alloca i32, align 4
-  %12 = alloca i32, align 4
-  store ptr %0, ptr %5, align 8
-  store ptr %1, ptr %6, align 8
-  store ptr %2, ptr %7, align 8
-  store ptr %3, ptr %8, align 8
-  store i32 0, ptr %10, align 4
-  br label %13
-
-13:                                               ; preds = %30, %4
-  %14 = load i32, ptr %10, align 4
-  %15 = icmp slt i32 %14, 6
-  br i1 %15, label %16, label %33
-
-16:                                               ; preds = %13
-  %17 = load ptr, ptr %7, align 8
-  %18 = load ptr, ptr %5, align 8
-  %19 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %18, i32 0, i32 1
-  %20 = load i32, ptr %19, align 4
-  %21 = load i32, ptr %10, align 4
-  %22 = mul nsw i32 %21, 5
-  %23 = add nsw i32 %20, %22
-  %24 = sext i32 %23 to i64
-  %25 = getelementptr inbounds double, ptr %17, i64 %24
-  %26 = load double, ptr %25, align 8
-  %27 = load i32, ptr %10, align 4
-  %28 = sext i32 %27 to i64
-  %29 = getelementptr inbounds [6 x double], ptr %9, i64 0, i64 %28
-  store double %26, ptr %29, align 8
-  br label %30
-
-30:                                               ; preds = %16
-  %31 = load i32, ptr %10, align 4
-  %32 = add nsw i32 %31, 1
-  store i32 %32, ptr %10, align 4
-  br label %13, !llvm.loop !29
-
-33:                                               ; preds = %13
-  store i32 0, ptr %11, align 4
-  br label %34
-
-34:                                               ; preds = %102, %33
-  %35 = load i32, ptr %11, align 4
-  %36 = icmp slt i32 %35, 5
-  br i1 %36, label %37, label %105
-
-37:                                               ; preds = %34
-  call void @llvm.nvvm.barrier0()
-  %38 = load ptr, ptr %6, align 8
-  %39 = load i32, ptr %11, align 4
-  %40 = sext i32 %39 to i64
-  %41 = getelementptr inbounds double, ptr %38, i64 %40
-  %42 = load double, ptr %41, align 8
-  %43 = load ptr, ptr %5, align 8
-  %44 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %43, i32 0, i32 4
-  %45 = load ptr, ptr %44, align 8
-  %46 = load ptr, ptr %5, align 8
-  %47 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %46, i32 0, i32 0
-  %48 = load i32, ptr %47, align 8
-  %49 = load ptr, ptr %5, align 8
-  %50 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %49, i32 0, i32 1
-  %51 = load i32, ptr %50, align 4
-  %52 = mul nsw i32 %51, 6
-  %53 = add nsw i32 %48, %52
-  %54 = sext i32 %53 to i64
-  %55 = getelementptr inbounds double, ptr %45, i64 %54
-  store double %42, ptr %55, align 8
-  call void @llvm.nvvm.barrier0()
-  %56 = load ptr, ptr %8, align 8
-  %57 = load i32, ptr %11, align 4
-  %58 = sext i32 %57 to i64
-  %59 = getelementptr inbounds double, ptr %56, i64 %58
-  store double 0.000000e+00, ptr %59, align 8
-  %60 = load ptr, ptr %5, align 8
-  %61 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %60, i32 0, i32 0
-  %62 = load i32, ptr %61, align 8
-  %63 = icmp slt i32 %62, 6
-  br i1 %63, label %64, label %101
-
-64:                                               ; preds = %37
-  %65 = load ptr, ptr %5, align 8
-  %66 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %65, i32 0, i32 1
-  %67 = load i32, ptr %66, align 4
-  %68 = icmp slt i32 %67, 5
-  br i1 %68, label %69, label %101
-
-69:                                               ; preds = %64
-  store i32 0, ptr %12, align 4
-  br label %70
-
-70:                                               ; preds = %97, %69
-  %71 = load i32, ptr %12, align 4
-  %72 = icmp slt i32 %71, 6
-  br i1 %72, label %73, label %100
-
-73:                                               ; preds = %70
-  %74 = load i32, ptr %12, align 4
-  %75 = sext i32 %74 to i64
-  %76 = getelementptr inbounds [6 x double], ptr %9, i64 0, i64 %75
-  %77 = load double, ptr %76, align 8
-  %78 = load ptr, ptr %5, align 8
-  %79 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %78, i32 0, i32 4
-  %80 = load ptr, ptr %79, align 8
-  %81 = load ptr, ptr %5, align 8
-  %82 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %81, i32 0, i32 0
-  %83 = load i32, ptr %82, align 8
-  %84 = load i32, ptr %12, align 4
-  %85 = mul nsw i32 %84, 6
-  %86 = add nsw i32 %83, %85
-  %87 = sext i32 %86 to i64
-  %88 = getelementptr inbounds double, ptr %80, i64 %87
-  %89 = load double, ptr %88, align 8
-  %90 = fmul contract double %77, %89
-  %91 = load ptr, ptr %8, align 8
-  %92 = load i32, ptr %11, align 4
-  %93 = sext i32 %92 to i64
-  %94 = getelementptr inbounds double, ptr %91, i64 %93
-  %95 = load double, ptr %94, align 8
-  %96 = fadd contract double %95, %90
-  store double %96, ptr %94, align 8
-  br label %97
-
-97:                                               ; preds = %73
-  %98 = load i32, ptr %12, align 4
-  %99 = add nsw i32 %98, 1
-  store i32 %99, ptr %12, align 4
-  br label %70, !llvm.loop !30
-
-100:                                              ; preds = %70
-  br label %101
-
-101:                                              ; preds = %100, %64, %37
-  br label %102
-
-102:                                              ; preds = %101
-  %103 = load i32, ptr %11, align 4
-  %104 = add nsw i32 %103, 1
-  store i32 %104, ptr %11, align 4
-  br label %34, !llvm.loop !31
-
-105:                                              ; preds = %34
-  ret void
-}
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define linkonce_odr dso_local void @_Z20ContractTransposeX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %0, ptr noundef %1, ptr noundef %2, ptr noundef %3) #1 comdat {
-  %5 = alloca ptr, align 8
-  %6 = alloca ptr, align 8
-  %7 = alloca ptr, align 8
-  %8 = alloca ptr, align 8
-  %9 = alloca [6 x double], align 8
-  %10 = alloca i32, align 4
-  %11 = alloca i32, align 4
-  %12 = alloca i32, align 4
-  store ptr %0, ptr %5, align 8
-  store ptr %1, ptr %6, align 8
-  store ptr %2, ptr %7, align 8
-  store ptr %3, ptr %8, align 8
-  store i32 0, ptr %10, align 4
-  br label %13
-
-13:                                               ; preds = %30, %4
-  %14 = load i32, ptr %10, align 4
-  %15 = icmp slt i32 %14, 6
-  br i1 %15, label %16, label %33
-
-16:                                               ; preds = %13
-  %17 = load ptr, ptr %7, align 8
-  %18 = load ptr, ptr %5, align 8
-  %19 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %18, i32 0, i32 0
-  %20 = load i32, ptr %19, align 8
-  %21 = load i32, ptr %10, align 4
-  %22 = mul nsw i32 %21, 5
-  %23 = add nsw i32 %20, %22
-  %24 = sext i32 %23 to i64
-  %25 = getelementptr inbounds double, ptr %17, i64 %24
-  %26 = load double, ptr %25, align 8
-  %27 = load i32, ptr %10, align 4
-  %28 = sext i32 %27 to i64
-  %29 = getelementptr inbounds [6 x double], ptr %9, i64 0, i64 %28
-  store double %26, ptr %29, align 8
-  br label %30
-
-30:                                               ; preds = %16
-  %31 = load i32, ptr %10, align 4
-  %32 = add nsw i32 %31, 1
-  store i32 %32, ptr %10, align 4
-  br label %13, !llvm.loop !32
-
-33:                                               ; preds = %13
-  store i32 0, ptr %11, align 4
-  br label %34
-
-34:                                               ; preds = %102, %33
-  %35 = load i32, ptr %11, align 4
-  %36 = icmp slt i32 %35, 5
-  br i1 %36, label %37, label %105
-
-37:                                               ; preds = %34
-  call void @llvm.nvvm.barrier0()
-  %38 = load ptr, ptr %6, align 8
-  %39 = load i32, ptr %11, align 4
-  %40 = sext i32 %39 to i64
-  %41 = getelementptr inbounds double, ptr %38, i64 %40
-  %42 = load double, ptr %41, align 8
-  %43 = load ptr, ptr %5, align 8
-  %44 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %43, i32 0, i32 4
-  %45 = load ptr, ptr %44, align 8
-  %46 = load ptr, ptr %5, align 8
-  %47 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %46, i32 0, i32 0
-  %48 = load i32, ptr %47, align 8
-  %49 = load ptr, ptr %5, align 8
-  %50 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %49, i32 0, i32 1
-  %51 = load i32, ptr %50, align 4
-  %52 = mul nsw i32 %51, 6
-  %53 = add nsw i32 %48, %52
-  %54 = sext i32 %53 to i64
-  %55 = getelementptr inbounds double, ptr %45, i64 %54
-  store double %42, ptr %55, align 8
-  call void @llvm.nvvm.barrier0()
-  %56 = load ptr, ptr %8, align 8
-  %57 = load i32, ptr %11, align 4
-  %58 = sext i32 %57 to i64
-  %59 = getelementptr inbounds double, ptr %56, i64 %58
-  store double 0.000000e+00, ptr %59, align 8
-  %60 = load ptr, ptr %5, align 8
-  %61 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %60, i32 0, i32 0
-  %62 = load i32, ptr %61, align 8
-  %63 = icmp slt i32 %62, 5
-  br i1 %63, label %64, label %101
-
-64:                                               ; preds = %37
-  %65 = load ptr, ptr %5, align 8
-  %66 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %65, i32 0, i32 1
-  %67 = load i32, ptr %66, align 4
-  %68 = icmp slt i32 %67, 5
-  br i1 %68, label %69, label %101
-
-69:                                               ; preds = %64
-  store i32 0, ptr %12, align 4
-  br label %70
-
-70:                                               ; preds = %97, %69
-  %71 = load i32, ptr %12, align 4
-  %72 = icmp slt i32 %71, 6
-  br i1 %72, label %73, label %100
-
-73:                                               ; preds = %70
-  %74 = load i32, ptr %12, align 4
-  %75 = sext i32 %74 to i64
-  %76 = getelementptr inbounds [6 x double], ptr %9, i64 0, i64 %75
-  %77 = load double, ptr %76, align 8
-  %78 = load ptr, ptr %5, align 8
-  %79 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %78, i32 0, i32 4
-  %80 = load ptr, ptr %79, align 8
-  %81 = load i32, ptr %12, align 4
-  %82 = load ptr, ptr %5, align 8
-  %83 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %82, i32 0, i32 1
-  %84 = load i32, ptr %83, align 4
-  %85 = mul nsw i32 %84, 6
-  %86 = add nsw i32 %81, %85
-  %87 = sext i32 %86 to i64
-  %88 = getelementptr inbounds double, ptr %80, i64 %87
-  %89 = load double, ptr %88, align 8
-  %90 = fmul contract double %77, %89
-  %91 = load ptr, ptr %8, align 8
-  %92 = load i32, ptr %11, align 4
-  %93 = sext i32 %92 to i64
-  %94 = getelementptr inbounds double, ptr %91, i64 %93
-  %95 = load double, ptr %94, align 8
-  %96 = fadd contract double %95, %90
-  store double %96, ptr %94, align 8
-  br label %97
-
-97:                                               ; preds = %73
-  %98 = load i32, ptr %12, align 4
-  %99 = add nsw i32 %98, 1
-  store i32 %99, ptr %12, align 4
-  br label %70, !llvm.loop !33
-
-100:                                              ; preds = %70
-  br label %101
-
-101:                                              ; preds = %100, %64, %37
-  br label %102
-
-102:                                              ; preds = %101
-  %103 = load i32, ptr %11, align 4
-  %104 = add nsw i32 %103, 1
-  store i32 %104, ptr %11, align 4
-  br label %34, !llvm.loop !34
-
-105:                                              ; preds = %34
-  ret void
-}
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define internal noundef double @_ZL9atomicAddPdd(ptr noundef %0, double noundef %1) #1 {
-  %3 = alloca ptr, align 8
-  %4 = alloca double, align 8
-  %5 = alloca ptr, align 8
-  %6 = alloca double, align 8
-  store ptr %0, ptr %5, align 8
-  store double %1, ptr %6, align 8
-  %7 = load ptr, ptr %5, align 8
-  %8 = load double, ptr %6, align 8
-  store ptr %7, ptr %3, align 8
-  store double %8, ptr %4, align 8
-  %9 = load ptr, ptr %3, align 8
-  %10 = load double, ptr %4, align 8
-  %11 = atomicrmw fadd ptr %9, double %10 seq_cst, align 8
-  ret double %11
-}
-
-attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_89" "target-features"="+ptx87,+sm_89" "uniform-work-group-size"="true" }
-attributes #1 = { convergent mustprogress noinline nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_89" "target-features"="+ptx87,+sm_89" }
-attributes #2 = { convergent nocallback nounwind }
-attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-attributes #4 = { convergent nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_89" "target-features"="+ptx87,+sm_89" }
-attributes #5 = { convergent nounwind }
-
-!llvm.module.flags = !{!0, !1, !2, !3}
-!nvvm.annotations = !{!4}
-!llvm.ident = !{!5, !6}
-!nvvmir.version = !{!7}
-
-!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 12, i32 8]}
-!1 = !{i32 1, !"wchar_size", i32 4}
-!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
-!3 = !{i32 7, !"frame-pointer", i32 2}
-!4 = !{ptr @CeedKernelCudaGenOperator_apply_mass}
-!5 = !{!"clang version 20.1.8"}
-!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
-!7 = !{i32 2, i32 0}
-!8 = distinct !{!8, !9}
-!9 = !{!"llvm.loop.mustprogress"}
-!10 = distinct !{!10, !9}
-!11 = distinct !{!11, !9}
-!12 = distinct !{!12, !9}
-!13 = distinct !{!13, !9}
-!14 = distinct !{!14, !9}
-!15 = distinct !{!15, !9}
-!16 = distinct !{!16, !9}
-!17 = distinct !{!17, !9}
-!18 = distinct !{!18, !9}
-!19 = distinct !{!19, !9}
-!20 = distinct !{!20, !9}
-!21 = distinct !{!21, !9}
-!22 = distinct !{!22, !9}
-!23 = distinct !{!23, !9}
-!24 = distinct !{!24, !9}
-!25 = distinct !{!25, !9}
-!26 = distinct !{!26, !9}
-!27 = distinct !{!27, !9}
-!28 = distinct !{!28, !9}
-!29 = distinct !{!29, !9}
-!30 = distinct !{!30, !9}
-!31 = distinct !{!31, !9}
-!32 = distinct !{!32, !9}
-!33 = distinct !{!33, !9}
-!34 = distinct !{!34, !9}
diff --git a/temp_kernel_final.ptx b/temp_kernel_final.ptx
deleted file mode 100644
index f459dfd4..00000000
--- a/temp_kernel_final.ptx
+++ /dev/null
@@ -1,2012 +0,0 @@
-//
-// Generated by LLVM NVPTX Back-End
-//
-
-.version 7.8
-.target sm_89
-.address_size 64
-
-.func _Z10LoadMatrixILi5ELi6EEvR15SharedData_CudaPKdPd
-(
-	.param .b64 _Z10LoadMatrixILi5ELi6EEvR15SharedData_CudaPKdPd_param_0,
-	.param .b64 _Z10LoadMatrixILi5ELi6EEvR15SharedData_CudaPKdPd_param_1,
-	.param .b64 _Z10LoadMatrixILi5ELi6EEvR15SharedData_CudaPKdPd_param_2
-)
-;
-.func _Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd
-(
-	.param .b64 _Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_0,
-	.param .b32 _Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_1,
-	.param .b32 _Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_2,
-	.param .b64 _Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_3,
-	.param .b64 _Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_4,
-	.param .b64 _Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_5
-)
-;
-.func _Z14InterpTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd
-(
-	.param .b64 _Z14InterpTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_0,
-	.param .b64 _Z14InterpTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_1,
-	.param .b64 _Z14InterpTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_2,
-	.param .b64 _Z14InterpTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_3
-)
-;
-.func _Z17ReadLVecStrided3dILi1ELi6ELi1ELi884736ELi216EEvR15SharedData_CudaiPKdPd
-(
-	.param .b64 _Z17ReadLVecStrided3dILi1ELi6ELi1ELi884736ELi216EEvR15SharedData_CudaiPKdPd_param_0,
-	.param .b32 _Z17ReadLVecStrided3dILi1ELi6ELi1ELi884736ELi216EEvR15SharedData_CudaiPKdPd_param_1,
-	.param .b64 _Z17ReadLVecStrided3dILi1ELi6ELi1ELi884736ELi216EEvR15SharedData_CudaiPKdPd_param_2,
-	.param .b64 _Z17ReadLVecStrided3dILi1ELi6ELi1ELi884736ELi216EEvR15SharedData_CudaiPKdPd_param_3
-)
-;
-.func  (.param .b32 func_retval0) _ZL10apply_massPviPKPKdPKPd
-(
-	.param .b64 _ZL10apply_massPviPKPKdPKPd_param_0,
-	.param .b32 _ZL10apply_massPviPKPKdPKPd_param_1,
-	.param .b64 _ZL10apply_massPviPKPKdPKPd_param_2,
-	.param .b64 _ZL10apply_massPviPKPKdPKPd_param_3
-)
-;
-.func _Z23InterpTransposeTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd
-(
-	.param .b64 _Z23InterpTransposeTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_0,
-	.param .b64 _Z23InterpTransposeTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_1,
-	.param .b64 _Z23InterpTransposeTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_2,
-	.param .b64 _Z23InterpTransposeTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_3
-)
-;
-.func _Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd
-(
-	.param .b64 _Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_0,
-	.param .b32 _Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_1,
-	.param .b32 _Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_2,
-	.param .b64 _Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_3,
-	.param .b64 _Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_4,
-	.param .b64 _Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_5
-)
-;
-.func  (.param .b64 func_retval0) _ZL9atomicAddPdd
-(
-	.param .b64 _ZL9atomicAddPdd_param_0,
-	.param .b64 _ZL9atomicAddPdd_param_1
-)
-;
-.func _Z20ContractTransposeZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd
-(
-	.param .b64 _Z20ContractTransposeZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_0,
-	.param .b64 _Z20ContractTransposeZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_1,
-	.param .b64 _Z20ContractTransposeZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_2,
-	.param .b64 _Z20ContractTransposeZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_3
-)
-;
-.func _Z20ContractTransposeY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd
-(
-	.param .b64 _Z20ContractTransposeY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_0,
-	.param .b64 _Z20ContractTransposeY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_1,
-	.param .b64 _Z20ContractTransposeY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_2,
-	.param .b64 _Z20ContractTransposeY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_3
-)
-;
-.func _Z20ContractTransposeX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd
-(
-	.param .b64 _Z20ContractTransposeX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_0,
-	.param .b64 _Z20ContractTransposeX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_1,
-	.param .b64 _Z20ContractTransposeX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_2,
-	.param .b64 _Z20ContractTransposeX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_3
-)
-;
-.func _Z11ContractX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd
-(
-	.param .b64 _Z11ContractX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_0,
-	.param .b64 _Z11ContractX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_1,
-	.param .b64 _Z11ContractX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_2,
-	.param .b64 _Z11ContractX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_3
-)
-;
-.func _Z11ContractY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd
-(
-	.param .b64 _Z11ContractY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_0,
-	.param .b64 _Z11ContractY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_1,
-	.param .b64 _Z11ContractY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_2,
-	.param .b64 _Z11ContractY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_3
-)
-;
-.func _Z11ContractZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd
-(
-	.param .b64 _Z11ContractZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_0,
-	.param .b64 _Z11ContractZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_1,
-	.param .b64 _Z11ContractZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_2,
-	.param .b64 _Z11ContractZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_3
-)
-;
-.func  (.param .b32 func_retval0) apply_mass_rs
-(
-	.param .b64 apply_mass_rs_param_0,
-	.param .b32 apply_mass_rs_param_1,
-	.param .b64 apply_mass_rs_param_2,
-	.param .b64 apply_mass_rs_param_3
-)
-;
-.extern .shared .align 8 .b8 slice[];
-// _ZZ36CeedKernelCudaGenOperator_apply_massE8s_B_in_0 has been demoted
-                                        // -- Begin function CeedKernelCudaGenOperator_apply_mass
-                                        // @CeedKernelCudaGenOperator_apply_mass
-.entry CeedKernelCudaGenOperator_apply_mass(
-	.param .u32 CeedKernelCudaGenOperator_apply_mass_param_0,
-	.param .u64 .ptr .align 1 CeedKernelCudaGenOperator_apply_mass_param_1,
-	.param .align 16 .b8 CeedKernelCudaGenOperator_apply_mass_param_2[256],
-	.param .align 16 .b8 CeedKernelCudaGenOperator_apply_mass_param_3[256],
-	.param .align 16 .b8 CeedKernelCudaGenOperator_apply_mass_param_4[256],
-	.param .align 16 .b8 CeedKernelCudaGenOperator_apply_mass_param_5[256],
-	.param .u64 .ptr .align 1 CeedKernelCudaGenOperator_apply_mass_param_6,
-	.param .align 16 .b8 CeedKernelCudaGenOperator_apply_mass_param_7[32]
-)
-{
-	.local .align 8 .b8 	__local_depot0[432];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<38>;
-	.reg .b64 	%rd<60>;
-	// demoted variable
-	.shared .align 8 .b8 _ZZ36CeedKernelCudaGenOperator_apply_massE8s_B_in_0[240];
-// %bb.0:
-	mov.u64 	%SPL, __local_depot0;
-	cvta.local.u64 	%SP, %SPL;
-	mov.b64 	%rd8, CeedKernelCudaGenOperator_apply_mass_param_7;
-	ld.param.u64 	%rd7, [CeedKernelCudaGenOperator_apply_mass_param_6];
-	mov.b64 	%rd6, CeedKernelCudaGenOperator_apply_mass_param_5;
-	mov.b64 	%rd5, CeedKernelCudaGenOperator_apply_mass_param_4;
-	mov.b64 	%rd4, CeedKernelCudaGenOperator_apply_mass_param_3;
-	mov.b64 	%rd3, CeedKernelCudaGenOperator_apply_mass_param_2;
-	ld.param.u64 	%rd2, [CeedKernelCudaGenOperator_apply_mass_param_1];
-	ld.param.u32 	%r1, [CeedKernelCudaGenOperator_apply_mass_param_0];
-	cvta.to.global.u64 	%rd9, %rd7;
-	cvta.global.u64 	%rd10, %rd9;
-	mov.u64 	%rd1, %rd3;
-	cvta.to.global.u64 	%rd11, %rd2;
-	cvta.global.u64 	%rd12, %rd11;
-	st.u32 	[%SP], %r1;
-	st.u64 	[%SP+8], %rd12;
-	st.u64 	[%SP+16], %rd10;
-	ld.param.u64 	%rd13, [CeedKernelCudaGenOperator_apply_mass_param_3];
-	cvta.to.global.u64 	%rd14, %rd13;
-	cvta.global.u64 	%rd15, %rd14;
-	st.u64 	[%SP+24], %rd15;
-	ld.param.u64 	%rd16, [CeedKernelCudaGenOperator_apply_mass_param_3+8];
-	cvta.to.global.u64 	%rd17, %rd16;
-	cvta.global.u64 	%rd18, %rd17;
-	st.u64 	[%SP+32], %rd18;
-	ld.param.u64 	%rd19, [CeedKernelCudaGenOperator_apply_mass_param_3+128];
-	cvta.to.global.u64 	%rd20, %rd19;
-	cvta.global.u64 	%rd21, %rd20;
-	st.u64 	[%SP+40], %rd21;
-	mov.b32 	%r2, 3;
-	st.u32 	[%SP+48], %r2;
-	mov.b32 	%r3, 6;
-	st.u32 	[%SP+52], %r3;
-	mov.u32 	%r4, %tid.x;
-	st.u32 	[%SP+56], %r4;
-	mov.u32 	%r5, %tid.y;
-	st.u32 	[%SP+60], %r5;
-	mov.u32 	%r6, %tid.z;
-	st.u32 	[%SP+64], %r6;
-	mov.u32 	%r7, %ntid.x;
-	mul.lo.s32 	%r8, %r5, %r7;
-	add.s32 	%r9, %r4, %r8;
-	mov.u32 	%r10, %ntid.y;
-	mul.lo.s32 	%r11, %r6, %r10;
-	mul.lo.s32 	%r12, %r11, %r7;
-	add.s32 	%r13, %r9, %r12;
-	st.u32 	[%SP+68], %r13;
-	ld.u32 	%r14, [%SP+64];
-	mul.lo.s32 	%r15, %r14, 36;
-	mov.u64 	%rd22, slice;
-	cvta.shared.u64 	%rd23, %rd22;
-	mul.wide.s32 	%rd24, %r15, 8;
-	add.s64 	%rd25, %rd23, %rd24;
-	st.u64 	[%SP+72], %rd25;
-	st.u32 	[%SP+80], %r2;
-	mov.b32 	%r16, 5;
-	st.u32 	[%SP+84], %r16;
-	mov.b32 	%r17, 1;
-	st.u32 	[%SP+88], %r17;
-	ld.param.u64 	%rd26, [CeedKernelCudaGenOperator_apply_mass_param_4];
-	cvta.to.global.u64 	%rd27, %rd26;
-	cvta.global.u64 	%rd28, %rd27;
-	mov.u64 	%rd29, _ZZ36CeedKernelCudaGenOperator_apply_massE8s_B_in_0;
-	cvta.shared.u64 	%rd30, %rd29;
-	add.u64 	%rd31, %SP, 56;
-	{ // callseq 0, 0
-	.param .b64 param0;
-	st.param.b64 	[param0], %rd31;
-	.param .b64 param1;
-	st.param.b64 	[param1], %rd28;
-	.param .b64 param2;
-	st.param.b64 	[param2], %rd30;
-	call.uni 
-	_Z10LoadMatrixILi5ELi6EEvR15SharedData_CudaPKdPd, 
-	(
-	param0, 
-	param1, 
-	param2
-	);
-	} // callseq 0
-	st.u32 	[%SP+92], %r2;
-	st.u32 	[%SP+96], %r3;
-	st.u32 	[%SP+100], %r17;
-	st.u32 	[%SP+104], %r2;
-	st.u32 	[%SP+108], %r16;
-	st.u32 	[%SP+112], %r17;
-	st.u64 	[%SP+120], %rd30;
-	bar.sync 	0;
-	mov.u32 	%r18, %ctaid.x;
-	mov.u32 	%r19, %ntid.z;
-	mul.lo.s32 	%r20, %r18, %r19;
-	add.s32 	%r21, %r20, %r6;
-	st.u32 	[%SP+128], %r21;
-	bra.uni 	$L__BB0_1;
-$L__BB0_1:                              // =>This Inner Loop Header: Depth=1
-	ld.u32 	%r22, [%SP+128];
-	ld.u32 	%r23, [%SP];
-	setp.ge.s32 	%p1, %r22, %r23;
-	@%p1 bra 	$L__BB0_4;
-	bra.uni 	$L__BB0_2;
-$L__BB0_2:                              //   in Loop: Header=BB0_1 Depth=1
-	add.u64 	%rd32, %SP, 136;
-	st.u64 	[%SP+184], %rd32;
-	mov.b32 	%r24, 274625;
-	st.u32 	[%SP+192], %r24;
-	st.u32 	[%SP+196], %r24;
-	ld.u32 	%r25, [%SP+128];
-	ld.param.u64 	%rd33, [%rd1];
-	cvta.to.global.u64 	%rd34, %rd33;
-	cvta.global.u64 	%rd35, %rd34;
-	ld.u64 	%rd36, [%SP+24];
-	ld.u64 	%rd37, [%SP+184];
-	add.u64 	%rd38, %SP, 56;
-	{ // callseq 1, 0
-	.param .b64 param0;
-	st.param.b64 	[param0], %rd38;
-	.param .b32 param1;
-	st.param.b32 	[param1], 274625;
-	.param .b32 param2;
-	st.param.b32 	[param2], %r25;
-	.param .b64 param3;
-	st.param.b64 	[param3], %rd35;
-	.param .b64 param4;
-	st.param.b64 	[param4], %rd36;
-	.param .b64 param5;
-	st.param.b64 	[param5], %rd37;
-	call.uni 
-	_Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd, 
-	(
-	param0, 
-	param1, 
-	param2, 
-	param3, 
-	param4, 
-	param5
-	);
-	} // callseq 1
-	ld.u64 	%rd39, [%SP+184];
-	mov.u64 	%rd40, _ZZ36CeedKernelCudaGenOperator_apply_massE8s_B_in_0;
-	cvta.shared.u64 	%rd41, %rd40;
-	add.u64 	%rd42, %SP, 200;
-	{ // callseq 2, 0
-	.param .b64 param0;
-	st.param.b64 	[param0], %rd38;
-	.param .b64 param1;
-	st.param.b64 	[param1], %rd39;
-	.param .b64 param2;
-	st.param.b64 	[param2], %rd41;
-	.param .b64 param3;
-	st.param.b64 	[param3], %rd42;
-	call.uni 
-	_Z14InterpTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd, 
-	(
-	param0, 
-	param1, 
-	param2, 
-	param3
-	);
-	} // callseq 2
-	mov.b32 	%r26, 1;
-	st.u32 	[%SP+296], %r26;
-	mov.b32 	%r27, 884736;
-	st.u32 	[%SP+300], %r27;
-	mov.b32 	%r28, 216;
-	st.u32 	[%SP+304], %r28;
-	ld.u32 	%r29, [%SP+128];
-	ld.u64 	%rd43, [%SP+32];
-	add.u64 	%rd44, %SP, 248;
-	{ // callseq 3, 0
-	.param .b64 param0;
-	st.param.b64 	[param0], %rd38;
-	.param .b32 param1;
-	st.param.b32 	[param1], %r29;
-	.param .b64 param2;
-	st.param.b64 	[param2], %rd43;
-	.param .b64 param3;
-	st.param.b64 	[param3], %rd44;
-	call.uni 
-	_Z17ReadLVecStrided3dILi1ELi6ELi1ELi884736ELi216EEvR15SharedData_CudaiPKdPd, 
-	(
-	param0, 
-	param1, 
-	param2, 
-	param3
-	);
-	} // callseq 3
-	st.u64 	[%SP+312], %rd44;
-	st.u64 	[%SP+368], %rd42;
-	ld.u64 	%rd45, [%SP+312];
-	st.u64 	[%SP+376], %rd45;
-	add.u64 	%rd46, %SP, 320;
-	st.u64 	[%SP+384], %rd46;
-	ld.u64 	%rd47, [%SP+368];
-	st.u64 	[%SP+392], %rd47;
-	ld.u64 	%rd48, [%SP+376];
-	st.u64 	[%SP+400], %rd48;
-	ld.u64 	%rd49, [%SP+384];
-	st.u64 	[%SP+408], %rd49;
-	ld.u64 	%rd50, [%SP+8];
-	add.u64 	%rd51, %SP, 392;
-	add.u64 	%rd52, %SP, 408;
-	{ // callseq 4, 0
-	.param .b64 param0;
-	st.param.b64 	[param0], %rd50;
-	.param .b32 param1;
-	st.param.b32 	[param1], 6;
-	.param .b64 param2;
-	st.param.b64 	[param2], %rd51;
-	.param .b64 param3;
-	st.param.b64 	[param3], %rd52;
-	.param .b32 retval0;
-	call.uni (retval0), 
-	_ZL10apply_massPviPKPKdPKPd, 
-	(
-	param0, 
-	param1, 
-	param2, 
-	param3
-	);
-	ld.param.b32 	%r30, [retval0];
-	} // callseq 4
-	st.u64 	[%SP+416], %rd32;
-	ld.u64 	%rd53, [%SP+120];
-	ld.u64 	%rd54, [%SP+416];
-	{ // callseq 5, 0
-	.param .b64 param0;
-	st.param.b64 	[param0], %rd38;
-	.param .b64 param1;
-	st.param.b64 	[param1], %rd46;
-	.param .b64 param2;
-	st.param.b64 	[param2], %rd53;
-	.param .b64 param3;
-	st.param.b64 	[param3], %rd54;
-	call.uni 
-	_Z23InterpTransposeTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd, 
-	(
-	param0, 
-	param1, 
-	param2, 
-	param3
-	);
-	} // callseq 5
-	st.u32 	[%SP+424], %r24;
-	st.u32 	[%SP+428], %r24;
-	ld.u32 	%r32, [%SP+128];
-	ld.param.u64 	%rd55, [%rd1+128];
-	cvta.to.global.u64 	%rd56, %rd55;
-	cvta.global.u64 	%rd57, %rd56;
-	ld.u64 	%rd58, [%SP+416];
-	ld.u64 	%rd59, [%SP+40];
-	{ // callseq 6, 0
-	.param .b64 param0;
-	st.param.b64 	[param0], %rd38;
-	.param .b32 param1;
-	st.param.b32 	[param1], 274625;
-	.param .b32 param2;
-	st.param.b32 	[param2], %r32;
-	.param .b64 param3;
-	st.param.b64 	[param3], %rd57;
-	.param .b64 param4;
-	st.param.b64 	[param4], %rd58;
-	.param .b64 param5;
-	st.param.b64 	[param5], %rd59;
-	call.uni 
-	_Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd, 
-	(
-	param0, 
-	param1, 
-	param2, 
-	param3, 
-	param4, 
-	param5
-	);
-	} // callseq 6
-	bra.uni 	$L__BB0_3;
-$L__BB0_3:                              //   in Loop: Header=BB0_1 Depth=1
-	mov.u32 	%r33, %nctaid.x;
-	mov.u32 	%r34, %ntid.z;
-	mul.lo.s32 	%r35, %r33, %r34;
-	ld.u32 	%r36, [%SP+128];
-	add.s32 	%r37, %r36, %r35;
-	st.u32 	[%SP+128], %r37;
-	bra.uni 	$L__BB0_1;
-$L__BB0_4:
-	ret;
-                                        // -- End function
-}
-.func _Z10LoadMatrixILi5ELi6EEvR15SharedData_CudaPKdPd(
-	.param .b64 _Z10LoadMatrixILi5ELi6EEvR15SharedData_CudaPKdPd_param_0,
-	.param .b64 _Z10LoadMatrixILi5ELi6EEvR15SharedData_CudaPKdPd_param_1,
-	.param .b64 _Z10LoadMatrixILi5ELi6EEvR15SharedData_CudaPKdPd_param_2
-)                                       // -- Begin function _Z10LoadMatrixILi5ELi6EEvR15SharedData_CudaPKdPd
-                                        // @_Z10LoadMatrixILi5ELi6EEvR15SharedData_CudaPKdPd
-{
-	.local .align 8 .b8 	__local_depot1[32];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<10>;
-	.reg .b64 	%rd<11>;
-	.reg .f64 	%fd<2>;
-
-// %bb.0:
-	mov.u64 	%SPL, __local_depot1;
-	cvta.local.u64 	%SP, %SPL;
-	ld.param.u64 	%rd3, [_Z10LoadMatrixILi5ELi6EEvR15SharedData_CudaPKdPd_param_2];
-	ld.param.u64 	%rd2, [_Z10LoadMatrixILi5ELi6EEvR15SharedData_CudaPKdPd_param_1];
-	ld.param.u64 	%rd1, [_Z10LoadMatrixILi5ELi6EEvR15SharedData_CudaPKdPd_param_0];
-	st.u64 	[%SP], %rd1;
-	st.u64 	[%SP+8], %rd2;
-	st.u64 	[%SP+16], %rd3;
-	ld.u64 	%rd4, [%SP];
-	ld.u32 	%r1, [%rd4+12];
-	st.u32 	[%SP+24], %r1;
-	bra.uni 	$L__BB1_1;
-$L__BB1_1:                              // =>This Inner Loop Header: Depth=1
-	ld.u32 	%r2, [%SP+24];
-	setp.gt.s32 	%p1, %r2, 29;
-	@%p1 bra 	$L__BB1_4;
-	bra.uni 	$L__BB1_2;
-$L__BB1_2:                              //   in Loop: Header=BB1_1 Depth=1
-	ld.u64 	%rd5, [%SP+8];
-	ld.s32 	%rd6, [%SP+24];
-	shl.b64 	%rd7, %rd6, 3;
-	add.s64 	%rd8, %rd5, %rd7;
-	ld.f64 	%fd1, [%rd8];
-	ld.u64 	%rd9, [%SP+16];
-	add.s64 	%rd10, %rd9, %rd7;
-	st.f64 	[%rd10], %fd1;
-	bra.uni 	$L__BB1_3;
-$L__BB1_3:                              //   in Loop: Header=BB1_1 Depth=1
-	mov.u32 	%r3, %ntid.x;
-	mov.u32 	%r4, %ntid.y;
-	mul.lo.s32 	%r5, %r3, %r4;
-	mov.u32 	%r6, %ntid.z;
-	mul.lo.s32 	%r7, %r5, %r6;
-	ld.u32 	%r8, [%SP+24];
-	add.s32 	%r9, %r8, %r7;
-	st.u32 	[%SP+24], %r9;
-	bra.uni 	$L__BB1_1;
-$L__BB1_4:
-	ret;
-                                        // -- End function
-}
-.func _Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd(
-	.param .b64 _Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_0,
-	.param .b32 _Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_1,
-	.param .b32 _Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_2,
-	.param .b64 _Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_3,
-	.param .b64 _Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_4,
-	.param .b64 _Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_5
-)                                       // -- Begin function _Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd
-                                        // @_Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd
-{
-	.local .align 8 .b8 	__local_depot2[56];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .pred 	%p<5>;
-	.reg .b32 	%r<32>;
-	.reg .b64 	%rd<17>;
-	.reg .f64 	%fd<2>;
-
-// %bb.0:
-	mov.u64 	%SPL, __local_depot2;
-	cvta.local.u64 	%SP, %SPL;
-	ld.param.u64 	%rd4, [_Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_5];
-	ld.param.u64 	%rd3, [_Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_4];
-	ld.param.u64 	%rd2, [_Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_3];
-	ld.param.u32 	%r2, [_Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_2];
-	ld.param.u32 	%r1, [_Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_1];
-	ld.param.u64 	%rd1, [_Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_0];
-	st.u64 	[%SP], %rd1;
-	st.u32 	[%SP+8], %r1;
-	st.u32 	[%SP+12], %r2;
-	st.u64 	[%SP+16], %rd2;
-	st.u64 	[%SP+24], %rd3;
-	st.u64 	[%SP+32], %rd4;
-	ld.u64 	%rd5, [%SP];
-	ld.u32 	%r3, [%rd5];
-	setp.gt.s32 	%p1, %r3, 4;
-	@%p1 bra 	$L__BB2_11;
-	bra.uni 	$L__BB2_1;
-$L__BB2_1:
-	ld.u64 	%rd6, [%SP];
-	ld.u32 	%r4, [%rd6+4];
-	setp.gt.s32 	%p2, %r4, 4;
-	@%p2 bra 	$L__BB2_11;
-	bra.uni 	$L__BB2_2;
-$L__BB2_2:
-	mov.b32 	%r5, 0;
-	st.u32 	[%SP+40], %r5;
-	bra.uni 	$L__BB2_3;
-$L__BB2_3:                              // =>This Loop Header: Depth=1
-                                        //     Child Loop BB2_5 Depth 2
-	ld.u32 	%r6, [%SP+40];
-	setp.gt.s32 	%p3, %r6, 4;
-	@%p3 bra 	$L__BB2_10;
-	bra.uni 	$L__BB2_4;
-$L__BB2_4:                              //   in Loop: Header=BB2_3 Depth=1
-	ld.u64 	%rd7, [%SP];
-	ld.u32 	%r7, [%rd7];
-	ld.u32 	%r8, [%rd7+4];
-	mul.lo.s32 	%r9, %r8, 5;
-	add.s32 	%r10, %r7, %r9;
-	ld.u32 	%r11, [%SP+40];
-	mul.lo.s32 	%r12, %r11, 25;
-	add.s32 	%r13, %r10, %r12;
-	st.u32 	[%SP+44], %r13;
-	ld.u64 	%rd8, [%SP+16];
-	ld.u32 	%r14, [%SP+44];
-	ld.u32 	%r15, [%SP+12];
-	mul.lo.s32 	%r16, %r15, 125;
-	add.s32 	%r17, %r14, %r16;
-	mul.wide.s32 	%rd9, %r17, 4;
-	add.s64 	%rd10, %rd8, %rd9;
-	ld.u32 	%r18, [%rd10];
-	st.u32 	[%SP+48], %r18;
-	mov.b32 	%r19, 0;
-	st.u32 	[%SP+52], %r19;
-	bra.uni 	$L__BB2_5;
-$L__BB2_5:                              //   Parent Loop BB2_3 Depth=1
-                                        // =>  This Inner Loop Header: Depth=2
-	ld.u32 	%r20, [%SP+52];
-	setp.gt.s32 	%p4, %r20, 0;
-	@%p4 bra 	$L__BB2_8;
-	bra.uni 	$L__BB2_6;
-$L__BB2_6:                              //   in Loop: Header=BB2_5 Depth=2
-	ld.u64 	%rd11, [%SP+24];
-	ld.u32 	%r23, [%SP+48];
-	ld.u32 	%r24, [%SP+52];
-	mul.lo.s32 	%r25, %r24, 274625;
-	add.s32 	%r26, %r23, %r25;
-	mul.wide.s32 	%rd12, %r26, 8;
-	add.s64 	%rd13, %rd11, %rd12;
-	ld.f64 	%fd1, [%rd13];
-	ld.u64 	%rd14, [%SP+32];
-	ld.u32 	%r27, [%SP+40];
-	mul.lo.s32 	%r28, %r24, 5;
-	add.s32 	%r29, %r27, %r28;
-	mul.wide.s32 	%rd15, %r29, 8;
-	add.s64 	%rd16, %rd14, %rd15;
-	st.f64 	[%rd16], %fd1;
-	bra.uni 	$L__BB2_7;
-$L__BB2_7:                              //   in Loop: Header=BB2_5 Depth=2
-	ld.u32 	%r30, [%SP+52];
-	add.s32 	%r31, %r30, 1;
-	st.u32 	[%SP+52], %r31;
-	bra.uni 	$L__BB2_5;
-$L__BB2_8:                              //   in Loop: Header=BB2_3 Depth=1
-	bra.uni 	$L__BB2_9;
-$L__BB2_9:                              //   in Loop: Header=BB2_3 Depth=1
-	ld.u32 	%r21, [%SP+40];
-	add.s32 	%r22, %r21, 1;
-	st.u32 	[%SP+40], %r22;
-	bra.uni 	$L__BB2_3;
-$L__BB2_10:
-	bra.uni 	$L__BB2_11;
-$L__BB2_11:
-	ret;
-                                        // -- End function
-}
-.func _Z14InterpTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(
-	.param .b64 _Z14InterpTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_0,
-	.param .b64 _Z14InterpTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_1,
-	.param .b64 _Z14InterpTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_2,
-	.param .b64 _Z14InterpTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_3
-)                                       // -- Begin function _Z14InterpTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd
-                                        // @_Z14InterpTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd
-{
-	.local .align 8 .b8 	__local_depot3[136];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<9>;
-	.reg .b64 	%rd<19>;
-
-// %bb.0:
-	mov.u64 	%SPL, __local_depot3;
-	cvta.local.u64 	%SP, %SPL;
-	ld.param.u64 	%rd4, [_Z14InterpTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_3];
-	ld.param.u64 	%rd3, [_Z14InterpTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_2];
-	ld.param.u64 	%rd2, [_Z14InterpTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_1];
-	ld.param.u64 	%rd1, [_Z14InterpTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_0];
-	st.u64 	[%SP], %rd1;
-	st.u64 	[%SP+8], %rd2;
-	st.u64 	[%SP+16], %rd3;
-	st.u64 	[%SP+24], %rd4;
-	mov.b32 	%r1, 0;
-	st.u32 	[%SP+128], %r1;
-	bra.uni 	$L__BB3_1;
-$L__BB3_1:                              // =>This Inner Loop Header: Depth=1
-	ld.u32 	%r2, [%SP+128];
-	setp.gt.s32 	%p1, %r2, 0;
-	@%p1 bra 	$L__BB3_4;
-	bra.uni 	$L__BB3_2;
-$L__BB3_2:                              //   in Loop: Header=BB3_1 Depth=1
-	ld.u64 	%rd5, [%SP];
-	ld.u64 	%rd6, [%SP+8];
-	ld.u32 	%r3, [%SP+128];
-	mul.lo.s32 	%r4, %r3, 5;
-	mul.wide.s32 	%rd7, %r4, 8;
-	add.s64 	%rd8, %rd6, %rd7;
-	ld.u64 	%rd9, [%SP+16];
-	add.u64 	%rd10, %SP, 32;
-	{ // callseq 7, 0
-	.param .b64 param0;
-	st.param.b64 	[param0], %rd5;
-	.param .b64 param1;
-	st.param.b64 	[param1], %rd8;
-	.param .b64 param2;
-	st.param.b64 	[param2], %rd9;
-	.param .b64 param3;
-	st.param.b64 	[param3], %rd10;
-	call.uni 
-	_Z11ContractX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd, 
-	(
-	param0, 
-	param1, 
-	param2, 
-	param3
-	);
-	} // callseq 7
-	ld.u64 	%rd11, [%SP];
-	ld.u64 	%rd12, [%SP+16];
-	add.u64 	%rd13, %SP, 80;
-	{ // callseq 8, 0
-	.param .b64 param0;
-	st.param.b64 	[param0], %rd11;
-	.param .b64 param1;
-	st.param.b64 	[param1], %rd10;
-	.param .b64 param2;
-	st.param.b64 	[param2], %rd12;
-	.param .b64 param3;
-	st.param.b64 	[param3], %rd13;
-	call.uni 
-	_Z11ContractY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd, 
-	(
-	param0, 
-	param1, 
-	param2, 
-	param3
-	);
-	} // callseq 8
-	ld.u64 	%rd14, [%SP];
-	ld.u64 	%rd15, [%SP+16];
-	ld.u64 	%rd16, [%SP+24];
-	ld.u32 	%r5, [%SP+128];
-	mul.lo.s32 	%r6, %r5, 6;
-	mul.wide.s32 	%rd17, %r6, 8;
-	add.s64 	%rd18, %rd16, %rd17;
-	{ // callseq 9, 0
-	.param .b64 param0;
-	st.param.b64 	[param0], %rd14;
-	.param .b64 param1;
-	st.param.b64 	[param1], %rd13;
-	.param .b64 param2;
-	st.param.b64 	[param2], %rd15;
-	.param .b64 param3;
-	st.param.b64 	[param3], %rd18;
-	call.uni 
-	_Z11ContractZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd, 
-	(
-	param0, 
-	param1, 
-	param2, 
-	param3
-	);
-	} // callseq 9
-	bra.uni 	$L__BB3_3;
-$L__BB3_3:                              //   in Loop: Header=BB3_1 Depth=1
-	ld.u32 	%r7, [%SP+128];
-	add.s32 	%r8, %r7, 1;
-	st.u32 	[%SP+128], %r8;
-	bra.uni 	$L__BB3_1;
-$L__BB3_4:
-	ret;
-                                        // -- End function
-}
-.func _Z17ReadLVecStrided3dILi1ELi6ELi1ELi884736ELi216EEvR15SharedData_CudaiPKdPd(
-	.param .b64 _Z17ReadLVecStrided3dILi1ELi6ELi1ELi884736ELi216EEvR15SharedData_CudaiPKdPd_param_0,
-	.param .b32 _Z17ReadLVecStrided3dILi1ELi6ELi1ELi884736ELi216EEvR15SharedData_CudaiPKdPd_param_1,
-	.param .b64 _Z17ReadLVecStrided3dILi1ELi6ELi1ELi884736ELi216EEvR15SharedData_CudaiPKdPd_param_2,
-	.param .b64 _Z17ReadLVecStrided3dILi1ELi6ELi1ELi884736ELi216EEvR15SharedData_CudaiPKdPd_param_3
-)                                       // -- Begin function _Z17ReadLVecStrided3dILi1ELi6ELi1ELi884736ELi216EEvR15SharedData_CudaiPKdPd
-                                        // @_Z17ReadLVecStrided3dILi1ELi6ELi1ELi884736ELi216EEvR15SharedData_CudaiPKdPd
-{
-	.local .align 8 .b8 	__local_depot4[48];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .pred 	%p<5>;
-	.reg .b32 	%r<30>;
-	.reg .b64 	%rd<13>;
-	.reg .f64 	%fd<2>;
-
-// %bb.0:
-	mov.u64 	%SPL, __local_depot4;
-	cvta.local.u64 	%SP, %SPL;
-	ld.param.u64 	%rd3, [_Z17ReadLVecStrided3dILi1ELi6ELi1ELi884736ELi216EEvR15SharedData_CudaiPKdPd_param_3];
-	ld.param.u64 	%rd2, [_Z17ReadLVecStrided3dILi1ELi6ELi1ELi884736ELi216EEvR15SharedData_CudaiPKdPd_param_2];
-	ld.param.u32 	%r1, [_Z17ReadLVecStrided3dILi1ELi6ELi1ELi884736ELi216EEvR15SharedData_CudaiPKdPd_param_1];
-	ld.param.u64 	%rd1, [_Z17ReadLVecStrided3dILi1ELi6ELi1ELi884736ELi216EEvR15SharedData_CudaiPKdPd_param_0];
-	st.u64 	[%SP], %rd1;
-	st.u32 	[%SP+8], %r1;
-	st.u64 	[%SP+16], %rd2;
-	st.u64 	[%SP+24], %rd3;
-	ld.u64 	%rd4, [%SP];
-	ld.u32 	%r2, [%rd4];
-	setp.gt.s32 	%p1, %r2, 5;
-	@%p1 bra 	$L__BB4_11;
-	bra.uni 	$L__BB4_1;
-$L__BB4_1:
-	ld.u64 	%rd5, [%SP];
-	ld.u32 	%r3, [%rd5+4];
-	setp.gt.s32 	%p2, %r3, 5;
-	@%p2 bra 	$L__BB4_11;
-	bra.uni 	$L__BB4_2;
-$L__BB4_2:
-	mov.b32 	%r4, 0;
-	st.u32 	[%SP+32], %r4;
-	bra.uni 	$L__BB4_3;
-$L__BB4_3:                              // =>This Loop Header: Depth=1
-                                        //     Child Loop BB4_5 Depth 2
-	ld.u32 	%r5, [%SP+32];
-	setp.gt.s32 	%p3, %r5, 5;
-	@%p3 bra 	$L__BB4_10;
-	bra.uni 	$L__BB4_4;
-$L__BB4_4:                              //   in Loop: Header=BB4_3 Depth=1
-	ld.u64 	%rd6, [%SP];
-	ld.u32 	%r6, [%rd6];
-	ld.u32 	%r7, [%rd6+4];
-	mul.lo.s32 	%r8, %r7, 6;
-	add.s32 	%r9, %r6, %r8;
-	ld.u32 	%r10, [%SP+32];
-	mul.lo.s32 	%r11, %r10, 36;
-	add.s32 	%r12, %r9, %r11;
-	st.u32 	[%SP+36], %r12;
-	ld.u32 	%r13, [%SP+36];
-	ld.u32 	%r14, [%SP+8];
-	mul.lo.s32 	%r15, %r14, 216;
-	add.s32 	%r16, %r13, %r15;
-	st.u32 	[%SP+40], %r16;
-	mov.b32 	%r17, 0;
-	st.u32 	[%SP+44], %r17;
-	bra.uni 	$L__BB4_5;
-$L__BB4_5:                              //   Parent Loop BB4_3 Depth=1
-                                        // =>  This Inner Loop Header: Depth=2
-	ld.u32 	%r18, [%SP+44];
-	setp.gt.s32 	%p4, %r18, 0;
-	@%p4 bra 	$L__BB4_8;
-	bra.uni 	$L__BB4_6;
-$L__BB4_6:                              //   in Loop: Header=BB4_5 Depth=2
-	ld.u64 	%rd7, [%SP+16];
-	ld.u32 	%r21, [%SP+40];
-	ld.u32 	%r22, [%SP+44];
-	mul.lo.s32 	%r23, %r22, 884736;
-	add.s32 	%r24, %r21, %r23;
-	mul.wide.s32 	%rd8, %r24, 8;
-	add.s64 	%rd9, %rd7, %rd8;
-	ld.f64 	%fd1, [%rd9];
-	ld.u64 	%rd10, [%SP+24];
-	ld.u32 	%r25, [%SP+32];
-	mul.lo.s32 	%r26, %r22, 6;
-	add.s32 	%r27, %r25, %r26;
-	mul.wide.s32 	%rd11, %r27, 8;
-	add.s64 	%rd12, %rd10, %rd11;
-	st.f64 	[%rd12], %fd1;
-	bra.uni 	$L__BB4_7;
-$L__BB4_7:                              //   in Loop: Header=BB4_5 Depth=2
-	ld.u32 	%r28, [%SP+44];
-	add.s32 	%r29, %r28, 1;
-	st.u32 	[%SP+44], %r29;
-	bra.uni 	$L__BB4_5;
-$L__BB4_8:                              //   in Loop: Header=BB4_3 Depth=1
-	bra.uni 	$L__BB4_9;
-$L__BB4_9:                              //   in Loop: Header=BB4_3 Depth=1
-	ld.u32 	%r19, [%SP+32];
-	add.s32 	%r20, %r19, 1;
-	st.u32 	[%SP+32], %r20;
-	bra.uni 	$L__BB4_3;
-$L__BB4_10:
-	bra.uni 	$L__BB4_11;
-$L__BB4_11:
-	ret;
-                                        // -- End function
-}
-.func  (.param .b32 func_retval0) _ZL10apply_massPviPKPKdPKPd(
-	.param .b64 _ZL10apply_massPviPKPKdPKPd_param_0,
-	.param .b32 _ZL10apply_massPviPKPKdPKPd_param_1,
-	.param .b64 _ZL10apply_massPviPKPKdPKPd_param_2,
-	.param .b64 _ZL10apply_massPviPKPKdPKPd_param_3
-)                                       // -- Begin function _ZL10apply_massPviPKPKdPKPd
-                                        // @_ZL10apply_massPviPKPKdPKPd
-{
-	.local .align 8 .b8 	__local_depot5[32];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .b32 	%r<5>;
-	.reg .b64 	%rd<8>;
-
-// %bb.0:
-	mov.u64 	%SPL, __local_depot5;
-	cvta.local.u64 	%SP, %SPL;
-	ld.param.u64 	%rd3, [_ZL10apply_massPviPKPKdPKPd_param_3];
-	ld.param.u64 	%rd2, [_ZL10apply_massPviPKPKdPKPd_param_2];
-	ld.param.u32 	%r1, [_ZL10apply_massPviPKPKdPKPd_param_1];
-	ld.param.u64 	%rd1, [_ZL10apply_massPviPKPKdPKPd_param_0];
-	st.u64 	[%SP], %rd1;
-	st.u32 	[%SP+8], %r1;
-	st.u64 	[%SP+16], %rd2;
-	st.u64 	[%SP+24], %rd3;
-	ld.u64 	%rd5, [%SP];
-	ld.u32 	%r2, [%SP+8];
-	ld.u64 	%rd6, [%SP+16];
-	ld.u64 	%rd7, [%SP+24];
-	mov.u64 	%rd4, apply_mass_rs;
-	{ // callseq 10, 0
-	.param .b64 param0;
-	st.param.b64 	[param0], %rd5;
-	.param .b32 param1;
-	st.param.b32 	[param1], %r2;
-	.param .b64 param2;
-	st.param.b64 	[param2], %rd6;
-	.param .b64 param3;
-	st.param.b64 	[param3], %rd7;
-	.param .b32 retval0;
-	prototype_10 : .callprototype (.param .b32 _) _ (.param .b64 _, .param .b32 _, .param .b64 _, .param .b64 _);
-	call (retval0), 
-	%rd4, 
-	(
-	param0, 
-	param1, 
-	param2, 
-	param3
-	)
-	, prototype_10;
-	ld.param.b32 	%r3, [retval0];
-	} // callseq 10
-	st.param.b32 	[func_retval0], %r3;
-	ret;
-                                        // -- End function
-}
-.func _Z23InterpTransposeTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(
-	.param .b64 _Z23InterpTransposeTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_0,
-	.param .b64 _Z23InterpTransposeTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_1,
-	.param .b64 _Z23InterpTransposeTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_2,
-	.param .b64 _Z23InterpTransposeTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_3
-)                                       // -- Begin function _Z23InterpTransposeTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd
-                                        // @_Z23InterpTransposeTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd
-{
-	.local .align 8 .b8 	__local_depot6[136];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<9>;
-	.reg .b64 	%rd<19>;
-
-// %bb.0:
-	mov.u64 	%SPL, __local_depot6;
-	cvta.local.u64 	%SP, %SPL;
-	ld.param.u64 	%rd4, [_Z23InterpTransposeTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_3];
-	ld.param.u64 	%rd3, [_Z23InterpTransposeTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_2];
-	ld.param.u64 	%rd2, [_Z23InterpTransposeTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_1];
-	ld.param.u64 	%rd1, [_Z23InterpTransposeTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_0];
-	st.u64 	[%SP], %rd1;
-	st.u64 	[%SP+8], %rd2;
-	st.u64 	[%SP+16], %rd3;
-	st.u64 	[%SP+24], %rd4;
-	mov.b32 	%r1, 0;
-	st.u32 	[%SP+128], %r1;
-	bra.uni 	$L__BB6_1;
-$L__BB6_1:                              // =>This Inner Loop Header: Depth=1
-	ld.u32 	%r2, [%SP+128];
-	setp.gt.s32 	%p1, %r2, 0;
-	@%p1 bra 	$L__BB6_4;
-	bra.uni 	$L__BB6_2;
-$L__BB6_2:                              //   in Loop: Header=BB6_1 Depth=1
-	ld.u64 	%rd5, [%SP];
-	ld.u64 	%rd6, [%SP+8];
-	ld.u32 	%r3, [%SP+128];
-	mul.lo.s32 	%r4, %r3, 6;
-	mul.wide.s32 	%rd7, %r4, 8;
-	add.s64 	%rd8, %rd6, %rd7;
-	ld.u64 	%rd9, [%SP+16];
-	add.u64 	%rd10, %SP, 32;
-	{ // callseq 11, 0
-	.param .b64 param0;
-	st.param.b64 	[param0], %rd5;
-	.param .b64 param1;
-	st.param.b64 	[param1], %rd8;
-	.param .b64 param2;
-	st.param.b64 	[param2], %rd9;
-	.param .b64 param3;
-	st.param.b64 	[param3], %rd10;
-	call.uni 
-	_Z20ContractTransposeZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd, 
-	(
-	param0, 
-	param1, 
-	param2, 
-	param3
-	);
-	} // callseq 11
-	ld.u64 	%rd11, [%SP];
-	ld.u64 	%rd12, [%SP+16];
-	add.u64 	%rd13, %SP, 80;
-	{ // callseq 12, 0
-	.param .b64 param0;
-	st.param.b64 	[param0], %rd11;
-	.param .b64 param1;
-	st.param.b64 	[param1], %rd10;
-	.param .b64 param2;
-	st.param.b64 	[param2], %rd12;
-	.param .b64 param3;
-	st.param.b64 	[param3], %rd13;
-	call.uni 
-	_Z20ContractTransposeY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd, 
-	(
-	param0, 
-	param1, 
-	param2, 
-	param3
-	);
-	} // callseq 12
-	ld.u64 	%rd14, [%SP];
-	ld.u64 	%rd15, [%SP+16];
-	ld.u64 	%rd16, [%SP+24];
-	ld.u32 	%r5, [%SP+128];
-	mul.lo.s32 	%r6, %r5, 5;
-	mul.wide.s32 	%rd17, %r6, 8;
-	add.s64 	%rd18, %rd16, %rd17;
-	{ // callseq 13, 0
-	.param .b64 param0;
-	st.param.b64 	[param0], %rd14;
-	.param .b64 param1;
-	st.param.b64 	[param1], %rd13;
-	.param .b64 param2;
-	st.param.b64 	[param2], %rd15;
-	.param .b64 param3;
-	st.param.b64 	[param3], %rd18;
-	call.uni 
-	_Z20ContractTransposeX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd, 
-	(
-	param0, 
-	param1, 
-	param2, 
-	param3
-	);
-	} // callseq 13
-	bra.uni 	$L__BB6_3;
-$L__BB6_3:                              //   in Loop: Header=BB6_1 Depth=1
-	ld.u32 	%r7, [%SP+128];
-	add.s32 	%r8, %r7, 1;
-	st.u32 	[%SP+128], %r8;
-	bra.uni 	$L__BB6_1;
-$L__BB6_4:
-	ret;
-                                        // -- End function
-}
-.func _Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd(
-	.param .b64 _Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_0,
-	.param .b32 _Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_1,
-	.param .b32 _Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_2,
-	.param .b64 _Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_3,
-	.param .b64 _Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_4,
-	.param .b64 _Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_5
-)                                       // -- Begin function _Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd
-                                        // @_Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd
-{
-	.local .align 8 .b8 	__local_depot7[56];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .pred 	%p<5>;
-	.reg .b32 	%r<32>;
-	.reg .b64 	%rd<17>;
-	.reg .f64 	%fd<4>;
-
-// %bb.0:
-	mov.u64 	%SPL, __local_depot7;
-	cvta.local.u64 	%SP, %SPL;
-	ld.param.u64 	%rd4, [_Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_5];
-	ld.param.u64 	%rd3, [_Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_4];
-	ld.param.u64 	%rd2, [_Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_3];
-	ld.param.u32 	%r2, [_Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_2];
-	ld.param.u32 	%r1, [_Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_1];
-	ld.param.u64 	%rd1, [_Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd_param_0];
-	st.u64 	[%SP], %rd1;
-	st.u32 	[%SP+8], %r1;
-	st.u32 	[%SP+12], %r2;
-	st.u64 	[%SP+16], %rd2;
-	st.u64 	[%SP+24], %rd3;
-	st.u64 	[%SP+32], %rd4;
-	ld.u64 	%rd5, [%SP];
-	ld.u32 	%r3, [%rd5];
-	setp.gt.s32 	%p1, %r3, 4;
-	@%p1 bra 	$L__BB7_11;
-	bra.uni 	$L__BB7_1;
-$L__BB7_1:
-	ld.u64 	%rd6, [%SP];
-	ld.u32 	%r4, [%rd6+4];
-	setp.gt.s32 	%p2, %r4, 4;
-	@%p2 bra 	$L__BB7_11;
-	bra.uni 	$L__BB7_2;
-$L__BB7_2:
-	mov.b32 	%r5, 0;
-	st.u32 	[%SP+40], %r5;
-	bra.uni 	$L__BB7_3;
-$L__BB7_3:                              // =>This Loop Header: Depth=1
-                                        //     Child Loop BB7_5 Depth 2
-	ld.u32 	%r6, [%SP+40];
-	setp.gt.s32 	%p3, %r6, 4;
-	@%p3 bra 	$L__BB7_10;
-	bra.uni 	$L__BB7_4;
-$L__BB7_4:                              //   in Loop: Header=BB7_3 Depth=1
-	ld.u64 	%rd7, [%SP];
-	ld.u32 	%r7, [%rd7];
-	ld.u32 	%r8, [%rd7+4];
-	mul.lo.s32 	%r9, %r8, 5;
-	add.s32 	%r10, %r7, %r9;
-	ld.u32 	%r11, [%SP+40];
-	mul.lo.s32 	%r12, %r11, 25;
-	add.s32 	%r13, %r10, %r12;
-	st.u32 	[%SP+44], %r13;
-	ld.u64 	%rd8, [%SP+16];
-	ld.u32 	%r14, [%SP+44];
-	ld.u32 	%r15, [%SP+12];
-	mul.lo.s32 	%r16, %r15, 125;
-	add.s32 	%r17, %r14, %r16;
-	mul.wide.s32 	%rd9, %r17, 4;
-	add.s64 	%rd10, %rd8, %rd9;
-	ld.u32 	%r18, [%rd10];
-	st.u32 	[%SP+48], %r18;
-	mov.b32 	%r19, 0;
-	st.u32 	[%SP+52], %r19;
-	bra.uni 	$L__BB7_5;
-$L__BB7_5:                              //   Parent Loop BB7_3 Depth=1
-                                        // =>  This Inner Loop Header: Depth=2
-	ld.u32 	%r20, [%SP+52];
-	setp.gt.s32 	%p4, %r20, 0;
-	@%p4 bra 	$L__BB7_8;
-	bra.uni 	$L__BB7_6;
-$L__BB7_6:                              //   in Loop: Header=BB7_5 Depth=2
-	ld.u64 	%rd11, [%SP+32];
-	ld.u32 	%r23, [%SP+48];
-	ld.u32 	%r24, [%SP+52];
-	mul.lo.s32 	%r25, %r24, 274625;
-	add.s32 	%r26, %r23, %r25;
-	mul.wide.s32 	%rd12, %r26, 8;
-	add.s64 	%rd13, %rd11, %rd12;
-	ld.u64 	%rd14, [%SP+24];
-	ld.u32 	%r27, [%SP+40];
-	mul.lo.s32 	%r28, %r24, 5;
-	add.s32 	%r29, %r27, %r28;
-	mul.wide.s32 	%rd15, %r29, 8;
-	add.s64 	%rd16, %rd14, %rd15;
-	ld.f64 	%fd1, [%rd16];
-	{ // callseq 14, 0
-	.param .b64 param0;
-	st.param.b64 	[param0], %rd13;
-	.param .b64 param1;
-	st.param.f64 	[param1], %fd1;
-	.param .b64 retval0;
-	call.uni (retval0), 
-	_ZL9atomicAddPdd, 
-	(
-	param0, 
-	param1
-	);
-	ld.param.f64 	%fd2, [retval0];
-	} // callseq 14
-	bra.uni 	$L__BB7_7;
-$L__BB7_7:                              //   in Loop: Header=BB7_5 Depth=2
-	ld.u32 	%r30, [%SP+52];
-	add.s32 	%r31, %r30, 1;
-	st.u32 	[%SP+52], %r31;
-	bra.uni 	$L__BB7_5;
-$L__BB7_8:                              //   in Loop: Header=BB7_3 Depth=1
-	bra.uni 	$L__BB7_9;
-$L__BB7_9:                              //   in Loop: Header=BB7_3 Depth=1
-	ld.u32 	%r21, [%SP+40];
-	add.s32 	%r22, %r21, 1;
-	st.u32 	[%SP+40], %r22;
-	bra.uni 	$L__BB7_3;
-$L__BB7_10:
-	bra.uni 	$L__BB7_11;
-$L__BB7_11:
-	ret;
-                                        // -- End function
-}
-.func  (.param .b64 func_retval0) _ZL9atomicAddPdd(
-	.param .b64 _ZL9atomicAddPdd_param_0,
-	.param .b64 _ZL9atomicAddPdd_param_1
-)                                       // -- Begin function _ZL9atomicAddPdd
-                                        // @_ZL9atomicAddPdd
-{
-	.local .align 8 .b8 	__local_depot8[32];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .b64 	%rd<4>;
-	.reg .f64 	%fd<5>;
-
-// %bb.0:
-	mov.u64 	%SPL, __local_depot8;
-	cvta.local.u64 	%SP, %SPL;
-	ld.param.f64 	%fd1, [_ZL9atomicAddPdd_param_1];
-	ld.param.u64 	%rd1, [_ZL9atomicAddPdd_param_0];
-	st.u64 	[%SP+16], %rd1;
-	st.f64 	[%SP+24], %fd1;
-	ld.u64 	%rd2, [%SP+16];
-	ld.f64 	%fd2, [%SP+24];
-	st.u64 	[%SP], %rd2;
-	st.f64 	[%SP+8], %fd2;
-	ld.u64 	%rd3, [%SP];
-	ld.f64 	%fd3, [%SP+8];
-	atom.add.f64 	%fd4, [%rd3], %fd3;
-	st.param.f64 	[func_retval0], %fd4;
-	ret;
-                                        // -- End function
-}
-.func _Z20ContractTransposeZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(
-	.param .b64 _Z20ContractTransposeZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_0,
-	.param .b64 _Z20ContractTransposeZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_1,
-	.param .b64 _Z20ContractTransposeZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_2,
-	.param .b64 _Z20ContractTransposeZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_3
-)                                       // -- Begin function _Z20ContractTransposeZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd
-                                        // @_Z20ContractTransposeZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd
-{
-	.local .align 8 .b8 	__local_depot9[40];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .pred 	%p<5>;
-	.reg .b32 	%r<15>;
-	.reg .b64 	%rd<21>;
-	.reg .f64 	%fd<5>;
-
-// %bb.0:
-	mov.u64 	%SPL, __local_depot9;
-	cvta.local.u64 	%SP, %SPL;
-	ld.param.u64 	%rd4, [_Z20ContractTransposeZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_3];
-	ld.param.u64 	%rd3, [_Z20ContractTransposeZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_2];
-	ld.param.u64 	%rd2, [_Z20ContractTransposeZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_1];
-	ld.param.u64 	%rd1, [_Z20ContractTransposeZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_0];
-	st.u64 	[%SP], %rd1;
-	st.u64 	[%SP+8], %rd2;
-	st.u64 	[%SP+16], %rd3;
-	st.u64 	[%SP+24], %rd4;
-	mov.b32 	%r1, 0;
-	st.u32 	[%SP+32], %r1;
-	bra.uni 	$L__BB9_1;
-$L__BB9_1:                              // =>This Loop Header: Depth=1
-                                        //     Child Loop BB9_5 Depth 2
-	ld.u32 	%r2, [%SP+32];
-	setp.gt.s32 	%p1, %r2, 4;
-	@%p1 bra 	$L__BB9_11;
-	bra.uni 	$L__BB9_2;
-$L__BB9_2:                              //   in Loop: Header=BB9_1 Depth=1
-	ld.u64 	%rd5, [%SP+24];
-	ld.s32 	%rd6, [%SP+32];
-	shl.b64 	%rd7, %rd6, 3;
-	add.s64 	%rd8, %rd5, %rd7;
-	mov.b64 	%rd9, 0;
-	st.u64 	[%rd8], %rd9;
-	ld.u64 	%rd10, [%SP];
-	ld.u32 	%r3, [%rd10];
-	setp.gt.s32 	%p2, %r3, 5;
-	@%p2 bra 	$L__BB9_9;
-	bra.uni 	$L__BB9_3;
-$L__BB9_3:                              //   in Loop: Header=BB9_1 Depth=1
-	ld.u64 	%rd11, [%SP];
-	ld.u32 	%r4, [%rd11+4];
-	setp.gt.s32 	%p3, %r4, 5;
-	@%p3 bra 	$L__BB9_9;
-	bra.uni 	$L__BB9_4;
-$L__BB9_4:                              //   in Loop: Header=BB9_1 Depth=1
-	mov.b32 	%r5, 0;
-	st.u32 	[%SP+36], %r5;
-	bra.uni 	$L__BB9_5;
-$L__BB9_5:                              //   Parent Loop BB9_1 Depth=1
-                                        // =>  This Inner Loop Header: Depth=2
-	ld.u32 	%r6, [%SP+36];
-	setp.gt.s32 	%p4, %r6, 5;
-	@%p4 bra 	$L__BB9_8;
-	bra.uni 	$L__BB9_6;
-$L__BB9_6:                              //   in Loop: Header=BB9_5 Depth=2
-	ld.u64 	%rd12, [%SP+16];
-	ld.u32 	%r9, [%SP+32];
-	ld.u32 	%r10, [%SP+36];
-	mul.lo.s32 	%r11, %r10, 5;
-	add.s32 	%r12, %r9, %r11;
-	mul.wide.s32 	%rd13, %r12, 8;
-	add.s64 	%rd14, %rd12, %rd13;
-	ld.f64 	%fd1, [%rd14];
-	ld.u64 	%rd15, [%SP+8];
-	mul.wide.s32 	%rd16, %r10, 8;
-	add.s64 	%rd17, %rd15, %rd16;
-	ld.f64 	%fd2, [%rd17];
-	ld.u64 	%rd18, [%SP+24];
-	mul.wide.s32 	%rd19, %r9, 8;
-	add.s64 	%rd20, %rd18, %rd19;
-	ld.f64 	%fd3, [%rd20];
-	fma.rn.f64 	%fd4, %fd1, %fd2, %fd3;
-	st.f64 	[%rd20], %fd4;
-	bra.uni 	$L__BB9_7;
-$L__BB9_7:                              //   in Loop: Header=BB9_5 Depth=2
-	ld.u32 	%r13, [%SP+36];
-	add.s32 	%r14, %r13, 1;
-	st.u32 	[%SP+36], %r14;
-	bra.uni 	$L__BB9_5;
-$L__BB9_8:                              //   in Loop: Header=BB9_1 Depth=1
-	bra.uni 	$L__BB9_9;
-$L__BB9_9:                              //   in Loop: Header=BB9_1 Depth=1
-	bra.uni 	$L__BB9_10;
-$L__BB9_10:                             //   in Loop: Header=BB9_1 Depth=1
-	ld.u32 	%r7, [%SP+32];
-	add.s32 	%r8, %r7, 1;
-	st.u32 	[%SP+32], %r8;
-	bra.uni 	$L__BB9_1;
-$L__BB9_11:
-	ret;
-                                        // -- End function
-}
-.func _Z20ContractTransposeY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(
-	.param .b64 _Z20ContractTransposeY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_0,
-	.param .b64 _Z20ContractTransposeY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_1,
-	.param .b64 _Z20ContractTransposeY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_2,
-	.param .b64 _Z20ContractTransposeY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_3
-)                                       // -- Begin function _Z20ContractTransposeY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd
-                                        // @_Z20ContractTransposeY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd
-{
-	.local .align 8 .b8 	__local_depot10[96];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .pred 	%p<6>;
-	.reg .b32 	%r<27>;
-	.reg .b64 	%rd<38>;
-	.reg .f64 	%fd<7>;
-
-// %bb.0:
-	mov.u64 	%SPL, __local_depot10;
-	cvta.local.u64 	%SP, %SPL;
-	ld.param.u64 	%rd4, [_Z20ContractTransposeY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_3];
-	ld.param.u64 	%rd3, [_Z20ContractTransposeY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_2];
-	ld.param.u64 	%rd2, [_Z20ContractTransposeY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_1];
-	ld.param.u64 	%rd1, [_Z20ContractTransposeY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_0];
-	st.u64 	[%SP], %rd1;
-	st.u64 	[%SP+8], %rd2;
-	st.u64 	[%SP+16], %rd3;
-	st.u64 	[%SP+24], %rd4;
-	mov.b32 	%r1, 0;
-	st.u32 	[%SP+80], %r1;
-	bra.uni 	$L__BB10_1;
-$L__BB10_1:                             // =>This Inner Loop Header: Depth=1
-	ld.u32 	%r2, [%SP+80];
-	setp.gt.s32 	%p1, %r2, 5;
-	@%p1 bra 	$L__BB10_4;
-	bra.uni 	$L__BB10_2;
-$L__BB10_2:                             //   in Loop: Header=BB10_1 Depth=1
-	ld.u64 	%rd31, [%SP+16];
-	ld.u64 	%rd32, [%SP];
-	ld.u32 	%r21, [%rd32+4];
-	ld.u32 	%r22, [%SP+80];
-	mul.lo.s32 	%r23, %r22, 5;
-	add.s32 	%r24, %r21, %r23;
-	mul.wide.s32 	%rd33, %r24, 8;
-	add.s64 	%rd34, %rd31, %rd33;
-	ld.f64 	%fd6, [%rd34];
-	mul.wide.s32 	%rd35, %r22, 8;
-	add.u64 	%rd36, %SP, 32;
-	add.s64 	%rd37, %rd36, %rd35;
-	st.f64 	[%rd37], %fd6;
-	bra.uni 	$L__BB10_3;
-$L__BB10_3:                             //   in Loop: Header=BB10_1 Depth=1
-	ld.u32 	%r25, [%SP+80];
-	add.s32 	%r26, %r25, 1;
-	st.u32 	[%SP+80], %r26;
-	bra.uni 	$L__BB10_1;
-$L__BB10_4:
-	mov.b32 	%r3, 0;
-	st.u32 	[%SP+84], %r3;
-	bra.uni 	$L__BB10_5;
-$L__BB10_5:                             // =>This Loop Header: Depth=1
-                                        //     Child Loop BB10_9 Depth 2
-	ld.u32 	%r4, [%SP+84];
-	setp.gt.s32 	%p2, %r4, 4;
-	@%p2 bra 	$L__BB10_15;
-	bra.uni 	$L__BB10_6;
-$L__BB10_6:                             //   in Loop: Header=BB10_5 Depth=1
-	bar.sync 	0;
-	ld.u64 	%rd5, [%SP+8];
-	ld.s32 	%rd6, [%SP+84];
-	shl.b64 	%rd7, %rd6, 3;
-	add.s64 	%rd8, %rd5, %rd7;
-	ld.f64 	%fd1, [%rd8];
-	ld.u64 	%rd9, [%SP];
-	ld.u64 	%rd10, [%rd9+16];
-	ld.u32 	%r5, [%rd9];
-	ld.u32 	%r6, [%rd9+4];
-	mul.lo.s32 	%r7, %r6, 6;
-	add.s32 	%r8, %r5, %r7;
-	mul.wide.s32 	%rd11, %r8, 8;
-	add.s64 	%rd12, %rd10, %rd11;
-	st.f64 	[%rd12], %fd1;
-	bar.sync 	0;
-	ld.u64 	%rd13, [%SP+24];
-	ld.s32 	%rd14, [%SP+84];
-	shl.b64 	%rd15, %rd14, 3;
-	add.s64 	%rd16, %rd13, %rd15;
-	mov.b64 	%rd17, 0;
-	st.u64 	[%rd16], %rd17;
-	ld.u64 	%rd18, [%SP];
-	ld.u32 	%r9, [%rd18];
-	setp.gt.s32 	%p3, %r9, 5;
-	@%p3 bra 	$L__BB10_13;
-	bra.uni 	$L__BB10_7;
-$L__BB10_7:                             //   in Loop: Header=BB10_5 Depth=1
-	ld.u64 	%rd19, [%SP];
-	ld.u32 	%r10, [%rd19+4];
-	setp.gt.s32 	%p4, %r10, 4;
-	@%p4 bra 	$L__BB10_13;
-	bra.uni 	$L__BB10_8;
-$L__BB10_8:                             //   in Loop: Header=BB10_5 Depth=1
-	mov.b32 	%r11, 0;
-	st.u32 	[%SP+88], %r11;
-	bra.uni 	$L__BB10_9;
-$L__BB10_9:                             //   Parent Loop BB10_5 Depth=1
-                                        // =>  This Inner Loop Header: Depth=2
-	ld.u32 	%r12, [%SP+88];
-	setp.gt.s32 	%p5, %r12, 5;
-	@%p5 bra 	$L__BB10_12;
-	bra.uni 	$L__BB10_10;
-$L__BB10_10:                            //   in Loop: Header=BB10_9 Depth=2
-	ld.u32 	%r15, [%SP+88];
-	mul.wide.s32 	%rd20, %r15, 8;
-	add.u64 	%rd21, %SP, 32;
-	add.s64 	%rd22, %rd21, %rd20;
-	ld.f64 	%fd2, [%rd22];
-	ld.u64 	%rd23, [%SP];
-	ld.u64 	%rd24, [%rd23+16];
-	ld.u32 	%r16, [%rd23];
-	mul.lo.s32 	%r17, %r15, 6;
-	add.s32 	%r18, %r16, %r17;
-	mul.wide.s32 	%rd25, %r18, 8;
-	add.s64 	%rd26, %rd24, %rd25;
-	ld.f64 	%fd3, [%rd26];
-	ld.u64 	%rd27, [%SP+24];
-	ld.s32 	%rd28, [%SP+84];
-	shl.b64 	%rd29, %rd28, 3;
-	add.s64 	%rd30, %rd27, %rd29;
-	ld.f64 	%fd4, [%rd30];
-	fma.rn.f64 	%fd5, %fd2, %fd3, %fd4;
-	st.f64 	[%rd30], %fd5;
-	bra.uni 	$L__BB10_11;
-$L__BB10_11:                            //   in Loop: Header=BB10_9 Depth=2
-	ld.u32 	%r19, [%SP+88];
-	add.s32 	%r20, %r19, 1;
-	st.u32 	[%SP+88], %r20;
-	bra.uni 	$L__BB10_9;
-$L__BB10_12:                            //   in Loop: Header=BB10_5 Depth=1
-	bra.uni 	$L__BB10_13;
-$L__BB10_13:                            //   in Loop: Header=BB10_5 Depth=1
-	bra.uni 	$L__BB10_14;
-$L__BB10_14:                            //   in Loop: Header=BB10_5 Depth=1
-	ld.u32 	%r13, [%SP+84];
-	add.s32 	%r14, %r13, 1;
-	st.u32 	[%SP+84], %r14;
-	bra.uni 	$L__BB10_5;
-$L__BB10_15:
-	ret;
-                                        // -- End function
-}
-.func _Z20ContractTransposeX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(
-	.param .b64 _Z20ContractTransposeX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_0,
-	.param .b64 _Z20ContractTransposeX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_1,
-	.param .b64 _Z20ContractTransposeX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_2,
-	.param .b64 _Z20ContractTransposeX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_3
-)                                       // -- Begin function _Z20ContractTransposeX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd
-                                        // @_Z20ContractTransposeX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd
-{
-	.local .align 8 .b8 	__local_depot11[96];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .pred 	%p<6>;
-	.reg .b32 	%r<27>;
-	.reg .b64 	%rd<38>;
-	.reg .f64 	%fd<7>;
-
-// %bb.0:
-	mov.u64 	%SPL, __local_depot11;
-	cvta.local.u64 	%SP, %SPL;
-	ld.param.u64 	%rd4, [_Z20ContractTransposeX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_3];
-	ld.param.u64 	%rd3, [_Z20ContractTransposeX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_2];
-	ld.param.u64 	%rd2, [_Z20ContractTransposeX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_1];
-	ld.param.u64 	%rd1, [_Z20ContractTransposeX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_0];
-	st.u64 	[%SP], %rd1;
-	st.u64 	[%SP+8], %rd2;
-	st.u64 	[%SP+16], %rd3;
-	st.u64 	[%SP+24], %rd4;
-	mov.b32 	%r1, 0;
-	st.u32 	[%SP+80], %r1;
-	bra.uni 	$L__BB11_1;
-$L__BB11_1:                             // =>This Inner Loop Header: Depth=1
-	ld.u32 	%r2, [%SP+80];
-	setp.gt.s32 	%p1, %r2, 5;
-	@%p1 bra 	$L__BB11_4;
-	bra.uni 	$L__BB11_2;
-$L__BB11_2:                             //   in Loop: Header=BB11_1 Depth=1
-	ld.u64 	%rd31, [%SP+16];
-	ld.u64 	%rd32, [%SP];
-	ld.u32 	%r21, [%rd32];
-	ld.u32 	%r22, [%SP+80];
-	mul.lo.s32 	%r23, %r22, 5;
-	add.s32 	%r24, %r21, %r23;
-	mul.wide.s32 	%rd33, %r24, 8;
-	add.s64 	%rd34, %rd31, %rd33;
-	ld.f64 	%fd6, [%rd34];
-	mul.wide.s32 	%rd35, %r22, 8;
-	add.u64 	%rd36, %SP, 32;
-	add.s64 	%rd37, %rd36, %rd35;
-	st.f64 	[%rd37], %fd6;
-	bra.uni 	$L__BB11_3;
-$L__BB11_3:                             //   in Loop: Header=BB11_1 Depth=1
-	ld.u32 	%r25, [%SP+80];
-	add.s32 	%r26, %r25, 1;
-	st.u32 	[%SP+80], %r26;
-	bra.uni 	$L__BB11_1;
-$L__BB11_4:
-	mov.b32 	%r3, 0;
-	st.u32 	[%SP+84], %r3;
-	bra.uni 	$L__BB11_5;
-$L__BB11_5:                             // =>This Loop Header: Depth=1
-                                        //     Child Loop BB11_9 Depth 2
-	ld.u32 	%r4, [%SP+84];
-	setp.gt.s32 	%p2, %r4, 4;
-	@%p2 bra 	$L__BB11_15;
-	bra.uni 	$L__BB11_6;
-$L__BB11_6:                             //   in Loop: Header=BB11_5 Depth=1
-	bar.sync 	0;
-	ld.u64 	%rd5, [%SP+8];
-	ld.s32 	%rd6, [%SP+84];
-	shl.b64 	%rd7, %rd6, 3;
-	add.s64 	%rd8, %rd5, %rd7;
-	ld.f64 	%fd1, [%rd8];
-	ld.u64 	%rd9, [%SP];
-	ld.u64 	%rd10, [%rd9+16];
-	ld.u32 	%r5, [%rd9];
-	ld.u32 	%r6, [%rd9+4];
-	mul.lo.s32 	%r7, %r6, 6;
-	add.s32 	%r8, %r5, %r7;
-	mul.wide.s32 	%rd11, %r8, 8;
-	add.s64 	%rd12, %rd10, %rd11;
-	st.f64 	[%rd12], %fd1;
-	bar.sync 	0;
-	ld.u64 	%rd13, [%SP+24];
-	ld.s32 	%rd14, [%SP+84];
-	shl.b64 	%rd15, %rd14, 3;
-	add.s64 	%rd16, %rd13, %rd15;
-	mov.b64 	%rd17, 0;
-	st.u64 	[%rd16], %rd17;
-	ld.u64 	%rd18, [%SP];
-	ld.u32 	%r9, [%rd18];
-	setp.gt.s32 	%p3, %r9, 4;
-	@%p3 bra 	$L__BB11_13;
-	bra.uni 	$L__BB11_7;
-$L__BB11_7:                             //   in Loop: Header=BB11_5 Depth=1
-	ld.u64 	%rd19, [%SP];
-	ld.u32 	%r10, [%rd19+4];
-	setp.gt.s32 	%p4, %r10, 4;
-	@%p4 bra 	$L__BB11_13;
-	bra.uni 	$L__BB11_8;
-$L__BB11_8:                             //   in Loop: Header=BB11_5 Depth=1
-	mov.b32 	%r11, 0;
-	st.u32 	[%SP+88], %r11;
-	bra.uni 	$L__BB11_9;
-$L__BB11_9:                             //   Parent Loop BB11_5 Depth=1
-                                        // =>  This Inner Loop Header: Depth=2
-	ld.u32 	%r12, [%SP+88];
-	setp.gt.s32 	%p5, %r12, 5;
-	@%p5 bra 	$L__BB11_12;
-	bra.uni 	$L__BB11_10;
-$L__BB11_10:                            //   in Loop: Header=BB11_9 Depth=2
-	ld.u32 	%r15, [%SP+88];
-	mul.wide.s32 	%rd20, %r15, 8;
-	add.u64 	%rd21, %SP, 32;
-	add.s64 	%rd22, %rd21, %rd20;
-	ld.f64 	%fd2, [%rd22];
-	ld.u64 	%rd23, [%SP];
-	ld.u64 	%rd24, [%rd23+16];
-	ld.u32 	%r16, [%rd23+4];
-	mul.lo.s32 	%r17, %r16, 6;
-	add.s32 	%r18, %r15, %r17;
-	mul.wide.s32 	%rd25, %r18, 8;
-	add.s64 	%rd26, %rd24, %rd25;
-	ld.f64 	%fd3, [%rd26];
-	ld.u64 	%rd27, [%SP+24];
-	ld.s32 	%rd28, [%SP+84];
-	shl.b64 	%rd29, %rd28, 3;
-	add.s64 	%rd30, %rd27, %rd29;
-	ld.f64 	%fd4, [%rd30];
-	fma.rn.f64 	%fd5, %fd2, %fd3, %fd4;
-	st.f64 	[%rd30], %fd5;
-	bra.uni 	$L__BB11_11;
-$L__BB11_11:                            //   in Loop: Header=BB11_9 Depth=2
-	ld.u32 	%r19, [%SP+88];
-	add.s32 	%r20, %r19, 1;
-	st.u32 	[%SP+88], %r20;
-	bra.uni 	$L__BB11_9;
-$L__BB11_12:                            //   in Loop: Header=BB11_5 Depth=1
-	bra.uni 	$L__BB11_13;
-$L__BB11_13:                            //   in Loop: Header=BB11_5 Depth=1
-	bra.uni 	$L__BB11_14;
-$L__BB11_14:                            //   in Loop: Header=BB11_5 Depth=1
-	ld.u32 	%r13, [%SP+84];
-	add.s32 	%r14, %r13, 1;
-	st.u32 	[%SP+84], %r14;
-	bra.uni 	$L__BB11_5;
-$L__BB11_15:
-	ret;
-                                        // -- End function
-}
-.func _Z11ContractX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(
-	.param .b64 _Z11ContractX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_0,
-	.param .b64 _Z11ContractX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_1,
-	.param .b64 _Z11ContractX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_2,
-	.param .b64 _Z11ContractX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_3
-)                                       // -- Begin function _Z11ContractX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd
-                                        // @_Z11ContractX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd
-{
-	.local .align 8 .b8 	__local_depot12[88];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .pred 	%p<6>;
-	.reg .b32 	%r<27>;
-	.reg .b64 	%rd<38>;
-	.reg .f64 	%fd<7>;
-
-// %bb.0:
-	mov.u64 	%SPL, __local_depot12;
-	cvta.local.u64 	%SP, %SPL;
-	ld.param.u64 	%rd4, [_Z11ContractX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_3];
-	ld.param.u64 	%rd3, [_Z11ContractX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_2];
-	ld.param.u64 	%rd2, [_Z11ContractX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_1];
-	ld.param.u64 	%rd1, [_Z11ContractX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_0];
-	st.u64 	[%SP], %rd1;
-	st.u64 	[%SP+8], %rd2;
-	st.u64 	[%SP+16], %rd3;
-	st.u64 	[%SP+24], %rd4;
-	mov.b32 	%r1, 0;
-	st.u32 	[%SP+72], %r1;
-	bra.uni 	$L__BB12_1;
-$L__BB12_1:                             // =>This Inner Loop Header: Depth=1
-	ld.u32 	%r2, [%SP+72];
-	setp.gt.s32 	%p1, %r2, 4;
-	@%p1 bra 	$L__BB12_4;
-	bra.uni 	$L__BB12_2;
-$L__BB12_2:                             //   in Loop: Header=BB12_1 Depth=1
-	ld.u64 	%rd31, [%SP+16];
-	ld.u32 	%r21, [%SP+72];
-	ld.u64 	%rd32, [%SP];
-	ld.u32 	%r22, [%rd32];
-	mul.lo.s32 	%r23, %r22, 5;
-	add.s32 	%r24, %r21, %r23;
-	mul.wide.s32 	%rd33, %r24, 8;
-	add.s64 	%rd34, %rd31, %rd33;
-	ld.f64 	%fd6, [%rd34];
-	mul.wide.s32 	%rd35, %r21, 8;
-	add.u64 	%rd36, %SP, 32;
-	add.s64 	%rd37, %rd36, %rd35;
-	st.f64 	[%rd37], %fd6;
-	bra.uni 	$L__BB12_3;
-$L__BB12_3:                             //   in Loop: Header=BB12_1 Depth=1
-	ld.u32 	%r25, [%SP+72];
-	add.s32 	%r26, %r25, 1;
-	st.u32 	[%SP+72], %r26;
-	bra.uni 	$L__BB12_1;
-$L__BB12_4:
-	mov.b32 	%r3, 0;
-	st.u32 	[%SP+76], %r3;
-	bra.uni 	$L__BB12_5;
-$L__BB12_5:                             // =>This Loop Header: Depth=1
-                                        //     Child Loop BB12_9 Depth 2
-	ld.u32 	%r4, [%SP+76];
-	setp.gt.s32 	%p2, %r4, 4;
-	@%p2 bra 	$L__BB12_15;
-	bra.uni 	$L__BB12_6;
-$L__BB12_6:                             //   in Loop: Header=BB12_5 Depth=1
-	bar.sync 	0;
-	ld.u64 	%rd5, [%SP+8];
-	ld.s32 	%rd6, [%SP+76];
-	shl.b64 	%rd7, %rd6, 3;
-	add.s64 	%rd8, %rd5, %rd7;
-	ld.f64 	%fd1, [%rd8];
-	ld.u64 	%rd9, [%SP];
-	ld.u64 	%rd10, [%rd9+16];
-	ld.u32 	%r5, [%rd9];
-	ld.u32 	%r6, [%rd9+4];
-	mul.lo.s32 	%r7, %r6, 6;
-	add.s32 	%r8, %r5, %r7;
-	mul.wide.s32 	%rd11, %r8, 8;
-	add.s64 	%rd12, %rd10, %rd11;
-	st.f64 	[%rd12], %fd1;
-	bar.sync 	0;
-	ld.u64 	%rd13, [%SP+24];
-	ld.s32 	%rd14, [%SP+76];
-	shl.b64 	%rd15, %rd14, 3;
-	add.s64 	%rd16, %rd13, %rd15;
-	mov.b64 	%rd17, 0;
-	st.u64 	[%rd16], %rd17;
-	ld.u64 	%rd18, [%SP];
-	ld.u32 	%r9, [%rd18];
-	setp.gt.s32 	%p3, %r9, 5;
-	@%p3 bra 	$L__BB12_13;
-	bra.uni 	$L__BB12_7;
-$L__BB12_7:                             //   in Loop: Header=BB12_5 Depth=1
-	ld.u64 	%rd19, [%SP];
-	ld.u32 	%r10, [%rd19+4];
-	setp.gt.s32 	%p4, %r10, 4;
-	@%p4 bra 	$L__BB12_13;
-	bra.uni 	$L__BB12_8;
-$L__BB12_8:                             //   in Loop: Header=BB12_5 Depth=1
-	mov.b32 	%r11, 0;
-	st.u32 	[%SP+80], %r11;
-	bra.uni 	$L__BB12_9;
-$L__BB12_9:                             //   Parent Loop BB12_5 Depth=1
-                                        // =>  This Inner Loop Header: Depth=2
-	ld.u32 	%r12, [%SP+80];
-	setp.gt.s32 	%p5, %r12, 4;
-	@%p5 bra 	$L__BB12_12;
-	bra.uni 	$L__BB12_10;
-$L__BB12_10:                            //   in Loop: Header=BB12_9 Depth=2
-	ld.u32 	%r15, [%SP+80];
-	mul.wide.s32 	%rd20, %r15, 8;
-	add.u64 	%rd21, %SP, 32;
-	add.s64 	%rd22, %rd21, %rd20;
-	ld.f64 	%fd2, [%rd22];
-	ld.u64 	%rd23, [%SP];
-	ld.u64 	%rd24, [%rd23+16];
-	ld.u32 	%r16, [%rd23+4];
-	mul.lo.s32 	%r17, %r16, 6;
-	add.s32 	%r18, %r15, %r17;
-	mul.wide.s32 	%rd25, %r18, 8;
-	add.s64 	%rd26, %rd24, %rd25;
-	ld.f64 	%fd3, [%rd26];
-	ld.u64 	%rd27, [%SP+24];
-	ld.s32 	%rd28, [%SP+76];
-	shl.b64 	%rd29, %rd28, 3;
-	add.s64 	%rd30, %rd27, %rd29;
-	ld.f64 	%fd4, [%rd30];
-	fma.rn.f64 	%fd5, %fd2, %fd3, %fd4;
-	st.f64 	[%rd30], %fd5;
-	bra.uni 	$L__BB12_11;
-$L__BB12_11:                            //   in Loop: Header=BB12_9 Depth=2
-	ld.u32 	%r19, [%SP+80];
-	add.s32 	%r20, %r19, 1;
-	st.u32 	[%SP+80], %r20;
-	bra.uni 	$L__BB12_9;
-$L__BB12_12:                            //   in Loop: Header=BB12_5 Depth=1
-	bra.uni 	$L__BB12_13;
-$L__BB12_13:                            //   in Loop: Header=BB12_5 Depth=1
-	bra.uni 	$L__BB12_14;
-$L__BB12_14:                            //   in Loop: Header=BB12_5 Depth=1
-	ld.u32 	%r13, [%SP+76];
-	add.s32 	%r14, %r13, 1;
-	st.u32 	[%SP+76], %r14;
-	bra.uni 	$L__BB12_5;
-$L__BB12_15:
-	ret;
-                                        // -- End function
-}
-.func _Z11ContractY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(
-	.param .b64 _Z11ContractY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_0,
-	.param .b64 _Z11ContractY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_1,
-	.param .b64 _Z11ContractY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_2,
-	.param .b64 _Z11ContractY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_3
-)                                       // -- Begin function _Z11ContractY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd
-                                        // @_Z11ContractY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd
-{
-	.local .align 8 .b8 	__local_depot13[88];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .pred 	%p<6>;
-	.reg .b32 	%r<27>;
-	.reg .b64 	%rd<38>;
-	.reg .f64 	%fd<7>;
-
-// %bb.0:
-	mov.u64 	%SPL, __local_depot13;
-	cvta.local.u64 	%SP, %SPL;
-	ld.param.u64 	%rd4, [_Z11ContractY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_3];
-	ld.param.u64 	%rd3, [_Z11ContractY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_2];
-	ld.param.u64 	%rd2, [_Z11ContractY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_1];
-	ld.param.u64 	%rd1, [_Z11ContractY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_0];
-	st.u64 	[%SP], %rd1;
-	st.u64 	[%SP+8], %rd2;
-	st.u64 	[%SP+16], %rd3;
-	st.u64 	[%SP+24], %rd4;
-	mov.b32 	%r1, 0;
-	st.u32 	[%SP+72], %r1;
-	bra.uni 	$L__BB13_1;
-$L__BB13_1:                             // =>This Inner Loop Header: Depth=1
-	ld.u32 	%r2, [%SP+72];
-	setp.gt.s32 	%p1, %r2, 4;
-	@%p1 bra 	$L__BB13_4;
-	bra.uni 	$L__BB13_2;
-$L__BB13_2:                             //   in Loop: Header=BB13_1 Depth=1
-	ld.u64 	%rd31, [%SP+16];
-	ld.u32 	%r21, [%SP+72];
-	ld.u64 	%rd32, [%SP];
-	ld.u32 	%r22, [%rd32+4];
-	mul.lo.s32 	%r23, %r22, 5;
-	add.s32 	%r24, %r21, %r23;
-	mul.wide.s32 	%rd33, %r24, 8;
-	add.s64 	%rd34, %rd31, %rd33;
-	ld.f64 	%fd6, [%rd34];
-	mul.wide.s32 	%rd35, %r21, 8;
-	add.u64 	%rd36, %SP, 32;
-	add.s64 	%rd37, %rd36, %rd35;
-	st.f64 	[%rd37], %fd6;
-	bra.uni 	$L__BB13_3;
-$L__BB13_3:                             //   in Loop: Header=BB13_1 Depth=1
-	ld.u32 	%r25, [%SP+72];
-	add.s32 	%r26, %r25, 1;
-	st.u32 	[%SP+72], %r26;
-	bra.uni 	$L__BB13_1;
-$L__BB13_4:
-	mov.b32 	%r3, 0;
-	st.u32 	[%SP+76], %r3;
-	bra.uni 	$L__BB13_5;
-$L__BB13_5:                             // =>This Loop Header: Depth=1
-                                        //     Child Loop BB13_9 Depth 2
-	ld.u32 	%r4, [%SP+76];
-	setp.gt.s32 	%p2, %r4, 4;
-	@%p2 bra 	$L__BB13_15;
-	bra.uni 	$L__BB13_6;
-$L__BB13_6:                             //   in Loop: Header=BB13_5 Depth=1
-	bar.sync 	0;
-	ld.u64 	%rd5, [%SP+8];
-	ld.s32 	%rd6, [%SP+76];
-	shl.b64 	%rd7, %rd6, 3;
-	add.s64 	%rd8, %rd5, %rd7;
-	ld.f64 	%fd1, [%rd8];
-	ld.u64 	%rd9, [%SP];
-	ld.u64 	%rd10, [%rd9+16];
-	ld.u32 	%r5, [%rd9];
-	ld.u32 	%r6, [%rd9+4];
-	mul.lo.s32 	%r7, %r6, 6;
-	add.s32 	%r8, %r5, %r7;
-	mul.wide.s32 	%rd11, %r8, 8;
-	add.s64 	%rd12, %rd10, %rd11;
-	st.f64 	[%rd12], %fd1;
-	bar.sync 	0;
-	ld.u64 	%rd13, [%SP+24];
-	ld.s32 	%rd14, [%SP+76];
-	shl.b64 	%rd15, %rd14, 3;
-	add.s64 	%rd16, %rd13, %rd15;
-	mov.b64 	%rd17, 0;
-	st.u64 	[%rd16], %rd17;
-	ld.u64 	%rd18, [%SP];
-	ld.u32 	%r9, [%rd18];
-	setp.gt.s32 	%p3, %r9, 5;
-	@%p3 bra 	$L__BB13_13;
-	bra.uni 	$L__BB13_7;
-$L__BB13_7:                             //   in Loop: Header=BB13_5 Depth=1
-	ld.u64 	%rd19, [%SP];
-	ld.u32 	%r10, [%rd19+4];
-	setp.gt.s32 	%p4, %r10, 5;
-	@%p4 bra 	$L__BB13_13;
-	bra.uni 	$L__BB13_8;
-$L__BB13_8:                             //   in Loop: Header=BB13_5 Depth=1
-	mov.b32 	%r11, 0;
-	st.u32 	[%SP+80], %r11;
-	bra.uni 	$L__BB13_9;
-$L__BB13_9:                             //   Parent Loop BB13_5 Depth=1
-                                        // =>  This Inner Loop Header: Depth=2
-	ld.u32 	%r12, [%SP+80];
-	setp.gt.s32 	%p5, %r12, 4;
-	@%p5 bra 	$L__BB13_12;
-	bra.uni 	$L__BB13_10;
-$L__BB13_10:                            //   in Loop: Header=BB13_9 Depth=2
-	ld.u32 	%r15, [%SP+80];
-	mul.wide.s32 	%rd20, %r15, 8;
-	add.u64 	%rd21, %SP, 32;
-	add.s64 	%rd22, %rd21, %rd20;
-	ld.f64 	%fd2, [%rd22];
-	ld.u64 	%rd23, [%SP];
-	ld.u64 	%rd24, [%rd23+16];
-	ld.u32 	%r16, [%rd23];
-	mul.lo.s32 	%r17, %r15, 6;
-	add.s32 	%r18, %r16, %r17;
-	mul.wide.s32 	%rd25, %r18, 8;
-	add.s64 	%rd26, %rd24, %rd25;
-	ld.f64 	%fd3, [%rd26];
-	ld.u64 	%rd27, [%SP+24];
-	ld.s32 	%rd28, [%SP+76];
-	shl.b64 	%rd29, %rd28, 3;
-	add.s64 	%rd30, %rd27, %rd29;
-	ld.f64 	%fd4, [%rd30];
-	fma.rn.f64 	%fd5, %fd2, %fd3, %fd4;
-	st.f64 	[%rd30], %fd5;
-	bra.uni 	$L__BB13_11;
-$L__BB13_11:                            //   in Loop: Header=BB13_9 Depth=2
-	ld.u32 	%r19, [%SP+80];
-	add.s32 	%r20, %r19, 1;
-	st.u32 	[%SP+80], %r20;
-	bra.uni 	$L__BB13_9;
-$L__BB13_12:                            //   in Loop: Header=BB13_5 Depth=1
-	bra.uni 	$L__BB13_13;
-$L__BB13_13:                            //   in Loop: Header=BB13_5 Depth=1
-	bra.uni 	$L__BB13_14;
-$L__BB13_14:                            //   in Loop: Header=BB13_5 Depth=1
-	ld.u32 	%r13, [%SP+76];
-	add.s32 	%r14, %r13, 1;
-	st.u32 	[%SP+76], %r14;
-	bra.uni 	$L__BB13_5;
-$L__BB13_15:
-	ret;
-                                        // -- End function
-}
-.func _Z11ContractZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(
-	.param .b64 _Z11ContractZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_0,
-	.param .b64 _Z11ContractZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_1,
-	.param .b64 _Z11ContractZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_2,
-	.param .b64 _Z11ContractZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_3
-)                                       // -- Begin function _Z11ContractZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd
-                                        // @_Z11ContractZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd
-{
-	.local .align 8 .b8 	__local_depot14[40];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .pred 	%p<5>;
-	.reg .b32 	%r<15>;
-	.reg .b64 	%rd<21>;
-	.reg .f64 	%fd<5>;
-
-// %bb.0:
-	mov.u64 	%SPL, __local_depot14;
-	cvta.local.u64 	%SP, %SPL;
-	ld.param.u64 	%rd4, [_Z11ContractZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_3];
-	ld.param.u64 	%rd3, [_Z11ContractZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_2];
-	ld.param.u64 	%rd2, [_Z11ContractZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_1];
-	ld.param.u64 	%rd1, [_Z11ContractZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd_param_0];
-	st.u64 	[%SP], %rd1;
-	st.u64 	[%SP+8], %rd2;
-	st.u64 	[%SP+16], %rd3;
-	st.u64 	[%SP+24], %rd4;
-	mov.b32 	%r1, 0;
-	st.u32 	[%SP+32], %r1;
-	bra.uni 	$L__BB14_1;
-$L__BB14_1:                             // =>This Loop Header: Depth=1
-                                        //     Child Loop BB14_5 Depth 2
-	ld.u32 	%r2, [%SP+32];
-	setp.gt.s32 	%p1, %r2, 5;
-	@%p1 bra 	$L__BB14_11;
-	bra.uni 	$L__BB14_2;
-$L__BB14_2:                             //   in Loop: Header=BB14_1 Depth=1
-	ld.u64 	%rd5, [%SP+24];
-	ld.s32 	%rd6, [%SP+32];
-	shl.b64 	%rd7, %rd6, 3;
-	add.s64 	%rd8, %rd5, %rd7;
-	mov.b64 	%rd9, 0;
-	st.u64 	[%rd8], %rd9;
-	ld.u64 	%rd10, [%SP];
-	ld.u32 	%r3, [%rd10];
-	setp.gt.s32 	%p2, %r3, 5;
-	@%p2 bra 	$L__BB14_9;
-	bra.uni 	$L__BB14_3;
-$L__BB14_3:                             //   in Loop: Header=BB14_1 Depth=1
-	ld.u64 	%rd11, [%SP];
-	ld.u32 	%r4, [%rd11+4];
-	setp.gt.s32 	%p3, %r4, 5;
-	@%p3 bra 	$L__BB14_9;
-	bra.uni 	$L__BB14_4;
-$L__BB14_4:                             //   in Loop: Header=BB14_1 Depth=1
-	mov.b32 	%r5, 0;
-	st.u32 	[%SP+36], %r5;
-	bra.uni 	$L__BB14_5;
-$L__BB14_5:                             //   Parent Loop BB14_1 Depth=1
-                                        // =>  This Inner Loop Header: Depth=2
-	ld.u32 	%r6, [%SP+36];
-	setp.gt.s32 	%p4, %r6, 4;
-	@%p4 bra 	$L__BB14_8;
-	bra.uni 	$L__BB14_6;
-$L__BB14_6:                             //   in Loop: Header=BB14_5 Depth=2
-	ld.u64 	%rd12, [%SP+16];
-	ld.u32 	%r9, [%SP+36];
-	ld.u32 	%r10, [%SP+32];
-	mul.lo.s32 	%r11, %r10, 5;
-	add.s32 	%r12, %r9, %r11;
-	mul.wide.s32 	%rd13, %r12, 8;
-	add.s64 	%rd14, %rd12, %rd13;
-	ld.f64 	%fd1, [%rd14];
-	ld.u64 	%rd15, [%SP+8];
-	mul.wide.s32 	%rd16, %r9, 8;
-	add.s64 	%rd17, %rd15, %rd16;
-	ld.f64 	%fd2, [%rd17];
-	ld.u64 	%rd18, [%SP+24];
-	mul.wide.s32 	%rd19, %r10, 8;
-	add.s64 	%rd20, %rd18, %rd19;
-	ld.f64 	%fd3, [%rd20];
-	fma.rn.f64 	%fd4, %fd1, %fd2, %fd3;
-	st.f64 	[%rd20], %fd4;
-	bra.uni 	$L__BB14_7;
-$L__BB14_7:                             //   in Loop: Header=BB14_5 Depth=2
-	ld.u32 	%r13, [%SP+36];
-	add.s32 	%r14, %r13, 1;
-	st.u32 	[%SP+36], %r14;
-	bra.uni 	$L__BB14_5;
-$L__BB14_8:                             //   in Loop: Header=BB14_1 Depth=1
-	bra.uni 	$L__BB14_9;
-$L__BB14_9:                             //   in Loop: Header=BB14_1 Depth=1
-	bra.uni 	$L__BB14_10;
-$L__BB14_10:                            //   in Loop: Header=BB14_1 Depth=1
-	ld.u32 	%r7, [%SP+32];
-	add.s32 	%r8, %r7, 1;
-	st.u32 	[%SP+32], %r8;
-	bra.uni 	$L__BB14_1;
-$L__BB14_11:
-	ret;
-                                        // -- End function
-}
-.func  (.param .b32 func_retval0) apply_mass_rs(
-	.param .b64 apply_mass_rs_param_0,
-	.param .b32 apply_mass_rs_param_1,
-	.param .b64 apply_mass_rs_param_2,
-	.param .b64 apply_mass_rs_param_3
-)                                       // -- Begin function apply_mass_rs
-                                        // @apply_mass_rs
-{
-	.reg .pred 	%p<3>;
-	.reg .b32 	%r<2>;
-	.reg .b64 	%rd<19>;
-	.reg .f64 	%fd<4>;
-
-// %bb.0:
-	ld.param.u64 	%rd13, [apply_mass_rs_param_2];
-	ld.u64 	%rd17, [%rd13];
-	ld.param.s32 	%rd15, [apply_mass_rs_param_1];
-	setp.eq.s64 	%p1, %rd15, 0;
-	ld.u64 	%rd16, [%rd13+8];
-	ld.param.u64 	%rd14, [apply_mass_rs_param_3];
-	ld.u64 	%rd18, [%rd14];
-	@%p1 bra 	$L__BB15_1;
-$L__BB15_2:                             // =>This Inner Loop Header: Depth=1
-	ld.f64 	%fd1, [%rd16];
-	ld.f64 	%fd2, [%rd17];
-	mul.rn.f64 	%fd3, %fd1, %fd2;
-	st.f64 	[%rd18], %fd3;
-	add.s64 	%rd18, %rd18, 8;
-	add.s64 	%rd17, %rd17, 8;
-	add.s64 	%rd16, %rd16, 8;
-	add.s64 	%rd15, %rd15, -1;
-	setp.eq.s64 	%p2, %rd15, 0;
-	@%p2 bra 	$L__BB15_1;
-	bra.uni 	$L__BB15_2;
-$L__BB15_1:
-	mov.b32 	%r1, 0;
-	st.param.b32 	[func_retval0], %r1;
-	ret;
-                                        // -- End function
-}
diff --git a/temp_kernel_linked.ll b/temp_kernel_linked.ll
deleted file mode 100644
index 8ef57417..00000000
--- a/temp_kernel_linked.ll
+++ /dev/null
@@ -1,1657 +0,0 @@
-; ModuleID = 'llvm-link'
-source_filename = "llvm-link"
-target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
-target triple = "nvptx64-nvidia-cuda"
-
-%struct.FieldsInt_Cuda = type { [16 x ptr], [16 x ptr] }
-%struct.Points_Cuda = type { i32, ptr, ptr, ptr }
-%struct.SharedData_Cuda = type { i32, i32, i32, i32, ptr }
-
-$_Z10LoadMatrixILi5ELi6EEvR15SharedData_CudaPKdPd = comdat any
-
-$_Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd = comdat any
-
-$_Z14InterpTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd = comdat any
-
-$_Z17ReadLVecStrided3dILi1ELi6ELi1ELi884736ELi216EEvR15SharedData_CudaiPKdPd = comdat any
-
-$_Z23InterpTransposeTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd = comdat any
-
-$_Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd = comdat any
-
-$_Z20ContractTransposeZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd = comdat any
-
-$_Z20ContractTransposeY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd = comdat any
-
-$_Z20ContractTransposeX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd = comdat any
-
-$_Z11ContractX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd = comdat any
-
-$_Z11ContractY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd = comdat any
-
-$_Z11ContractZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd = comdat any
-
-@slice = external dso_local addrspace(3) global [0 x double], align 8
-@_ZZ36CeedKernelCudaGenOperator_apply_massE8s_B_in_0 = internal addrspace(3) global [30 x double] undef, align 8
-
-; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone
-define dso_local ptx_kernel void @CeedKernelCudaGenOperator_apply_mass(i32 noundef %0, ptr noundef %1, ptr noundef byval(%struct.FieldsInt_Cuda) align 8 %2, ptr noundef byval(%struct.FieldsInt_Cuda) align 8 %3, ptr noundef byval(%struct.FieldsInt_Cuda) align 8 %4, ptr noundef byval(%struct.FieldsInt_Cuda) align 8 %5, ptr noundef %6, ptr noundef byval(%struct.Points_Cuda) align 8 %7) #0 {
-  %9 = alloca i32, align 4
-  %10 = alloca ptr, align 8
-  %11 = alloca ptr, align 8
-  %12 = alloca ptr, align 8
-  %13 = alloca ptr, align 8
-  %14 = alloca ptr, align 8
-  %15 = alloca i32, align 4
-  %16 = alloca i32, align 4
-  %17 = alloca %struct.SharedData_Cuda, align 8
-  %18 = alloca i32, align 4
-  %19 = alloca i32, align 4
-  %20 = alloca i32, align 4
-  %21 = alloca i32, align 4
-  %22 = alloca i32, align 4
-  %23 = alloca i32, align 4
-  %24 = alloca i32, align 4
-  %25 = alloca i32, align 4
-  %26 = alloca i32, align 4
-  %27 = alloca ptr, align 8
-  %28 = alloca i32, align 4
-  %29 = alloca [6 x double], align 8
-  %30 = alloca ptr, align 8
-  %31 = alloca i32, align 4
-  %32 = alloca i32, align 4
-  %33 = alloca [6 x double], align 8
-  %34 = alloca [6 x double], align 8
-  %35 = alloca i32, align 4
-  %36 = alloca i32, align 4
-  %37 = alloca i32, align 4
-  %38 = alloca ptr, align 8
-  %39 = alloca [6 x double], align 8
-  %40 = alloca ptr, align 8
-  %41 = alloca ptr, align 8
-  %42 = alloca ptr, align 8
-  %43 = alloca [2 x ptr], align 8
-  %44 = alloca [1 x ptr], align 8
-  %45 = alloca ptr, align 8
-  %46 = alloca i32, align 4
-  %47 = alloca i32, align 4
-  store i32 %0, ptr %9, align 4
-  store ptr %1, ptr %10, align 8
-  store ptr %6, ptr %11, align 8
-  %48 = getelementptr inbounds nuw %struct.FieldsInt_Cuda, ptr %3, i32 0, i32 0
-  %49 = getelementptr inbounds [16 x ptr], ptr %48, i64 0, i64 0
-  %50 = load ptr, ptr %49, align 8
-  store ptr %50, ptr %12, align 8
-  %51 = getelementptr inbounds nuw %struct.FieldsInt_Cuda, ptr %3, i32 0, i32 0
-  %52 = getelementptr inbounds [16 x ptr], ptr %51, i64 0, i64 1
-  %53 = load ptr, ptr %52, align 8
-  store ptr %53, ptr %13, align 8
-  %54 = getelementptr inbounds nuw %struct.FieldsInt_Cuda, ptr %3, i32 0, i32 1
-  %55 = getelementptr inbounds [16 x ptr], ptr %54, i64 0, i64 0
-  %56 = load ptr, ptr %55, align 8
-  store ptr %56, ptr %14, align 8
-  store i32 3, ptr %15, align 4
-  store i32 6, ptr %16, align 4
-  %57 = call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %58 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %17, i32 0, i32 0
-  store i32 %57, ptr %58, align 8
-  %59 = call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
-  %60 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %17, i32 0, i32 1
-  store i32 %59, ptr %60, align 4
-  %61 = call noundef range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.z()
-  %62 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %17, i32 0, i32 2
-  store i32 %61, ptr %62, align 8
-  %63 = call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %64 = call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
-  %65 = call noundef range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  %66 = mul i32 %64, %65
-  %67 = add i32 %63, %66
-  %68 = call noundef range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.z()
-  %69 = call noundef range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
-  %70 = mul i32 %68, %69
-  %71 = call noundef range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  %72 = mul i32 %70, %71
-  %73 = add i32 %67, %72
-  %74 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %17, i32 0, i32 3
-  store i32 %73, ptr %74, align 4
-  %75 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %17, i32 0, i32 2
-  %76 = load i32, ptr %75, align 8
-  %77 = mul nsw i32 %76, 6
-  %78 = mul nsw i32 %77, 6
-  %79 = sext i32 %78 to i64
-  %80 = getelementptr inbounds double, ptr addrspacecast (ptr addrspace(3) @slice to ptr), i64 %79
-  %81 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %17, i32 0, i32 4
-  store ptr %80, ptr %81, align 8
-  store i32 3, ptr %18, align 4
-  store i32 5, ptr %19, align 4
-  store i32 1, ptr %20, align 4
-  %82 = getelementptr inbounds nuw %struct.FieldsInt_Cuda, ptr %4, i32 0, i32 0
-  %83 = getelementptr inbounds [16 x ptr], ptr %82, i64 0, i64 0
-  %84 = load ptr, ptr %83, align 8
-  call void @_Z10LoadMatrixILi5ELi6EEvR15SharedData_CudaPKdPd(ptr noundef nonnull align 8 dereferenceable(24) %17, ptr noundef %84, ptr noundef addrspacecast (ptr addrspace(3) @_ZZ36CeedKernelCudaGenOperator_apply_massE8s_B_in_0 to ptr)) #5
-  store i32 3, ptr %21, align 4
-  store i32 6, ptr %22, align 4
-  store i32 1, ptr %23, align 4
-  store i32 3, ptr %24, align 4
-  store i32 5, ptr %25, align 4
-  store i32 1, ptr %26, align 4
-  store ptr addrspacecast (ptr addrspace(3) @_ZZ36CeedKernelCudaGenOperator_apply_massE8s_B_in_0 to ptr), ptr %27, align 8
-  call void @llvm.nvvm.barrier0()
-  %85 = call noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-  %86 = call noundef range(i32 1, 65) i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
-  %87 = mul i32 %85, %86
-  %88 = call noundef range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.z()
-  %89 = add i32 %87, %88
-  store i32 %89, ptr %28, align 4
-  br label %90
-
-90:                                               ; preds = %131, %8
-  %91 = load i32, ptr %28, align 4
-  %92 = load i32, ptr %9, align 4
-  %93 = icmp slt i32 %91, %92
-  br i1 %93, label %94, label %137
-
-94:                                               ; preds = %90
-  %95 = getelementptr inbounds [6 x double], ptr %29, i64 0, i64 0
-  store ptr %95, ptr %30, align 8
-  store i32 274625, ptr %31, align 4
-  store i32 274625, ptr %32, align 4
-  %96 = load i32, ptr %28, align 4
-  %97 = getelementptr inbounds nuw %struct.FieldsInt_Cuda, ptr %2, i32 0, i32 0
-  %98 = getelementptr inbounds [16 x ptr], ptr %97, i64 0, i64 0
-  %99 = load ptr, ptr %98, align 8
-  %100 = load ptr, ptr %12, align 8
-  %101 = load ptr, ptr %30, align 8
-  call void @_Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd(ptr noundef nonnull align 8 dereferenceable(24) %17, i32 noundef 274625, i32 noundef %96, ptr noundef %99, ptr noundef %100, ptr noundef %101) #5
-  %102 = load ptr, ptr %30, align 8
-  %103 = getelementptr inbounds [6 x double], ptr %33, i64 0, i64 0
-  call void @_Z14InterpTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %17, ptr noundef %102, ptr noundef addrspacecast (ptr addrspace(3) @_ZZ36CeedKernelCudaGenOperator_apply_massE8s_B_in_0 to ptr), ptr noundef %103) #5
-  store i32 1, ptr %35, align 4
-  store i32 884736, ptr %36, align 4
-  store i32 216, ptr %37, align 4
-  %104 = load i32, ptr %28, align 4
-  %105 = load ptr, ptr %13, align 8
-  %106 = getelementptr inbounds [6 x double], ptr %34, i64 0, i64 0
-  call void @_Z17ReadLVecStrided3dILi1ELi6ELi1ELi884736ELi216EEvR15SharedData_CudaiPKdPd(ptr noundef nonnull align 8 dereferenceable(24) %17, i32 noundef %104, ptr noundef %105, ptr noundef %106) #5
-  %107 = getelementptr inbounds [6 x double], ptr %34, i64 0, i64 0
-  store ptr %107, ptr %38, align 8
-  %108 = getelementptr inbounds [6 x double], ptr %33, i64 0, i64 0
-  store ptr %108, ptr %40, align 8
-  %109 = load ptr, ptr %38, align 8
-  store ptr %109, ptr %41, align 8
-  %110 = getelementptr inbounds [6 x double], ptr %39, i64 0, i64 0
-  store ptr %110, ptr %42, align 8
-  %111 = load ptr, ptr %40, align 8
-  %112 = getelementptr inbounds [2 x ptr], ptr %43, i64 0, i64 0
-  store ptr %111, ptr %112, align 8
-  %113 = load ptr, ptr %41, align 8
-  %114 = getelementptr inbounds [2 x ptr], ptr %43, i64 0, i64 1
-  store ptr %113, ptr %114, align 8
-  %115 = load ptr, ptr %42, align 8
-  %116 = getelementptr inbounds [1 x ptr], ptr %44, i64 0, i64 0
-  store ptr %115, ptr %116, align 8
-  %117 = load ptr, ptr %10, align 8
-  %118 = getelementptr inbounds [2 x ptr], ptr %43, i64 0, i64 0
-  %119 = getelementptr inbounds [1 x ptr], ptr %44, i64 0, i64 0
-  %120 = call noundef i32 @_ZL10apply_massPviPKPKdPKPd(ptr noundef %117, i32 noundef 6, ptr noundef %118, ptr noundef %119) #5
-  %121 = getelementptr inbounds [6 x double], ptr %29, i64 0, i64 0
-  store ptr %121, ptr %45, align 8
-  %122 = getelementptr inbounds [6 x double], ptr %39, i64 0, i64 0
-  %123 = load ptr, ptr %27, align 8
-  %124 = load ptr, ptr %45, align 8
-  call void @_Z23InterpTransposeTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %17, ptr noundef %122, ptr noundef %123, ptr noundef %124) #5
-  store i32 274625, ptr %46, align 4
-  store i32 274625, ptr %47, align 4
-  %125 = load i32, ptr %28, align 4
-  %126 = getelementptr inbounds nuw %struct.FieldsInt_Cuda, ptr %2, i32 0, i32 1
-  %127 = getelementptr inbounds [16 x ptr], ptr %126, i64 0, i64 0
-  %128 = load ptr, ptr %127, align 8
-  %129 = load ptr, ptr %45, align 8
-  %130 = load ptr, ptr %14, align 8
-  call void @_Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd(ptr noundef nonnull align 8 dereferenceable(24) %17, i32 noundef 274625, i32 noundef %125, ptr noundef %128, ptr noundef %129, ptr noundef %130) #5
-  br label %131
-
-131:                                              ; preds = %94
-  %132 = call noundef range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
-  %133 = call noundef range(i32 1, 65) i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
-  %134 = mul i32 %132, %133
-  %135 = load i32, ptr %28, align 4
-  %136 = add i32 %135, %134
-  store i32 %136, ptr %28, align 4
-  br label %90, !llvm.loop !10
-
-137:                                              ; preds = %90
-  ret void
-}
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.y() #1
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.z() #1
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #1
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #1
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define linkonce_odr dso_local void @_Z10LoadMatrixILi5ELi6EEvR15SharedData_CudaPKdPd(ptr noundef nonnull align 8 dereferenceable(24) %0, ptr noalias noundef %1, ptr noundef %2) #2 comdat {
-  %4 = alloca ptr, align 8
-  %5 = alloca ptr, align 8
-  %6 = alloca ptr, align 8
-  %7 = alloca i32, align 4
-  store ptr %0, ptr %4, align 8
-  store ptr %1, ptr %5, align 8
-  store ptr %2, ptr %6, align 8
-  %8 = load ptr, ptr %4, align 8
-  %9 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %8, i32 0, i32 3
-  %10 = load i32, ptr %9, align 4
-  store i32 %10, ptr %7, align 4
-  br label %11
-
-11:                                               ; preds = %24, %3
-  %12 = load i32, ptr %7, align 4
-  %13 = icmp slt i32 %12, 30
-  br i1 %13, label %14, label %32
-
-14:                                               ; preds = %11
-  %15 = load ptr, ptr %5, align 8
-  %16 = load i32, ptr %7, align 4
-  %17 = sext i32 %16 to i64
-  %18 = getelementptr inbounds double, ptr %15, i64 %17
-  %19 = load double, ptr %18, align 8
-  %20 = load ptr, ptr %6, align 8
-  %21 = load i32, ptr %7, align 4
-  %22 = sext i32 %21 to i64
-  %23 = getelementptr inbounds double, ptr %20, i64 %22
-  store double %19, ptr %23, align 8
-  br label %24
-
-24:                                               ; preds = %14
-  %25 = call noundef range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  %26 = call noundef range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
-  %27 = mul i32 %25, %26
-  %28 = call noundef range(i32 1, 65) i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
-  %29 = mul i32 %27, %28
-  %30 = load i32, ptr %7, align 4
-  %31 = add i32 %30, %29
-  store i32 %31, ptr %7, align 4
-  br label %11, !llvm.loop !12
-
-32:                                               ; preds = %11
-  ret void
-}
-
-; Function Attrs: convergent nocallback nounwind
-declare void @llvm.nvvm.barrier0() #3
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.z() #1
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define linkonce_odr dso_local void @_Z18ReadLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd(ptr noundef nonnull align 8 dereferenceable(24) %0, i32 noundef %1, i32 noundef %2, ptr noalias noundef %3, ptr noalias noundef %4, ptr noalias noundef %5) #2 comdat {
-  %7 = alloca ptr, align 8
-  %8 = alloca i32, align 4
-  %9 = alloca i32, align 4
-  %10 = alloca ptr, align 8
-  %11 = alloca ptr, align 8
-  %12 = alloca ptr, align 8
-  %13 = alloca i32, align 4
-  %14 = alloca i32, align 4
-  %15 = alloca i32, align 4
-  %16 = alloca i32, align 4
-  store ptr %0, ptr %7, align 8
-  store i32 %1, ptr %8, align 4
-  store i32 %2, ptr %9, align 4
-  store ptr %3, ptr %10, align 8
-  store ptr %4, ptr %11, align 8
-  store ptr %5, ptr %12, align 8
-  %17 = load ptr, ptr %7, align 8
-  %18 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %17, i32 0, i32 0
-  %19 = load i32, ptr %18, align 8
-  %20 = icmp slt i32 %19, 5
-  br i1 %20, label %21, label %80
-
-21:                                               ; preds = %6
-  %22 = load ptr, ptr %7, align 8
-  %23 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %22, i32 0, i32 1
-  %24 = load i32, ptr %23, align 4
-  %25 = icmp slt i32 %24, 5
-  br i1 %25, label %26, label %80
-
-26:                                               ; preds = %21
-  store i32 0, ptr %13, align 4
-  br label %27
-
-27:                                               ; preds = %76, %26
-  %28 = load i32, ptr %13, align 4
-  %29 = icmp slt i32 %28, 5
-  br i1 %29, label %30, label %79
-
-30:                                               ; preds = %27
-  %31 = load ptr, ptr %7, align 8
-  %32 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %31, i32 0, i32 0
-  %33 = load i32, ptr %32, align 8
-  %34 = load ptr, ptr %7, align 8
-  %35 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %34, i32 0, i32 1
-  %36 = load i32, ptr %35, align 4
-  %37 = mul nsw i32 %36, 5
-  %38 = add nsw i32 %33, %37
-  %39 = load i32, ptr %13, align 4
-  %40 = mul nsw i32 %39, 5
-  %41 = mul nsw i32 %40, 5
-  %42 = add nsw i32 %38, %41
-  store i32 %42, ptr %14, align 4
-  %43 = load ptr, ptr %10, align 8
-  %44 = load i32, ptr %14, align 4
-  %45 = load i32, ptr %9, align 4
-  %46 = mul nsw i32 %45, 5
-  %47 = mul nsw i32 %46, 5
-  %48 = mul nsw i32 %47, 5
-  %49 = add nsw i32 %44, %48
-  %50 = sext i32 %49 to i64
-  %51 = getelementptr inbounds i32, ptr %43, i64 %50
-  %52 = load i32, ptr %51, align 4
-  store i32 %52, ptr %15, align 4
-  store i32 0, ptr %16, align 4
-  br label %53
-
-53:                                               ; preds = %72, %30
-  %54 = load i32, ptr %16, align 4
-  %55 = icmp slt i32 %54, 1
-  br i1 %55, label %56, label %75
-
-56:                                               ; preds = %53
-  %57 = load ptr, ptr %11, align 8
-  %58 = load i32, ptr %15, align 4
-  %59 = load i32, ptr %16, align 4
-  %60 = mul nsw i32 274625, %59
-  %61 = add nsw i32 %58, %60
-  %62 = sext i32 %61 to i64
-  %63 = getelementptr inbounds double, ptr %57, i64 %62
-  %64 = load double, ptr %63, align 8
-  %65 = load ptr, ptr %12, align 8
-  %66 = load i32, ptr %13, align 4
-  %67 = load i32, ptr %16, align 4
-  %68 = mul nsw i32 %67, 5
-  %69 = add nsw i32 %66, %68
-  %70 = sext i32 %69 to i64
-  %71 = getelementptr inbounds double, ptr %65, i64 %70
-  store double %64, ptr %71, align 8
-  br label %72
-
-72:                                               ; preds = %56
-  %73 = load i32, ptr %16, align 4
-  %74 = add nsw i32 %73, 1
-  store i32 %74, ptr %16, align 4
-  br label %53, !llvm.loop !13
-
-75:                                               ; preds = %53
-  br label %76
-
-76:                                               ; preds = %75
-  %77 = load i32, ptr %13, align 4
-  %78 = add nsw i32 %77, 1
-  store i32 %78, ptr %13, align 4
-  br label %27, !llvm.loop !14
-
-79:                                               ; preds = %27
-  br label %80
-
-80:                                               ; preds = %79, %21, %6
-  ret void
-}
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define linkonce_odr dso_local void @_Z14InterpTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %0, ptr noalias noundef %1, ptr noundef %2, ptr noalias noundef %3) #2 comdat {
-  %5 = alloca ptr, align 8
-  %6 = alloca ptr, align 8
-  %7 = alloca ptr, align 8
-  %8 = alloca ptr, align 8
-  %9 = alloca [6 x double], align 8
-  %10 = alloca [6 x double], align 8
-  %11 = alloca i32, align 4
-  store ptr %0, ptr %5, align 8
-  store ptr %1, ptr %6, align 8
-  store ptr %2, ptr %7, align 8
-  store ptr %3, ptr %8, align 8
-  store i32 0, ptr %11, align 4
-  br label %12
-
-12:                                               ; preds = %36, %4
-  %13 = load i32, ptr %11, align 4
-  %14 = icmp slt i32 %13, 1
-  br i1 %14, label %15, label %39
-
-15:                                               ; preds = %12
-  %16 = load ptr, ptr %5, align 8
-  %17 = load ptr, ptr %6, align 8
-  %18 = load i32, ptr %11, align 4
-  %19 = mul nsw i32 %18, 5
-  %20 = sext i32 %19 to i64
-  %21 = getelementptr inbounds double, ptr %17, i64 %20
-  %22 = load ptr, ptr %7, align 8
-  %23 = getelementptr inbounds [6 x double], ptr %9, i64 0, i64 0
-  call void @_Z11ContractX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %16, ptr noundef %21, ptr noundef %22, ptr noundef %23) #5
-  %24 = load ptr, ptr %5, align 8
-  %25 = getelementptr inbounds [6 x double], ptr %9, i64 0, i64 0
-  %26 = load ptr, ptr %7, align 8
-  %27 = getelementptr inbounds [6 x double], ptr %10, i64 0, i64 0
-  call void @_Z11ContractY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %24, ptr noundef %25, ptr noundef %26, ptr noundef %27) #5
-  %28 = load ptr, ptr %5, align 8
-  %29 = getelementptr inbounds [6 x double], ptr %10, i64 0, i64 0
-  %30 = load ptr, ptr %7, align 8
-  %31 = load ptr, ptr %8, align 8
-  %32 = load i32, ptr %11, align 4
-  %33 = mul nsw i32 %32, 6
-  %34 = sext i32 %33 to i64
-  %35 = getelementptr inbounds double, ptr %31, i64 %34
-  call void @_Z11ContractZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %28, ptr noundef %29, ptr noundef %30, ptr noundef %35) #5
-  br label %36
-
-36:                                               ; preds = %15
-  %37 = load i32, ptr %11, align 4
-  %38 = add nsw i32 %37, 1
-  store i32 %38, ptr %11, align 4
-  br label %12, !llvm.loop !15
-
-39:                                               ; preds = %12
-  ret void
-}
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define linkonce_odr dso_local void @_Z17ReadLVecStrided3dILi1ELi6ELi1ELi884736ELi216EEvR15SharedData_CudaiPKdPd(ptr noundef nonnull align 8 dereferenceable(24) %0, i32 noundef %1, ptr noalias noundef %2, ptr noalias noundef %3) #2 comdat {
-  %5 = alloca ptr, align 8
-  %6 = alloca i32, align 4
-  %7 = alloca ptr, align 8
-  %8 = alloca ptr, align 8
-  %9 = alloca i32, align 4
-  %10 = alloca i32, align 4
-  %11 = alloca i32, align 4
-  %12 = alloca i32, align 4
-  store ptr %0, ptr %5, align 8
-  store i32 %1, ptr %6, align 4
-  store ptr %2, ptr %7, align 8
-  store ptr %3, ptr %8, align 8
-  %13 = load ptr, ptr %5, align 8
-  %14 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %13, i32 0, i32 0
-  %15 = load i32, ptr %14, align 8
-  %16 = icmp slt i32 %15, 6
-  br i1 %16, label %17, label %71
-
-17:                                               ; preds = %4
-  %18 = load ptr, ptr %5, align 8
-  %19 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %18, i32 0, i32 1
-  %20 = load i32, ptr %19, align 4
-  %21 = icmp slt i32 %20, 6
-  br i1 %21, label %22, label %71
-
-22:                                               ; preds = %17
-  store i32 0, ptr %9, align 4
-  br label %23
-
-23:                                               ; preds = %67, %22
-  %24 = load i32, ptr %9, align 4
-  %25 = icmp slt i32 %24, 6
-  br i1 %25, label %26, label %70
-
-26:                                               ; preds = %23
-  %27 = load ptr, ptr %5, align 8
-  %28 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %27, i32 0, i32 0
-  %29 = load i32, ptr %28, align 8
-  %30 = load ptr, ptr %5, align 8
-  %31 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %30, i32 0, i32 1
-  %32 = load i32, ptr %31, align 4
-  %33 = mul nsw i32 %32, 6
-  %34 = add nsw i32 %29, %33
-  %35 = load i32, ptr %9, align 4
-  %36 = mul nsw i32 %35, 6
-  %37 = mul nsw i32 %36, 6
-  %38 = add nsw i32 %34, %37
-  store i32 %38, ptr %10, align 4
-  %39 = load i32, ptr %10, align 4
-  %40 = mul nsw i32 %39, 1
-  %41 = load i32, ptr %6, align 4
-  %42 = mul nsw i32 %41, 216
-  %43 = add nsw i32 %40, %42
-  store i32 %43, ptr %11, align 4
-  store i32 0, ptr %12, align 4
-  br label %44
-
-44:                                               ; preds = %63, %26
-  %45 = load i32, ptr %12, align 4
-  %46 = icmp slt i32 %45, 1
-  br i1 %46, label %47, label %66
-
-47:                                               ; preds = %44
-  %48 = load ptr, ptr %7, align 8
-  %49 = load i32, ptr %11, align 4
-  %50 = load i32, ptr %12, align 4
-  %51 = mul nsw i32 %50, 884736
-  %52 = add nsw i32 %49, %51
-  %53 = sext i32 %52 to i64
-  %54 = getelementptr inbounds double, ptr %48, i64 %53
-  %55 = load double, ptr %54, align 8
-  %56 = load ptr, ptr %8, align 8
-  %57 = load i32, ptr %9, align 4
-  %58 = load i32, ptr %12, align 4
-  %59 = mul nsw i32 %58, 6
-  %60 = add nsw i32 %57, %59
-  %61 = sext i32 %60 to i64
-  %62 = getelementptr inbounds double, ptr %56, i64 %61
-  store double %55, ptr %62, align 8
-  br label %63
-
-63:                                               ; preds = %47
-  %64 = load i32, ptr %12, align 4
-  %65 = add nsw i32 %64, 1
-  store i32 %65, ptr %12, align 4
-  br label %44, !llvm.loop !16
-
-66:                                               ; preds = %44
-  br label %67
-
-67:                                               ; preds = %66
-  %68 = load i32, ptr %9, align 4
-  %69 = add nsw i32 %68, 1
-  store i32 %69, ptr %9, align 4
-  br label %23, !llvm.loop !17
-
-70:                                               ; preds = %23
-  br label %71
-
-71:                                               ; preds = %70, %17, %4
-  ret void
-}
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define internal noundef i32 @_ZL10apply_massPviPKPKdPKPd(ptr noundef %0, i32 noundef %1, ptr noundef %2, ptr noundef %3) #2 {
-  %5 = alloca ptr, align 8
-  %6 = alloca i32, align 4
-  %7 = alloca ptr, align 8
-  %8 = alloca ptr, align 8
-  store ptr %0, ptr %5, align 8
-  store i32 %1, ptr %6, align 4
-  store ptr %2, ptr %7, align 8
-  store ptr %3, ptr %8, align 8
-  %9 = load ptr, ptr %5, align 8
-  %10 = load i32, ptr %6, align 4
-  %11 = load ptr, ptr %7, align 8
-  %12 = load ptr, ptr %8, align 8
-  %13 = call i32 @apply_mass_rs(ptr noundef %9, i32 noundef %10, ptr noundef %11, ptr noundef %12) #5
-  ret i32 %13
-}
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define linkonce_odr dso_local void @_Z23InterpTransposeTensor3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %0, ptr noalias noundef %1, ptr noundef %2, ptr noalias noundef %3) #2 comdat {
-  %5 = alloca ptr, align 8
-  %6 = alloca ptr, align 8
-  %7 = alloca ptr, align 8
-  %8 = alloca ptr, align 8
-  %9 = alloca [6 x double], align 8
-  %10 = alloca [6 x double], align 8
-  %11 = alloca i32, align 4
-  store ptr %0, ptr %5, align 8
-  store ptr %1, ptr %6, align 8
-  store ptr %2, ptr %7, align 8
-  store ptr %3, ptr %8, align 8
-  store i32 0, ptr %11, align 4
-  br label %12
-
-12:                                               ; preds = %36, %4
-  %13 = load i32, ptr %11, align 4
-  %14 = icmp slt i32 %13, 1
-  br i1 %14, label %15, label %39
-
-15:                                               ; preds = %12
-  %16 = load ptr, ptr %5, align 8
-  %17 = load ptr, ptr %6, align 8
-  %18 = load i32, ptr %11, align 4
-  %19 = mul nsw i32 %18, 6
-  %20 = sext i32 %19 to i64
-  %21 = getelementptr inbounds double, ptr %17, i64 %20
-  %22 = load ptr, ptr %7, align 8
-  %23 = getelementptr inbounds [6 x double], ptr %9, i64 0, i64 0
-  call void @_Z20ContractTransposeZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %16, ptr noundef %21, ptr noundef %22, ptr noundef %23) #5
-  %24 = load ptr, ptr %5, align 8
-  %25 = getelementptr inbounds [6 x double], ptr %9, i64 0, i64 0
-  %26 = load ptr, ptr %7, align 8
-  %27 = getelementptr inbounds [6 x double], ptr %10, i64 0, i64 0
-  call void @_Z20ContractTransposeY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %24, ptr noundef %25, ptr noundef %26, ptr noundef %27) #5
-  %28 = load ptr, ptr %5, align 8
-  %29 = getelementptr inbounds [6 x double], ptr %10, i64 0, i64 0
-  %30 = load ptr, ptr %7, align 8
-  %31 = load ptr, ptr %8, align 8
-  %32 = load i32, ptr %11, align 4
-  %33 = mul nsw i32 %32, 5
-  %34 = sext i32 %33 to i64
-  %35 = getelementptr inbounds double, ptr %31, i64 %34
-  call void @_Z20ContractTransposeX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %28, ptr noundef %29, ptr noundef %30, ptr noundef %35) #5
-  br label %36
-
-36:                                               ; preds = %15
-  %37 = load i32, ptr %11, align 4
-  %38 = add nsw i32 %37, 1
-  store i32 %38, ptr %11, align 4
-  br label %12, !llvm.loop !18
-
-39:                                               ; preds = %12
-  ret void
-}
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define linkonce_odr dso_local void @_Z19WriteLVecStandard3dILi1ELi274625ELi5EEvR15SharedData_CudaiiPKiPKdPd(ptr noundef nonnull align 8 dereferenceable(24) %0, i32 noundef %1, i32 noundef %2, ptr noalias noundef %3, ptr noalias noundef %4, ptr noalias noundef %5) #2 comdat {
-  %7 = alloca ptr, align 8
-  %8 = alloca i32, align 4
-  %9 = alloca i32, align 4
-  %10 = alloca ptr, align 8
-  %11 = alloca ptr, align 8
-  %12 = alloca ptr, align 8
-  %13 = alloca i32, align 4
-  %14 = alloca i32, align 4
-  %15 = alloca i32, align 4
-  %16 = alloca i32, align 4
-  store ptr %0, ptr %7, align 8
-  store i32 %1, ptr %8, align 4
-  store i32 %2, ptr %9, align 4
-  store ptr %3, ptr %10, align 8
-  store ptr %4, ptr %11, align 8
-  store ptr %5, ptr %12, align 8
-  %17 = load ptr, ptr %7, align 8
-  %18 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %17, i32 0, i32 0
-  %19 = load i32, ptr %18, align 8
-  %20 = icmp slt i32 %19, 5
-  br i1 %20, label %21, label %81
-
-21:                                               ; preds = %6
-  %22 = load ptr, ptr %7, align 8
-  %23 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %22, i32 0, i32 1
-  %24 = load i32, ptr %23, align 4
-  %25 = icmp slt i32 %24, 5
-  br i1 %25, label %26, label %81
-
-26:                                               ; preds = %21
-  store i32 0, ptr %13, align 4
-  br label %27
-
-27:                                               ; preds = %77, %26
-  %28 = load i32, ptr %13, align 4
-  %29 = icmp slt i32 %28, 5
-  br i1 %29, label %30, label %80
-
-30:                                               ; preds = %27
-  %31 = load ptr, ptr %7, align 8
-  %32 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %31, i32 0, i32 0
-  %33 = load i32, ptr %32, align 8
-  %34 = load ptr, ptr %7, align 8
-  %35 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %34, i32 0, i32 1
-  %36 = load i32, ptr %35, align 4
-  %37 = mul nsw i32 %36, 5
-  %38 = add nsw i32 %33, %37
-  %39 = load i32, ptr %13, align 4
-  %40 = mul nsw i32 %39, 5
-  %41 = mul nsw i32 %40, 5
-  %42 = add nsw i32 %38, %41
-  store i32 %42, ptr %14, align 4
-  %43 = load ptr, ptr %10, align 8
-  %44 = load i32, ptr %14, align 4
-  %45 = load i32, ptr %9, align 4
-  %46 = mul nsw i32 %45, 5
-  %47 = mul nsw i32 %46, 5
-  %48 = mul nsw i32 %47, 5
-  %49 = add nsw i32 %44, %48
-  %50 = sext i32 %49 to i64
-  %51 = getelementptr inbounds i32, ptr %43, i64 %50
-  %52 = load i32, ptr %51, align 4
-  store i32 %52, ptr %15, align 4
-  store i32 0, ptr %16, align 4
-  br label %53
-
-53:                                               ; preds = %73, %30
-  %54 = load i32, ptr %16, align 4
-  %55 = icmp slt i32 %54, 1
-  br i1 %55, label %56, label %76
-
-56:                                               ; preds = %53
-  %57 = load ptr, ptr %12, align 8
-  %58 = load i32, ptr %15, align 4
-  %59 = load i32, ptr %16, align 4
-  %60 = mul nsw i32 274625, %59
-  %61 = add nsw i32 %58, %60
-  %62 = sext i32 %61 to i64
-  %63 = getelementptr inbounds double, ptr %57, i64 %62
-  %64 = load ptr, ptr %11, align 8
-  %65 = load i32, ptr %13, align 4
-  %66 = load i32, ptr %16, align 4
-  %67 = mul nsw i32 %66, 5
-  %68 = add nsw i32 %65, %67
-  %69 = sext i32 %68 to i64
-  %70 = getelementptr inbounds double, ptr %64, i64 %69
-  %71 = load double, ptr %70, align 8
-  %72 = call contract noundef double @_ZL9atomicAddPdd(ptr noundef %63, double noundef %71) #5
-  br label %73
-
-73:                                               ; preds = %56
-  %74 = load i32, ptr %16, align 4
-  %75 = add nsw i32 %74, 1
-  store i32 %75, ptr %16, align 4
-  br label %53, !llvm.loop !19
-
-76:                                               ; preds = %53
-  br label %77
-
-77:                                               ; preds = %76
-  %78 = load i32, ptr %13, align 4
-  %79 = add nsw i32 %78, 1
-  store i32 %79, ptr %13, align 4
-  br label %27, !llvm.loop !20
-
-80:                                               ; preds = %27
-  br label %81
-
-81:                                               ; preds = %80, %21, %6
-  ret void
-}
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare noundef i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() #1
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define internal noundef double @_ZL9atomicAddPdd(ptr noundef %0, double noundef %1) #2 {
-  %3 = alloca ptr, align 8
-  %4 = alloca double, align 8
-  %5 = alloca ptr, align 8
-  %6 = alloca double, align 8
-  store ptr %0, ptr %5, align 8
-  store double %1, ptr %6, align 8
-  %7 = load ptr, ptr %5, align 8
-  %8 = load double, ptr %6, align 8
-  store ptr %7, ptr %3, align 8
-  store double %8, ptr %4, align 8
-  %9 = load ptr, ptr %3, align 8
-  %10 = load double, ptr %4, align 8
-  %11 = atomicrmw fadd ptr %9, double %10 seq_cst, align 8
-  ret double %11
-}
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define linkonce_odr dso_local void @_Z20ContractTransposeZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %0, ptr noundef %1, ptr noundef %2, ptr noundef %3) #2 comdat {
-  %5 = alloca ptr, align 8
-  %6 = alloca ptr, align 8
-  %7 = alloca ptr, align 8
-  %8 = alloca ptr, align 8
-  %9 = alloca i32, align 4
-  %10 = alloca i32, align 4
-  store ptr %0, ptr %5, align 8
-  store ptr %1, ptr %6, align 8
-  store ptr %2, ptr %7, align 8
-  store ptr %3, ptr %8, align 8
-  store i32 0, ptr %9, align 4
-  br label %11
-
-11:                                               ; preds = %58, %4
-  %12 = load i32, ptr %9, align 4
-  %13 = icmp slt i32 %12, 5
-  br i1 %13, label %14, label %61
-
-14:                                               ; preds = %11
-  %15 = load ptr, ptr %8, align 8
-  %16 = load i32, ptr %9, align 4
-  %17 = sext i32 %16 to i64
-  %18 = getelementptr inbounds double, ptr %15, i64 %17
-  store double 0.000000e+00, ptr %18, align 8
-  %19 = load ptr, ptr %5, align 8
-  %20 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %19, i32 0, i32 0
-  %21 = load i32, ptr %20, align 8
-  %22 = icmp slt i32 %21, 6
-  br i1 %22, label %23, label %57
-
-23:                                               ; preds = %14
-  %24 = load ptr, ptr %5, align 8
-  %25 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %24, i32 0, i32 1
-  %26 = load i32, ptr %25, align 4
-  %27 = icmp slt i32 %26, 6
-  br i1 %27, label %28, label %57
-
-28:                                               ; preds = %23
-  store i32 0, ptr %10, align 4
-  br label %29
-
-29:                                               ; preds = %53, %28
-  %30 = load i32, ptr %10, align 4
-  %31 = icmp slt i32 %30, 6
-  br i1 %31, label %32, label %56
-
-32:                                               ; preds = %29
-  %33 = load ptr, ptr %7, align 8
-  %34 = load i32, ptr %9, align 4
-  %35 = load i32, ptr %10, align 4
-  %36 = mul nsw i32 %35, 5
-  %37 = add nsw i32 %34, %36
-  %38 = sext i32 %37 to i64
-  %39 = getelementptr inbounds double, ptr %33, i64 %38
-  %40 = load double, ptr %39, align 8
-  %41 = load ptr, ptr %6, align 8
-  %42 = load i32, ptr %10, align 4
-  %43 = sext i32 %42 to i64
-  %44 = getelementptr inbounds double, ptr %41, i64 %43
-  %45 = load double, ptr %44, align 8
-  %46 = fmul contract double %40, %45
-  %47 = load ptr, ptr %8, align 8
-  %48 = load i32, ptr %9, align 4
-  %49 = sext i32 %48 to i64
-  %50 = getelementptr inbounds double, ptr %47, i64 %49
-  %51 = load double, ptr %50, align 8
-  %52 = fadd contract double %51, %46
-  store double %52, ptr %50, align 8
-  br label %53
-
-53:                                               ; preds = %32
-  %54 = load i32, ptr %10, align 4
-  %55 = add nsw i32 %54, 1
-  store i32 %55, ptr %10, align 4
-  br label %29, !llvm.loop !21
-
-56:                                               ; preds = %29
-  br label %57
-
-57:                                               ; preds = %56, %23, %14
-  br label %58
-
-58:                                               ; preds = %57
-  %59 = load i32, ptr %9, align 4
-  %60 = add nsw i32 %59, 1
-  store i32 %60, ptr %9, align 4
-  br label %11, !llvm.loop !22
-
-61:                                               ; preds = %11
-  ret void
-}
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define linkonce_odr dso_local void @_Z20ContractTransposeY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %0, ptr noundef %1, ptr noundef %2, ptr noundef %3) #2 comdat {
-  %5 = alloca ptr, align 8
-  %6 = alloca ptr, align 8
-  %7 = alloca ptr, align 8
-  %8 = alloca ptr, align 8
-  %9 = alloca [6 x double], align 8
-  %10 = alloca i32, align 4
-  %11 = alloca i32, align 4
-  %12 = alloca i32, align 4
-  store ptr %0, ptr %5, align 8
-  store ptr %1, ptr %6, align 8
-  store ptr %2, ptr %7, align 8
-  store ptr %3, ptr %8, align 8
-  store i32 0, ptr %10, align 4
-  br label %13
-
-13:                                               ; preds = %30, %4
-  %14 = load i32, ptr %10, align 4
-  %15 = icmp slt i32 %14, 6
-  br i1 %15, label %16, label %33
-
-16:                                               ; preds = %13
-  %17 = load ptr, ptr %7, align 8
-  %18 = load ptr, ptr %5, align 8
-  %19 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %18, i32 0, i32 1
-  %20 = load i32, ptr %19, align 4
-  %21 = load i32, ptr %10, align 4
-  %22 = mul nsw i32 %21, 5
-  %23 = add nsw i32 %20, %22
-  %24 = sext i32 %23 to i64
-  %25 = getelementptr inbounds double, ptr %17, i64 %24
-  %26 = load double, ptr %25, align 8
-  %27 = load i32, ptr %10, align 4
-  %28 = sext i32 %27 to i64
-  %29 = getelementptr inbounds [6 x double], ptr %9, i64 0, i64 %28
-  store double %26, ptr %29, align 8
-  br label %30
-
-30:                                               ; preds = %16
-  %31 = load i32, ptr %10, align 4
-  %32 = add nsw i32 %31, 1
-  store i32 %32, ptr %10, align 4
-  br label %13, !llvm.loop !23
-
-33:                                               ; preds = %13
-  store i32 0, ptr %11, align 4
-  br label %34
-
-34:                                               ; preds = %102, %33
-  %35 = load i32, ptr %11, align 4
-  %36 = icmp slt i32 %35, 5
-  br i1 %36, label %37, label %105
-
-37:                                               ; preds = %34
-  call void @llvm.nvvm.barrier0()
-  %38 = load ptr, ptr %6, align 8
-  %39 = load i32, ptr %11, align 4
-  %40 = sext i32 %39 to i64
-  %41 = getelementptr inbounds double, ptr %38, i64 %40
-  %42 = load double, ptr %41, align 8
-  %43 = load ptr, ptr %5, align 8
-  %44 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %43, i32 0, i32 4
-  %45 = load ptr, ptr %44, align 8
-  %46 = load ptr, ptr %5, align 8
-  %47 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %46, i32 0, i32 0
-  %48 = load i32, ptr %47, align 8
-  %49 = load ptr, ptr %5, align 8
-  %50 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %49, i32 0, i32 1
-  %51 = load i32, ptr %50, align 4
-  %52 = mul nsw i32 %51, 6
-  %53 = add nsw i32 %48, %52
-  %54 = sext i32 %53 to i64
-  %55 = getelementptr inbounds double, ptr %45, i64 %54
-  store double %42, ptr %55, align 8
-  call void @llvm.nvvm.barrier0()
-  %56 = load ptr, ptr %8, align 8
-  %57 = load i32, ptr %11, align 4
-  %58 = sext i32 %57 to i64
-  %59 = getelementptr inbounds double, ptr %56, i64 %58
-  store double 0.000000e+00, ptr %59, align 8
-  %60 = load ptr, ptr %5, align 8
-  %61 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %60, i32 0, i32 0
-  %62 = load i32, ptr %61, align 8
-  %63 = icmp slt i32 %62, 6
-  br i1 %63, label %64, label %101
-
-64:                                               ; preds = %37
-  %65 = load ptr, ptr %5, align 8
-  %66 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %65, i32 0, i32 1
-  %67 = load i32, ptr %66, align 4
-  %68 = icmp slt i32 %67, 5
-  br i1 %68, label %69, label %101
-
-69:                                               ; preds = %64
-  store i32 0, ptr %12, align 4
-  br label %70
-
-70:                                               ; preds = %97, %69
-  %71 = load i32, ptr %12, align 4
-  %72 = icmp slt i32 %71, 6
-  br i1 %72, label %73, label %100
-
-73:                                               ; preds = %70
-  %74 = load i32, ptr %12, align 4
-  %75 = sext i32 %74 to i64
-  %76 = getelementptr inbounds [6 x double], ptr %9, i64 0, i64 %75
-  %77 = load double, ptr %76, align 8
-  %78 = load ptr, ptr %5, align 8
-  %79 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %78, i32 0, i32 4
-  %80 = load ptr, ptr %79, align 8
-  %81 = load ptr, ptr %5, align 8
-  %82 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %81, i32 0, i32 0
-  %83 = load i32, ptr %82, align 8
-  %84 = load i32, ptr %12, align 4
-  %85 = mul nsw i32 %84, 6
-  %86 = add nsw i32 %83, %85
-  %87 = sext i32 %86 to i64
-  %88 = getelementptr inbounds double, ptr %80, i64 %87
-  %89 = load double, ptr %88, align 8
-  %90 = fmul contract double %77, %89
-  %91 = load ptr, ptr %8, align 8
-  %92 = load i32, ptr %11, align 4
-  %93 = sext i32 %92 to i64
-  %94 = getelementptr inbounds double, ptr %91, i64 %93
-  %95 = load double, ptr %94, align 8
-  %96 = fadd contract double %95, %90
-  store double %96, ptr %94, align 8
-  br label %97
-
-97:                                               ; preds = %73
-  %98 = load i32, ptr %12, align 4
-  %99 = add nsw i32 %98, 1
-  store i32 %99, ptr %12, align 4
-  br label %70, !llvm.loop !24
-
-100:                                              ; preds = %70
-  br label %101
-
-101:                                              ; preds = %100, %64, %37
-  br label %102
-
-102:                                              ; preds = %101
-  %103 = load i32, ptr %11, align 4
-  %104 = add nsw i32 %103, 1
-  store i32 %104, ptr %11, align 4
-  br label %34, !llvm.loop !25
-
-105:                                              ; preds = %34
-  ret void
-}
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define linkonce_odr dso_local void @_Z20ContractTransposeX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %0, ptr noundef %1, ptr noundef %2, ptr noundef %3) #2 comdat {
-  %5 = alloca ptr, align 8
-  %6 = alloca ptr, align 8
-  %7 = alloca ptr, align 8
-  %8 = alloca ptr, align 8
-  %9 = alloca [6 x double], align 8
-  %10 = alloca i32, align 4
-  %11 = alloca i32, align 4
-  %12 = alloca i32, align 4
-  store ptr %0, ptr %5, align 8
-  store ptr %1, ptr %6, align 8
-  store ptr %2, ptr %7, align 8
-  store ptr %3, ptr %8, align 8
-  store i32 0, ptr %10, align 4
-  br label %13
-
-13:                                               ; preds = %30, %4
-  %14 = load i32, ptr %10, align 4
-  %15 = icmp slt i32 %14, 6
-  br i1 %15, label %16, label %33
-
-16:                                               ; preds = %13
-  %17 = load ptr, ptr %7, align 8
-  %18 = load ptr, ptr %5, align 8
-  %19 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %18, i32 0, i32 0
-  %20 = load i32, ptr %19, align 8
-  %21 = load i32, ptr %10, align 4
-  %22 = mul nsw i32 %21, 5
-  %23 = add nsw i32 %20, %22
-  %24 = sext i32 %23 to i64
-  %25 = getelementptr inbounds double, ptr %17, i64 %24
-  %26 = load double, ptr %25, align 8
-  %27 = load i32, ptr %10, align 4
-  %28 = sext i32 %27 to i64
-  %29 = getelementptr inbounds [6 x double], ptr %9, i64 0, i64 %28
-  store double %26, ptr %29, align 8
-  br label %30
-
-30:                                               ; preds = %16
-  %31 = load i32, ptr %10, align 4
-  %32 = add nsw i32 %31, 1
-  store i32 %32, ptr %10, align 4
-  br label %13, !llvm.loop !26
-
-33:                                               ; preds = %13
-  store i32 0, ptr %11, align 4
-  br label %34
-
-34:                                               ; preds = %102, %33
-  %35 = load i32, ptr %11, align 4
-  %36 = icmp slt i32 %35, 5
-  br i1 %36, label %37, label %105
-
-37:                                               ; preds = %34
-  call void @llvm.nvvm.barrier0()
-  %38 = load ptr, ptr %6, align 8
-  %39 = load i32, ptr %11, align 4
-  %40 = sext i32 %39 to i64
-  %41 = getelementptr inbounds double, ptr %38, i64 %40
-  %42 = load double, ptr %41, align 8
-  %43 = load ptr, ptr %5, align 8
-  %44 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %43, i32 0, i32 4
-  %45 = load ptr, ptr %44, align 8
-  %46 = load ptr, ptr %5, align 8
-  %47 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %46, i32 0, i32 0
-  %48 = load i32, ptr %47, align 8
-  %49 = load ptr, ptr %5, align 8
-  %50 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %49, i32 0, i32 1
-  %51 = load i32, ptr %50, align 4
-  %52 = mul nsw i32 %51, 6
-  %53 = add nsw i32 %48, %52
-  %54 = sext i32 %53 to i64
-  %55 = getelementptr inbounds double, ptr %45, i64 %54
-  store double %42, ptr %55, align 8
-  call void @llvm.nvvm.barrier0()
-  %56 = load ptr, ptr %8, align 8
-  %57 = load i32, ptr %11, align 4
-  %58 = sext i32 %57 to i64
-  %59 = getelementptr inbounds double, ptr %56, i64 %58
-  store double 0.000000e+00, ptr %59, align 8
-  %60 = load ptr, ptr %5, align 8
-  %61 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %60, i32 0, i32 0
-  %62 = load i32, ptr %61, align 8
-  %63 = icmp slt i32 %62, 5
-  br i1 %63, label %64, label %101
-
-64:                                               ; preds = %37
-  %65 = load ptr, ptr %5, align 8
-  %66 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %65, i32 0, i32 1
-  %67 = load i32, ptr %66, align 4
-  %68 = icmp slt i32 %67, 5
-  br i1 %68, label %69, label %101
-
-69:                                               ; preds = %64
-  store i32 0, ptr %12, align 4
-  br label %70
-
-70:                                               ; preds = %97, %69
-  %71 = load i32, ptr %12, align 4
-  %72 = icmp slt i32 %71, 6
-  br i1 %72, label %73, label %100
-
-73:                                               ; preds = %70
-  %74 = load i32, ptr %12, align 4
-  %75 = sext i32 %74 to i64
-  %76 = getelementptr inbounds [6 x double], ptr %9, i64 0, i64 %75
-  %77 = load double, ptr %76, align 8
-  %78 = load ptr, ptr %5, align 8
-  %79 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %78, i32 0, i32 4
-  %80 = load ptr, ptr %79, align 8
-  %81 = load i32, ptr %12, align 4
-  %82 = load ptr, ptr %5, align 8
-  %83 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %82, i32 0, i32 1
-  %84 = load i32, ptr %83, align 4
-  %85 = mul nsw i32 %84, 6
-  %86 = add nsw i32 %81, %85
-  %87 = sext i32 %86 to i64
-  %88 = getelementptr inbounds double, ptr %80, i64 %87
-  %89 = load double, ptr %88, align 8
-  %90 = fmul contract double %77, %89
-  %91 = load ptr, ptr %8, align 8
-  %92 = load i32, ptr %11, align 4
-  %93 = sext i32 %92 to i64
-  %94 = getelementptr inbounds double, ptr %91, i64 %93
-  %95 = load double, ptr %94, align 8
-  %96 = fadd contract double %95, %90
-  store double %96, ptr %94, align 8
-  br label %97
-
-97:                                               ; preds = %73
-  %98 = load i32, ptr %12, align 4
-  %99 = add nsw i32 %98, 1
-  store i32 %99, ptr %12, align 4
-  br label %70, !llvm.loop !27
-
-100:                                              ; preds = %70
-  br label %101
-
-101:                                              ; preds = %100, %64, %37
-  br label %102
-
-102:                                              ; preds = %101
-  %103 = load i32, ptr %11, align 4
-  %104 = add nsw i32 %103, 1
-  store i32 %104, ptr %11, align 4
-  br label %34, !llvm.loop !28
-
-105:                                              ; preds = %34
-  ret void
-}
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define linkonce_odr dso_local void @_Z11ContractX3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %0, ptr noundef %1, ptr noundef %2, ptr noundef %3) #2 comdat {
-  %5 = alloca ptr, align 8
-  %6 = alloca ptr, align 8
-  %7 = alloca ptr, align 8
-  %8 = alloca ptr, align 8
-  %9 = alloca [5 x double], align 8
-  %10 = alloca i32, align 4
-  %11 = alloca i32, align 4
-  %12 = alloca i32, align 4
-  store ptr %0, ptr %5, align 8
-  store ptr %1, ptr %6, align 8
-  store ptr %2, ptr %7, align 8
-  store ptr %3, ptr %8, align 8
-  store i32 0, ptr %10, align 4
-  br label %13
-
-13:                                               ; preds = %30, %4
-  %14 = load i32, ptr %10, align 4
-  %15 = icmp slt i32 %14, 5
-  br i1 %15, label %16, label %33
-
-16:                                               ; preds = %13
-  %17 = load ptr, ptr %7, align 8
-  %18 = load i32, ptr %10, align 4
-  %19 = load ptr, ptr %5, align 8
-  %20 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %19, i32 0, i32 0
-  %21 = load i32, ptr %20, align 8
-  %22 = mul nsw i32 %21, 5
-  %23 = add nsw i32 %18, %22
-  %24 = sext i32 %23 to i64
-  %25 = getelementptr inbounds double, ptr %17, i64 %24
-  %26 = load double, ptr %25, align 8
-  %27 = load i32, ptr %10, align 4
-  %28 = sext i32 %27 to i64
-  %29 = getelementptr inbounds [5 x double], ptr %9, i64 0, i64 %28
-  store double %26, ptr %29, align 8
-  br label %30
-
-30:                                               ; preds = %16
-  %31 = load i32, ptr %10, align 4
-  %32 = add nsw i32 %31, 1
-  store i32 %32, ptr %10, align 4
-  br label %13, !llvm.loop !29
-
-33:                                               ; preds = %13
-  store i32 0, ptr %11, align 4
-  br label %34
-
-34:                                               ; preds = %102, %33
-  %35 = load i32, ptr %11, align 4
-  %36 = icmp slt i32 %35, 5
-  br i1 %36, label %37, label %105
-
-37:                                               ; preds = %34
-  call void @llvm.nvvm.barrier0()
-  %38 = load ptr, ptr %6, align 8
-  %39 = load i32, ptr %11, align 4
-  %40 = sext i32 %39 to i64
-  %41 = getelementptr inbounds double, ptr %38, i64 %40
-  %42 = load double, ptr %41, align 8
-  %43 = load ptr, ptr %5, align 8
-  %44 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %43, i32 0, i32 4
-  %45 = load ptr, ptr %44, align 8
-  %46 = load ptr, ptr %5, align 8
-  %47 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %46, i32 0, i32 0
-  %48 = load i32, ptr %47, align 8
-  %49 = load ptr, ptr %5, align 8
-  %50 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %49, i32 0, i32 1
-  %51 = load i32, ptr %50, align 4
-  %52 = mul nsw i32 %51, 6
-  %53 = add nsw i32 %48, %52
-  %54 = sext i32 %53 to i64
-  %55 = getelementptr inbounds double, ptr %45, i64 %54
-  store double %42, ptr %55, align 8
-  call void @llvm.nvvm.barrier0()
-  %56 = load ptr, ptr %8, align 8
-  %57 = load i32, ptr %11, align 4
-  %58 = sext i32 %57 to i64
-  %59 = getelementptr inbounds double, ptr %56, i64 %58
-  store double 0.000000e+00, ptr %59, align 8
-  %60 = load ptr, ptr %5, align 8
-  %61 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %60, i32 0, i32 0
-  %62 = load i32, ptr %61, align 8
-  %63 = icmp slt i32 %62, 6
-  br i1 %63, label %64, label %101
-
-64:                                               ; preds = %37
-  %65 = load ptr, ptr %5, align 8
-  %66 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %65, i32 0, i32 1
-  %67 = load i32, ptr %66, align 4
-  %68 = icmp slt i32 %67, 5
-  br i1 %68, label %69, label %101
-
-69:                                               ; preds = %64
-  store i32 0, ptr %12, align 4
-  br label %70
-
-70:                                               ; preds = %97, %69
-  %71 = load i32, ptr %12, align 4
-  %72 = icmp slt i32 %71, 5
-  br i1 %72, label %73, label %100
-
-73:                                               ; preds = %70
-  %74 = load i32, ptr %12, align 4
-  %75 = sext i32 %74 to i64
-  %76 = getelementptr inbounds [5 x double], ptr %9, i64 0, i64 %75
-  %77 = load double, ptr %76, align 8
-  %78 = load ptr, ptr %5, align 8
-  %79 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %78, i32 0, i32 4
-  %80 = load ptr, ptr %79, align 8
-  %81 = load i32, ptr %12, align 4
-  %82 = load ptr, ptr %5, align 8
-  %83 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %82, i32 0, i32 1
-  %84 = load i32, ptr %83, align 4
-  %85 = mul nsw i32 %84, 6
-  %86 = add nsw i32 %81, %85
-  %87 = sext i32 %86 to i64
-  %88 = getelementptr inbounds double, ptr %80, i64 %87
-  %89 = load double, ptr %88, align 8
-  %90 = fmul contract double %77, %89
-  %91 = load ptr, ptr %8, align 8
-  %92 = load i32, ptr %11, align 4
-  %93 = sext i32 %92 to i64
-  %94 = getelementptr inbounds double, ptr %91, i64 %93
-  %95 = load double, ptr %94, align 8
-  %96 = fadd contract double %95, %90
-  store double %96, ptr %94, align 8
-  br label %97
-
-97:                                               ; preds = %73
-  %98 = load i32, ptr %12, align 4
-  %99 = add nsw i32 %98, 1
-  store i32 %99, ptr %12, align 4
-  br label %70, !llvm.loop !30
-
-100:                                              ; preds = %70
-  br label %101
-
-101:                                              ; preds = %100, %64, %37
-  br label %102
-
-102:                                              ; preds = %101
-  %103 = load i32, ptr %11, align 4
-  %104 = add nsw i32 %103, 1
-  store i32 %104, ptr %11, align 4
-  br label %34, !llvm.loop !31
-
-105:                                              ; preds = %34
-  ret void
-}
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define linkonce_odr dso_local void @_Z11ContractY3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %0, ptr noundef %1, ptr noundef %2, ptr noundef %3) #2 comdat {
-  %5 = alloca ptr, align 8
-  %6 = alloca ptr, align 8
-  %7 = alloca ptr, align 8
-  %8 = alloca ptr, align 8
-  %9 = alloca [5 x double], align 8
-  %10 = alloca i32, align 4
-  %11 = alloca i32, align 4
-  %12 = alloca i32, align 4
-  store ptr %0, ptr %5, align 8
-  store ptr %1, ptr %6, align 8
-  store ptr %2, ptr %7, align 8
-  store ptr %3, ptr %8, align 8
-  store i32 0, ptr %10, align 4
-  br label %13
-
-13:                                               ; preds = %30, %4
-  %14 = load i32, ptr %10, align 4
-  %15 = icmp slt i32 %14, 5
-  br i1 %15, label %16, label %33
-
-16:                                               ; preds = %13
-  %17 = load ptr, ptr %7, align 8
-  %18 = load i32, ptr %10, align 4
-  %19 = load ptr, ptr %5, align 8
-  %20 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %19, i32 0, i32 1
-  %21 = load i32, ptr %20, align 4
-  %22 = mul nsw i32 %21, 5
-  %23 = add nsw i32 %18, %22
-  %24 = sext i32 %23 to i64
-  %25 = getelementptr inbounds double, ptr %17, i64 %24
-  %26 = load double, ptr %25, align 8
-  %27 = load i32, ptr %10, align 4
-  %28 = sext i32 %27 to i64
-  %29 = getelementptr inbounds [5 x double], ptr %9, i64 0, i64 %28
-  store double %26, ptr %29, align 8
-  br label %30
-
-30:                                               ; preds = %16
-  %31 = load i32, ptr %10, align 4
-  %32 = add nsw i32 %31, 1
-  store i32 %32, ptr %10, align 4
-  br label %13, !llvm.loop !32
-
-33:                                               ; preds = %13
-  store i32 0, ptr %11, align 4
-  br label %34
-
-34:                                               ; preds = %102, %33
-  %35 = load i32, ptr %11, align 4
-  %36 = icmp slt i32 %35, 5
-  br i1 %36, label %37, label %105
-
-37:                                               ; preds = %34
-  call void @llvm.nvvm.barrier0()
-  %38 = load ptr, ptr %6, align 8
-  %39 = load i32, ptr %11, align 4
-  %40 = sext i32 %39 to i64
-  %41 = getelementptr inbounds double, ptr %38, i64 %40
-  %42 = load double, ptr %41, align 8
-  %43 = load ptr, ptr %5, align 8
-  %44 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %43, i32 0, i32 4
-  %45 = load ptr, ptr %44, align 8
-  %46 = load ptr, ptr %5, align 8
-  %47 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %46, i32 0, i32 0
-  %48 = load i32, ptr %47, align 8
-  %49 = load ptr, ptr %5, align 8
-  %50 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %49, i32 0, i32 1
-  %51 = load i32, ptr %50, align 4
-  %52 = mul nsw i32 %51, 6
-  %53 = add nsw i32 %48, %52
-  %54 = sext i32 %53 to i64
-  %55 = getelementptr inbounds double, ptr %45, i64 %54
-  store double %42, ptr %55, align 8
-  call void @llvm.nvvm.barrier0()
-  %56 = load ptr, ptr %8, align 8
-  %57 = load i32, ptr %11, align 4
-  %58 = sext i32 %57 to i64
-  %59 = getelementptr inbounds double, ptr %56, i64 %58
-  store double 0.000000e+00, ptr %59, align 8
-  %60 = load ptr, ptr %5, align 8
-  %61 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %60, i32 0, i32 0
-  %62 = load i32, ptr %61, align 8
-  %63 = icmp slt i32 %62, 6
-  br i1 %63, label %64, label %101
-
-64:                                               ; preds = %37
-  %65 = load ptr, ptr %5, align 8
-  %66 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %65, i32 0, i32 1
-  %67 = load i32, ptr %66, align 4
-  %68 = icmp slt i32 %67, 6
-  br i1 %68, label %69, label %101
-
-69:                                               ; preds = %64
-  store i32 0, ptr %12, align 4
-  br label %70
-
-70:                                               ; preds = %97, %69
-  %71 = load i32, ptr %12, align 4
-  %72 = icmp slt i32 %71, 5
-  br i1 %72, label %73, label %100
-
-73:                                               ; preds = %70
-  %74 = load i32, ptr %12, align 4
-  %75 = sext i32 %74 to i64
-  %76 = getelementptr inbounds [5 x double], ptr %9, i64 0, i64 %75
-  %77 = load double, ptr %76, align 8
-  %78 = load ptr, ptr %5, align 8
-  %79 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %78, i32 0, i32 4
-  %80 = load ptr, ptr %79, align 8
-  %81 = load ptr, ptr %5, align 8
-  %82 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %81, i32 0, i32 0
-  %83 = load i32, ptr %82, align 8
-  %84 = load i32, ptr %12, align 4
-  %85 = mul nsw i32 %84, 6
-  %86 = add nsw i32 %83, %85
-  %87 = sext i32 %86 to i64
-  %88 = getelementptr inbounds double, ptr %80, i64 %87
-  %89 = load double, ptr %88, align 8
-  %90 = fmul contract double %77, %89
-  %91 = load ptr, ptr %8, align 8
-  %92 = load i32, ptr %11, align 4
-  %93 = sext i32 %92 to i64
-  %94 = getelementptr inbounds double, ptr %91, i64 %93
-  %95 = load double, ptr %94, align 8
-  %96 = fadd contract double %95, %90
-  store double %96, ptr %94, align 8
-  br label %97
-
-97:                                               ; preds = %73
-  %98 = load i32, ptr %12, align 4
-  %99 = add nsw i32 %98, 1
-  store i32 %99, ptr %12, align 4
-  br label %70, !llvm.loop !33
-
-100:                                              ; preds = %70
-  br label %101
-
-101:                                              ; preds = %100, %64, %37
-  br label %102
-
-102:                                              ; preds = %101
-  %103 = load i32, ptr %11, align 4
-  %104 = add nsw i32 %103, 1
-  store i32 %104, ptr %11, align 4
-  br label %34, !llvm.loop !34
-
-105:                                              ; preds = %34
-  ret void
-}
-
-; Function Attrs: convergent mustprogress noinline nounwind optnone
-define linkonce_odr dso_local void @_Z11ContractZ3dILi1ELi5ELi6ELi6EEvR15SharedData_CudaPKdS3_Pd(ptr noundef nonnull align 8 dereferenceable(24) %0, ptr noundef %1, ptr noundef %2, ptr noundef %3) #2 comdat {
-  %5 = alloca ptr, align 8
-  %6 = alloca ptr, align 8
-  %7 = alloca ptr, align 8
-  %8 = alloca ptr, align 8
-  %9 = alloca i32, align 4
-  %10 = alloca i32, align 4
-  store ptr %0, ptr %5, align 8
-  store ptr %1, ptr %6, align 8
-  store ptr %2, ptr %7, align 8
-  store ptr %3, ptr %8, align 8
-  store i32 0, ptr %9, align 4
-  br label %11
-
-11:                                               ; preds = %58, %4
-  %12 = load i32, ptr %9, align 4
-  %13 = icmp slt i32 %12, 6
-  br i1 %13, label %14, label %61
-
-14:                                               ; preds = %11
-  %15 = load ptr, ptr %8, align 8
-  %16 = load i32, ptr %9, align 4
-  %17 = sext i32 %16 to i64
-  %18 = getelementptr inbounds double, ptr %15, i64 %17
-  store double 0.000000e+00, ptr %18, align 8
-  %19 = load ptr, ptr %5, align 8
-  %20 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %19, i32 0, i32 0
-  %21 = load i32, ptr %20, align 8
-  %22 = icmp slt i32 %21, 6
-  br i1 %22, label %23, label %57
-
-23:                                               ; preds = %14
-  %24 = load ptr, ptr %5, align 8
-  %25 = getelementptr inbounds nuw %struct.SharedData_Cuda, ptr %24, i32 0, i32 1
-  %26 = load i32, ptr %25, align 4
-  %27 = icmp slt i32 %26, 6
-  br i1 %27, label %28, label %57
-
-28:                                               ; preds = %23
-  store i32 0, ptr %10, align 4
-  br label %29
-
-29:                                               ; preds = %53, %28
-  %30 = load i32, ptr %10, align 4
-  %31 = icmp slt i32 %30, 5
-  br i1 %31, label %32, label %56
-
-32:                                               ; preds = %29
-  %33 = load ptr, ptr %7, align 8
-  %34 = load i32, ptr %10, align 4
-  %35 = load i32, ptr %9, align 4
-  %36 = mul nsw i32 %35, 5
-  %37 = add nsw i32 %34, %36
-  %38 = sext i32 %37 to i64
-  %39 = getelementptr inbounds double, ptr %33, i64 %38
-  %40 = load double, ptr %39, align 8
-  %41 = load ptr, ptr %6, align 8
-  %42 = load i32, ptr %10, align 4
-  %43 = sext i32 %42 to i64
-  %44 = getelementptr inbounds double, ptr %41, i64 %43
-  %45 = load double, ptr %44, align 8
-  %46 = fmul contract double %40, %45
-  %47 = load ptr, ptr %8, align 8
-  %48 = load i32, ptr %9, align 4
-  %49 = sext i32 %48 to i64
-  %50 = getelementptr inbounds double, ptr %47, i64 %49
-  %51 = load double, ptr %50, align 8
-  %52 = fadd contract double %51, %46
-  store double %52, ptr %50, align 8
-  br label %53
-
-53:                                               ; preds = %32
-  %54 = load i32, ptr %10, align 4
-  %55 = add nsw i32 %54, 1
-  store i32 %55, ptr %10, align 4
-  br label %29, !llvm.loop !35
-
-56:                                               ; preds = %29
-  br label %57
-
-57:                                               ; preds = %56, %23, %14
-  br label %58
-
-58:                                               ; preds = %57
-  %59 = load i32, ptr %9, align 4
-  %60 = add nsw i32 %59, 1
-  store i32 %60, ptr %9, align 4
-  br label %11, !llvm.loop !36
-
-61:                                               ; preds = %11
-  ret void
-}
-
-; Function Attrs: nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none)
-define internal noundef signext i8 @apply_mass_rs(ptr nocapture noundef readnone %0, i32 noundef %1, ptr nocapture noundef readonly %2, ptr nocapture noundef readonly %3) #4 {
-  %5 = load ptr, ptr %2, align 8, !noundef !37
-  %6 = getelementptr inbounds nuw i8, ptr %2, i64 8
-  %7 = load ptr, ptr %6, align 8, !noundef !37
-  %8 = sext i32 %1 to i64
-  %9 = load ptr, ptr %3, align 8, !noundef !37
-  %10 = icmp eq i32 %1, 0
-  br i1 %10, label %11, label %12
-
-11:                                               ; preds = %12, %4
-  ret i8 0
-
-12:                                               ; preds = %12, %4
-  %13 = phi i64 [ %14, %12 ], [ 0, %4 ]
-  %14 = add nuw i64 %13, 1
-  %15 = getelementptr inbounds nuw double, ptr %7, i64 %13
-  %16 = load double, ptr %15, align 8, !noundef !37
-  %17 = getelementptr inbounds nuw double, ptr %5, i64 %13
-  %18 = load double, ptr %17, align 8, !noundef !37
-  %19 = getelementptr inbounds nuw double, ptr %9, i64 %13
-  %20 = fmul double %16, %18
-  store double %20, ptr %19, align 8
-  %21 = icmp eq i64 %14, %8
-  br i1 %21, label %11, label %12
-}
-
-attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_89" "target-features"="+ptx87,+sm_89" "uniform-work-group-size"="true" }
-attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-attributes #2 = { convergent mustprogress noinline nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_89" "target-features"="+ptx87,+sm_89" }
-attributes #3 = { convergent nocallback nounwind }
-attributes #4 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "target-cpu"="sm_30" }
-attributes #5 = { convergent nounwind }
-
-!nvvm.annotations = !{!0}
-!llvm.ident = !{!1, !2, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3, !3}
-!nvvmir.version = !{!4}
-!llvm.module.flags = !{!5, !6, !7, !8, !9}
-
-!0 = !{ptr @CeedKernelCudaGenOperator_apply_mass}
-!1 = !{!"clang version 20.1.8"}
-!2 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
-!3 = !{!"rustc version 1.88.0-nightly (e9f8103f9 2025-05-07)"}
-!4 = !{i32 2, i32 0}
-!5 = !{i32 2, !"SDK Version", [2 x i32] [i32 12, i32 8]}
-!6 = !{i32 1, !"wchar_size", i32 4}
-!7 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
-!8 = !{i32 7, !"frame-pointer", i32 2}
-!9 = !{i32 8, !"PIC Level", i32 0}
-!10 = distinct !{!10, !11}
-!11 = !{!"llvm.loop.mustprogress"}
-!12 = distinct !{!12, !11}
-!13 = distinct !{!13, !11}
-!14 = distinct !{!14, !11}
-!15 = distinct !{!15, !11}
-!16 = distinct !{!16, !11}
-!17 = distinct !{!17, !11}
-!18 = distinct !{!18, !11}
-!19 = distinct !{!19, !11}
-!20 = distinct !{!20, !11}
-!21 = distinct !{!21, !11}
-!22 = distinct !{!22, !11}
-!23 = distinct !{!23, !11}
-!24 = distinct !{!24, !11}
-!25 = distinct !{!25, !11}
-!26 = distinct !{!26, !11}
-!27 = distinct !{!27, !11}
-!28 = distinct !{!28, !11}
-!29 = distinct !{!29, !11}
-!30 = distinct !{!30, !11}
-!31 = distinct !{!31, !11}
-!32 = distinct !{!32, !11}
-!33 = distinct !{!33, !11}
-!34 = distinct !{!34, !11}
-!35 = distinct !{!35, !11}
-!36 = distinct !{!36, !11}
-!37 = !{}
diff --git a/temp_kernel_opt.bc b/temp_kernel_opt.bc
deleted file mode 100644
index a6e580e2..00000000
Binary files a/temp_kernel_opt.bc and /dev/null differ
diff --git a/temp_kernel_source.cu b/temp_kernel_source.cu
deleted file mode 100644
index 580f7e34..00000000
--- a/temp_kernel_source.cu
+++ /dev/null
@@ -1,130 +0,0 @@
-#define OP_T_1D 6
-#include <ceed/jit-source/cuda/cuda-jit.h>
-
-// Tensor basis source
-#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>
-
-// CodeGen operator source
-#include <ceed/jit-source/cuda/cuda-gen-templates.h>
-
-
-#undef CEED_Q_VLA
-#define CEED_Q_VLA 6
-
-// User QFunction source
-#include "/home/jeremy/Dev/libCEED-rust/examples/rust-qfunctions/ex1-volume.h"
-
-
-// -----------------------------------------------------------------------------
-// Operator Kernel
-// 
-// d_[in,out]_i:   CeedVector device array
-// r_[in,out]_e_i: Element vector register
-// r_[in,out]_q_i: Quadrature space vector register
-// r_[in,out]_c_i: AtPoints Chebyshev coefficients register
-// r_[in,out]_s_i: Quadrature space slice vector register
-// 
-// s_B_[in,out]_i: Interpolation matrix, shared memory
-// s_G_[in,out]_i: Gradient matrix, shared memory
-// -----------------------------------------------------------------------------
-extern "C" __global__ void CeedKernelCudaGenOperator_apply_mass(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda points) {
-  const CeedScalar *__restrict__ d_in_0 = fields.inputs[0];
-  const CeedScalar *__restrict__ d_in_1 = fields.inputs[1];
-  CeedScalar *__restrict__ d_out_0 = fields.outputs[0];
-  const CeedInt max_dim = 3;
-  const CeedInt Q_1d = 6;
-  extern __shared__ CeedScalar slice[];
-  SharedData_Cuda data;
-  data.t_id_x = threadIdx.x;
-  data.t_id_y = threadIdx.y;
-  data.t_id_z = threadIdx.z;
-  data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;
-  data.slice  = slice + data.t_id_z*OP_T_1D*OP_T_1D;
-
-  // Input field constants and basis data
-  // -- Input field 0: u
-  const CeedInt dim_in_0 = 3;
-  const CeedInt P_1d_in_0 = 5;
-  const CeedInt num_comp_in_0 = 1;
-  // EvalMode: interpolation
-  __shared__ CeedScalar s_B_in_0[P_1d_in_0*Q_1d];
-  LoadMatrix<P_1d_in_0, Q_1d>(data, B.inputs[0], s_B_in_0);
-  // -- Input field 1: qdata
-  const CeedInt dim_in_1 = 3;
-  const CeedInt P_1d_in_1 = 6;
-  const CeedInt num_comp_in_1 = 1;
-  // EvalMode: none
-
-  // Output field constants and basis data
-  // -- Output field 0: v
-  const CeedInt dim_out_0 = 3;
-  const CeedInt P_1d_out_0 = 5;
-  const CeedInt num_comp_out_0 = 1;
-  // EvalMode: interpolation
-  CeedScalar *s_B_out_0 = s_B_in_0;
-
-  // Element loop
-  __syncthreads();
-  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {
-    // Scratch restriction buffer space
-    CeedScalar r_e_scratch[6];
-
-    // -- Input field restrictions and basis actions
-    // ---- Input field 0: u
-    CeedScalar *r_e_in_0 = r_e_scratch;
-    const CeedInt l_size_in_0 = 274625;
-    const CeedInt comp_stride_in_0 = 274625;
-    ReadLVecStandard3d<num_comp_in_0, comp_stride_in_0, P_1d_in_0>(data, l_size_in_0, elem, indices.inputs[0], d_in_0, r_e_in_0);
-    // EvalMode: interpolation
-    CeedScalar r_q_in_0[num_comp_in_0*Q_1d];
-    InterpTensor3d<num_comp_in_0, P_1d_in_0, Q_1d, OP_T_1D>(data, r_e_in_0, s_B_in_0, r_q_in_0);
-    // ---- Input field 1: qdata
-    CeedScalar r_e_in_1[num_comp_in_1*P_1d_in_1];
-    const CeedInt strides_in_1_0 = 1, strides_in_1_1 = 884736, strides_in_1_2 = 216;
-    ReadLVecStrided3d<num_comp_in_1, P_1d_in_1, strides_in_1_0, strides_in_1_1, strides_in_1_2>(data, elem, d_in_1, r_e_in_1);
-    // EvalMode: none
-    CeedScalar *r_q_in_1 = r_e_in_1;
-
-    // -- Output field setup
-    // ---- Output field 0: v
-    CeedScalar r_q_out_0[num_comp_out_0*Q_1d];
-
-    // Note: Using full elements
-    {
-      // -- Input fields
-      // ---- Input field 0: u
-      CeedScalar *r_s_in_0 = r_q_in_0;
-      // ---- Input field 1: qdata
-      CeedScalar *r_s_in_1 = r_q_in_1;
-      // -- Output fields
-      // ---- Output field 0: v
-      CeedScalar *r_s_out_0 = r_q_out_0;
-
-      // -- QFunction inputs and outputs
-      // ---- Inputs
-      CeedScalar *inputs[2];
-      // ------ Input field 0: u
-      inputs[0] = r_s_in_0;
-      // ------ Input field 1: qdata
-      inputs[1] = r_s_in_1;
-      // ---- Outputs
-      CeedScalar *outputs[1];
-      // ------ Output field 0: v
-      outputs[0] = r_s_out_0;
-
-      // -- Apply QFunction
-      apply_mass(ctx, Q_1d, inputs, outputs);
-    }
-
-    // -- Output field basis action and restrictions
-    // ---- Output field 0: v
-    // EvalMode: interpolation
-    CeedScalar *r_e_out_0 = r_e_scratch;
-    InterpTransposeTensor3d<num_comp_out_0, P_1d_out_0, Q_1d, OP_T_1D>(data, r_q_out_0, s_B_out_0, r_e_out_0);
-    const CeedInt l_size_out_0 = 274625;
-    const CeedInt comp_stride_out_0 = 274625;
-    WriteLVecStandard3d<num_comp_out_0, comp_stride_out_0, P_1d_out_0>(data, l_size_out_0, elem, indices.outputs[0], r_e_out_0, d_out_0);
-  }
-}
-// -----------------------------------------------------------------------------
-
diff --git a/tests/junit.py b/tests/junit.py
index b0144454..7237594b 100755
--- a/tests/junit.py
+++ b/tests/junit.py
@@ -51,6 +51,8 @@ class CeedSuiteSpec(SuiteSpec):
             Path: Path to source file
         """
         prefix, rest = test.split('-', 1)
+        if prefix == 'rustqfunctions':
+            return (Path('examples') / 'rust-qfunctions' / rest).with_suffix('.c')
         if prefix == 'petsc':
             return (Path('examples') / 'petsc' / rest).with_suffix('.c')
         elif prefix == 'mfem':

jeremylt · 2025-07-31T17:03:21Z

Re popen() not capturing all output: I think what we have is workable enough that we can push that to a follow-up issue to figure that out in better detail

jeremylt · 2025-07-31T17:33:52Z

Ope, examples/rust-qfunctions folder needs a makefile

examples/rust-qfunctions/Makefile

Co-authored-by: Jeremy L Thompson <[email protected]>

jeremylt · 2025-07-31T18:01:01Z

Minor - makefile changes to properly clean up

diff --git a/Makefile b/Makefile
index 3a03310e..3b63519c 100644
--- a/Makefile
+++ b/Makefile
@@ -915,6 +915,7 @@ cln clean :
        $(call quiet,MAKE) -C examples clean NEK5K_DIR="$(abspath $(NEK5K_DIR))"
        $(call quiet,MAKE) -C python/tests clean
        $(RM) benchmarks/*output.txt
+       $(RM) -f temp_*
 
 distclean : clean
        $(RM) -r doc/html doc/sphinx/build $(CONFIG)
diff --git a/examples/Makefile b/examples/Makefile
index d32f406f..f220748b 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -53,10 +53,12 @@ fluids:
 
 solids:
        make CEED_DIR=$(CEED_DIR) PETSC_DIR=$(PETSC_DIR) PETSC_ARCH=$(PETSC_ARCH) -C solids all
+
 clean:
        +make -C ceed clean
        +make -C mfem clean
        +make -C nek clean
+       +make -C rust-qfunctions clean
        +make -C petsc clean
        +make -C fluids clean
        +make -C solids clean
diff --git a/examples/rust-qfunctions/Makefile b/examples/rust-qfunctions/Makefile
index 32fb5b02..324297c5 100644
--- a/examples/rust-qfunctions/Makefile
+++ b/examples/rust-qfunctions/Makefile
@@ -31,4 +31,5 @@ ex1-volume: ex1-volume.c
 
 clean:
        rm -f *~ $(EXAMPLES)
+       rm -f temp_*
        rm -rf *.dSYM *.TVD.*breakpoints

jeremylt · 2025-07-31T18:41:15Z

Ok, I think we'll have to merge to a staging branch to get CI to run

* Switch compiler to clang (not portable) * test add_num * compile with llvm tools * not working linking * not fixed * Update ex1-volume.h * update * remove global path * changes * changes 2 * crate works * basic gpu rust compilation * still not working * rust source roots basic support * nvrtc/clang selection * cleanup * update example (not working) * add rust example * fix merge issue * delete temp files * cleanup * rust qfunc 2d array (needs doc) * cleanup * more cleanup * downgrade back to c++11 * format * final draft cleanup * formatting + CUDA_CLANG -> GPU_CLANG * Update cuda CEED_QFUNCTION_RUST * fix python * fix python and format * format fr * update comment * fix python fr * Apply error suggestions from code review * update errors to libceed format * Apply suggestions from code review * add optimization flag * remove line breaks * Apply suggestions from code review * avoid python in macro better * add rust example * format * Apply suggestions from code review * move rust example to own directory * Simplify python exclusion logic * re-fix python * Update python/build_ceed_cffi.py * change names and simplify makefile * Revert "change names and simplify makefile" This reverts commit 96e762f. * Apply Jeremy's diff * Simplify CeedCallSystem * use rust-install llvm tools * add gitignores * update paths * example absolute path * fix comments * apply partial diff * add newline * add makefile * Update examples/rust-qfunctions/Makefile * update makefile --------- Co-authored-by: Allen MacFarland <alma4974@noether>

Rust and cuda clang support (#1873)

Allen MacFarland and others added 21 commits June 25, 2025 14:35

Switch compiler to clang (not portable)

6023bab

test add_num

857b77a

compile with llvm tools

909af6b

not working linking

ad100b2

not fixed

ba3b719

Update ex1-volume.h

b71448f

update

d47fb3d

remove global path

24b03c2

changes

b089501

changes 2

111c301

crate works

86b2eb5

basic gpu rust compilation

0b4c4bf

still not working

0c427df

rust source roots basic support

db640a8

nvrtc/clang selection

142f1c0

cleanup

151965a

update example (not working)

789a422

add rust example

8b9b002

Merge branch 'main' into allen-rust-jit

40f43ad

Merge pull request #1 from SirAlienTheGreat/allen-rust-jit

3230820

Allen rust jit

fix merge issue

a6c33a2

jrwrigh reviewed Jul 22, 2025

View reviewed changes

examples/ceed/bruhh.rs Outdated Show resolved Hide resolved

Allen MacFarland added 2 commits July 21, 2025 20:14

delete temp files

321e426

cleanup

19ab554

jeremylt reviewed Jul 22, 2025

View reviewed changes

Makefile Outdated Show resolved Hide resolved

jeremylt reviewed Jul 22, 2025

View reviewed changes

backends/cuda/ceed-cuda-compile.cpp Outdated Show resolved Hide resolved

jeremylt reviewed Jul 22, 2025

View reviewed changes

backends/cuda/ceed-cuda-compile.cpp Show resolved Hide resolved

jeremylt reviewed Jul 22, 2025

View reviewed changes

backends/cuda/ceed-cuda-compile.cpp Outdated Show resolved Hide resolved

Allen MacFarland added 2 commits July 22, 2025 15:55

rust qfunc 2d array (needs doc)

c4fac45

cleanup

07beb34

Allen MacFarland added 2 commits July 30, 2025 12:40

Revert "change names and simplify makefile"

4ea74b2

This reverts commit 96e762f.

Apply Jeremy's diff

91a4609

Allen MacFarland added 3 commits July 30, 2025 13:22

Simplify CeedCallSystem

87760ac

use rust-install llvm tools

ae4cbba

add gitignores

2e43fc9

jeremylt reviewed Jul 30, 2025

View reviewed changes

examples/ceed/Makefile Outdated Show resolved Hide resolved

jeremylt reviewed Jul 30, 2025

View reviewed changes

examples/rust-qfunctions/ex1-volume-rust.c Outdated Show resolved Hide resolved

Allen MacFarland added 3 commits July 31, 2025 09:38

update paths

46392e4

example absolute path

b655e22

fix comments

d147647

jeremylt reviewed Jul 31, 2025

View reviewed changes

examples/rust-qfunctions/ex1-volume.c Outdated Show resolved Hide resolved

SirAlienTheGreat and others added 2 commits July 31, 2025 11:22

apply partial diff

da3ab3c

add newline

8c8bef2

add makefile

af83de8

jeremylt reviewed Jul 31, 2025

View reviewed changes

examples/rust-qfunctions/Makefile Outdated Show resolved Hide resolved

Update examples/rust-qfunctions/Makefile

cd5d2b9

Co-authored-by: Jeremy L Thompson <[email protected]>

update makefile

b2d0003

jeremylt changed the base branch from main to SirAlienTheGreat/rust-qfunctions July 31, 2025 18:42

jeremylt merged commit db0428d into CEED:SirAlienTheGreat/rust-qfunctions Jul 31, 2025
24 checks passed

jeremylt mentioned this pull request Jul 31, 2025

Rust and cuda clang support (#1873) #1881

Merged

jeremylt added a commit that referenced this pull request Aug 8, 2025

Merge pull request #1881 from CEED/SirAlienTheGreat/rust-qfunctions

9b9f611

Rust and cuda clang support (#1873)

Rust and cuda clang support #1873

Rust and cuda clang support #1873

Uh oh!

Conversation

SirAlienTheGreat commented Jul 21, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

jeremylt commented Jul 30, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

jeremylt commented Jul 30, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

jeremylt commented Jul 31, 2025

Uh oh!

jeremylt commented Jul 31, 2025

Uh oh!

jeremylt commented Jul 31, 2025

Uh oh!

Uh oh!

jeremylt commented Jul 31, 2025

Uh oh!

jeremylt commented Jul 31, 2025

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

SirAlienTheGreat commented Jul 21, 2025 •

edited

Loading

jeremylt commented Jul 30, 2025 •

edited

Loading

jeremylt commented Jul 30, 2025 •

edited

Loading