diff --git a/src/cuda/CUDAStream.h b/src/cuda/CUDAStream.h
index 50e099dc..b94e4e59 100644
--- a/src/cuda/CUDAStream.h
+++ b/src/cuda/CUDAStream.h
@@ -10,6 +10,7 @@
 #include <iostream>
 #include <stdexcept>
 #include <sstream>
+#include <cstdint>
 
 #include "Stream.h"
 
diff --git a/src/thrust/ThrustStream.cu b/src/thrust/ThrustStream.cu
index 14ddd752..a7f7b161 100644
--- a/src/thrust/ThrustStream.cu
+++ b/src/thrust/ThrustStream.cu
@@ -1,15 +1,37 @@
-// Copyright (c) 2020 Tom Deakin
-// University of Bristol HPC
+// Copyright (c) 2020 Tom Deakin, 2025 Bernhard Manfred Gruber
+// University of Bristol HPC, NVIDIA
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 
+// Thrust fails to compile if the following macro is defined:
+#undef THRUST
+
 #include "ThrustStream.h"
 #include <thrust/inner_product.h>
 #include <thrust/device_vector.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/zip_function.h>
 
+#if defined(MANAGED)
+#include <thrust/universal_vector.h>
+#else
+#include <thrust/device_vector.h>
+#endif
+
+template <class T>
+using vector =
+#if defined(MANAGED)
+  thrust::universal_vector<T>;
+#else
+  thrust::device_vector<T>;
+#endif
+
+template <class T>
+struct ThrustStream<T>::Impl{
+  vector<T> a, b, c;
+};
+
 static inline void synchronise()
 {
 // rocThrust doesn't synchronise between thrust calls
@@ -19,9 +41,8 @@ static inline void synchronise()
 }
 
 template <class T>
-ThrustStream<T>::ThrustStream(BenchId bs, const intptr_t array_size, const int device,
-			      T initA, T initB, T initC)
-    : array_size{array_size}, a(array_size), b(array_size), c(array_size) {
+ThrustStream<T>::ThrustStream(const intptr_t array_size, int device)
+    : array_size{array_size}, impl(new Impl{vector<T>(array_size), vector<T>(array_size), vector<T>(array_size)}) {
   std::cout << "Using CUDA device: " << getDeviceName(device) << std::endl;
   std::cout << "Driver: " << getDeviceDriver(device) << std::endl;
   std::cout << "Thrust version: " << THRUST_VERSION << std::endl;
@@ -50,39 +71,30 @@ ThrustStream<T>::ThrustStream(BenchId bs, const intptr_t array_size, const int d
   init_arrays(initA, initB, initC);
 }
 
+template <class T>
+ThrustStream<T>::~ThrustStream() = default;
+
 template <class T>
 void ThrustStream<T>::init_arrays(T initA, T initB, T initC)
 {
-  thrust::fill(a.begin(), a.end(), initA);
-  thrust::fill(b.begin(), b.end(), initB);
-  thrust::fill(c.begin(), c.end(), initC);
+  thrust::fill(impl->a.begin(), impl->a.end(), initA);
+  thrust::fill(impl->b.begin(), impl->b.end(), initB);
+  thrust::fill(impl->c.begin(), impl->c.end(), initC);
   synchronise();
 }
 
 template <class T>
-void ThrustStream<T>::get_arrays(T const*& a_, T const*& b_, T const*& c_)
-{
-  #if defined(MANAGED)
-  a_ = &*a.data();
-  b_ = &*b.data();
-  c_ = &*c.data();
-  #else
-  h_a.resize(array_size);
-  h_b.resize(array_size);
-  h_c.resize(array_size);
-  thrust::copy(a.begin(), a.end(), h_a.begin());
-  thrust::copy(b.begin(), b.end(), h_b.begin());
-  thrust::copy(c.begin(), c.end(), h_c.begin());
-  a_ = h_a.data();
-  b_ = h_b.data();
-  c_ = h_c.data();
-  #endif
+void ThrustStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+{
+  thrust::copy(impl->a.begin(), impl->a.end(), h_a.begin());
+  thrust::copy(impl->b.begin(), impl->b.end(), h_b.begin());
+  thrust::copy(impl->c.begin(), impl->c.end(), h_c.begin());
 }
 
 template <class T>
 void ThrustStream<T>::copy()
 {
-  thrust::copy(a.begin(), a.end(),c.begin());
+  thrust::copy(impl->a.begin(), impl->a.end(),impl->c.begin());
   synchronise();
 }
 
@@ -91,9 +103,9 @@ void ThrustStream<T>::mul()
 {
   const T scalar = startScalar;
   thrust::transform(
-      c.begin(),
-      c.end(),
-      b.begin(),
+      impl->c.begin(),
+      impl->c.end(),
+      impl->b.begin(),
       [=] __device__ __host__ (const T &ci){
         return ci * scalar;
       }
@@ -105,10 +117,10 @@ template <class T>
 void ThrustStream<T>::add()
 {
   thrust::transform(
-      a.begin(),
-      a.end(),
-      b.begin(),
-      c.begin(),
+      impl->a.begin(),
+      impl->a.end(),
+      impl->b.begin(),
+      impl->c.begin(),
       [] __device__ __host__ (const T& ai, const T& bi){
         return ai + bi;
       }
@@ -121,10 +133,10 @@ void ThrustStream<T>::triad()
 {
   const T scalar = startScalar;
   thrust::transform(
-      b.begin(),
-      b.end(),
-      c.begin(),
-      a.begin(),
+      impl->b.begin(),
+      impl->b.end(),
+      impl->c.begin(),
+      impl->a.begin(),
       [=] __device__ __host__ (const T& bi, const T& ci){
         return bi + scalar * ci;
       }
@@ -137,9 +149,9 @@ void ThrustStream<T>::nstream()
 {
   const T scalar = startScalar;
   thrust::transform(
-      thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin(), c.begin())),
-      thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end(), c.end())),
-      a.begin(),
+      thrust::make_zip_iterator(impl->a.begin(), impl->b.begin(), impl->c.begin()),
+      thrust::make_zip_iterator(impl->a.end(), impl->b.end(), impl->c.end()),
+      impl->a.begin(),
       thrust::make_zip_function(
           [=] __device__ __host__ (const T& ai, const T& bi, const T& ci){
             return ai + bi + scalar * ci;
@@ -151,7 +163,7 @@ void ThrustStream<T>::nstream()
 template <class T>
 T ThrustStream<T>::dot()
 {
-  return thrust::inner_product(a.begin(), a.end(), b.begin(), T{});
+  return thrust::inner_product(impl->a.begin(), impl->a.end(), impl->b.begin(), T{});
 }
 
 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA || \
@@ -167,7 +179,7 @@ T ThrustStream<T>::dot()
 # error Unsupported compiler for Thrust
 #endif
 
-void check_error(void)
+void check_error()
 {
   IMPL_FN__(Error_t) err =  IMPL_FN__(GetLastError());
   if (err !=  IMPL_FN__(Success))
@@ -177,7 +189,7 @@ void check_error(void)
   }
 }
 
-void listDevices(void)
+void listDevices()
 {
   // Get number of devices
   int count;
@@ -225,7 +237,7 @@ std::string getDeviceDriver(const int device)
 
 #else
 
-void listDevices(void)
+void listDevices()
 {
   std::cout << "0: CPU" << std::endl;
 }
diff --git a/src/thrust/ThrustStream.h b/src/thrust/ThrustStream.h
index 676ecaeb..4c4999fe 100644
--- a/src/thrust/ThrustStream.h
+++ b/src/thrust/ThrustStream.h
@@ -1,5 +1,5 @@
-// Copyright (c) 2020 Tom Deakin
-// University of Bristol HPC
+// Copyright (c) 2020 Tom Deakin, 2025 Bernhard Manfred Gruber
+// University of Bristol HPC, NVIDIA
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
@@ -8,11 +8,8 @@
 
 #include <iostream>
 #include <vector>
-#if defined(MANAGED)
-#include <thrust/universal_vector.h>
-#else
-#include <thrust/device_vector.h>
-#endif
+#include <memory>
+#include <cstdint>
 
 #include "Stream.h"
 
@@ -22,20 +19,13 @@ template <class T>
 class ThrustStream : public Stream<T>
 {
   protected:
-    // Size of arrays
+    struct Impl;
+    std::unique_ptr<Impl> impl; // avoid thrust vectors leaking into non-CUDA translation units
     intptr_t array_size;
 
-  #if defined(MANAGED)
-    thrust::universal_vector<T> a, b, c;
-  #else
-    thrust::device_vector<T> a, b, c;
-    std::vector<T> h_a, h_b, h_c;
-  #endif
-
   public:
-    ThrustStream(BenchId bs, const intptr_t array_size, const int device_id,
-		 T initA, T initB, T initC);
-    ~ThrustStream() = default;
+    ThrustStream(intptr_t array_size, int device);
+    ~ThrustStream();
 
     void copy() override;
     void add() override;
diff --git a/src/thrust/model.cmake b/src/thrust/model.cmake
index 23627c11..f96ea6c5 100644
--- a/src/thrust/model.cmake
+++ b/src/thrust/model.cmake
@@ -1,13 +1,13 @@
 
 register_flag_optional(THRUST_IMPL
         "Which Thrust implementation to use, supported options include:
-         - CUDA (via https://github.com/NVIDIA/thrust)
+         - CUDA (via https://github.com/NVIDIA/CCCL (CUDA Core Compute Libraries))
          - ROCM (via https://github.com/ROCmSoftwarePlatform/rocThrust)
         "
         "CUDA")
 
 register_flag_optional(SDK_DIR
-        "Path to the selected Thrust implementation (e.g `/opt/nvidia/hpc_sdk/Linux_x86_64/21.9/cuda/include` for NVHPC, `/opt/rocm` for ROCm)"
+        "Path to the installation prefix for CCCL or Thrust (e.g `/opt/nvidia/hpc_sdk/Linux_x86_64/24.5/cuda/12.4/lib64/cmake` for NVHPC, or `/usr/local/cuda-13.0/lib64/cmake` for nvcc, or `/opt/rocm` for ROCm)"
         "")
 
 register_flag_optional(BACKEND
@@ -18,7 +18,8 @@ register_flag_optional(BACKEND
         "
         "CUDA")
 
-register_flag_optional(MANAGED "Enabled managed memory mode." "OFF")
+register_flag_optional(MANAGED "Enabled managed memory mode."
+        "OFF")
 
 register_flag_optional(CMAKE_CUDA_COMPILER
         "[THRUST_IMPL==CUDA] Path to the CUDA nvcc compiler"
@@ -33,52 +34,53 @@ register_flag_optional(CUDA_EXTRA_FLAGS
         "[THRUST_IMPL==CUDA] Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`"
         "")
 
+option(FETCH_CCCL "Fetch (download) the CCCL library. This uses CMake's FetchContent feature.
+                   Specify version by setting FETCH_CCCL_VERSION" OFF)
+set(FETCH_CCCL_VERSION "v3.1.0" CACHE STRING "Specify version of CCCL to use if FETCH_CCCL is ON")
 
 macro(setup)
-    set(CMAKE_CXX_STANDARD 14)
+    set(CMAKE_CXX_STANDARD 17)
     if (MANAGED)
       register_definitions(MANAGED)
     endif ()
 
     if (${THRUST_IMPL} STREQUAL "CUDA")
-
-        # see CUDA.cmake, we're only adding a few Thrust related libraries here
-
         if (POLICY CMP0104)
             cmake_policy(SET CMP0104 NEW)
         endif ()
-
         set(CMAKE_CUDA_ARCHITECTURES  ${CUDA_ARCH})
-        # add -forward-unknown-to-host-compiler for compatibility reasons
         set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda " ${CUDA_EXTRA_FLAGS})
         enable_language(CUDA)
-        # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG
-        # appended later
+        # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG appended later
         wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE})
 
         message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_${BUILD_TYPE}}")
 
-
-        # XXX NVHPC <= 21.9 has cub-config in `Linux_x86_64/21.9/cuda/11.4/include/cub/cmake`
-        # XXX NVHPC >= 22.3 has cub-config in `Linux_x86_64/22.3/cuda/11.6/lib64/cmake/cub/`
-        # same thing for thrust
+        # append SDK_DIR to help finding CCCL
         if (SDK_DIR)
+            # CMake tries several subdirectories below SDK_DIR, see documentation:
+            # https://cmake.org/cmake/help/latest/command/find_package.html#config-mode-search-procedure
             list(APPEND CMAKE_PREFIX_PATH ${SDK_DIR})
-            find_package(CUB REQUIRED CONFIG PATHS ${SDK_DIR}/cub)
-            find_package(Thrust REQUIRED CONFIG PATHS ${SDK_DIR}/thrust)
-        else ()
-            find_package(CUB REQUIRED CONFIG)
-            find_package(Thrust REQUIRED CONFIG)
         endif ()
 
-        message(STATUS "Using Thrust backend: ${BACKEND}")
-
-        # this creates the interface that we can link to
-        thrust_create_target(Thrust${BACKEND}
-                HOST CPP
-                DEVICE ${BACKEND})
-
-        register_link_library(Thrust${BACKEND})
+        # append CUDA Toolkit cmake config dir to help finding CCCL
+        find_package(CUDAToolkit REQUIRED)
+        list(APPEND CMAKE_PREFIX_PATH "${CUDAToolkit_LIBRARY_DIR}/cmake")
+
+        set(CCCL_THRUST_DEVICE_SYSTEM ${BACKEND} CACHE STRING "" FORCE)
+
+        # fetch CCCL if user wants to, otherwise just try to find it
+        if (FETCH_CCCL)
+            FetchContent_Declare(
+                    CCCL
+                    GIT_REPOSITORY https://github.com/nvidia/cccl.git
+                    GIT_TAG "${FETCH_CCCL_VERSION}"
+            )
+            FetchContent_MakeAvailable(CCCL)
+        else()
+            find_package(CCCL CONFIG REQUIRED)
+        endif()
+        register_link_library(CCCL::CCCL)
     elseif (${THRUST_IMPL} STREQUAL "ROCM")
         if (SDK_DIR)
             find_package(rocprim REQUIRED CONFIG PATHS ${SDK_DIR}/rocprim)