CUDA 12.9 compilation error

It seems nvcc tool chain behavior keeps changing with version upgrade. I tried compile test_gpt2cu with CUDA 

```
$ nvcc --version                                                                                                                                                                                                         
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Tue_May_27_02:21:03_PDT_2025
Cuda compilation tools, release 12.9, V12.9.86
Build cuda_12.9.r12.9/compiler.36037853_0
```


And I encountered the following errors:

```
make test_gpt2cu                                                                                                                                                                                                        [backprop]
NICE Compiling with OpenMP support
nvcc -O3 --use_fast_math test_gpt2.cu -lcublas -lcublasLt -o test_gpt2cu
nvcc warning : Support for offline compilation for architectures prior to '<compute/sm/lto>_75' will be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
train_gpt2.cu(795): error: __host__ or __device__ annotation on lambda requires --extended-lambda nvcc flag
      (cudaCheck(cub::DeviceFor::Bulk(B * T * C, [=] __attribute__((device))(int idx) { auto [b, t, c] = i2n(idx, C, T); out_md(b, t, c) = wte_md(inp_md(b, t), c) + wpe_md(t, c); }), "train_gpt2.cu", 795))
                                                                            ^

train_gpt2.cu(795): error: calling a __device__ function("_Z3i2n1?1?1?") from a __host__ function("operator()") is not allowed
      (cudaCheck(cub::DeviceFor::Bulk(B * T * C, [=] __attribute__((device))(int idx) { auto [b, t, c] = i2n(idx, C, T); out_md(b, t, c) = wte_md(inp_md(b, t), c) + wpe_md(t, c); }), "train_gpt2.cu", 795))
                                                                                                         ^

train_gpt2.cu(945): error: __host__ or __device__ annotation on lambda requires --extended-lambda nvcc flag
                       [=] __attribute__((device))(int idx) {
                                                  ^

train_gpt2.cu(971): error: __host__ or __device__ annotation on lambda requires --extended-lambda nvcc flag
          thrust::make_counting_iterator(0), [=] __attribute__((host)) __attribute__((device))(int idx) {
                                                                                              ^

train_gpt2.cu(986): error: __host__ or __device__ annotation on lambda requires --extended-lambda nvcc flag
      thrust::transform(thrust::cuda::par_nosync, inp, inp + N, out, [] __attribute__((device))(float xi) {
                                                                                               ^

test_gpt2.cu(51): error: initial value of reference to non-const must be an lvalue
      gpt2_build_from_checkpoint(&model, "gpt2_124M.bin");
                                 ^

test_gpt2.cu(73): error: too many arguments in function call
      float* expected_grads_memory = malloc_and_point_parameters(&expected_grads, model.param_sizes, 0);
                                                                                                     ^

test_gpt2.cu(73): error: no suitable conversion function from "thrust::THRUST_200802_SM_520_NS::device_vector<float, thrust::THRUST_200802_SM_520_NS::device_allocator<float>>" to "float *" exists
      float* expected_grads_memory = malloc_and_point_parameters(&expected_grads, model.param_sizes, 0);
                                     ^

test_gpt2.cu(74): error: too many arguments in function call
      float* calculated_grads_memory = malloc_and_point_parameters(&calculated_grads, model.param_sizes, 0);
                                                                                                         ^

test_gpt2.cu(74): error: no suitable conversion function from "thrust::THRUST_200802_SM_520_NS::device_vector<float, thrust::THRUST_200802_SM_520_NS::device_allocator<float>>" to "float *" exists
      float* calculated_grads_memory = malloc_and_point_parameters(&calculated_grads, model.param_sizes, 0);
                                       ^

test_gpt2.cu(94): error: initial value of reference to non-const must be an lvalue
      gpt2_forward(&model, x, 
                   ^

test_gpt2.cu(119): error: initial value of reference to non-const must be an lvalue
          gpt2_forward(&model, x, y, B, T);
                       ^

test_gpt2.cu(120): error: initial value of reference to non-const must be an lvalue
          gpt2_zero_grad(&model);
                         ^

test_gpt2.cu(121): error: initial value of reference to non-const must be an lvalue
          gpt2_backward(&model);
                        ^

test_gpt2.cu(175): error: no suitable conversion function from "thrust::THRUST_200802_SM_520_NS::device_vector<float, thrust::THRUST_200802_SM_520_NS::device_allocator<float>>" to "const void *" exists
              cudaMemcpy(calculated_grads_memory, model.grads_memory, model.num_parameters * sizeof(float), cudaMemcpyDeviceToHost);
                                                  ^

test_gpt2.cu(220): error: identifier "gpt2_free" is undefined
      gpt2_free(&model);
      ^

16 errors detected in the compilation of "test_gpt2.cu".
make: *** [Makefile:59: test_gpt2cu] Error 2
```



Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

CUDA 12.9 compilation error #5

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

CUDA 12.9 compilation error #5

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions