Cuda13 linux nvshmem (PaddlePaddle#75557)

swgu98 · swgu98 · commit d1666747363d · 2025-10-15T14:19:55.000Z
* nvshmem cuda13

* cuda13

* templete bypass
diff --git a/.github/workflows/CheckPRTemplate.yml b/.github/workflows/CheckPRTemplate.yml
@@ -16,7 +16,15 @@ jobs:
       - name: Clone paddle
         uses: actions/checkout@v4
 
+      - name: Check bypass
+        id: check-bypass
+        uses: ./.github/actions/check-bypass
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          workflow-name: template
+
       - name: Check PR Template
+        if: steps.check-bypass.outputs.can-skip != 'true'
         env:
           AGILE_PULL_ID: ${{ github.event.pull_request.number }}
           AGILE_COMPILE_BRANCH: ${{ github.base_ref }}
diff --git a/cmake/external/nvshmem.cmake b/cmake/external/nvshmem.cmake
@@ -53,7 +53,12 @@ else()
       extern_nvshmem)
 endif()
 
-set(NVSHMEM_PATCH_PATH ${PADDLE_SOURCE_DIR}/patches/nvshmem/nvshmem.patch)
+if(CUDA_VERSION VERSION_GREATER_EQUAL 13)
+  set(NVSHMEM_PATCH_PATH
+      ${PADDLE_SOURCE_DIR}/patches/nvshmem/nvshmem_cuda13.patch)
+else()
+  set(NVSHMEM_PATCH_PATH ${PADDLE_SOURCE_DIR}/patches/nvshmem/nvshmem.patch)
+endif()
 set(NVSHMEM_PATCH_COMMAND
     git init && git config --global --add safe.directory ${NVSHMEM_SOURCE_DIR}
     && git config user.name "PaddlePaddle" && git config user.email
diff --git a/patches/nvshmem/nvshmem_cuda13.patch b/patches/nvshmem/nvshmem_cuda13.patch
@@ -0,0 +1,330 @@
+diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
+index cba899b..88f291d 100644
+--- a/src/CMakeLists.txt
++++ b/src/CMakeLists.txt
+@@ -213,8 +213,8 @@ set_target_properties(nvshmem nvshmem_host
+                       PROPERTIES POSITION_INDEPENDENT_CODE ON
+                       CXX_STANDARD_REQUIRED ON
+                       CUDA_STANDARD_REQUIRED ON
+-                      CXX_STANDARD 11
+-                      CUDA_STANDARD 11
++                      CXX_STANDARD 17
++                      CUDA_STANDARD 17
+                       CUDA_SEPARABLE_COMPILATION ON
+                       LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/lib"
+                       ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/lib"
+diff --git a/src/include/device_host_transport/nvshmem_common_ibgda.h b/src/include/device_host_transport/nvshmem_common_ibgda.h
+index 8b8a263..080a8fe 100644
+--- a/src/include/device_host_transport/nvshmem_common_ibgda.h
++++ b/src/include/device_host_transport/nvshmem_common_ibgda.h
+@@ -46,6 +46,8 @@
+         qp_man.tx_wq.cons_idx = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                    \
+         qp_man.tx_wq.get_head = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                    \
+         qp_man.tx_wq.get_tail = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                    \
++        qp_man.rx_wq.resv_head = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                    \
++        qp_man.rx_wq.cons_idx = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                    \
+         qp_man.ibuf.head = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                         \
+         qp_man.ibuf.tail = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                         \
+     } while (0);
+@@ -168,14 +170,18 @@ typedef struct {
+         uint64_t get_head;    // last wqe idx + 1 with a "fetch" operation (g, get, amo_fetch)
+         uint64_t get_tail;    // last wqe idx + 1 polled with cst; get_tail > get_head is possible
+     } tx_wq;
++    struct {
++        uint64_t resv_head;   // last reserved wqe idx + 1
++        uint64_t cons_idx;    // polled wqe idx + 1 (consumer index + 1)
++    } rx_wq;
+     struct {
+         uint64_t head;
+         uint64_t tail;
+     } ibuf;
+     char padding[NVSHMEMI_IBGDA_QP_MANAGEMENT_PADDING];
+ } __attribute__((__aligned__(8))) nvshmemi_ibgda_device_qp_management_v1;
+-static_assert(sizeof(nvshmemi_ibgda_device_qp_management_v1) == 96,
+-              "ibgda_device_qp_management_v1 must be 96 bytes.");
++static_assert(sizeof(nvshmemi_ibgda_device_qp_management_v1) == 112,
++              "ibgda_device_qp_management_v1 must be 112 bytes.");
+ 
+ typedef nvshmemi_ibgda_device_qp_management_v1 nvshmemi_ibgda_device_qp_management_t;
+ 
+@@ -199,9 +205,19 @@ typedef struct nvshmemi_ibgda_device_qp {
+         // May point to mvars.prod_idx or internal prod_idx
+         uint64_t *prod_idx;
+     } tx_wq;
++    struct {
++        uint16_t nwqes;
++        uint64_t tail;
++        void *wqe;
++        __be32 *dbrec;
++        void *bf;
++        nvshmemi_ibgda_device_cq_t *cq;
++        // May point to mvars.prod_idx or internal prod_idx
++        uint64_t *prod_idx;
++    } rx_wq;
+     nvshmemi_ibgda_device_qp_management_v1 mvars;  // management variables
+ } nvshmemi_ibgda_device_qp_v1;
+-static_assert(sizeof(nvshmemi_ibgda_device_qp_v1) == 184, "ibgda_device_qp_v1 must be 184 bytes.");
++static_assert(sizeof(nvshmemi_ibgda_device_qp_v1) == 256, "ibgda_device_qp_v1 must be 256 bytes.");
+ 
+ typedef nvshmemi_ibgda_device_qp_v1 nvshmemi_ibgda_device_qp_t;
+ 
+diff --git a/src/modules/transport/common/transport_ib_common.cpp b/src/modules/transport/common/transport_ib_common.cpp
+index c89f408..f99018a 100644
+--- a/src/modules/transport/common/transport_ib_common.cpp
++++ b/src/modules/transport/common/transport_ib_common.cpp
+@@ -26,6 +26,9 @@ int nvshmemt_ib_common_nv_peer_mem_available() {
+     if (access("/sys/kernel/mm/memory_peers/nvidia-peermem/version", F_OK) == 0) {
+         return NVSHMEMX_SUCCESS;
+     }
++    if (access("/sys/module/nvidia_peermem/version", F_OK) == 0) {
++        return NVSHMEMX_SUCCESS;
++    }
+ 
+     return NVSHMEMX_ERROR_INTERNAL;
+ }
+diff --git a/src/modules/transport/ibgda/ibgda.cpp b/src/modules/transport/ibgda/ibgda.cpp
+index ef325cd..bc339c5 100644
+--- a/src/modules/transport/ibgda/ibgda.cpp
++++ b/src/modules/transport/ibgda/ibgda.cpp
+@@ -198,6 +198,7 @@ struct ibgda_ep {
+     off_t dbr_offset;
+ 
+     struct ibgda_cq *send_cq;
++    struct ibgda_cq *recv_cq;
+     struct ibv_ah *ah;
+ 
+     uint32_t user_index;
+@@ -1066,7 +1067,7 @@ static inline void ibgda_nic_control_free(struct ibgda_mem_object *mobject) {
+         ibgda_host_mem_free(mobject);
+ }
+ 
+-static int ibgda_create_cq(struct ibgda_cq **pgcq, struct ibgda_device *device) {
++static int ibgda_create_cq(struct ibgda_cq **pgcq, struct ibgda_device *device, int cc = 1) {
+     int status = 0;
+ 
+     struct ibgda_cq *gcq = NULL;
+@@ -1117,7 +1118,7 @@ static int ibgda_create_cq(struct ibgda_cq **pgcq, struct ibgda_device *device)
+     cq_context = DEVX_ADDR_OF(create_cq_in, cmd_in, cq_context);
+     DEVX_SET(cqc, cq_context, dbr_umem_valid, IBGDA_MLX5_UMEM_VALID_ENABLE);
+     DEVX_SET(cqc, cq_context, cqe_sz, MLX5_CQE_SIZE_64B);
+-    DEVX_SET(cqc, cq_context, cc, 0x1);  // Use collapsed CQ
++    DEVX_SET(cqc, cq_context, cc, cc);  // Use collapsed CQ
+     DEVX_SET(cqc, cq_context, oi, 0x1);  // Allow overrun
+     DEVX_SET(cqc, cq_context, dbr_umem_id, dbr_umem->umem_id);
+     DEVX_SET(cqc, cq_context, log_cq_size, IBGDA_ILOG2_OR0(num_cqe));
+@@ -1538,7 +1539,8 @@ static int ibgda_create_cq_shared_objects(nvshmemt_ibgda_state_t *ibgda_state,
+ 
+     struct ibv_context *context = device->context;
+ 
+-    unsigned int num_cqs = device->dci.num_eps + device->rc.num_eps_per_pe * n_pes;
++    // Each RC qp has one send CQ and one recv CQ.
++    unsigned int num_cqs = device->dci.num_eps + device->rc.num_eps_per_pe * n_pes * 2;
+ 
+     assert(ibgda_qp_depth > 0);
+     size_t num_cqe = IBGDA_ROUND_UP_POW2_OR_0(ibgda_qp_depth);
+@@ -1701,7 +1703,8 @@ static int ibgda_create_qp_shared_objects(nvshmemt_ibgda_state_t *ibgda_state,
+     }
+ 
+     // Allocate and map WQ buffer for all QPs.
+-    wq_buf_size_per_qp = num_wqebb * MLX5_SEND_WQE_BB;  // num_wqebb is always a power of 2
++    // Todo: reduce the size of wq buffer.
++    wq_buf_size_per_qp = num_wqebb * MLX5_SEND_WQE_BB * 2;  // num_wqebb is always a power of 2
+     wq_buf_size = wq_buf_size_per_qp * num_eps;
+     status = ibgda_nic_control_alloc(&wq_mobject, wq_buf_size, IBGDA_GPAGE_SIZE);
+     NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out, "cannot allocate wq buf.\n");
+@@ -1882,8 +1885,11 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
+     int cqe_version = 0;
+ 
+     struct ibgda_cq *send_cq = NULL;
++    struct ibgda_cq *recv_cq = NULL;
+ 
+     size_t num_wqebb = IBGDA_ROUND_UP_POW2_OR_0(ibgda_qp_depth);
++    size_t num_recv_wqe = ibgda_qp_depth;
++    size_t recv_wqe_size = 16;
+ 
+     int status = 0;
+ 
+@@ -1911,6 +1917,11 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
+     status = ibgda_create_cq(&send_cq, device);
+     NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out, "ibgda_create_cq failed.\n");
+ 
++    if (qp_type == NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC) {
++        status = ibgda_create_cq(&recv_cq, device);
++        NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out, "ibgda_create_cq failed.\n");
++    }
++
+     ep = (struct ibgda_ep *)calloc(1, sizeof(struct ibgda_ep));
+     NVSHMEMI_NULL_ERROR_JMP(ep, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, out,
+                             "Unable to allocate mem for ep.\n");
+@@ -1939,12 +1950,9 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
+     DEVX_SET(qpc, qp_context, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
+     DEVX_SET(qpc, qp_context, pd, device->qp_shared_object.pdn);
+     DEVX_SET(qpc, qp_context, uar_page, uar_mobject->uar->page_id);  // BF register
+-    DEVX_SET(qpc, qp_context, rq_type, IBGDA_SRQ_TYPE_VALUE);        // Shared Receive Queue
+-    DEVX_SET(qpc, qp_context, srqn_rmpn_xrqn, device->qp_shared_object.srqn);
+     DEVX_SET(qpc, qp_context, cqn_snd, send_cq->cqn);
+-    DEVX_SET(qpc, qp_context, cqn_rcv, device->qp_shared_object.rcqn);
++    DEVX_SET(qpc, qp_context, cqn_rcv, qp_type == NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC ? recv_cq->cqn : device->qp_shared_object.rcqn);
+     DEVX_SET(qpc, qp_context, log_sq_size, IBGDA_ILOG2_OR0(num_wqebb));
+-    DEVX_SET(qpc, qp_context, log_rq_size, 0);
+     DEVX_SET(qpc, qp_context, cs_req, 0);                                     // Disable CS Request
+     DEVX_SET(qpc, qp_context, cs_res, 0);                                     // Disable CS Response
+     DEVX_SET(qpc, qp_context, dbr_umem_valid, IBGDA_MLX5_UMEM_VALID_ENABLE);  // Enable dbr_umem_id
+@@ -1953,6 +1961,15 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
+     DEVX_SET(qpc, qp_context, dbr_umem_id, dbr_umem->umem_id);  // DBR buffer
+     DEVX_SET(qpc, qp_context, user_index, qp_idx);
+     DEVX_SET(qpc, qp_context, page_offset, 0);
++    if (qp_type == NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC){
++        DEVX_SET(qpc, qp_context, rq_type, 0);        // Regular recv queue
++        DEVX_SET(qpc, qp_context, log_rq_size, IBGDA_ILOG2(num_recv_wqe)); // 4 wqe
++        DEVX_SET(qpc, qp_context, log_rq_stride, IBGDA_ILOG2(recv_wqe_size) - 4); // max recv wqe size = 16B
++    } else {
++        DEVX_SET(qpc, qp_context, rq_type, IBGDA_SRQ_TYPE_VALUE);        // Shared Receive Queue, DC must use this.
++        DEVX_SET(qpc, qp_context, srqn_rmpn_xrqn, device->qp_shared_object.srqn);
++        DEVX_SET(qpc, qp_context, log_rq_size, 0);
++    }
+ 
+     ep->devx_qp = mlx5dv_devx_obj_create(context, cmd_in, sizeof(cmd_in), cmd_out, sizeof(cmd_out));
+     NVSHMEMI_NULL_ERROR_JMP(ep->devx_qp, status, NVSHMEMX_ERROR_INTERNAL, out,
+@@ -1962,9 +1979,9 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
+     ep->portid = portid;
+ 
+     ep->sq_cnt = num_wqebb;
+-    ep->sq_buf_offset = 0;
++    ep->sq_buf_offset = num_recv_wqe * recv_wqe_size;
+ 
+-    ep->rq_cnt = 0;
++    ep->rq_cnt = num_recv_wqe;
+     ep->rq_buf_offset = 0;
+ 
+     ep->wq_mobject = device->qp_shared_object.wq_mobject;
+@@ -1978,6 +1995,7 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
+     ep->uar_mobject = uar_mobject;
+ 
+     ep->send_cq = send_cq;
++    ep->recv_cq = recv_cq;
+ 
+     ep->qp_type = qp_type;
+ 
+@@ -1989,6 +2007,7 @@ out:
+     if (status) {
+         if (uar_mobject) ibgda_unmap_and_free_qp_uar(uar_mobject);
+         if (send_cq) ibgda_destroy_cq(send_cq);
++        if (recv_cq) ibgda_destroy_cq(recv_cq);
+         if (ep) free(ep);
+     }
+ 
+@@ -2287,6 +2306,10 @@ static int ibgda_destroy_ep(struct ibgda_ep *ep) {
+         ibgda_destroy_cq(ep->send_cq);
+     }
+ 
++    if (ep->recv_cq) {
++        ibgda_destroy_cq(ep->recv_cq);
++    }
++
+     if (ep->ah) {
+         ftable.destroy_ah(ep->ah);
+     }
+@@ -2318,7 +2341,7 @@ static void ibgda_get_device_qp(nvshmemi_ibgda_device_qp_t *dev_qp, struct ibgda
+     dev_qp->qpn = ep->qpn;
+ 
+     assert(ep->wq_mobject->has_gpu_mapping);
+-    dev_qp->tx_wq.wqe = (void *)((uintptr_t)ep->wq_mobject->aligned.gpu_ptr + ep->wq_offset);
++    dev_qp->tx_wq.wqe = (void *)((uintptr_t)ep->wq_mobject->aligned.gpu_ptr + ep->wq_offset + ep->sq_buf_offset);
+ 
+     if (ibgda_nic_handler == IBGDA_NIC_HANDLER_GPU) {
+         assert(ep->dbr_mobject->has_gpu_mapping);
+@@ -2330,6 +2353,12 @@ static void ibgda_get_device_qp(nvshmemi_ibgda_device_qp_t *dev_qp, struct ibgda
+     }
+ 
+     dev_qp->tx_wq.nwqes = ep->sq_cnt;
++    if (ep->qp_type == NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC) {
++        dev_qp->rx_wq.nwqes = ep->rq_cnt;
++        dev_qp->rx_wq.wqe = (void *)((uintptr_t)ep->wq_mobject->aligned.gpu_ptr + ep->wq_offset + ep->rq_buf_offset);
++        dev_qp->rx_wq.dbrec = (__be32 *)((uintptr_t)ep->dbr_mobject->aligned.gpu_ptr + ep->dbr_offset);
++        dev_qp->rx_wq.bf = (void *)ep->uar_mobject->aligned.gpu_ptr;
++    }
+ 
+     ibuf_dci_start = (uintptr_t)device->qp_shared_object.internal_buf.mem_object->aligned.gpu_ptr;
+     ibuf_rc_start = ibuf_dci_start + (size_per_dci * device->dci.num_eps);
+@@ -2379,6 +2408,9 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
+     nvshmemi_ibgda_device_cq_t *cq_d = NULL;
+     nvshmemi_ibgda_device_cq_t *cq_h = NULL;
+ 
++    nvshmemi_ibgda_device_cq_t *recv_cq_d = NULL;
++    nvshmemi_ibgda_device_cq_t *recv_cq_h = NULL;
++
+     uint8_t *qp_group_switches_d = NULL;
+ 
+     const size_t mvars_offset = offsetof(nvshmemi_ibgda_device_qp_t, mvars);
+@@ -2386,6 +2418,8 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
+     const size_t cons_t_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.cons_idx);
+     const size_t wqe_h_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.resv_head);
+     const size_t wqe_t_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.ready_head);
++    const size_t rx_resv_head_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, rx_wq.resv_head);
++    const size_t rx_cons_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, rx_wq.cons_idx);
+ 
+     nvshmemi_ibgda_device_qp_map_type_t rc_map_type = NVSHMEMI_IBGDA_DEVICE_QP_MAP_TYPE_INVALID;
+     nvshmemi_ibgda_device_qp_map_type_t dc_map_type = NVSHMEMI_IBGDA_DEVICE_QP_MAP_TYPE_INVALID;
+@@ -2421,7 +2455,7 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
+         num_dct_handles += device->dct.num_eps * n_pes;
+         num_dci_handles += device->dci.num_eps;
+         num_rc_handles += device->rc.num_eps_per_pe * n_pes;
+-        num_cq_handles += device->dci.num_eps + (device->rc.num_eps_per_pe * (n_pes - 1));
++        num_cq_handles += device->dci.num_eps + (device->rc.num_eps_per_pe * (n_pes - 1) * 2);
+         num_shared_dci_handles += device->dci.num_shared_eps;
+     }
+     assert(num_dci_handles - num_shared_dci_handles >= 0);
+@@ -2456,6 +2490,10 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
+     for (int i = 0; i < num_cq_handles; i++) {
+         nvshmemi_init_ibgda_device_cq(cq_h[i]);
+     }
++
++    recv_cq_h = (nvshmemi_ibgda_device_cq_t *)calloc(1, sizeof(*recv_cq_h));
++    NVSHMEMI_NULL_ERROR_JMP(recv_cq_h, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, out, "recv_cq calloc err.");
++    nvshmemi_init_ibgda_device_cq(recv_cq_h[0]);
+     /* allocate host memory for dct, rc, cq, dci end */
+ 
+     /* allocate device memory for dct, rc, cq, dci start */
+@@ -2559,6 +2597,15 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
+                 }
+ 
+                 ++cq_idx;
++
++                rc_h[arr_idx].rx_wq.cq = &cq_d[cq_idx];
++
++                ibgda_get_device_cq(&cq_h[cq_idx], device->rc.eps[i]->recv_cq);
++                cq_h[cq_idx].resv_head = (uint64_t *)(base_mvars_d_addr + rx_resv_head_offset);
++                cq_h[cq_idx].cons_idx = (uint64_t *)(base_mvars_d_addr + rx_cons_offset);
++                cq_h[cq_idx].qpn = rc_h[arr_idx].qpn;
++                cq_h[cq_idx].qp_type = rc_h[arr_idx].qp_type;
++                ++cq_idx;
+             }
+         }
+     }
+@@ -2936,17 +2983,20 @@ int nvshmemt_ibgda_connect_endpoints(nvshmem_transport_t t, int *selected_dev_id
+         INFO(ibgda_state->log_level, "Creating %d RC QPs", device->rc.num_eps_per_pe);
+         for (int i = 0; i < num_rc_eps; ++i) {
+             // Do not create loopback to self
+-            if (i / device->rc.num_eps_per_pe == mype) {
++            int dst_pe = (i + 1 + mype) % n_pes;
++            int offset = i / n_pes;
++            int mapped_i = dst_pe * device->rc.num_eps_per_pe + offset;
++            if (dst_pe == mype) {
+                 continue;
+             }
+-            status = ibgda_create_qp(&device->rc.eps[i], device, portid, i,
++            status = ibgda_create_qp(&device->rc.eps[mapped_i], device, portid, mapped_i,
+                                      NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC);
+             NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
+-                                  "ibgda_create_dci failed on RC #%d.", i);
++                                  "ibgda_create_dci failed on RC #%d.", mapped_i);
+ 
+-            status = ibgda_get_rc_handle(&local_rc_handles[i], device->rc.eps[i], device);
++            status = ibgda_get_rc_handle(&local_rc_handles[mapped_i], device->rc.eps[mapped_i], device);
+             NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
+-                                  "ibgda_get_rc_handle failed on RC #%d.", i);
++                                  "ibgda_get_rc_handle failed on RC #%d.", mapped_i);
+         }
+ 
+         if (num_rc_eps) {