Skip to content

Commit 4299389

Browse files
committed
prov/efa: Fix use-after-free in DC packet completion handling
DC packets had a race condition where RECEIPT acknowledgments could arrive before all TX completions, causing premature txe release and use-after-free errors when remaining TX completions accessed the freed memory. Fix by using efa_outstanding_tx_ops for completion tracking: - DC packets use efa_outstanding_tx_ops to track TX completion status - RECEIPT handler writes completion immediately (preserves DC semantics) - txe release delayed until both TX completions and receipt received - Added EFA_RDM_TXE_RECEIPT_RECEIVED flag to track receipt arrival - Added efa_rdm_txe_dc_ready_for_release() with proper documentation This prevents the race condition while maintaining existing DC completion behavior for applications. Signed-off-by: Shi Jin <sjina@amazon.com>
1 parent 765272e commit 4299389

File tree

7 files changed

+152
-17
lines changed

7 files changed

+152
-17
lines changed

prov/efa/src/rdm/efa_rdm_ope.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1971,7 +1971,7 @@ ssize_t efa_rdm_ope_repost_ope_queued_before_handshake(struct efa_rdm_ope *ope)
19711971
}
19721972
}
19731973

1974-
int efa_rdm_ope_process_queued_ope(struct efa_rdm_ope *ope, uint16_t flag)
1974+
int efa_rdm_ope_process_queued_ope(struct efa_rdm_ope *ope, uint32_t flag)
19751975
{
19761976
int ret = 0;
19771977

prov/efa/src/rdm/efa_rdm_ope.h

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ struct efa_rdm_ope {
103103
* This flag is different from #cq_entry.flags, which is
104104
* applied to CQ entry's returned to user.
105105
*/
106-
uint16_t internal_flags;
106+
uint32_t internal_flags;
107107

108108
size_t iov_count;
109109
struct iovec iov[EFA_RDM_IOV_LIMIT];
@@ -281,13 +281,18 @@ void efa_rdm_rxe_release_internal(struct efa_rdm_ope *rxe);
281281
* @brief flag to indicate that the ope was created
282282
* for internal operations, so it should not generate
283283
* any cq entry or err entry.
284-
* NOTICE: the ope->internal_flags is uint16_t, so
285-
* to introduce more bits for internal flags, the
286-
* internal_flags needs to be changed to uint32_t
287-
* or larger.
288284
*/
289285
#define EFA_RDM_OPE_INTERNAL BIT_ULL(15)
290286

287+
/**
288+
* @brief flag to indicate that a DC txe has received its receipt packet
289+
*
290+
* This flag is used to track when a delivery complete operation has
291+
* received acknowledgment from the receiver, preventing premature
292+
* completion before all TX operations finish.
293+
*/
294+
#define EFA_RDM_TXE_RECEIPT_RECEIVED BIT_ULL(16)
295+
291296
#define EFA_RDM_OPE_QUEUED_FLAGS (EFA_RDM_OPE_QUEUED_RNR | EFA_RDM_OPE_QUEUED_CTRL | EFA_RDM_OPE_QUEUED_READ | EFA_RDM_OPE_QUEUED_BEFORE_HANDSHAKE)
292297

293298
void efa_rdm_ope_try_fill_desc(struct efa_rdm_ope *ope, int mr_iov_start, uint64_t access);
@@ -311,6 +316,27 @@ void efa_rdm_ope_handle_recv_completed(struct efa_rdm_ope *ope);
311316

312317
void efa_rdm_ope_handle_send_completed(struct efa_rdm_ope *ope);
313318

319+
/**
320+
* @brief Check if a delivery complete (DC) TXE is ready for release
321+
*
322+
* @details
323+
* For DC packets, this function prevents use-after-free race conditions by
324+
* ensuring the TXE is only released when both conditions are met:
325+
* 1. All TX operations have completed (efa_outstanding_tx_ops == 0)
326+
* 2. Receipt packet has been received (EFA_RDM_TXE_RECEIPT_RECEIVED flag set)
327+
*
328+
* This dual-condition check ensures proper synchronization between send
329+
* completions and receipt acknowledgments in the delivery complete protocol.
330+
*
331+
* @param[in] txe TX operation entry to check
332+
* @return true if TXE is ready for release, false otherwise
333+
*/
334+
static inline bool efa_rdm_txe_dc_ready_for_release(struct efa_rdm_ope *txe)
335+
{
336+
return (txe->efa_outstanding_tx_ops == 0) &&
337+
(txe->internal_flags & EFA_RDM_TXE_RECEIPT_RECEIVED);
338+
}
339+
314340
int efa_rdm_ope_prepare_to_post_read(struct efa_rdm_ope *ope);
315341

316342
void efa_rdm_ope_prepare_to_post_write(struct efa_rdm_ope *ope);
@@ -342,6 +368,6 @@ ssize_t efa_rdm_ope_repost_ope_queued_before_handshake(struct efa_rdm_ope *ope);
342368

343369
ssize_t efa_rdm_txe_prepare_local_read_pkt_entry(struct efa_rdm_ope *txe);
344370

345-
int efa_rdm_ope_process_queued_ope(struct efa_rdm_ope *ope, uint16_t flag);
371+
int efa_rdm_ope_process_queued_ope(struct efa_rdm_ope *ope, uint32_t flag);
346372

347373
#endif

prov/efa/src/rdm/efa_rdm_pke_cmd.c

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -646,16 +646,16 @@ void efa_rdm_pke_handle_send_completion(struct efa_rdm_pke *pkt_entry)
646646
case EFA_RDM_DC_LONGCTS_MSGRTM_PKT:
647647
case EFA_RDM_DC_LONGCTS_TAGRTM_PKT:
648648
case EFA_RDM_DC_LONGCTS_RTW_PKT:
649-
/* no action to be taken here
650-
* For non-dc version of these packet types,
651-
* this is the place to increase bytes_acked or
652-
* write tx completion.
653-
* For dc, tx completion will always be
654-
* written upon receving the receipt packet
655-
* Moreoever, because receipt can arrive
656-
* before send completion, we cannot take
657-
* any action on txe here.
649+
/* For DC packets, use efa_outstanding_tx_ops to track TX completions
650+
* instead of bytes_acked to avoid issues with unset payload_size.
651+
* Note: efa_rdm_ep_record_tx_op_completed() above decrements efa_outstanding_tx_ops,
652+
* so this check must come after that call.
653+
* Only release TXE when both TX ops complete and receipt is received.
658654
*/
655+
assert(pkt_entry->ope);
656+
if (efa_rdm_txe_dc_ready_for_release(pkt_entry->ope))
657+
efa_rdm_txe_release(pkt_entry->ope);
658+
break;
659659
case EFA_RDM_READ_NACK_PKT:
660660
/* no action needed for NACK packet */
661661
break;

prov/efa/src/rdm/efa_rdm_pke_nonreq.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -783,7 +783,16 @@ void efa_rdm_pke_handle_receipt_recv(struct efa_rdm_pke *pkt_entry)
783783
return;
784784
}
785785

786-
efa_rdm_ope_handle_send_completed(txe);
786+
/* Write send completion immediately to preserve DC semantics */
787+
efa_rdm_txe_report_completion(txe);
788+
789+
/* Set receipt received flag for DC operations */
790+
txe->internal_flags |= EFA_RDM_TXE_RECEIPT_RECEIVED;
791+
792+
/* Only release txe if both conditions are met */
793+
if (efa_rdm_txe_dc_ready_for_release(txe))
794+
efa_rdm_txe_release(txe);
795+
787796
efa_rdm_pke_release_rx(pkt_entry);
788797
}
789798

prov/efa/test/efa_unit_test_ope.c

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */
33

44
#include "efa_unit_tests.h"
5+
#include "rdm/efa_rdm_pke_cmd.h"
6+
#include "rdm/efa_rdm_pke_nonreq.h"
57

68
typedef void (*efa_rdm_ope_handle_error_func_t)(struct efa_rdm_ope *ope, int err, int prov_errno);
79

@@ -1176,3 +1178,97 @@ void test_efa_rdm_atomic_compare_desc_persistence(struct efa_resource **state)
11761178
efa_unit_test_buff_destruct(&result_buff);
11771179
efa_unit_test_buff_destruct(&compare_buff);
11781180
}
1181+
/**
1182+
* @brief Common helper for DC packet TXE release testing
1183+
*
1184+
* Sets up test environment and packet entries for DC packet testing.
1185+
* Tests the specified completion order and verifies TXE release behavior.
1186+
*
1187+
* @param[in] resource Test resource structure
1188+
* @param[in] send_first If true, send completion happens first; if false, receipt first
1189+
*/
1190+
static void test_efa_rdm_txe_dc_release_common(struct efa_resource *resource, bool send_first)
1191+
{
1192+
struct efa_rdm_ep *efa_rdm_ep;
1193+
struct efa_rdm_ope *txe;
1194+
struct efa_rdm_pke *dc_pkt_entry, *receipt_pkt_entry;
1195+
1196+
efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
1197+
1198+
efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid);
1199+
1200+
/* Allocate TXE and set up for DC operation */
1201+
txe = efa_unit_test_alloc_txe(resource, ofi_op_msg);
1202+
assert_non_null(txe);
1203+
txe->internal_flags |= EFA_RDM_TXE_DELIVERY_COMPLETE_REQUESTED;
1204+
txe->efa_outstanding_tx_ops = 1;
1205+
1206+
/* Create fake DC packet entry */
1207+
dc_pkt_entry = efa_rdm_pke_alloc(efa_rdm_ep, efa_rdm_ep->efa_tx_pkt_pool, EFA_RDM_PKE_FROM_EFA_TX_POOL);
1208+
assert_non_null(dc_pkt_entry);
1209+
dc_pkt_entry->ope = txe;
1210+
dc_pkt_entry->ep = efa_rdm_ep;
1211+
dc_pkt_entry->peer = txe->peer;
1212+
/* Set DC packet type in wiredata */
1213+
struct efa_rdm_base_hdr *base_hdr = (struct efa_rdm_base_hdr *)dc_pkt_entry->wiredata;
1214+
base_hdr->type = EFA_RDM_DC_EAGER_MSGRTM_PKT;
1215+
1216+
/* Create fake receipt packet entry */
1217+
receipt_pkt_entry = efa_rdm_pke_alloc(efa_rdm_ep, efa_rdm_ep->efa_rx_pkt_pool, EFA_RDM_PKE_FROM_EFA_RX_POOL);
1218+
assert_non_null(receipt_pkt_entry);
1219+
receipt_pkt_entry->ope = txe;
1220+
receipt_pkt_entry->ep = efa_rdm_ep;
1221+
1222+
/* Verify TXE is not ready for release initially */
1223+
assert_false(efa_rdm_txe_dc_ready_for_release(txe));
1224+
assert_int_equal(efa_unit_test_get_dlist_length(&efa_rdm_ep->txe_list), 1);
1225+
1226+
if (send_first) {
1227+
/* Send completion first - should not release TXE yet */
1228+
efa_rdm_pke_handle_send_completion(dc_pkt_entry);
1229+
assert_int_equal(efa_unit_test_get_dlist_length(&efa_rdm_ep->txe_list), 1);
1230+
assert_false(efa_rdm_txe_dc_ready_for_release(txe));
1231+
1232+
/* Receipt handling - should now release TXE */
1233+
efa_rdm_pke_handle_receipt_recv(receipt_pkt_entry);
1234+
} else {
1235+
/* Receipt handling first - should not release TXE yet */
1236+
efa_rdm_pke_handle_receipt_recv(receipt_pkt_entry);
1237+
assert_int_equal(efa_unit_test_get_dlist_length(&efa_rdm_ep->txe_list), 1);
1238+
assert_true(txe->internal_flags & EFA_RDM_TXE_RECEIPT_RECEIVED);
1239+
assert_false(efa_rdm_txe_dc_ready_for_release(txe));
1240+
1241+
/* Send completion - should now release TXE */
1242+
efa_rdm_pke_handle_send_completion(dc_pkt_entry);
1243+
}
1244+
1245+
/* Verify TXE is released */
1246+
assert_int_equal(efa_unit_test_get_dlist_length(&efa_rdm_ep->txe_list), 0);
1247+
}
1248+
1249+
/**
1250+
* @brief Test DC packet TXE release with send completion first
1251+
*
1252+
* This test verifies the DC (Delivery Complete) TXE release logic when
1253+
* send completion arrives before receipt acknowledgment.
1254+
*
1255+
* @param[in] state cmocka state variable
1256+
*/
1257+
void test_efa_rdm_txe_dc_send_first(struct efa_resource **state)
1258+
{
1259+
test_efa_rdm_txe_dc_release_common(*state, true);
1260+
}
1261+
1262+
/**
1263+
* @brief Test DC packet TXE release with receipt completion first
1264+
*
1265+
* This test verifies the race condition fix where receipt acknowledgment
1266+
* arrives before send completion. The TXE should only be released when
1267+
* both conditions are met, regardless of order.
1268+
*
1269+
* @param[in] state cmocka state variable
1270+
*/
1271+
void test_efa_rdm_txe_dc_receipt_first(struct efa_resource **state)
1272+
{
1273+
test_efa_rdm_txe_dc_release_common(*state, false);
1274+
}

prov/efa/test/efa_unit_tests.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,8 @@ int main(void)
333333
cmocka_unit_test_setup_teardown(test_efa_rdm_ope_eor_packet_failed_posting, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
334334
cmocka_unit_test_setup_teardown(test_efa_rdm_ope_eor_packet_tracking_unresponsive_wait_send, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
335335
cmocka_unit_test_setup_teardown(test_efa_rdm_atomic_compare_desc_persistence, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
336+
cmocka_unit_test_setup_teardown(test_efa_rdm_txe_dc_send_first, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
337+
cmocka_unit_test_setup_teardown(test_efa_rdm_txe_dc_receipt_first, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
336338
/* end of efa_unit_test_ope.c */
337339

338340
cmocka_unit_test_setup_teardown(test_efa_rdm_msg_send_to_local_peer_with_null_desc, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),

prov/efa/test/efa_unit_tests.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,8 @@ void test_efa_rdm_ope_eor_packet_tracking_wait_send();
291291
void test_efa_rdm_ope_eor_packet_failed_posting();
292292
void test_efa_rdm_ope_eor_packet_tracking_unresponsive_wait_send();
293293
void test_efa_rdm_atomic_compare_desc_persistence();
294+
void test_efa_rdm_txe_dc_send_first();
295+
void test_efa_rdm_txe_dc_receipt_first();
294296

295297

296298
/* end of efa_unit_test_ope.c */

0 commit comments

Comments
 (0)