Skip to content

Commit a0b093c

Browse files
Alan-JowettAlan Jowett
andauthored
Enable add/remove CPU's from epoch consensus (microsoft#3771)
Signed-off-by: Alan Jowett <[email protected]> Co-authored-by: Alan Jowett <[email protected]>
1 parent 6e8f47a commit a0b093c

File tree

4 files changed

+224
-34
lines changed

4 files changed

+224
-34
lines changed

libs/runtime/ebpf_epoch.c

Lines changed: 195 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,23 @@
1717
* 1) Each CPU determines the minimum epoch of all threads on the CPU.
1818
* 2) The minimum epoch is committed as the release epoch and any memory that is older than the release epoch is
1919
* released.
20-
* 3) The epoch_computation_in_progress flag is cleared which allows the epoch computation to be initiated again.
20+
* 3) The epoch_computation_in_progress flag is cleared which allows the epoch computation to be initiated again.
21+
*
22+
* Note:
23+
* CPUs can be in one of three states:
24+
* 1) Inactive: The CPU is not actively participating in epoch computation.
25+
* Active flag is false.
26+
* 2) Activating: The CPU is in the process of activating and is not yet active.
27+
* Active flag is true and current_epoch == EBPF_EPOCH_UNKNOWN_EPOCH.
28+
* 3) Active: The CPU is actively participating in epoch computation.
29+
* Active flag is true and current_epoch != EBPF_EPOCH_UNKNOWN_EPOCH.
30+
*
31+
* All CPUs except CPU 0 are in the inactive state at initialization. CPU 0 is always active.
32+
*
33+
* CPUs transition between states as follows:
34+
* 1) Inactive -> Activating: The CPU is activated when a thread enters an epoch and the CPU is not active.
35+
* 2) Activating -> Active: The CPU is active when it is notified of the current epoch value.
36+
* 3) Active -> Inactive: The CPU is deactivated when there are no threads in the epoch and the free list is empty.
2137
*/
2238

2339
/**
@@ -30,6 +46,16 @@
3046
*/
3147
#define EBPF_NANO_SECONDS_PER_FILETIME_TICK 100
3248

49+
/**
50+
* @brief A sentinel value used to indicate that the epoch is unknown.
51+
*/
52+
#define EBPF_EPOCH_UNKNOWN_EPOCH 0
53+
54+
/**
55+
* @brief The first valid epoch value.
56+
*/
57+
#define EBPF_EPOCH_FIRST_EPOCH 1
58+
3359
#define EBPF_EPOCH_FAIL_FAST(REASON, ASSERTION) \
3460
if (!(ASSERTION)) { \
3561
ebpf_assert(!#ASSERTION); \
@@ -51,9 +77,19 @@ typedef __declspec(align(EBPF_CACHE_LINE_SIZE)) struct _ebpf_epoch_cpu_entry
5177
int timer_armed : 1; ///< Set if the flush timer is armed.
5278
int rundown_in_progress : 1; ///< Set if rundown is in progress.
5379
int epoch_computation_in_progress : 1; ///< Set if epoch computation is in progress.
54-
ebpf_timed_work_queue_t* work_queue; ///< Work queue used to schedule work items.
80+
int active : 1; ///< CPU is active in epoch computation. Only accessed under _ebpf_epoch_cpu_active_lock.
81+
int work_queue_assigned : 1; ///< Work queue is assigned to this CPU.
82+
ebpf_timed_work_queue_t* work_queue; ///< Work queue used to schedule work items.
5583
} ebpf_epoch_cpu_entry_t;
5684

85+
static_assert(
86+
sizeof(ebpf_epoch_cpu_entry_t) % EBPF_CACHE_LINE_SIZE == 0, "ebpf_epoch_cpu_entry_t is not cache aligned");
87+
88+
/**
89+
* @brief Lock to ensure a consistent view of the active CPUs.
90+
*/
91+
static ebpf_lock_t _ebpf_epoch_cpu_active_lock; ///< Lock to protect the active CPU list.
92+
5793
/**
5894
* @brief Table of per-CPU state.
5995
*/
@@ -116,12 +152,12 @@ typedef struct _ebpf_epoch_cpu_message
116152
{
117153
struct
118154
{
119-
uint64_t current_epoch; ///< The new current epoch.
120-
uint64_t proposed_release_epoch; ///< Minimum epoch of all threads on the CPU.
155+
int64_t current_epoch; ///< The new current epoch.
156+
int64_t proposed_release_epoch; ///< Minimum epoch of all threads on the CPU.
121157
} propose_epoch;
122158
struct
123159
{
124-
uint64_t released_epoch; ///< The newest epoch that can be released.
160+
int64_t released_epoch; ///< The newest epoch that can be released.
125161
} commit_epoch;
126162
struct
127163
{
@@ -224,6 +260,15 @@ static _IRQL_requires_(DISPATCH_LEVEL) void _ebpf_epoch_arm_timer_if_needed(ebpf
224260
static void
225261
_ebpf_epoch_work_item_callback(_In_ cxplat_preemptible_work_item_t* preemptible_work_item, void* context);
226262

263+
static void
264+
_ebpf_epoch_activate_cpu(uint32_t cpu_id);
265+
266+
static void
267+
_ebpf_epoch_deactivate_cpu(uint32_t cpu_id);
268+
269+
uint32_t
270+
_ebpf_epoch_next_active_cpu(uint32_t cpu_id);
271+
227272
/**
228273
* @brief Raise the CPU's IRQL to DISPATCH_LEVEL if it is below DISPATCH_LEVEL.
229274
* First check if the IRQL is below DISPATCH_LEVEL to avoid the overhead of
@@ -278,12 +323,13 @@ ebpf_epoch_initiate()
278323
goto Error;
279324
}
280325

326+
ebpf_lock_create(&_ebpf_epoch_cpu_active_lock);
327+
281328
ebpf_assert(EBPF_CACHE_ALIGN_POINTER(_ebpf_epoch_cpu_table) == _ebpf_epoch_cpu_table);
282329

283330
// Initialize the per-CPU state.
284331
for (uint32_t cpu_id = 0; cpu_id < _ebpf_epoch_cpu_count; cpu_id++) {
285332
ebpf_epoch_cpu_entry_t* cpu_entry = &_ebpf_epoch_cpu_table[cpu_id];
286-
cpu_entry->current_epoch = 1;
287333
ebpf_list_initialize(&cpu_entry->epoch_state_list);
288334
ebpf_list_initialize(&cpu_entry->free_list);
289335
}
@@ -302,6 +348,12 @@ ebpf_epoch_initiate()
302348
}
303349
}
304350

351+
// CPU 0 is always active.
352+
_ebpf_epoch_activate_cpu(0);
353+
354+
// Set the current epoch for CPU 0.
355+
_ebpf_epoch_cpu_table[0].current_epoch = EBPF_EPOCH_FIRST_EPOCH;
356+
305357
KeInitializeDpc(&_ebpf_epoch_timer_dpc, _ebpf_epoch_timer_worker, NULL);
306358
KeSetTargetProcessorDpc(&_ebpf_epoch_timer_dpc, 0);
307359

@@ -358,6 +410,7 @@ ebpf_epoch_terminate()
358410
cxplat_free(
359411
_ebpf_epoch_cpu_table, CXPLAT_POOL_FLAG_NON_PAGED | CXPLAT_POOL_FLAG_CACHE_ALIGNED, EBPF_POOL_TAG_EPOCH);
360412
_ebpf_epoch_cpu_table = NULL;
413+
361414
EBPF_RETURN_VOID();
362415
}
363416

@@ -376,6 +429,10 @@ ebpf_epoch_enter(_Out_ ebpf_epoch_state_t* epoch_state)
376429
ebpf_list_insert_tail(&cpu_entry->epoch_state_list, &epoch_state->epoch_list_entry);
377430

378431
_ebpf_epoch_lower_to_previous_irql(epoch_state->irql_at_enter);
432+
433+
if (!cpu_entry->active) {
434+
_ebpf_epoch_activate_cpu(epoch_state->cpu_id);
435+
}
379436
}
380437
#pragma warning(pop)
381438

@@ -650,6 +707,10 @@ _ebpf_epoch_insert_in_free_list(_In_ ebpf_epoch_allocation_header_t* header)
650707
uint32_t cpu_id = ebpf_get_current_cpu();
651708
ebpf_epoch_cpu_entry_t* cpu_entry = &_ebpf_epoch_cpu_table[cpu_id];
652709

710+
if (!cpu_entry->active) {
711+
_ebpf_epoch_activate_cpu(cpu_id);
712+
}
713+
653714
if (cpu_entry->rundown_in_progress) {
654715
KeLowerIrql(old_irql);
655716
switch (header->entry_type) {
@@ -747,8 +808,6 @@ void
747808
_ebpf_epoch_messenger_propose_release_epoch(
748809
_Inout_ ebpf_epoch_cpu_entry_t* cpu_entry, _Inout_ ebpf_epoch_cpu_message_t* message, uint32_t current_cpu)
749810
{
750-
// Walk over each thread_entry in the epoch_state_list and compute the minimum epoch.
751-
ebpf_list_entry_t* entry = cpu_entry->epoch_state_list.Flink;
752811
ebpf_epoch_state_t* epoch_state;
753812
uint32_t next_cpu;
754813

@@ -760,32 +819,43 @@ _ebpf_epoch_messenger_propose_release_epoch(
760819
}
761820
// Other CPUs update the current epoch.
762821
else {
822+
// If the epoch was unknown, then update the freed_epoch for all items in the free list now that we know the
823+
// current epoch. This occurs when the CPU is activated and continues until the first epoch is proposed.
824+
if (cpu_entry->current_epoch == EBPF_EPOCH_UNKNOWN_EPOCH) {
825+
for (ebpf_list_entry_t* entry = cpu_entry->free_list.Flink; entry != &cpu_entry->free_list;
826+
entry = entry->Flink) {
827+
ebpf_epoch_allocation_header_t* header =
828+
CONTAINING_RECORD(entry, ebpf_epoch_allocation_header_t, list_entry);
829+
ebpf_assert(header->freed_epoch == EBPF_EPOCH_UNKNOWN_EPOCH);
830+
header->freed_epoch = cpu_entry->current_epoch;
831+
}
832+
}
833+
763834
cpu_entry->current_epoch = message->message.propose_epoch.current_epoch;
764835
}
765836

766837
// Put a memory barrier here to ensure that the write is not re-ordered.
767838
MemoryBarrier();
768839

769840
// Previous CPU's minimum epoch.
770-
uint64_t minimum_epoch = message->message.propose_epoch.proposed_release_epoch;
841+
int64_t minimum_epoch = message->message.propose_epoch.proposed_release_epoch;
771842

772-
while (entry != &cpu_entry->epoch_state_list) {
843+
// Walk over each thread_entry in the epoch_state_list and compute the minimum epoch.
844+
for (ebpf_list_entry_t* entry = &cpu_entry->epoch_state_list; entry != &cpu_entry->epoch_state_list;
845+
entry = entry->Flink) {
773846
epoch_state = CONTAINING_RECORD(entry, ebpf_epoch_state_t, epoch_list_entry);
774847
minimum_epoch = min(minimum_epoch, epoch_state->epoch);
775-
entry = entry->Flink;
776848
}
777849

778850
// Set the proposed release epoch to the minimum epoch seen so far.
779851
message->message.propose_epoch.proposed_release_epoch = minimum_epoch;
780852

853+
next_cpu = _ebpf_epoch_next_active_cpu(current_cpu);
854+
781855
// If this is the last CPU, then send a message to the first CPU to commit the release epoch.
782-
if (current_cpu == _ebpf_epoch_cpu_count - 1) {
856+
if (next_cpu == 0) {
783857
message->message.commit_epoch.released_epoch = minimum_epoch;
784858
message->message_type = EBPF_EPOCH_CPU_MESSAGE_TYPE_COMMIT_RELEASE_EPOCH;
785-
next_cpu = 0;
786-
} else {
787-
// Send the message to the next CPU.
788-
next_cpu = current_cpu + 1;
789859
}
790860

791861
_ebpf_epoch_send_message_async(message, next_cpu);
@@ -813,22 +883,41 @@ _ebpf_epoch_messenger_commit_release_epoch(
813883
{
814884
uint32_t next_cpu;
815885

886+
// If any epoch_states are in EBPF_EPOCH_UNKNOWN_EPOCH, then activation of a CPU is in progress.
887+
bool other_cpus_are_activating = (message->message.commit_epoch.released_epoch == EBPF_EPOCH_UNKNOWN_EPOCH);
888+
889+
// If this CPU is in EBPF_EPOCH_UNKNOWN_EPOCH, then activation of this CPU is in progress.
890+
bool this_cpu_is_activating = (cpu_entry->current_epoch == EBPF_EPOCH_UNKNOWN_EPOCH);
891+
816892
cpu_entry->timer_armed = false;
817893
// Set the released_epoch to the value computed by the EBPF_EPOCH_CPU_MESSAGE_TYPE_PROPOSE_RELEASE_EPOCH message.
818894
cpu_entry->released_epoch = message->message.commit_epoch.released_epoch - 1;
819895

896+
next_cpu = _ebpf_epoch_next_active_cpu(current_cpu);
897+
820898
// If this is the last CPU, send the message to the first CPU to complete the cycle.
821-
if (current_cpu != _ebpf_epoch_cpu_count - 1) {
822-
// Send the message to the next CPU.
823-
next_cpu = current_cpu + 1;
824-
} else {
899+
if (next_cpu == 0) {
825900
message->message_type = EBPF_EPOCH_CPU_MESSAGE_TYPE_PROPOSE_EPOCH_COMPLETE;
826-
next_cpu = 0;
827901
}
828902

829903
_ebpf_epoch_send_message_async(message, next_cpu);
830904

905+
// Wait for all the CPUs to transition to an active state.
906+
if (other_cpus_are_activating || this_cpu_is_activating) {
907+
// One or more CPUs are still activating. Rearm the timer and wait for the next message.
908+
_ebpf_epoch_arm_timer_if_needed(cpu_entry);
909+
return;
910+
}
911+
912+
// All CPUs have transitioned to an active state and the epoch computation was successfully completed.
913+
// Release any memory that is associated with expired epochs.
831914
_ebpf_epoch_release_free_list(cpu_entry, cpu_entry->released_epoch);
915+
916+
// Check if this CPU is idle and deactivate it if it is (CPU 0 is always active).
917+
if ((current_cpu != 0) && ebpf_list_is_empty(&cpu_entry->free_list) &&
918+
ebpf_list_is_empty(&cpu_entry->epoch_state_list)) {
919+
_ebpf_epoch_deactivate_cpu(current_cpu);
920+
}
832921
}
833922

834923
/**
@@ -894,15 +983,13 @@ _ebpf_epoch_messenger_rundown_in_progress(
894983
{
895984
uint32_t next_cpu;
896985
cpu_entry->rundown_in_progress = true;
986+
987+
next_cpu = _ebpf_epoch_next_active_cpu(current_cpu);
988+
897989
// If this is the last CPU, then stop.
898-
if (current_cpu != _ebpf_epoch_cpu_count - 1) {
899-
// Send the message to the next CPU.
900-
next_cpu = current_cpu + 1;
901-
} else {
990+
if (next_cpu == 0) {
902991
// Signal the caller that rundown is complete.
903992
KeSetEvent(&message->completion_event, 0, FALSE);
904-
// Set next_cpu to UINT32_MAX to make code analysis happy.
905-
next_cpu = UINT32_MAX;
906993
return;
907994
}
908995

@@ -1028,3 +1115,84 @@ _ebpf_epoch_work_item_callback(_In_ cxplat_preemptible_work_item_t* preemptible_
10281115

10291116
cxplat_release_rundown_protection(&_ebpf_epoch_work_item_rundown_ref);
10301117
}
1118+
1119+
/**
1120+
* @brief Add the CPU to the next active CPU table.
1121+
*
1122+
* @param[in] cpu_id CPU to add.
1123+
*/
1124+
static void
1125+
_ebpf_epoch_activate_cpu(uint32_t cpu_id)
1126+
{
1127+
EBPF_LOG_ENTRY();
1128+
1129+
EBPF_LOG_MESSAGE_UINT64(EBPF_TRACELOG_LEVEL_INFO, EBPF_TRACELOG_KEYWORD_EPOCH, "Activating CPU", cpu_id);
1130+
1131+
ebpf_epoch_cpu_entry_t* cpu_entry = &_ebpf_epoch_cpu_table[cpu_id];
1132+
ebpf_lock_state_t state = ebpf_lock_lock(&_ebpf_epoch_cpu_active_lock);
1133+
1134+
cpu_entry->active = true;
1135+
// When the CPU is activated, the current epoch is not known.
1136+
// Memory freed before the current epoch is set will have its epoch set to EBPF_EPOCH_UNKNOWN_EPOCH and have its
1137+
// epoch set when the current epoch is known (i.e., when the next epoch is proposed).
1138+
cpu_entry->current_epoch = EBPF_EPOCH_UNKNOWN_EPOCH;
1139+
1140+
if (!cpu_entry->work_queue_assigned) {
1141+
ebpf_result_t result = ebpf_timed_work_queue_set_cpu_id(cpu_entry->work_queue, cpu_id);
1142+
if (result != EBPF_SUCCESS) {
1143+
// This is a fatal error. The epoch system is in an inconsistent state.
1144+
__fastfail(FAST_FAIL_INVALID_ARG);
1145+
}
1146+
cpu_entry->work_queue_assigned = 1;
1147+
}
1148+
1149+
ebpf_lock_unlock(&_ebpf_epoch_cpu_active_lock, state);
1150+
EBPF_LOG_EXIT();
1151+
}
1152+
1153+
/**
1154+
* @brief Remove the CPU from the next active CPU table.
1155+
*
1156+
* @param[in] cpu_id CPU to remove.
1157+
*/
1158+
static void
1159+
_ebpf_epoch_deactivate_cpu(uint32_t cpu_id)
1160+
{
1161+
EBPF_LOG_ENTRY();
1162+
1163+
EBPF_LOG_MESSAGE_UINT64(EBPF_TRACELOG_LEVEL_INFO, EBPF_TRACELOG_KEYWORD_EPOCH, "Deactivating CPU", cpu_id);
1164+
1165+
ebpf_epoch_cpu_entry_t* cpu_entry = &_ebpf_epoch_cpu_table[cpu_id];
1166+
ebpf_lock_state_t state = ebpf_lock_lock(&_ebpf_epoch_cpu_active_lock);
1167+
cpu_entry->active = false;
1168+
ebpf_lock_unlock(&_ebpf_epoch_cpu_active_lock, state);
1169+
1170+
EBPF_LOG_EXIT();
1171+
}
1172+
1173+
/**
1174+
* @brief Given the current CPU, return the next active CPU.
1175+
*
1176+
* @param[in] cpu_id The current CPU.
1177+
* @return The next active CPU.
1178+
*/
1179+
uint32_t
1180+
_ebpf_epoch_next_active_cpu(uint32_t cpu_id)
1181+
{
1182+
uint32_t next_active_cpu;
1183+
ebpf_lock_state_t state = ebpf_lock_lock(&_ebpf_epoch_cpu_active_lock);
1184+
1185+
for (next_active_cpu = cpu_id + 1; next_active_cpu < _ebpf_epoch_cpu_count; next_active_cpu++) {
1186+
if (_ebpf_epoch_cpu_table[next_active_cpu].active) {
1187+
break;
1188+
}
1189+
}
1190+
1191+
if (next_active_cpu == _ebpf_epoch_cpu_count) {
1192+
next_active_cpu = 0;
1193+
}
1194+
1195+
ebpf_lock_unlock(&_ebpf_epoch_cpu_active_lock, state);
1196+
1197+
return next_active_cpu;
1198+
}

libs/runtime/ebpf_epoch.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ extern "C"
1414
typedef struct _ebpf_epoch_state
1515
{
1616
LIST_ENTRY epoch_list_entry; /// List entry for the epoch list.
17-
uint64_t epoch; /// The epoch when this entry was added to the list.
17+
int64_t epoch; /// The epoch when this entry was added to the list.
1818
uint32_t cpu_id; /// The CPU on which this entry was added to the list.
1919
KIRQL irql_at_enter; /// The IRQL when this entry was added to the list.
2020
} ebpf_epoch_state_t;

0 commit comments

Comments
 (0)