17
17
* 1) Each CPU determines the minimum epoch of all threads on the CPU.
18
18
* 2) The minimum epoch is committed as the release epoch and any memory that is older than the release epoch is
19
19
* released.
20
- * 3) The epoch_computation_in_progress flag is cleared which allows the epoch computation to be initiated again.
20
+ * 3) The epoch_computation_in_progress flag is cleared which allows the epoch computation to be initiated again.
21
+ *
22
+ * Note:
23
+ * CPUs can be in one of three states:
24
+ * 1) Inactive: The CPU is not actively participating in epoch computation.
25
+ * Active flag is false.
26
+ * 2) Activating: The CPU is in the process of activating and is not yet active.
27
+ * Active flag is true and current_epoch == EBPF_EPOCH_UNKNOWN_EPOCH.
28
+ * 3) Active: The CPU is actively participating in epoch computation.
29
+ * Active flag is true and current_epoch != EBPF_EPOCH_UNKNOWN_EPOCH.
30
+ *
31
+ * All CPUs except CPU 0 are in the inactive state at initialization. CPU 0 is always active.
32
+ *
33
+ * CPUs transition between states as follows:
34
+ * 1) Inactive -> Activating: The CPU is activated when a thread enters an epoch and the CPU is not active.
35
+ * 2) Activating -> Active: The CPU is active when it is notified of the current epoch value.
36
+ * 3) Active -> Inactive: The CPU is deactivated when there are no threads in the epoch and the free list is empty.
21
37
*/
22
38
23
39
/**
30
46
*/
31
47
#define EBPF_NANO_SECONDS_PER_FILETIME_TICK 100
32
48
49
+ /**
50
+ * @brief A sentinel value used to indicate that the epoch is unknown.
51
+ */
52
+ #define EBPF_EPOCH_UNKNOWN_EPOCH 0
53
+
54
+ /**
55
+ * @brief The first valid epoch value.
56
+ */
57
+ #define EBPF_EPOCH_FIRST_EPOCH 1
58
+
33
59
#define EBPF_EPOCH_FAIL_FAST (REASON , ASSERTION ) \
34
60
if (!(ASSERTION)) { \
35
61
ebpf_assert(!#ASSERTION); \
@@ -51,9 +77,19 @@ typedef __declspec(align(EBPF_CACHE_LINE_SIZE)) struct _ebpf_epoch_cpu_entry
51
77
int timer_armed : 1 ; ///< Set if the flush timer is armed.
52
78
int rundown_in_progress : 1 ; ///< Set if rundown is in progress.
53
79
int epoch_computation_in_progress : 1 ; ///< Set if epoch computation is in progress.
54
- ebpf_timed_work_queue_t * work_queue ; ///< Work queue used to schedule work items.
80
+ int active : 1 ; ///< CPU is active in epoch computation. Only accessed under _ebpf_epoch_cpu_active_lock.
81
+ int work_queue_assigned : 1 ; ///< Work queue is assigned to this CPU.
82
+ ebpf_timed_work_queue_t * work_queue ; ///< Work queue used to schedule work items.
55
83
} ebpf_epoch_cpu_entry_t ;
56
84
85
+ static_assert (
86
+ sizeof (ebpf_epoch_cpu_entry_t ) % EBPF_CACHE_LINE_SIZE == 0 , "ebpf_epoch_cpu_entry_t is not cache aligned" );
87
+
88
+ /**
89
+ * @brief Lock to ensure a consistent view of the active CPUs.
90
+ */
91
+ static ebpf_lock_t _ebpf_epoch_cpu_active_lock ; ///< Lock to protect the active CPU list.
92
+
57
93
/**
58
94
* @brief Table of per-CPU state.
59
95
*/
@@ -116,12 +152,12 @@ typedef struct _ebpf_epoch_cpu_message
116
152
{
117
153
struct
118
154
{
119
- uint64_t current_epoch ; ///< The new current epoch.
120
- uint64_t proposed_release_epoch ; ///< Minimum epoch of all threads on the CPU.
155
+ int64_t current_epoch ; ///< The new current epoch.
156
+ int64_t proposed_release_epoch ; ///< Minimum epoch of all threads on the CPU.
121
157
} propose_epoch ;
122
158
struct
123
159
{
124
- uint64_t released_epoch ; ///< The newest epoch that can be released.
160
+ int64_t released_epoch ; ///< The newest epoch that can be released.
125
161
} commit_epoch ;
126
162
struct
127
163
{
@@ -224,6 +260,15 @@ static _IRQL_requires_(DISPATCH_LEVEL) void _ebpf_epoch_arm_timer_if_needed(ebpf
224
260
static void
225
261
_ebpf_epoch_work_item_callback (_In_ cxplat_preemptible_work_item_t * preemptible_work_item , void * context );
226
262
263
+ static void
264
+ _ebpf_epoch_activate_cpu (uint32_t cpu_id );
265
+
266
+ static void
267
+ _ebpf_epoch_deactivate_cpu (uint32_t cpu_id );
268
+
269
+ uint32_t
270
+ _ebpf_epoch_next_active_cpu (uint32_t cpu_id );
271
+
227
272
/**
228
273
* @brief Raise the CPU's IRQL to DISPATCH_LEVEL if it is below DISPATCH_LEVEL.
229
274
* First check if the IRQL is below DISPATCH_LEVEL to avoid the overhead of
@@ -278,12 +323,13 @@ ebpf_epoch_initiate()
278
323
goto Error ;
279
324
}
280
325
326
+ ebpf_lock_create (& _ebpf_epoch_cpu_active_lock );
327
+
281
328
ebpf_assert (EBPF_CACHE_ALIGN_POINTER (_ebpf_epoch_cpu_table ) == _ebpf_epoch_cpu_table );
282
329
283
330
// Initialize the per-CPU state.
284
331
for (uint32_t cpu_id = 0 ; cpu_id < _ebpf_epoch_cpu_count ; cpu_id ++ ) {
285
332
ebpf_epoch_cpu_entry_t * cpu_entry = & _ebpf_epoch_cpu_table [cpu_id ];
286
- cpu_entry -> current_epoch = 1 ;
287
333
ebpf_list_initialize (& cpu_entry -> epoch_state_list );
288
334
ebpf_list_initialize (& cpu_entry -> free_list );
289
335
}
@@ -302,6 +348,12 @@ ebpf_epoch_initiate()
302
348
}
303
349
}
304
350
351
+ // CPU 0 is always active.
352
+ _ebpf_epoch_activate_cpu (0 );
353
+
354
+ // Set the current epoch for CPU 0.
355
+ _ebpf_epoch_cpu_table [0 ].current_epoch = EBPF_EPOCH_FIRST_EPOCH ;
356
+
305
357
KeInitializeDpc (& _ebpf_epoch_timer_dpc , _ebpf_epoch_timer_worker , NULL );
306
358
KeSetTargetProcessorDpc (& _ebpf_epoch_timer_dpc , 0 );
307
359
@@ -358,6 +410,7 @@ ebpf_epoch_terminate()
358
410
cxplat_free (
359
411
_ebpf_epoch_cpu_table , CXPLAT_POOL_FLAG_NON_PAGED | CXPLAT_POOL_FLAG_CACHE_ALIGNED , EBPF_POOL_TAG_EPOCH );
360
412
_ebpf_epoch_cpu_table = NULL ;
413
+
361
414
EBPF_RETURN_VOID ();
362
415
}
363
416
@@ -376,6 +429,10 @@ ebpf_epoch_enter(_Out_ ebpf_epoch_state_t* epoch_state)
376
429
ebpf_list_insert_tail (& cpu_entry -> epoch_state_list , & epoch_state -> epoch_list_entry );
377
430
378
431
_ebpf_epoch_lower_to_previous_irql (epoch_state -> irql_at_enter );
432
+
433
+ if (!cpu_entry -> active ) {
434
+ _ebpf_epoch_activate_cpu (epoch_state -> cpu_id );
435
+ }
379
436
}
380
437
#pragma warning(pop)
381
438
@@ -650,6 +707,10 @@ _ebpf_epoch_insert_in_free_list(_In_ ebpf_epoch_allocation_header_t* header)
650
707
uint32_t cpu_id = ebpf_get_current_cpu ();
651
708
ebpf_epoch_cpu_entry_t * cpu_entry = & _ebpf_epoch_cpu_table [cpu_id ];
652
709
710
+ if (!cpu_entry -> active ) {
711
+ _ebpf_epoch_activate_cpu (cpu_id );
712
+ }
713
+
653
714
if (cpu_entry -> rundown_in_progress ) {
654
715
KeLowerIrql (old_irql );
655
716
switch (header -> entry_type ) {
747
808
_ebpf_epoch_messenger_propose_release_epoch (
748
809
_Inout_ ebpf_epoch_cpu_entry_t * cpu_entry , _Inout_ ebpf_epoch_cpu_message_t * message , uint32_t current_cpu )
749
810
{
750
- // Walk over each thread_entry in the epoch_state_list and compute the minimum epoch.
751
- ebpf_list_entry_t * entry = cpu_entry -> epoch_state_list .Flink ;
752
811
ebpf_epoch_state_t * epoch_state ;
753
812
uint32_t next_cpu ;
754
813
@@ -760,32 +819,43 @@ _ebpf_epoch_messenger_propose_release_epoch(
760
819
}
761
820
// Other CPUs update the current epoch.
762
821
else {
822
+ // If the epoch was unknown, then update the freed_epoch for all items in the free list now that we know the
823
+ // current epoch. This occurs when the CPU is activated and continues until the first epoch is proposed.
824
+ if (cpu_entry -> current_epoch == EBPF_EPOCH_UNKNOWN_EPOCH ) {
825
+ for (ebpf_list_entry_t * entry = cpu_entry -> free_list .Flink ; entry != & cpu_entry -> free_list ;
826
+ entry = entry -> Flink ) {
827
+ ebpf_epoch_allocation_header_t * header =
828
+ CONTAINING_RECORD (entry , ebpf_epoch_allocation_header_t , list_entry );
829
+ ebpf_assert (header -> freed_epoch == EBPF_EPOCH_UNKNOWN_EPOCH );
830
+ header -> freed_epoch = cpu_entry -> current_epoch ;
831
+ }
832
+ }
833
+
763
834
cpu_entry -> current_epoch = message -> message .propose_epoch .current_epoch ;
764
835
}
765
836
766
837
// Put a memory barrier here to ensure that the write is not re-ordered.
767
838
MemoryBarrier ();
768
839
769
840
// Previous CPU's minimum epoch.
770
- uint64_t minimum_epoch = message -> message .propose_epoch .proposed_release_epoch ;
841
+ int64_t minimum_epoch = message -> message .propose_epoch .proposed_release_epoch ;
771
842
772
- while (entry != & cpu_entry -> epoch_state_list ) {
843
+ // Walk over each thread_entry in the epoch_state_list and compute the minimum epoch.
844
+ for (ebpf_list_entry_t * entry = & cpu_entry -> epoch_state_list ; entry != & cpu_entry -> epoch_state_list ;
845
+ entry = entry -> Flink ) {
773
846
epoch_state = CONTAINING_RECORD (entry , ebpf_epoch_state_t , epoch_list_entry );
774
847
minimum_epoch = min (minimum_epoch , epoch_state -> epoch );
775
- entry = entry -> Flink ;
776
848
}
777
849
778
850
// Set the proposed release epoch to the minimum epoch seen so far.
779
851
message -> message .propose_epoch .proposed_release_epoch = minimum_epoch ;
780
852
853
+ next_cpu = _ebpf_epoch_next_active_cpu (current_cpu );
854
+
781
855
// If this is the last CPU, then send a message to the first CPU to commit the release epoch.
782
- if (current_cpu == _ebpf_epoch_cpu_count - 1 ) {
856
+ if (next_cpu == 0 ) {
783
857
message -> message .commit_epoch .released_epoch = minimum_epoch ;
784
858
message -> message_type = EBPF_EPOCH_CPU_MESSAGE_TYPE_COMMIT_RELEASE_EPOCH ;
785
- next_cpu = 0 ;
786
- } else {
787
- // Send the message to the next CPU.
788
- next_cpu = current_cpu + 1 ;
789
859
}
790
860
791
861
_ebpf_epoch_send_message_async (message , next_cpu );
@@ -813,22 +883,41 @@ _ebpf_epoch_messenger_commit_release_epoch(
813
883
{
814
884
uint32_t next_cpu ;
815
885
886
+ // If any epoch_states are in EBPF_EPOCH_UNKNOWN_EPOCH, then activation of a CPU is in progress.
887
+ bool other_cpus_are_activating = (message -> message .commit_epoch .released_epoch == EBPF_EPOCH_UNKNOWN_EPOCH );
888
+
889
+ // If this CPU is in EBPF_EPOCH_UNKNOWN_EPOCH, then activation of this CPU is in progress.
890
+ bool this_cpu_is_activating = (cpu_entry -> current_epoch == EBPF_EPOCH_UNKNOWN_EPOCH );
891
+
816
892
cpu_entry -> timer_armed = false;
817
893
// Set the released_epoch to the value computed by the EBPF_EPOCH_CPU_MESSAGE_TYPE_PROPOSE_RELEASE_EPOCH message.
818
894
cpu_entry -> released_epoch = message -> message .commit_epoch .released_epoch - 1 ;
819
895
896
+ next_cpu = _ebpf_epoch_next_active_cpu (current_cpu );
897
+
820
898
// If this is the last CPU, send the message to the first CPU to complete the cycle.
821
- if (current_cpu != _ebpf_epoch_cpu_count - 1 ) {
822
- // Send the message to the next CPU.
823
- next_cpu = current_cpu + 1 ;
824
- } else {
899
+ if (next_cpu == 0 ) {
825
900
message -> message_type = EBPF_EPOCH_CPU_MESSAGE_TYPE_PROPOSE_EPOCH_COMPLETE ;
826
- next_cpu = 0 ;
827
901
}
828
902
829
903
_ebpf_epoch_send_message_async (message , next_cpu );
830
904
905
+ // Wait for all the CPUs to transition to an active state.
906
+ if (other_cpus_are_activating || this_cpu_is_activating ) {
907
+ // One or more CPUs are still activating. Rearm the timer and wait for the next message.
908
+ _ebpf_epoch_arm_timer_if_needed (cpu_entry );
909
+ return ;
910
+ }
911
+
912
+ // All CPUs have transitioned to an active state and the epoch computation was successfully completed.
913
+ // Release any memory that is associated with expired epochs.
831
914
_ebpf_epoch_release_free_list (cpu_entry , cpu_entry -> released_epoch );
915
+
916
+ // Check if this CPU is idle and deactivate it if it is (CPU 0 is always active).
917
+ if ((current_cpu != 0 ) && ebpf_list_is_empty (& cpu_entry -> free_list ) &&
918
+ ebpf_list_is_empty (& cpu_entry -> epoch_state_list )) {
919
+ _ebpf_epoch_deactivate_cpu (current_cpu );
920
+ }
832
921
}
833
922
834
923
/**
@@ -894,15 +983,13 @@ _ebpf_epoch_messenger_rundown_in_progress(
894
983
{
895
984
uint32_t next_cpu ;
896
985
cpu_entry -> rundown_in_progress = true;
986
+
987
+ next_cpu = _ebpf_epoch_next_active_cpu (current_cpu );
988
+
897
989
// If this is the last CPU, then stop.
898
- if (current_cpu != _ebpf_epoch_cpu_count - 1 ) {
899
- // Send the message to the next CPU.
900
- next_cpu = current_cpu + 1 ;
901
- } else {
990
+ if (next_cpu == 0 ) {
902
991
// Signal the caller that rundown is complete.
903
992
KeSetEvent (& message -> completion_event , 0 , FALSE);
904
- // Set next_cpu to UINT32_MAX to make code analysis happy.
905
- next_cpu = UINT32_MAX ;
906
993
return ;
907
994
}
908
995
@@ -1028,3 +1115,84 @@ _ebpf_epoch_work_item_callback(_In_ cxplat_preemptible_work_item_t* preemptible_
1028
1115
1029
1116
cxplat_release_rundown_protection (& _ebpf_epoch_work_item_rundown_ref );
1030
1117
}
1118
+
1119
+ /**
1120
+ * @brief Add the CPU to the next active CPU table.
1121
+ *
1122
+ * @param[in] cpu_id CPU to add.
1123
+ */
1124
+ static void
1125
+ _ebpf_epoch_activate_cpu (uint32_t cpu_id )
1126
+ {
1127
+ EBPF_LOG_ENTRY ();
1128
+
1129
+ EBPF_LOG_MESSAGE_UINT64 (EBPF_TRACELOG_LEVEL_INFO , EBPF_TRACELOG_KEYWORD_EPOCH , "Activating CPU" , cpu_id );
1130
+
1131
+ ebpf_epoch_cpu_entry_t * cpu_entry = & _ebpf_epoch_cpu_table [cpu_id ];
1132
+ ebpf_lock_state_t state = ebpf_lock_lock (& _ebpf_epoch_cpu_active_lock );
1133
+
1134
+ cpu_entry -> active = true;
1135
+ // When the CPU is activated, the current epoch is not known.
1136
+ // Memory freed before the current epoch is set will have its epoch set to EBPF_EPOCH_UNKNOWN_EPOCH and have its
1137
+ // epoch set when the current epoch is known (i.e., when the next epoch is proposed).
1138
+ cpu_entry -> current_epoch = EBPF_EPOCH_UNKNOWN_EPOCH ;
1139
+
1140
+ if (!cpu_entry -> work_queue_assigned ) {
1141
+ ebpf_result_t result = ebpf_timed_work_queue_set_cpu_id (cpu_entry -> work_queue , cpu_id );
1142
+ if (result != EBPF_SUCCESS ) {
1143
+ // This is a fatal error. The epoch system is in an inconsistent state.
1144
+ __fastfail (FAST_FAIL_INVALID_ARG );
1145
+ }
1146
+ cpu_entry -> work_queue_assigned = 1 ;
1147
+ }
1148
+
1149
+ ebpf_lock_unlock (& _ebpf_epoch_cpu_active_lock , state );
1150
+ EBPF_LOG_EXIT ();
1151
+ }
1152
+
1153
+ /**
1154
+ * @brief Remove the CPU from the next active CPU table.
1155
+ *
1156
+ * @param[in] cpu_id CPU to remove.
1157
+ */
1158
+ static void
1159
+ _ebpf_epoch_deactivate_cpu (uint32_t cpu_id )
1160
+ {
1161
+ EBPF_LOG_ENTRY ();
1162
+
1163
+ EBPF_LOG_MESSAGE_UINT64 (EBPF_TRACELOG_LEVEL_INFO , EBPF_TRACELOG_KEYWORD_EPOCH , "Deactivating CPU" , cpu_id );
1164
+
1165
+ ebpf_epoch_cpu_entry_t * cpu_entry = & _ebpf_epoch_cpu_table [cpu_id ];
1166
+ ebpf_lock_state_t state = ebpf_lock_lock (& _ebpf_epoch_cpu_active_lock );
1167
+ cpu_entry -> active = false;
1168
+ ebpf_lock_unlock (& _ebpf_epoch_cpu_active_lock , state );
1169
+
1170
+ EBPF_LOG_EXIT ();
1171
+ }
1172
+
1173
+ /**
1174
+ * @brief Given the current CPU, return the next active CPU.
1175
+ *
1176
+ * @param[in] cpu_id The current CPU.
1177
+ * @return The next active CPU.
1178
+ */
1179
+ uint32_t
1180
+ _ebpf_epoch_next_active_cpu (uint32_t cpu_id )
1181
+ {
1182
+ uint32_t next_active_cpu ;
1183
+ ebpf_lock_state_t state = ebpf_lock_lock (& _ebpf_epoch_cpu_active_lock );
1184
+
1185
+ for (next_active_cpu = cpu_id + 1 ; next_active_cpu < _ebpf_epoch_cpu_count ; next_active_cpu ++ ) {
1186
+ if (_ebpf_epoch_cpu_table [next_active_cpu ].active ) {
1187
+ break ;
1188
+ }
1189
+ }
1190
+
1191
+ if (next_active_cpu == _ebpf_epoch_cpu_count ) {
1192
+ next_active_cpu = 0 ;
1193
+ }
1194
+
1195
+ ebpf_lock_unlock (& _ebpf_epoch_cpu_active_lock , state );
1196
+
1197
+ return next_active_cpu ;
1198
+ }
0 commit comments