11use chrono:: { DateTime , Local } ;
2+ use itertools:: Either ;
23use miette:: { Diagnostic , Result , miette} ;
34use opencue_proto:: {
45 host:: HardwareState ,
@@ -64,17 +65,14 @@ impl FrameManager {
6465
6566 // **Attention**: If an error happens between here and spawning a frame, the resources
6667 // reserved need to be released.
67- //
68- // Cuebot unfortunatelly uses a hardcoded frame environment variable to signal if
69- // a frame is hyperthreaded. Rqd should only reserve cores if a frame is hyperthreaded.
70- let hyperthreaded = run_frame
71- . environment
72- . get ( "CUE_THREADABLE" )
73- . is_some_and ( |v| v == "1" ) ;
68+
7469 let num_cores = ( run_frame. num_cores as u32 ) . div_ceil ( self . config . machine . core_multiplier ) ;
70+
71+ // Reserving cores will always yield a list of reserved thread_ids. If hyperthreading is off,
72+ // the list should be ignored
7573 let thread_ids = self
7674 . machine
77- . reserve_cores ( num_cores as usize , run_frame. resource_id ( ) , hyperthreaded )
75+ . reserve_cores ( Either :: Left ( num_cores as usize ) , run_frame. resource_id ( ) )
7876 . await
7977 . map_err ( |err| {
8078 FrameManagerError :: Aborted ( format ! (
@@ -90,7 +88,13 @@ impl FrameManager {
9088 let reserved_res = self . machine . reserve_gpus ( run_frame. num_gpus as u32 ) . await ;
9189 if reserved_res. is_err ( ) {
9290 // Release cores reserved on the last step
93- self . machine . release_cores ( num_cores, & thread_ids) . await ;
91+ if let Err ( err) = self . machine . release_cores ( & run_frame. resource_id ( ) ) . await {
92+ warn ! (
93+ "Failed to release cores reserved for {} during gpu reservation failure. {}" ,
94+ & run_frame. resource_id( ) ,
95+ err
96+ )
97+ } ;
9498 }
9599 Some ( reserved_res. map_err ( |err| {
96100 FrameManagerError :: Aborted ( format ! (
@@ -101,20 +105,35 @@ impl FrameManager {
101105 }
102106 } ;
103107
108+ // Cuebot unfortunatelly uses a hardcoded frame environment variable to signal if
109+ // a frame is hyperthreaded. Rqd should only reserve cores if a frame is hyperthreaded.
110+ let hyperthreaded = run_frame
111+ . environment
112+ . get ( "CUE_THREADABLE" )
113+ . is_some_and ( |v| v == "1" ) ;
114+ // Ignore the list of allocated threads if hyperthreading is off
115+ let thread_ids = hyperthreaded. then_some ( thread_ids) ;
116+
117+ let resource_id = run_frame. resource_id ( ) ;
104118 let running_frame = Arc :: new ( RunningFrame :: init (
105119 run_frame,
106120 uid,
107121 self . config . runner . clone ( ) ,
108- thread_ids. clone ( ) ,
122+ thread_ids,
109123 gpu_list,
110124 self . machine . get_host_name ( ) . await ,
111125 ) ) ;
112126
113127 if self . config . runner . run_on_docker {
114128 self . spawn_docker_frame ( running_frame, false ) ;
115129 } else if self . spawn_running_frame ( running_frame, false ) . is_err ( ) {
116- // Release cores reserved on the last step
117- self . machine . release_cores ( num_cores, & thread_ids) . await ;
130+ // Release cores reserved if spawning the frame failed
131+ if let Err ( err) = self . machine . release_cores ( & resource_id) . await {
132+ warn ! (
133+ "Failed to release cores reserved for {} during spawn failure. {}" ,
134+ & resource_id, err
135+ ) ;
136+ }
118137 }
119138
120139 Ok ( ( ) )
@@ -161,33 +180,38 @@ impl FrameManager {
161180 Ok ( running_frame) => {
162181 // Update reservations. If a thread_ids list exists, the frame was booked using affinity
163182 if let Err ( err) = match & running_frame. thread_ids {
164- Some ( thread_ids) => self
165- . machine
166- . reserve_cores_by_id ( thread_ids, running_frame. request . resource_id ( ) )
167- . await
168- . map ( Some ) ,
183+ Some ( thread_ids) => {
184+ self . machine
185+ . reserve_cores (
186+ Either :: Right ( thread_ids. clone ( ) ) ,
187+ running_frame. request . resource_id ( ) ,
188+ )
189+ . await
190+ }
169191 None => {
170192 let num_cores = ( running_frame. request . num_cores as u32 )
171193 . div_ceil ( self . config . machine . core_multiplier ) ;
172194 self . machine
173195 . reserve_cores (
174- num_cores as usize ,
196+ Either :: Left ( num_cores as usize ) ,
175197 running_frame. request . resource_id ( ) ,
176- false ,
177198 )
178199 . await
179200 }
180201 } {
181202 errors. push ( err. to_string ( ) ) ;
182203 }
183204
184- let num_cores = ( running_frame. request . num_cores as u32 )
185- . div_ceil ( self . config . machine . core_multiplier ) ;
186- let thread_ids = & running_frame. thread_ids . clone ( ) ;
205+ let resource_id = running_frame. request . resource_id ( ) ;
187206 if self . config . runner . run_on_docker {
188207 todo ! ( "Recovering frames when running on docker is not yet supported" )
189208 } else if self . spawn_running_frame ( running_frame, true ) . is_err ( ) {
190- self . machine . release_cores ( num_cores, thread_ids) . await ;
209+ if let Err ( err) = self . machine . release_cores ( & resource_id) . await {
210+ warn ! (
211+ "Failed to release cores reserved for {} during recover spawn error. {}" ,
212+ & resource_id, err
213+ ) ;
214+ }
191215 }
192216 }
193217 Err ( err) => {
0 commit comments