@@ -345,9 +345,17 @@ int fork_child_main(int ipc_fd,
345345/* Arguments passed to the worker pthread. Allocated by sys_clone_thread, freed
346346 * by the worker after vCPU creation and register setup.
347347 */
348+ typedef struct {
349+ pthread_mutex_t lock ;
350+ pthread_cond_t cond ;
351+ bool ready ;
352+ int startup_rc ;
353+ } thread_startup_t ;
354+
348355typedef struct {
349356 thread_entry_t * thread ;
350357 guest_t * guest ;
358+ thread_startup_t * startup ;
351359 bool verbose ;
352360 uint64_t child_stack , flags , tls ;
353361 /* Parent system regs to copy into the new vCPU */
@@ -398,6 +406,11 @@ static int64_t sys_clone_thread(hv_vcpu_t parent_vcpu,
398406 uint64_t ctid_gva ,
399407 bool verbose )
400408{
409+ thread_startup_t startup = {
410+ .lock = PTHREAD_MUTEX_INITIALIZER ,
411+ .cond = PTHREAD_COND_INITIALIZER ,
412+ };
413+
401414 /* Allocate guest TID */
402415 int64_t child_tid = proc_alloc_pid ();
403416
@@ -446,11 +459,14 @@ static int64_t sys_clone_thread(hv_vcpu_t parent_vcpu,
446459 thread_create_args_t * tca = calloc (1 , sizeof (thread_create_args_t ));
447460 if (!tca ) {
448461 thread_deactivate (t );
462+ pthread_cond_destroy (& startup .cond );
463+ pthread_mutex_destroy (& startup .lock );
449464 return - LINUX_ENOMEM ;
450465 }
451466
452467 tca -> thread = t ;
453468 tca -> guest = g ;
469+ tca -> startup = & startup ;
454470 tca -> verbose = verbose ;
455471 tca -> child_stack = child_stack ;
456472 tca -> flags = flags ;
@@ -474,6 +490,8 @@ static int64_t sys_clone_thread(hv_vcpu_t parent_vcpu,
474490 if (guest_write_small (g , ptid_gva , & tid32 , sizeof (tid32 )) < 0 ) {
475491 free (tca );
476492 thread_deactivate (t );
493+ pthread_cond_destroy (& startup .cond );
494+ pthread_mutex_destroy (& startup .lock );
477495 return - LINUX_EFAULT ;
478496 }
479497 }
@@ -491,6 +509,8 @@ static int64_t sys_clone_thread(hv_vcpu_t parent_vcpu,
491509 if (guest_write_small (g , ctid_gva , & tid32 , sizeof (tid32 )) < 0 ) {
492510 free (tca );
493511 thread_deactivate (t );
512+ pthread_cond_destroy (& startup .cond );
513+ pthread_mutex_destroy (& startup .lock );
494514 return - LINUX_EFAULT ;
495515 }
496516 }
@@ -510,11 +530,49 @@ static int64_t sys_clone_thread(hv_vcpu_t parent_vcpu,
510530 log_error ("clone_thread: pthread_create failed: %s" , strerror (err ));
511531 free (tca );
512532 thread_deactivate (t );
533+ pthread_cond_destroy (& startup .cond );
534+ pthread_mutex_destroy (& startup .lock );
535+ /* Roll back any SETTID writes done before pthread_create. Same
536+ * rationale as the post-handshake failure path: clone(2) does not
537+ * leave live-looking TIDs behind for a thread that never started.
538+ */
539+ if (flags & LINUX_CLONE_PARENT_SETTID ) {
540+ int32_t zero = 0 ;
541+ (void ) guest_write_small (g , ptid_gva , & zero , sizeof (zero ));
542+ }
543+ if (flags & LINUX_CLONE_CHILD_SETTID ) {
544+ int32_t zero = 0 ;
545+ (void ) guest_write_small (g , ctid_gva , & zero , sizeof (zero ));
546+ }
513547 return - LINUX_EAGAIN ;
514548 }
515549
516550 t -> host_thread = host_thread ;
517551
552+ pthread_mutex_lock (& startup .lock );
553+ while (!startup .ready )
554+ pthread_cond_wait (& startup .cond , & startup .lock );
555+ pthread_mutex_unlock (& startup .lock );
556+ pthread_cond_destroy (& startup .cond );
557+ pthread_mutex_destroy (& startup .lock );
558+ if (startup .startup_rc < 0 ) {
559+ /* Worker failed during HVF bring-up after the SETTID writes had
560+ * already populated the guest TID slots. Linux clone(2) does not
561+ * leave a live-looking TID behind for a thread that never started,
562+ * so zero the slots before the parent sees the error.
563+ */
564+ pthread_join (host_thread , NULL );
565+ if (flags & LINUX_CLONE_PARENT_SETTID ) {
566+ int32_t zero = 0 ;
567+ (void ) guest_write_small (g , ptid_gva , & zero , sizeof (zero ));
568+ }
569+ if (flags & LINUX_CLONE_CHILD_SETTID ) {
570+ int32_t zero = 0 ;
571+ (void ) guest_write_small (g , ctid_gva , & zero , sizeof (zero ));
572+ }
573+ return startup .startup_rc ;
574+ }
575+
518576 log_debug ("clone_thread: child tid=%lld created" , (long long ) child_tid );
519577
520578 return child_tid ;
@@ -530,6 +588,7 @@ static void *thread_create_and_run(void *arg)
530588 thread_create_args_t * tca = (thread_create_args_t * ) arg ;
531589 thread_entry_t * t = tca -> thread ;
532590 guest_t * g = tca -> guest ;
591+ thread_startup_t * startup = tca -> startup ;
533592
534593 /* Create vCPU on THIS thread (HVF requirement) */
535594 hv_vcpu_t vcpu ;
@@ -538,6 +597,11 @@ static void *thread_create_and_run(void *arg)
538597 if (r != HV_SUCCESS ) {
539598 log_error ("thread tid=%lld: hv_vcpu_create failed: %d" ,
540599 (long long ) t -> guest_tid , (int ) r );
600+ pthread_mutex_lock (& startup -> lock );
601+ startup -> startup_rc = - LINUX_EIO ;
602+ startup -> ready = true;
603+ pthread_cond_broadcast (& startup -> cond );
604+ pthread_mutex_unlock (& startup -> lock );
541605 free (tca );
542606 thread_deactivate (t );
543607 return NULL ;
@@ -546,50 +610,105 @@ static void *thread_create_and_run(void *arg)
546610 t -> vcpu = vcpu ;
547611 t -> vexit = vexit ;
548612
613+ /* Sysreg setup uses checked calls instead of HV_CHECK so the parent's
614+ * startup handshake can roll back cleanly rather than tearing down the
615+ * whole process on a transient HVF failure here.
616+ */
617+ #define WORKER_HV (call ) \
618+ do { \
619+ hv_return_t _r = (call); \
620+ if (_r != HV_SUCCESS) { \
621+ log_error("thread tid=%lld: %s failed: %d", \
622+ (long long) t->guest_tid, #call, (int) _r); \
623+ goto startup_failed; \
624+ } \
625+ } while (0)
626+
549627 /* Copy system registers from parent (shared page tables, same MMU config)
550628 */
551- HV_CHECK (hv_vcpu_set_sys_reg (vcpu , HV_SYS_REG_VBAR_EL1 , tca -> vbar ));
552- HV_CHECK (hv_vcpu_set_sys_reg (vcpu , HV_SYS_REG_MAIR_EL1 , tca -> mair ));
553- HV_CHECK (hv_vcpu_set_sys_reg (vcpu , HV_SYS_REG_TCR_EL1 , tca -> tcr ));
554- HV_CHECK (hv_vcpu_set_sys_reg (vcpu , HV_SYS_REG_TTBR0_EL1 , tca -> ttbr0 ));
555- HV_CHECK (hv_vcpu_set_sys_reg (vcpu , HV_SYS_REG_CPACR_EL1 , tca -> cpacr ));
629+ WORKER_HV (hv_vcpu_set_sys_reg (vcpu , HV_SYS_REG_VBAR_EL1 , tca -> vbar ));
630+ WORKER_HV (hv_vcpu_set_sys_reg (vcpu , HV_SYS_REG_MAIR_EL1 , tca -> mair ));
631+ WORKER_HV (hv_vcpu_set_sys_reg (vcpu , HV_SYS_REG_TCR_EL1 , tca -> tcr ));
632+ WORKER_HV (hv_vcpu_set_sys_reg (vcpu , HV_SYS_REG_TTBR0_EL1 , tca -> ttbr0 ));
633+ WORKER_HV (hv_vcpu_set_sys_reg (vcpu , HV_SYS_REG_CPACR_EL1 , tca -> cpacr ));
556634
557635 /* MMU already on, so set SCTLR with M=1 directly (page tables exist) */
558- HV_CHECK (hv_vcpu_set_sys_reg (vcpu , HV_SYS_REG_SCTLR_EL1 , tca -> sctlr ));
636+ WORKER_HV (hv_vcpu_set_sys_reg (vcpu , HV_SYS_REG_SCTLR_EL1 , tca -> sctlr ));
559637
560638 /* Per-thread SP_EL1 (each vCPU needs its own EL1 exception stack) */
561- HV_CHECK (hv_vcpu_set_sys_reg (vcpu , HV_SYS_REG_SP_EL1 , tca -> sp_el1 ));
639+ WORKER_HV (hv_vcpu_set_sys_reg (vcpu , HV_SYS_REG_SP_EL1 , tca -> sp_el1 ));
562640
563641 /* SP_EL0 = child_stack (provided by clone caller) */
564- HV_CHECK (hv_vcpu_set_sys_reg (vcpu , HV_SYS_REG_SP_EL0 , tca -> child_stack ));
642+ WORKER_HV (hv_vcpu_set_sys_reg (vcpu , HV_SYS_REG_SP_EL0 , tca -> child_stack ));
565643
566644 /* TPIDR_EL0 = thread-local storage pointer (if CLONE_SETTLS) */
567645 if (tca -> flags & LINUX_CLONE_SETTLS ) {
568- HV_CHECK (hv_vcpu_set_sys_reg (vcpu , HV_SYS_REG_TPIDR_EL0 , tca -> tls ));
646+ WORKER_HV (hv_vcpu_set_sys_reg (vcpu , HV_SYS_REG_TPIDR_EL0 , tca -> tls ));
569647 } else {
570- HV_CHECK (hv_vcpu_set_sys_reg (vcpu , HV_SYS_REG_TPIDR_EL0 , tca -> tpidr ));
648+ WORKER_HV (hv_vcpu_set_sys_reg (vcpu , HV_SYS_REG_TPIDR_EL0 , tca -> tpidr ));
571649 }
572650
573651 /* ELR_EL1 = clone return point (same as parent) */
574- HV_CHECK (hv_vcpu_set_sys_reg (vcpu , HV_SYS_REG_ELR_EL1 , tca -> elr ));
575- HV_CHECK (hv_vcpu_set_sys_reg (vcpu , HV_SYS_REG_SPSR_EL1 , tca -> spsr ));
576-
577- /* Copy all 31 GPRs from parent, then set X0=0 (child clone return) */
578- vcpu_restore_gprs (vcpu , tca -> gprs );
579- vcpu_set_gpr (vcpu , 0 , 0 );
652+ WORKER_HV (hv_vcpu_set_sys_reg (vcpu , HV_SYS_REG_ELR_EL1 , tca -> elr ));
653+ WORKER_HV (hv_vcpu_set_sys_reg (vcpu , HV_SYS_REG_SPSR_EL1 , tca -> spsr ));
654+
655+ /* Copy all 31 GPRs from parent, then set X0=0 (child clone return).
656+ * The vcpu_restore_gprs / vcpu_restore_simd helpers in hvutil.h abort
657+ * the whole process on failure via HV_CHECK, which would defeat the
658+ * handshake rollback. Open-code the restore here so transient HVF
659+ * failures fall into the same startup_failed path as the sysreg writes.
660+ */
661+ for (unsigned i = 0 ; i < 31 ; i ++ )
662+ WORKER_HV (hv_vcpu_set_reg (vcpu , HV_REG_X0 + i , tca -> gprs [i ]));
663+ WORKER_HV (hv_vcpu_set_reg (vcpu , HV_REG_X0 , 0 ));
580664
581- vcpu_restore_simd (vcpu , & tca -> simd_state );
665+ for (int i = 0 ; i < 32 ; i ++ )
666+ WORKER_HV (hv_vcpu_set_simd_fp_reg (vcpu , HV_SIMD_FP_REG_Q0 + i ,
667+ tca -> simd_state .v [i ]));
668+ WORKER_HV (hv_vcpu_set_reg (vcpu , HV_REG_FPSR , tca -> simd_state .fpsr ));
669+ WORKER_HV (hv_vcpu_set_reg (vcpu , HV_REG_FPCR , tca -> simd_state .fpcr ));
582670
583671 /* Start at clone return point in EL0 (not shim entry) */
584- HV_CHECK (hv_vcpu_set_reg (vcpu , HV_REG_PC , tca -> elr ));
585- HV_CHECK (hv_vcpu_set_reg (vcpu , HV_REG_CPSR , 0 )); /* EL0t */
672+ WORKER_HV (hv_vcpu_set_reg (vcpu , HV_REG_PC , tca -> elr ));
673+ WORKER_HV (hv_vcpu_set_reg (vcpu , HV_REG_CPSR , 0 )); /* EL0t */
674+ #undef WORKER_HV
586675
587676 bool verbose = tca -> verbose ;
588677 free (tca );
589678
590679 /* Set per-thread TLS pointer and enter worker run loop */
591680 current_thread = t ;
592681
682+ pthread_mutex_lock (& startup -> lock );
683+ startup -> startup_rc = 0 ;
684+ startup -> ready = true;
685+ pthread_cond_broadcast (& startup -> cond );
686+ pthread_mutex_unlock (& startup -> lock );
687+
688+ goto startup_ok ;
689+
690+ startup_failed :
691+ /* HVF sysreg/GPR setup failed after vCPU creation. Drop the thread slot
692+ * before tearing the vCPU down: thread_interrupt_all() scans the active
693+ * set and calls hv_vcpus_exit() on each t->vcpu without a null check, so
694+ * clearing t->vcpu while the slot is still active would let a concurrent
695+ * exit_group hand a zero handle to HVF. Deactivating first removes the
696+ * slot from iteration. Then destroy the vCPU on its owning thread (the
697+ * only thread allowed to do so), free args, and finally signal the
698+ * parent so it observes a fully torn-down state.
699+ */
700+ thread_deactivate (t );
701+ hv_vcpu_destroy (vcpu );
702+ free (tca );
703+ pthread_mutex_lock (& startup -> lock );
704+ startup -> startup_rc = - LINUX_EIO ;
705+ startup -> ready = true;
706+ pthread_cond_broadcast (& startup -> cond );
707+ pthread_mutex_unlock (& startup -> lock );
708+ return NULL ;
709+
710+ startup_ok :;
711+
593712 log_debug ("thread tid=%lld starting on vCPU" , (long long ) t -> guest_tid );
594713
595714 vcpu_run_loop (vcpu , vexit , g , verbose , 0 );
0 commit comments