Skip to content

Commit ad15853

Browse files
committed
Tighten FUSE wait semantics
Correctness gaps remained in the guest-internal FUSE transport and the CLONE_THREAD path after the initial landing. Multi-model review flagged them; this change closes them. FUSE wait-path EINTR (src/syscall/fuse.c, src/syscall/signal.{c,h}): - A new helper signal_pending_interruption(restart_out) inspects every unblocked pending bit and reports whether the effective delivery is non-disruptive for every signal in the set. A signal is non-disruptive when its handler is SIG_IGN, when SIG_DFL resolves to default-ignore (SIGCHLD, SIGURG, SIGWINCH), or when a user handler has SA_RESTART set. Any other signal forces the caller to treat the wait as interrupted, so a SIGTERM hiding behind an ignored SIGCHLD cannot stay invisible to the caller. - fuse_request_locked detaches the request and returns -EINTR only when the deliverable set contains a disruptive signal. SA_RESTART and ignored signals let the wait continue until the daemon replies, matching the application-visible contract of those handlers and avoiding a useless FUSE_INTERRUPT for work the guest still wants. FUSE_FORGET reference-count integrity (src/syscall/fuse.c): - fuse_walk_path_locked drops the previous component's lookup hold on any error return so partial-walk failures (e.g. ENOENT on a deep component) no longer leak a reference per surviving prefix. - fuse_release_common_locked emits a compensating FUSE_FORGET on the O_PATH path. O_PATH opens skip FUSE_OPEN but still consume an nlookup during the walk; the prior early-return left that ref hanging on the daemon. - fuse_lookup_locked issues a single compensating FUSE_FORGET when the per-session ref table is full so the daemon's nlookup view stays balanced even when elfuse runs out of local capacity. - The per-session ref table cap rises from 256 to 4096 so realistic recursive walks no longer hit the compensating-FORGET path. CLONE_THREAD startup-readiness (src/runtime/forkipc.c): - sys_clone_thread waits on a thread_startup_t condvar until the worker reports current_thread publication or an explicit -EIO failure. The worker's HVF bring-up (hv_vcpu_create plus every sysreg, GPR, SIMD, and PC write) goes through a checked WORKER_HV macro, so a transient HVF error rolls back instead of aborting the process via HV_CHECK. - The startup_failed cleanup path drops the thread slot before destroying the vCPU, so a concurrent thread_interrupt_all cannot observe a slot whose t->vcpu has just been cleared. - Both failure paths (pthread_create EAGAIN and post-handshake -EIO) roll back PARENT_SETTID and CHILD_SETTID guest writes so the caller never observes a live-looking TID for a thread that never started.
1 parent 0cfb9cd commit ad15853

5 files changed

Lines changed: 648 additions & 65 deletions

File tree

src/runtime/forkipc.c

Lines changed: 138 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -345,9 +345,17 @@ int fork_child_main(int ipc_fd,
345345
/* Arguments passed to the worker pthread. Allocated by sys_clone_thread, freed
346346
* by the worker after vCPU creation and register setup.
347347
*/
348+
typedef struct {
349+
pthread_mutex_t lock;
350+
pthread_cond_t cond;
351+
bool ready;
352+
int startup_rc;
353+
} thread_startup_t;
354+
348355
typedef struct {
349356
thread_entry_t *thread;
350357
guest_t *guest;
358+
thread_startup_t *startup;
351359
bool verbose;
352360
uint64_t child_stack, flags, tls;
353361
/* Parent system regs to copy into the new vCPU */
@@ -398,6 +406,11 @@ static int64_t sys_clone_thread(hv_vcpu_t parent_vcpu,
398406
uint64_t ctid_gva,
399407
bool verbose)
400408
{
409+
thread_startup_t startup = {
410+
.lock = PTHREAD_MUTEX_INITIALIZER,
411+
.cond = PTHREAD_COND_INITIALIZER,
412+
};
413+
401414
/* Allocate guest TID */
402415
int64_t child_tid = proc_alloc_pid();
403416

@@ -446,11 +459,14 @@ static int64_t sys_clone_thread(hv_vcpu_t parent_vcpu,
446459
thread_create_args_t *tca = calloc(1, sizeof(thread_create_args_t));
447460
if (!tca) {
448461
thread_deactivate(t);
462+
pthread_cond_destroy(&startup.cond);
463+
pthread_mutex_destroy(&startup.lock);
449464
return -LINUX_ENOMEM;
450465
}
451466

452467
tca->thread = t;
453468
tca->guest = g;
469+
tca->startup = &startup;
454470
tca->verbose = verbose;
455471
tca->child_stack = child_stack;
456472
tca->flags = flags;
@@ -474,6 +490,8 @@ static int64_t sys_clone_thread(hv_vcpu_t parent_vcpu,
474490
if (guest_write_small(g, ptid_gva, &tid32, sizeof(tid32)) < 0) {
475491
free(tca);
476492
thread_deactivate(t);
493+
pthread_cond_destroy(&startup.cond);
494+
pthread_mutex_destroy(&startup.lock);
477495
return -LINUX_EFAULT;
478496
}
479497
}
@@ -491,6 +509,8 @@ static int64_t sys_clone_thread(hv_vcpu_t parent_vcpu,
491509
if (guest_write_small(g, ctid_gva, &tid32, sizeof(tid32)) < 0) {
492510
free(tca);
493511
thread_deactivate(t);
512+
pthread_cond_destroy(&startup.cond);
513+
pthread_mutex_destroy(&startup.lock);
494514
return -LINUX_EFAULT;
495515
}
496516
}
@@ -510,11 +530,49 @@ static int64_t sys_clone_thread(hv_vcpu_t parent_vcpu,
510530
log_error("clone_thread: pthread_create failed: %s", strerror(err));
511531
free(tca);
512532
thread_deactivate(t);
533+
pthread_cond_destroy(&startup.cond);
534+
pthread_mutex_destroy(&startup.lock);
535+
/* Roll back any SETTID writes done before pthread_create. Same
536+
* rationale as the post-handshake failure path: clone(2) does not
537+
* leave live-looking TIDs behind for a thread that never started.
538+
*/
539+
if (flags & LINUX_CLONE_PARENT_SETTID) {
540+
int32_t zero = 0;
541+
(void) guest_write_small(g, ptid_gva, &zero, sizeof(zero));
542+
}
543+
if (flags & LINUX_CLONE_CHILD_SETTID) {
544+
int32_t zero = 0;
545+
(void) guest_write_small(g, ctid_gva, &zero, sizeof(zero));
546+
}
513547
return -LINUX_EAGAIN;
514548
}
515549

516550
t->host_thread = host_thread;
517551

552+
pthread_mutex_lock(&startup.lock);
553+
while (!startup.ready)
554+
pthread_cond_wait(&startup.cond, &startup.lock);
555+
pthread_mutex_unlock(&startup.lock);
556+
pthread_cond_destroy(&startup.cond);
557+
pthread_mutex_destroy(&startup.lock);
558+
if (startup.startup_rc < 0) {
559+
/* Worker failed during HVF bring-up after the SETTID writes had
560+
* already populated the guest TID slots. Linux clone(2) does not
561+
* leave a live-looking TID behind for a thread that never started,
562+
* so zero the slots before the parent sees the error.
563+
*/
564+
pthread_join(host_thread, NULL);
565+
if (flags & LINUX_CLONE_PARENT_SETTID) {
566+
int32_t zero = 0;
567+
(void) guest_write_small(g, ptid_gva, &zero, sizeof(zero));
568+
}
569+
if (flags & LINUX_CLONE_CHILD_SETTID) {
570+
int32_t zero = 0;
571+
(void) guest_write_small(g, ctid_gva, &zero, sizeof(zero));
572+
}
573+
return startup.startup_rc;
574+
}
575+
518576
log_debug("clone_thread: child tid=%lld created", (long long) child_tid);
519577

520578
return child_tid;
@@ -530,6 +588,7 @@ static void *thread_create_and_run(void *arg)
530588
thread_create_args_t *tca = (thread_create_args_t *) arg;
531589
thread_entry_t *t = tca->thread;
532590
guest_t *g = tca->guest;
591+
thread_startup_t *startup = tca->startup;
533592

534593
/* Create vCPU on THIS thread (HVF requirement) */
535594
hv_vcpu_t vcpu;
@@ -538,6 +597,11 @@ static void *thread_create_and_run(void *arg)
538597
if (r != HV_SUCCESS) {
539598
log_error("thread tid=%lld: hv_vcpu_create failed: %d",
540599
(long long) t->guest_tid, (int) r);
600+
pthread_mutex_lock(&startup->lock);
601+
startup->startup_rc = -LINUX_EIO;
602+
startup->ready = true;
603+
pthread_cond_broadcast(&startup->cond);
604+
pthread_mutex_unlock(&startup->lock);
541605
free(tca);
542606
thread_deactivate(t);
543607
return NULL;
@@ -546,50 +610,105 @@ static void *thread_create_and_run(void *arg)
546610
t->vcpu = vcpu;
547611
t->vexit = vexit;
548612

613+
/* Sysreg setup uses checked calls instead of HV_CHECK so the parent's
614+
* startup handshake can roll back cleanly rather than tearing down the
615+
* whole process on a transient HVF failure here.
616+
*/
617+
#define WORKER_HV(call) \
618+
do { \
619+
hv_return_t _r = (call); \
620+
if (_r != HV_SUCCESS) { \
621+
log_error("thread tid=%lld: %s failed: %d", \
622+
(long long) t->guest_tid, #call, (int) _r); \
623+
goto startup_failed; \
624+
} \
625+
} while (0)
626+
549627
/* Copy system registers from parent (shared page tables, same MMU config)
550628
*/
551-
HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_VBAR_EL1, tca->vbar));
552-
HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_MAIR_EL1, tca->mair));
553-
HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TCR_EL1, tca->tcr));
554-
HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TTBR0_EL1, tca->ttbr0));
555-
HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_CPACR_EL1, tca->cpacr));
629+
WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_VBAR_EL1, tca->vbar));
630+
WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_MAIR_EL1, tca->mair));
631+
WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TCR_EL1, tca->tcr));
632+
WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TTBR0_EL1, tca->ttbr0));
633+
WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_CPACR_EL1, tca->cpacr));
556634

557635
/* MMU already on, so set SCTLR with M=1 directly (page tables exist) */
558-
HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SCTLR_EL1, tca->sctlr));
636+
WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SCTLR_EL1, tca->sctlr));
559637

560638
/* Per-thread SP_EL1 (each vCPU needs its own EL1 exception stack) */
561-
HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL1, tca->sp_el1));
639+
WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL1, tca->sp_el1));
562640

563641
/* SP_EL0 = child_stack (provided by clone caller) */
564-
HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL0, tca->child_stack));
642+
WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL0, tca->child_stack));
565643

566644
/* TPIDR_EL0 = thread-local storage pointer (if CLONE_SETTLS) */
567645
if (tca->flags & LINUX_CLONE_SETTLS) {
568-
HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR_EL0, tca->tls));
646+
WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR_EL0, tca->tls));
569647
} else {
570-
HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR_EL0, tca->tpidr));
648+
WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR_EL0, tca->tpidr));
571649
}
572650

573651
/* ELR_EL1 = clone return point (same as parent) */
574-
HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_ELR_EL1, tca->elr));
575-
HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SPSR_EL1, tca->spsr));
576-
577-
/* Copy all 31 GPRs from parent, then set X0=0 (child clone return) */
578-
vcpu_restore_gprs(vcpu, tca->gprs);
579-
vcpu_set_gpr(vcpu, 0, 0);
652+
WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_ELR_EL1, tca->elr));
653+
WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SPSR_EL1, tca->spsr));
654+
655+
/* Copy all 31 GPRs from parent, then set X0=0 (child clone return).
656+
* The vcpu_restore_gprs / vcpu_restore_simd helpers in hvutil.h abort
657+
* the whole process on failure via HV_CHECK, which would defeat the
658+
* handshake rollback. Open-code the restore here so transient HVF
659+
* failures fall into the same startup_failed path as the sysreg writes.
660+
*/
661+
for (unsigned i = 0; i < 31; i++)
662+
WORKER_HV(hv_vcpu_set_reg(vcpu, HV_REG_X0 + i, tca->gprs[i]));
663+
WORKER_HV(hv_vcpu_set_reg(vcpu, HV_REG_X0, 0));
580664

581-
vcpu_restore_simd(vcpu, &tca->simd_state);
665+
for (int i = 0; i < 32; i++)
666+
WORKER_HV(hv_vcpu_set_simd_fp_reg(vcpu, HV_SIMD_FP_REG_Q0 + i,
667+
tca->simd_state.v[i]));
668+
WORKER_HV(hv_vcpu_set_reg(vcpu, HV_REG_FPSR, tca->simd_state.fpsr));
669+
WORKER_HV(hv_vcpu_set_reg(vcpu, HV_REG_FPCR, tca->simd_state.fpcr));
582670

583671
/* Start at clone return point in EL0 (not shim entry) */
584-
HV_CHECK(hv_vcpu_set_reg(vcpu, HV_REG_PC, tca->elr));
585-
HV_CHECK(hv_vcpu_set_reg(vcpu, HV_REG_CPSR, 0)); /* EL0t */
672+
WORKER_HV(hv_vcpu_set_reg(vcpu, HV_REG_PC, tca->elr));
673+
WORKER_HV(hv_vcpu_set_reg(vcpu, HV_REG_CPSR, 0)); /* EL0t */
674+
#undef WORKER_HV
586675

587676
bool verbose = tca->verbose;
588677
free(tca);
589678

590679
/* Set per-thread TLS pointer and enter worker run loop */
591680
current_thread = t;
592681

682+
pthread_mutex_lock(&startup->lock);
683+
startup->startup_rc = 0;
684+
startup->ready = true;
685+
pthread_cond_broadcast(&startup->cond);
686+
pthread_mutex_unlock(&startup->lock);
687+
688+
goto startup_ok;
689+
690+
startup_failed:
691+
/* HVF sysreg/GPR setup failed after vCPU creation. Drop the thread slot
692+
* before tearing the vCPU down: thread_interrupt_all() scans the active
693+
* set and calls hv_vcpus_exit() on each t->vcpu without a null check, so
694+
* clearing t->vcpu while the slot is still active would let a concurrent
695+
* exit_group hand a zero handle to HVF. Deactivating first removes the
696+
* slot from iteration. Then destroy the vCPU on its owning thread (the
697+
* only thread allowed to do so), free args, and finally signal the
698+
* parent so it observes a fully torn-down state.
699+
*/
700+
thread_deactivate(t);
701+
hv_vcpu_destroy(vcpu);
702+
free(tca);
703+
pthread_mutex_lock(&startup->lock);
704+
startup->startup_rc = -LINUX_EIO;
705+
startup->ready = true;
706+
pthread_cond_broadcast(&startup->cond);
707+
pthread_mutex_unlock(&startup->lock);
708+
return NULL;
709+
710+
startup_ok:;
711+
593712
log_debug("thread tid=%lld starting on vCPU", (long long) t->guest_tid);
594713

595714
vcpu_run_loop(vcpu, vexit, g, verbose, 0);

0 commit comments

Comments
 (0)