Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,12 @@ $(BUILD_DIR)/test-scm-creds: tests/test-scm-creds.c | $(BUILD_DIR)
@echo " CROSS $< (with -lpthread)"
$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread

# test-fault-signal-mt spawns pthreads that each take recoverable SIGSEGVs to
# stress synchronous-fault delivery routing in a multi-threaded guest.
$(BUILD_DIR)/test-fault-signal-mt: tests/test-fault-signal-mt.c | $(BUILD_DIR)
@echo " CROSS $< (with -lpthread)"
$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread

# test-shim-cred-race spawns a pthread reader while the main thread
# toggles setresuid; the reader spins on the identity fast path.
$(BUILD_DIR)/test-shim-cred-race: tests/test-shim-cred-race.c | $(BUILD_DIR)
Expand Down
32 changes: 23 additions & 9 deletions src/syscall/proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -1454,8 +1454,9 @@ int vcpu_run_loop(hv_vcpu_t vcpu,
uint64_t esr;
hv_vcpu_get_sys_reg(vcpu, HV_SYS_REG_ESR_EL1, &esr);
signal_set_fault_info(LINUX_SEGV_ACCERR, far, esr);
signal_queue(LINUX_SIGSEGV);
int sig_ret = signal_deliver(vcpu, g, &exit_code);
int sig_ret =
signal_deliver_fault(vcpu, g, LINUX_SIGSEGV,
&exit_code);
if (sig_ret < 0)
running = false;
break;
Expand Down Expand Up @@ -1538,7 +1539,6 @@ int vcpu_run_loop(hv_vcpu_t vcpu,
hv_vcpu_get_sys_reg(vcpu, HV_SYS_REG_ESR_EL1, &brk_esr);
signal_set_fault_info(LINUX_TRAP_BRKPT, brk_pc,
brk_esr);
signal_queue(LINUX_SIGTRAP);
if (verbose) {
uint64_t thread_blocked =
current_thread ? current_thread->blocked
Expand All @@ -1550,7 +1550,9 @@ int vcpu_run_loop(hv_vcpu_t vcpu,
(unsigned long long) signal_get_state()
->pending);
}
int sig_ret = signal_deliver(vcpu, g, &exit_code);
int sig_ret =
signal_deliver_fault(vcpu, g, LINUX_SIGTRAP,
&exit_code);
if (verbose)
log_debug("%s: signal_deliver returned %d", prefix,
sig_ret);
Expand Down Expand Up @@ -1612,8 +1614,9 @@ int vcpu_run_loop(hv_vcpu_t vcpu,
prefix, (unsigned long long) elr_addr,
(unsigned long long) esr, fault_ec);
signal_set_fault_info(LINUX_ILL_ILLOPC, elr_addr, esr);
signal_queue(LINUX_SIGILL);
int sig_ret = signal_deliver(vcpu, g, &exit_code);
int sig_ret =
signal_deliver_fault(vcpu, g, LINUX_SIGILL,
&exit_code);
/* HVC #11 consumes X8 as the post-fault TLBI opcode.
* signal_deliver() may leave it unchanged when no
* handler is materialized, or set the syscall-path
Expand Down Expand Up @@ -1681,8 +1684,8 @@ int vcpu_run_loop(hv_vcpu_t vcpu,
(unsigned long long) esr, fsc, code_name);
}
signal_set_fault_info(si_code, far_addr, esr);
signal_queue(LINUX_SIGSEGV);
int sig_ret = signal_deliver(vcpu, g, &exit_code);
int sig_ret =
signal_deliver_fault(vcpu, g, LINUX_SIGSEGV, &exit_code);
/* HVC #11 consumes X8 as the post-fault TLBI opcode.
* signal_deliver() may leave it unchanged when no
* handler is materialized, or set the syscall-path
Expand Down Expand Up @@ -1941,11 +1944,22 @@ int vcpu_run_loop(hv_vcpu_t vcpu,
exit_code = 128;
running = false;
}
} else if (vexit->reason == HV_EXIT_REASON_CANCELED) {
} else if (vexit->reason == HV_EXIT_REASON_CANCELED ||
vexit->reason == HV_EXIT_REASON_UNKNOWN) {
/* Canceled by hv_vcpus_exit(). Can be: alarm timeout,
* exit_group from another thread, or signal preemption
* (signal_queue called hv_vcpus_exit to deliver a signal
* while the guest was in a tight loop).
*
* HV_EXIT_REASON_UNKNOWN is the same event seen from the other
* side of a race: when a host signal (e.g. the SIGUSR2 used by the
* cross-process guest-signal transport) is delivered to this thread
* while it is actively executing guest code inside hv_vcpu_run, the
* run aborts with UNKNOWN instead of the clean CANCELED that
* hv_vcpus_exit() produces for a vCPU caught between runs. The
* pending guest signal has already been drained and queued, so it
* is fully deliverable -- fall through to the same handling and
* resume rather than treating it as a fatal unexpected exit.
*/
if (is_main && g_timed_out) {
/* Timeout already handled above the exception switch --
Expand Down
131 changes: 101 additions & 30 deletions src/syscall/signal.c
Original file line number Diff line number Diff line change
Expand Up @@ -1317,37 +1317,27 @@ static void build_sigcontext_reserved(uint8_t *reserved,
memset(reserved + off, 0, 8);
}

int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code)
/* Build and install the rt_sigframe for `signum` on the current thread, with
* sig_lock held on entry and released on every return path. Shared by
* signal_deliver() (signal selected from the process-wide pending set) and
* signal_deliver_fault() (synchronous fault forced onto the faulting thread).
* rt_info supplies si_code/si_pid/sigval when no thread-local pending_fault is
* set; the pending_fault is consumed (one-shot) when valid. Returns 1 if a
* handler frame was installed, 0 if the signal was ignored, and -1 (with
* *exit_code set) when the default disposition terminates the guest.
*/
static int deliver_signal_locked(hv_vcpu_t vcpu,
guest_t *g,
int signum,
signal_rt_info_t rt_info,
int *exit_code)
{
pthread_mutex_lock(&sig_lock);
uint64_t *blocked = thread_blocked_ptr();
uint64_t *saved_ptr = thread_saved_blocked_ptr();
bool *valid_ptr = thread_saved_valid_ptr();
uint64_t deliverable = sig_state.pending & ~*blocked;
if (deliverable == 0) {
pthread_mutex_unlock(&sig_lock);
return 0;
}

/* Find lowest pending unblocked signal */
int signum = bit_ctz64(deliverable) + 1;
signal_rt_info_t rt_info = signal_default_info(signum);

/* Dequeue: for RT signals, decrement count and only clear the
* pending bit when the queue is empty. Standard signals are
* always cleared (single instance, bitmask semantics).
*/
if (signum >= LINUX_SIGRTMIN) {
signal_rt_dequeue_locked(signum, &rt_info);
} else {
rt_info = signal_standard_peek_locked(signum);
sig_state.std_info_valid[signum - 1] = false;
sig_state.pending &= ~sig_bit(signum);
}

/* signum is bit_ctz64(deliverable) + 1, bounded 1..64 by the 64-bit
* pending mask. The static analyzer cannot see the bound, so gate the
* array access defensively.
/* signum is 1..64 from the caller; the static analyzer cannot see the
* bound, so gate the array access defensively.
*/
int idx = signum - 1;
if (!RANGE_CHECK(idx, 0, LINUX_NSIG)) {
Expand Down Expand Up @@ -1386,14 +1376,35 @@ int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code)

/* Deliver to user handler: build rt_sigframe on guest stack */

/* 1. Save current vCPU state */
/* 1. Save current vCPU state.
*
* ELR_EL1/SPSR_EL1 hold the interrupted EL0 return state only while the
* guest is unwinding a syscall (it is at EL1 in the shim, about to ERET).
* When the vCPU was preempted while executing EL0 code -- a tight compute
* loop interrupted by SIGALRM, or the cross-process guest-signal transport
* (SIGUSR2) firing mid-execution -- the live interrupted state is in
* HV_REG_PC / HV_REG_CPSR and ELR_EL1 is stale from the previous syscall.
* Redirecting via ELR_EL1 alone is then a no-op because the resume uses
* HV_REG_PC, so the handler never runs and the X0..X2 writes below clobber
* the interrupted registers instead. Detect the EL0-preemption case from
* the live PSTATE (M[3:0]==0 => EL0t) and use PC for both save and
* redirect.
*/
uint64_t saved_regs[31];
uint64_t saved_sp, saved_pc, saved_pstate;
uint64_t cur_cpsr = 0;
hv_vcpu_get_reg(vcpu, HV_REG_CPSR, &cur_cpsr);
bool el0_preempt = (cur_cpsr & 0xfULL) == 0;

vcpu_snapshot_gprs(vcpu, saved_regs);
saved_sp = vcpu_get_sysreg(vcpu, HV_SYS_REG_SP_EL0);
saved_pc = vcpu_get_sysreg(vcpu, HV_SYS_REG_ELR_EL1);
saved_pstate = vcpu_get_sysreg(vcpu, HV_SYS_REG_SPSR_EL1);
if (el0_preempt) {
hv_vcpu_get_reg(vcpu, HV_REG_PC, &saved_pc);
saved_pstate = cur_cpsr;
} else {
saved_pc = vcpu_get_sysreg(vcpu, HV_SYS_REG_ELR_EL1);
saved_pstate = vcpu_get_sysreg(vcpu, HV_SYS_REG_SPSR_EL1);
}

/* 1b. rseq abort: if the thread is in a restartable sequence critical
* section, abort it. Linux does this on every signal delivery.
Expand Down Expand Up @@ -1549,6 +1560,16 @@ int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code)
/* SPSR_EL1: EL0t (user mode) */
hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SPSR_EL1, 0);

/* EL0-preemption delivery: the resume runs from HV_REG_PC, not via an
* ERET that consumes ELR_EL1, so redirect the live PC/PSTATE directly.
* The ELR_EL1/SPSR_EL1 writes above still cover the rt_sigreturn path,
* which unwinds back to EL0 through the shim ERET.
*/
if (el0_preempt) {
hv_vcpu_set_reg(vcpu, HV_REG_PC, act->sa_handler);
hv_vcpu_set_reg(vcpu, HV_REG_CPSR, 0); /* EL0t */
}

/* X0 = signal number */
hv_vcpu_set_reg(vcpu, HV_REG_X0, (uint64_t) signum);

Expand Down Expand Up @@ -1590,13 +1611,63 @@ int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code)
* shim still has the interrupted syscall frame on its EL1 stack. Tell it
* to drop that frame so the handler PC/SP/LR/args installed above are not
* overwritten before ERET. Fault/BRK delivery paths ignore this marker.
* The EL0-preemption path resumes straight into the handler at EL0 with
* no shim frame to drop, so the marker is neither needed nor consulted.
*/
hv_vcpu_set_reg(vcpu, HV_REG_X8, 2);
if (!el0_preempt)
hv_vcpu_set_reg(vcpu, HV_REG_X8, 2);

pthread_mutex_unlock(&sig_lock);
return 1;
}

int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code)
{
pthread_mutex_lock(&sig_lock);
uint64_t *blocked = thread_blocked_ptr();
uint64_t deliverable = sig_state.pending & ~*blocked;
if (deliverable == 0) {
pthread_mutex_unlock(&sig_lock);
return 0;
}

/* Find lowest pending unblocked signal */
int signum = bit_ctz64(deliverable) + 1;
signal_rt_info_t rt_info = signal_default_info(signum);

/* Dequeue: for RT signals, decrement count and only clear the
* pending bit when the queue is empty. Standard signals are
* always cleared (single instance, bitmask semantics).
*/
if (signum >= LINUX_SIGRTMIN) {
signal_rt_dequeue_locked(signum, &rt_info);
} else {
rt_info = signal_standard_peek_locked(signum);
sig_state.std_info_valid[signum - 1] = false;
sig_state.pending &= ~sig_bit(signum);
}

return deliver_signal_locked(vcpu, g, signum, rt_info, exit_code);
}

int signal_deliver_fault(hv_vcpu_t vcpu, guest_t *g, int signum, int *exit_code)
{
/* Synchronous faults (SIGSEGV/SIGBUS/SIGILL/SIGFPE/SIGTRAP) are specific to
* the thread that triggered them and must be delivered to that thread with
* the thread-local fault info set by signal_set_fault_info(). Routing them
* through the process-wide pending bitmask (signal_queue + signal_deliver)
* is racy: another vCPU thread can dequeue the bit and deliver it with no
* fault info (si_code becomes SI_USER, which makes a JVM treat a recoverable
* implicit null-check as a fatal external signal), and two threads faulting
* on the same signal collapse into one bit so one fault is lost. Deliver
* directly here, never touching sig_state.pending. The blocked mask is
* intentionally ignored: a synchronous fault cannot be postponed.
*/
pthread_mutex_lock(&sig_lock);
signal_rt_info_t rt_info = signal_default_info(signum);
return deliver_signal_locked(vcpu, g, signum, rt_info, exit_code);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment two lines up calls this "intentionally ignored: a synchronous fault cannot be postponed" -- that's only half the Linux contract. force_sig_info_to_task in kernel/signal.c resets the disposition to SIG_DFL and unblocks the signum before applying default, so a SIGSEGV/SIGBUS/SIGILL/SIGFPE/SIGTRAP that's blocked OR set to SIG_IGN terminates the process. This PR's behavior diverges in two ways:

  1. SIG_IGN on synchronous fault (preexisting): deliver_signal_locked at signal.c:1352 returns 0 for SIG_IGN. The vCPU loop resumes, PC unchanged, the same instruction re-faults forever. Linux would terminate.
  2. Blocked synchronous fault (new in this PR): pre-PR, signal_deliver's deliverable = sig_state.pending & ~*blocked short-circuited and the thread re-faulted infinitely. Post-PR, signal_deliver_fault bypasses the blocked mask entirely and runs the user handler despite the block. Linux would reset to SIG_DFL + unblock + terminate.

JVM works either way because it never blocks SIGSEGV, but the contract drift is real. A five-line precheck closes both cases with one shape:

int signal_deliver_fault(hv_vcpu_t vcpu, guest_t *g, int signum, int *exit_code)
{
    pthread_mutex_lock(&sig_lock);
    uint64_t *blocked = thread_blocked_ptr();
    linux_sigaction_t *act = &sig_state.actions[signum - 1];
    if (act->sa_handler == LINUX_SIG_IGN ||
        (*blocked & sig_bit(signum))) {
        /* Linux force_sig_info_to_task: forced synchronous faults reset
         * disposition to SIG_DFL, unblock the signum, then apply default. */
        act->sa_handler = LINUX_SIG_DFL;
        *blocked &= ~sig_bit(signum);
    }
    signal_rt_info_t rt_info = signal_default_info(signum);
    return deliver_signal_locked(vcpu, g, signum, rt_info, exit_code);
}

Update the "intentionally ignored" comment accordingly.

}

/* rt_sigreturn. */

int signal_rt_sigreturn(hv_vcpu_t vcpu, guest_t *g)
Expand Down
10 changes: 10 additions & 0 deletions src/syscall/signal.h
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,16 @@ void signal_set_shim_globals_guest(guest_t *g);
*/
int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code);

/* Deliver a synchronous fault signal directly to the faulting (current) thread,
* bypassing the process-wide pending set. The caller must have set the fault
* info via signal_set_fault_info() immediately before. Same return convention
* as signal_deliver(). Use this for SIGSEGV/SIGBUS/SIGILL/SIGFPE/SIGTRAP raised
* from a guest exception, never signal_queue()+signal_deliver(): a queued fault
* can be stolen by another vCPU thread (delivered as SI_USER, no si_addr) or
* coalesced with another thread's fault into one bitmask bit.
*/
int signal_deliver_fault(hv_vcpu_t vcpu, guest_t *g, int signum, int *exit_code);

/* Handle rt_sigreturn (SYS 139): restore registers from rt_sigframe on
* the guest stack. Returns SYSCALL_EXEC_HAPPENED to skip X0 writeback.
*/
Expand Down
1 change: 1 addition & 0 deletions tests/manifest.txt
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ test-negative # diff=skip

[section] Signal + thread tests
test-signal-thread
test-fault-signal-mt # diff=skip

[section] Fork edge cases
test-clone3 # diff=skip
Expand Down
Loading
Loading