diff --git a/src/syscall/abi.h b/src/syscall/abi.h index 1cea435..52874a4 100644 --- a/src/syscall/abi.h +++ b/src/syscall/abi.h @@ -352,6 +352,9 @@ typedef struct { #define LINUX_TIOCSCTTY 0x540E /* -> macOS TIOCSCTTY (same semantics) */ #define LINUX_TIOCGWINSZ 0x5413 /* -> macOS TIOCGWINSZ (same struct) */ #define LINUX_FIONREAD 0x541B /* -> macOS FIONREAD (same semantics) */ +#define LINUX_FIONBIO 0x5421 /* set/clear O_NONBLOCK (arg: int *) */ +#define LINUX_FIONCLEX 0x5450 /* clear close-on-exec on fd */ +#define LINUX_FIOCLEX 0x5451 /* set close-on-exec on fd */ #define LINUX_TIOCNOTTY 0x5422 /* -> macOS TIOCNOTTY (same semantics) */ #define LINUX_TIOCGSID 0x5429 /* -> macOS TIOCGSID (same semantics) */ /* termios2 variant (adds c_ispeed/c_ospeed) */ @@ -705,7 +708,10 @@ typedef struct { typedef struct { int type; /* FD_CLOSED, FD_STDIO, FD_REGULAR, FD_DIR */ int host_fd; /* Underlying macOS file descriptor */ - uint64_t generation; /* Bumped each time this guest fd slot is reused. */ + uint64_t generation; /* Bumped each time this guest fd slot is reused. Lets + * long-lived references (e.g. epoll registrations) + * detect a close+reopen ABA where the slot now holds a + * different open file. */ int linux_flags; /* Linux open flags (for CLOEXEC tracking) */ void *dir; /* DIR* for FD_DIR entries (NULL otherwise) */ char proc_path[FD_VIRTUAL_PATH_MAX]; /* Virtual /proc dir root for *at */ diff --git a/src/syscall/io.c b/src/syscall/io.c index c3d26d2..67f02cd 100644 --- a/src/syscall/io.c +++ b/src/syscall/io.c @@ -1476,6 +1476,33 @@ int64_t sys_pwritev2(guest_t *g, int64_t sys_ioctl(guest_t *g, int fd, uint64_t request, uint64_t arg) { + /* FIOCLEX/FIONCLEX are the ioctl form of fcntl(F_SETFD): they set/clear the + * guest close-on-exec flag, which lives in fd_table linux_flags (not the + * host fd's FD_CLOEXEC, which is per-descriptor and would be lost on the + * dup that host_fd_ref hands multi-threaded callers, so mirror the F_SETFD + * path in sys_fcntl). They need no host fd, so dispatch them before + * host_fd_ref_open_regular_io(): that helper rejects O_PATH (FD_PATH) fds + * with EBADF, but Linux allows these ioctls -- like fcntl(F_SETFD) -- on + * O_PATH descriptors. Validate the slot and mutate the flag in a single + * fd_lock section so there is no validate-then-mutate window in which a + * concurrent close/reuse could flip CLOEXEC on a different file that took + * the slot. The arg is ignored. */ + if (request == LINUX_FIOCLEX || request == LINUX_FIONCLEX) { + if (!RANGE_CHECK(fd, 0, FD_TABLE_SIZE)) + return -LINUX_EBADF; + pthread_mutex_lock(&fd_lock); + if (fd_table[fd].type == FD_CLOSED) { + pthread_mutex_unlock(&fd_lock); + return -LINUX_EBADF; + } + if (request == LINUX_FIOCLEX) + fd_table[fd].linux_flags |= LINUX_O_CLOEXEC; + else + fd_table[fd].linux_flags &= ~LINUX_O_CLOEXEC; + pthread_mutex_unlock(&fd_lock); + return 0; + } + host_fd_ref_t host_ref; int64_t err = host_fd_ref_open_regular_io(fd, &host_ref); if (err < 0) @@ -1688,6 +1715,29 @@ int64_t sys_ioctl(guest_t *g, int fd, uint64_t request, uint64_t arg) return 0; } + case LINUX_FIONBIO: { + /* Set/clear O_NONBLOCK on the fd. Linux FIONBIO takes an int* arg: + * nonzero enables non-blocking, zero disables it. libuv's + * uv__nonblock_ioctl() (its default on Linux) issues this on pipe and + * socket fds at setup; without it the guest's uv_pipe_open() fails with + * ENOTTY and Node's stdio stream construction throws. + */ + int32_t on = 0; + if (guest_read_small(g, arg, &on, sizeof(on)) < 0) { + host_fd_ref_close(&host_ref); + return -LINUX_EFAULT; + } + int flags = fcntl(host_fd, F_GETFL); + if (flags < 0) { + host_fd_ref_close(&host_ref); + return linux_errno(); + } + flags = on ? (flags | O_NONBLOCK) : (flags & ~O_NONBLOCK); + int r = fcntl(host_fd, F_SETFL, flags); + host_fd_ref_close(&host_ref); + return r < 0 ? linux_errno() : 0; + } + default: host_fd_ref_close(&host_ref); return -LINUX_ENOTTY; diff --git a/src/syscall/poll.c b/src/syscall/poll.c index 2fe9923..705a902 100644 --- a/src/syscall/poll.c +++ b/src/syscall/poll.c @@ -692,14 +692,19 @@ typedef struct { /* Per-fd registration entry within an epoll instance. */ typedef struct { - uint32_t events; /* Registered EPOLL* events mask */ - uint64_t data; /* User data to return in epoll_wait */ - bool active; /* Registered in this instance */ - bool oneshot_armed; /* EPOLLONESHOT and event already fired, - * waiting for EPOLL_CTL_MOD re-arm. - * kqueue removed the event, so poll emulation prevents - * reporting but allow MOD. - */ + uint32_t events; /* Registered EPOLL* events mask */ + uint64_t data; /* User data to return in epoll_wait */ + uint64_t generation; /* fd_entry_t.generation captured at ADD/MOD. Detects a + * close+reopen ABA: if the guest fd's current + * generation no longer matches, the registered open + * file is gone and this stale entry must not drive + * kevent against the reused host fd. */ + bool active; /* Registered in this instance */ + bool oneshot_armed; /* EPOLLONESHOT and event already fired, + * waiting for EPOLL_CTL_MOD re-arm. + * kqueue removed the event, so poll emulation prevents + * reporting but allow MOD. + */ } epoll_reg_t; /* Per-epoll-instance data, stored in fd_table[epfd].dir. Each instance @@ -781,18 +786,43 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva) return -LINUX_EINVAL; } - host_fd_ref_t target_ref; - if (host_fd_ref_open(fd, &target_ref) < 0) { + /* Validate the target fd and read its persistent host fd in a single + * fd_lock snapshot, so the kqueue knote ident is taken from the same entry + * that was validated. A kqueue knote is keyed by the fd number and the + * kernel drops it the moment that fd is closed, so the ident must be the + * persistent host fd from the fd table -- not the dup that + * host_fd_ref_open() hands multi-threaded callers, which + * host_fd_ref_close() closes when the syscall returns (silently tearing the + * registration down). Snapshotting (rather than host_fd_ref_open() + a + * separate fd_to_host()) keeps the validate and the ident read atomic under + * one fd_lock. The snapshot's generation then guards the cross-call ABA + * below. Result mapping uses udata (the guest fd), so the ident only needs + * to stay open and refer to the same open file description. */ + fd_entry_t target_snap; + if (!fd_snapshot(fd, &target_snap)) { host_fd_ref_close(&epoll_ref); return -LINUX_EBADF; } + int target_host_fd = target_snap.host_fd; epoll_reg_t *reg = &inst->regs[fd]; + /* Cross-call ABA guard. If the guest closed this fd and reopened it (or the + * slot was reused) since the registration was stamped, the kernel already + * dropped the original knote when the old host fd closed, yet the guest fd + * number -- and thus reg->active -- still looks live. Acting on it would + * EV_DELETE/EV_MOD the wrong knote on the reused host fd. A mismatched + * generation means the registration is gone: drop it so DEL/MOD report + * ENOENT (matching Linux's auto-removal on close) and ADD starts fresh. */ + if ((reg->active || reg->oneshot_armed) && + reg->generation != target_snap.generation) { + reg->active = false; + reg->oneshot_armed = false; + } + if (op == LINUX_EPOLL_CTL_DEL) { /* Linux returns ENOENT when removing an unregistered fd */ if (!reg->active) { - host_fd_ref_close(&target_ref); host_fd_ref_close(&epoll_ref); return -LINUX_ENOENT; } @@ -804,12 +834,12 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva) int nchanges = 0; { if (reg->events & (LINUX_EPOLLIN | LINUX_EPOLLRDHUP)) { - EV_SET(&changes[nchanges], target_ref.fd, EVFILT_READ, + EV_SET(&changes[nchanges], target_host_fd, EVFILT_READ, EV_DELETE, 0, 0, NULL); nchanges++; } if (reg->events & LINUX_EPOLLOUT) { - EV_SET(&changes[nchanges], target_ref.fd, EVFILT_WRITE, + EV_SET(&changes[nchanges], target_host_fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL); nchanges++; } @@ -819,7 +849,6 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva) /* Clear stale state for potential re-add */ reg->oneshot_armed = false; } - host_fd_ref_close(&target_ref); host_fd_ref_close(&epoll_ref); return 0; } @@ -829,12 +858,10 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva) * (EPOLLONESHOT fired, waiting for re-arm) are still valid for MOD. */ if (op == LINUX_EPOLL_CTL_ADD && reg->active) { - host_fd_ref_close(&target_ref); host_fd_ref_close(&epoll_ref); return -LINUX_EEXIST; } if (op == LINUX_EPOLL_CTL_MOD && !reg->active && !reg->oneshot_armed) { - host_fd_ref_close(&target_ref); host_fd_ref_close(&epoll_ref); return -LINUX_ENOENT; } @@ -842,7 +869,6 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva) /* ADD or MOD: read the epoll_event from guest */ linux_epoll_event_t ev; if (guest_read_small(g, event_gva, &ev, sizeof(ev)) < 0) { - host_fd_ref_close(&target_ref); host_fd_ref_close(&epoll_ref); return -LINUX_EFAULT; } @@ -860,11 +886,11 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva) if (op == LINUX_EPOLL_CTL_MOD && reg->active) { struct kevent del; if (reg->events & (LINUX_EPOLLIN | LINUX_EPOLLRDHUP)) { - EV_SET(&del, target_ref.fd, EVFILT_READ, EV_DELETE, 0, 0, NULL); + EV_SET(&del, target_host_fd, EVFILT_READ, EV_DELETE, 0, 0, NULL); kevent(epoll_ref.fd, &del, 1, NULL, 0, NULL); } if (reg->events & LINUX_EPOLLOUT) { - EV_SET(&del, target_ref.fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL); + EV_SET(&del, target_host_fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL); kevent(epoll_ref.fd, &del, 1, NULL, 0, NULL); } } @@ -894,33 +920,34 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva) void *udata = (void *) (uintptr_t) fd; if (ev.events & (LINUX_EPOLLIN | LINUX_EPOLLRDHUP)) { - EV_SET(&changes[nchanges], target_ref.fd, EVFILT_READ, kflags, 0, 0, + EV_SET(&changes[nchanges], target_host_fd, EVFILT_READ, kflags, 0, 0, udata); nchanges++; } if (ev.events & LINUX_EPOLLOUT) { - EV_SET(&changes[nchanges], target_ref.fd, EVFILT_WRITE, kflags, 0, 0, + EV_SET(&changes[nchanges], target_host_fd, EVFILT_WRITE, kflags, 0, 0, udata); nchanges++; } if (nchanges > 0) { if (kevent(epoll_ref.fd, changes, nchanges, NULL, 0, NULL) < 0) { - host_fd_ref_close(&target_ref); host_fd_ref_close(&epoll_ref); return linux_errno(); } } /* Store registration data in per-instance table. - * Clear oneshot_armed when MOD successfully re-arms. + * Clear oneshot_armed when MOD successfully re-arms. Stamp the snapshot's + * generation so a later close+reopen of this guest fd is detected as a + * stale registration by the ABA guard above. */ reg->events = ev.events; reg->data = ev.data; + reg->generation = target_snap.generation; reg->active = true; reg->oneshot_armed = false; - host_fd_ref_close(&target_ref); host_fd_ref_close(&epoll_ref); return 0; } diff --git a/tests/manifest.txt b/tests/manifest.txt index 7273505..338b216 100644 --- a/tests/manifest.txt +++ b/tests/manifest.txt @@ -62,8 +62,11 @@ test-signalfd test-signalfd-hardening test-epoll test-epoll-edge +test-epoll-mt +test-epoll-aba test-timerfd test-large-io-boundary +test-ioctl-cloexec [section] /proc and /dev emulation tests test-proc diff --git a/tests/test-epoll-aba.c b/tests/test-epoll-aba.c new file mode 100644 index 0000000..be44f05 --- /dev/null +++ b/tests/test-epoll-aba.c @@ -0,0 +1,206 @@ +/* epoll_ctl close+reopen ABA regression test + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Companion to test-epoll-mt.c. That test covers the original dup-as-ident + * bug (registrations vanishing across epoll_ctl). This one covers the ABA + * race jserv flagged in review (PR #73, poll.c:723): + * + * A registration in inst->regs[fd] is keyed on the guest fd *number*, but + * the kqueue knote is keyed on the underlying host fd. When the guest + * close()s a registered fd, the kernel drops the knote immediately, yet + * sys_close leaves reg->active set. If the guest then reopens the same fd + * number (a different open file behind the same number -- the A->B->A ABA), + * a later EPOLL_CTL_DEL/MOD would see the stale reg->active and act on the + * *new* file's host fd: EV_DELETE/EV_MOD against the wrong knote, returning + * success for a registration Linux considers already gone. + * + * The fix stamps fd_entry_t.generation into epoll_reg_t at ADD time and + * rejects DEL/MOD when fd_table[fd].generation no longer matches, so a + * close+reopen is detected as a stale registration. The generation counter is + * monotonic, so the reused fd number gets a fresh stamp that never collides + * with the old one. + * + * Asserted behavior (matches Linux, which auto-removes an fd from the epoll + * interest list on close): + * - DEL on a closed-then-reopened fd number -> ENOENT, not success. + * - MOD on it -> ENOENT. + * - A fresh ADD on it succeeds and delivers edges for the *new* file. + * - The guard does not over-fire: DEL on a still-open registration that was + * never closed still succeeds. + * + * A CLONE_THREAD sibling is kept alive throughout so the guest stays + * multi-threaded -- that is the only mode in which host_fd_ref_open() hands + * back a dup, i.e. the exact path the snapshot+generation logic guards. + * + * Syscalls exercised: clone(220), epoll_create1(20), epoll_ctl(21), + * epoll_pwait(22), eventfd2(19), dup3(24), write(64), + * read(63), close(57), futex(98), nanosleep(101), exit(93) + */ + +#include +#include +#include +#include +#include + +#include "test-harness.h" +#include "raw-syscall.h" + +int passes = 0, fails = 0; + +static volatile int child_should_exit = 0; +static char sibling_stack[16384] __attribute__((aligned(16))); + +/* Sibling thread: stays alive so the guest is multi-threaded across the + * parent's epoll_ctl() calls, then exits. Raw syscalls only -- a + * clone(CLONE_THREAD) child has no libc TLS. */ +static int sibling_fn(void *arg) +{ + (void) arg; + struct { + long tv_sec, tv_nsec; + } ts = {0, 5000000}; /* 5ms */ + while (!child_should_exit) + raw_syscall2(__NR_nanosleep, (long) &ts, 0); + raw_syscall1(__NR_exit, 0); + return 0; +} + +/* Reopen a fresh eventfd onto exactly the guest fd number that was just + * closed, so the ABA (same number, new open file, new generation) is exercised + * regardless of fd-allocator policy. The lowest-free allocator normally hands + * back oldfd directly; dup3() forces it otherwise. Returns oldfd, or -1. */ +static int reopen_same_number(int oldfd) +{ + int nf = eventfd(0, EFD_NONBLOCK); + if (nf < 0) + return -1; + if (nf == oldfd) + return oldfd; + /* dup3 with flags=0 == dup2; lands the new open file on oldfd. */ + if (raw_syscall3(__NR_dup3, nf, oldfd, 0) < 0) { + close(nf); + return -1; + } + close(nf); + return oldfd; +} + +/* Register fd for EPOLLIN, make it readable, and assert the edge is observed + * with the expected data.fd within a generous finite budget. Returns 1 on + * success. Leaves the eventfd counter signalled (caller drains/closes). */ +static int add_and_expect_edge(int epfd, int fd) +{ + struct epoll_event ev = {.events = EPOLLIN, .data.fd = fd}; + if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev) < 0) + return 0; + + uint64_t one = 1; + if (write(fd, &one, sizeof(one)) != (ssize_t) sizeof(one)) + return 0; + + struct epoll_event out[4]; + int n = epoll_wait(epfd, out, 4, 2000); + return n == 1 && out[0].data.fd == fd; +} + +static void drain_eventfd(int fd) +{ + uint64_t v; + (void) !read(fd, &v, sizeof(v)); +} + +int main(void) +{ + printf("test-epoll-aba: epoll_ctl close+reopen ABA guard\n"); + + /* Keep the guest multi-threaded for the whole test (see file header). */ + long flags = 0x00000100 | 0x00000200 | 0x00000400 | 0x00000800 | + 0x00010000 | 0x00200000; + volatile uint32_t child_tid = 1; + long ret = raw_syscall5(__NR_clone, flags, + (long) (sibling_stack + sizeof(sibling_stack)), 0, + 0, (long) &child_tid); + if (ret == 0) { + sibling_fn(NULL); + return 0; /* unreachable */ + } + + TEST("clone sibling for multi-threaded context"); + EXPECT_TRUE(ret > 0, "clone failed"); + + int epfd = epoll_create1(EPOLL_CLOEXEC); + TEST("epoll_create1"); + EXPECT_TRUE(epfd >= 0, "epoll_create1 failed"); + + /* Control: the generation guard must not over-fire. A registration on a + * fd that is never closed must still DEL cleanly (generation matches). */ + TEST("DEL on still-open registration succeeds"); + { + int efd = eventfd(0, EFD_NONBLOCK); + struct epoll_event ev = {.events = EPOLLIN, .data.fd = efd}; + int ok = efd >= 0 && epoll_ctl(epfd, EPOLL_CTL_ADD, efd, &ev) == 0 && + epoll_ctl(epfd, EPOLL_CTL_DEL, efd, NULL) == 0; + EXPECT_TRUE(ok, "valid DEL rejected"); + close(efd); + } + + /* Register file A on a fd number, confirm it really is live, then close + * it and reopen a different file B onto the same number. */ + int efd = eventfd(0, EFD_NONBLOCK); + TEST("ABA setup: register + observe edge on file A"); + EXPECT_TRUE(efd >= 0 && add_and_expect_edge(epfd, efd), + "file A failed to register/deliver"); + drain_eventfd(efd); + close(efd); + + int reused = reopen_same_number(efd); + TEST("ABA setup: reopen reuses the same fd number"); + EXPECT_EQ(reused, efd, "could not reuse fd number"); + + /* The stale registration must be treated as gone. Without the generation + * guard these act on file B's host fd and return success. */ + TEST("DEL after close+reopen -> ENOENT"); + { + struct epoll_event ev = {.events = EPOLLIN, .data.fd = efd}; + (void) ev; + EXPECT_ERRNO(epoll_ctl(epfd, EPOLL_CTL_DEL, efd, NULL), ENOENT, + "stale DEL did not report ENOENT"); + } + + TEST("MOD after close+reopen -> ENOENT"); + { + struct epoll_event ev = {.events = EPOLLIN | EPOLLOUT, .data.fd = efd}; + EXPECT_ERRNO(epoll_ctl(epfd, EPOLL_CTL_MOD, efd, &ev), ENOENT, + "stale MOD did not report ENOENT"); + } + + /* A fresh registration on the reused number must work end to end and + * report the *new* file's readiness -- proving the new knote is keyed on + * file B's host fd, not corrupted by the cleared stale state. */ + TEST("fresh ADD after ABA delivers edge for file B"); + EXPECT_TRUE(add_and_expect_edge(epfd, efd), + "reused fd failed to register/deliver after ABA"); + drain_eventfd(efd); + + TEST("DEL of the fresh registration succeeds"); + EXPECT_TRUE(epoll_ctl(epfd, EPOLL_CTL_DEL, efd, NULL) == 0, + "fresh DEL rejected"); + close(efd); + close(epfd); + + /* Release the sibling and join via the CLONE_CHILD_CLEARTID futex. */ + child_should_exit = 1; + for (int i = 0; i < 200 && child_tid != 0; i++) { + struct { + long tv_sec, tv_nsec; + } ts = {0, 10000000}; /* 10ms */ + raw_syscall6(__NR_futex, (long) &child_tid, 0 /* FUTEX_WAIT */, + child_tid, (long) &ts, 0, 0); + } + + SUMMARY("test-epoll-aba"); + return fails > 0 ? 1 : 0; +} diff --git a/tests/test-epoll-mt.c b/tests/test-epoll-mt.c new file mode 100644 index 0000000..d3e8f76 --- /dev/null +++ b/tests/test-epoll-mt.c @@ -0,0 +1,147 @@ +/* Multi-threaded epoll registration regression test + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Regression for the epoll_ctl host-fd-reference bug: in a multi-threaded + * guest, host_fd_ref_open() hands back a *dup* of the target fd that is + * closed when the syscall returns. sys_epoll_ctl() used that transient dup + * as the kqueue knote ident, so the kernel dropped the registration the + * moment epoll_ctl() returned -- and epoll_pwait() never reported readiness + * again. Single-threaded guests borrow the raw fd (no dup, no close) and so + * never hit it; this only reproduces with at least one CLONE_THREAD sibling + * active. Node's libuv DelayedTaskScheduler relied on exactly this path + * (eventfd + epoll for uv_async_send) and hung forever at process exit. + * + * The test keeps a sibling thread alive across the epoll_ctl() call, then + * checks that both a pipe and an eventfd registered while multi-threaded + * still deliver an EPOLLIN edge. + * + * Syscalls exercised: clone(220), epoll_create1(20), epoll_ctl(21), + * epoll_pwait(22), eventfd2(19), pipe2(59), write(64), + * read(63), close(57), futex(98), exit(93) + */ + +#include +#include +#include +#include +#include + +#include "test-harness.h" +#include "raw-syscall.h" + +int passes = 0, fails = 0; + +static volatile int child_should_exit = 0; +static char sibling_stack[16384] __attribute__((aligned(16))); + +/* Sibling thread: stays alive (raw nanosleep loop) so the guest is + * multi-threaded for the duration of the parent's epoll_ctl() calls, then + * exits via the raw exit syscall. Uses only raw syscalls because a + * clone(CLONE_THREAD) child has no libc TLS set up. + */ +static int sibling_fn(void *arg) +{ + (void) arg; + struct { + long tv_sec, tv_nsec; + } ts = {0, 5000000}; /* 5ms */ + while (!child_should_exit) + raw_syscall2(__NR_nanosleep, (long) &ts, 0); + raw_syscall1(__NR_exit, 0); + return 0; +} + +/* Register host_fd for EPOLLIN on epfd, make it readable via make_ready(), + * and assert epoll_pwait() observes the edge within the timeout. Returns 1 + * on success. */ +static int expect_ready_edge(int epfd, int fd, void (*make_ready)(int), int arg) +{ + struct epoll_event ev = {.events = EPOLLIN, .data.fd = fd}; + if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev) < 0) + return 0; + + make_ready(arg); + + struct epoll_event out[4]; + /* 2s budget: the bug manifests as an indefinite miss, so any generous + * finite timeout distinguishes pass from fail without flaking. */ + int n = epoll_wait(epfd, out, 4, 2000); + return n == 1 && out[0].data.fd == fd; +} + +static void poke_eventfd(int fd) +{ + uint64_t one = 1; + (void) !write(fd, &one, sizeof(one)); +} + +static int g_pipe_wr; +static void poke_pipe(int unused) +{ + (void) unused; + (void) !write(g_pipe_wr, "x", 1); +} + +int main(void) +{ + printf("test-epoll-mt: epoll registration under CLONE_THREAD\n"); + + /* Spawn a CLONE_THREAD sibling so host_fd_ref_open() takes the dup path. + * Flags: CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| + * CLONE_CHILD_CLEARTID. + */ + long flags = 0x00000100 | 0x00000200 | 0x00000400 | 0x00000800 | + 0x00010000 | 0x00200000; + volatile uint32_t child_tid = 1; + long ret = raw_syscall5(__NR_clone, flags, + (long) (sibling_stack + sizeof(sibling_stack)), 0, + 0, (long) &child_tid); + if (ret == 0) { + sibling_fn(NULL); + return 0; /* unreachable: sibling_fn exits the thread */ + } + + TEST("clone sibling for multi-threaded context"); + EXPECT_TRUE(ret > 0, "clone failed"); + + /* eventfd registered + signalled while multi-threaded (the Node path). */ + TEST("MT epoll: eventfd EPOLLIN edge delivered"); + { + int epfd = epoll_create1(EPOLL_CLOEXEC); + int efd = eventfd(0, EFD_NONBLOCK); + EXPECT_TRUE(epfd >= 0 && efd >= 0, "epoll/eventfd create failed"); + EXPECT_TRUE(expect_ready_edge(epfd, efd, poke_eventfd, efd), + "eventfd registration lost across epoll_ctl"); + close(efd); + close(epfd); + } + + /* Same with a pipe read end, to show the fix is fd-type independent. */ + TEST("MT epoll: pipe EPOLLIN edge delivered"); + { + int epfd = epoll_create1(EPOLL_CLOEXEC); + int pipefd[2]; + EXPECT_TRUE(epfd >= 0 && pipe(pipefd) == 0, "epoll/pipe create failed"); + g_pipe_wr = pipefd[1]; + EXPECT_TRUE(expect_ready_edge(epfd, pipefd[0], poke_pipe, 0), + "pipe registration lost across epoll_ctl"); + close(pipefd[0]); + close(pipefd[1]); + close(epfd); + } + + /* Release the sibling and join via the CLONE_CHILD_CLEARTID futex. */ + child_should_exit = 1; + for (int i = 0; i < 200 && child_tid != 0; i++) { + struct { + long tv_sec, tv_nsec; + } ts = {0, 10000000}; /* 10ms */ + raw_syscall6(__NR_futex, (long) &child_tid, 0 /* FUTEX_WAIT */, + child_tid, (long) &ts, 0, 0); + } + + SUMMARY("test-epoll-mt"); + return fails > 0 ? 1 : 0; +} diff --git a/tests/test-ioctl-cloexec.c b/tests/test-ioctl-cloexec.c new file mode 100644 index 0000000..e25ca40 --- /dev/null +++ b/tests/test-ioctl-cloexec.c @@ -0,0 +1,83 @@ +/* FIOCLEX/FIONCLEX ioctl close-on-exec regression test + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * FIOCLEX/FIONCLEX are the ioctl form of fcntl(F_SETFD): they set/clear the + * close-on-exec flag and take no real host fd. Two properties are covered: + * + * 1. Toggling round-trips: after ioctl(FIOCLEX) the flag reads back via + * fcntl(F_GETFD), and ioctl(FIONCLEX) clears it again. + * + * 2. O_PATH descriptors are accepted. Linux permits FIOCLEX/FIONCLEX (like + * fcntl(F_SETFD)) on O_PATH fds; elfuse used to route every ioctl through + * the regular-IO host-fd open, which rejects O_PATH with EBADF, so the two + * cloexec ioctls now dispatch before that gate. + * + * Syscalls exercised: openat(56), ioctl(29), fcntl(25), close(57) + */ + +#include +#include +#include + +#include "test-harness.h" + +#ifndef O_PATH +#define O_PATH 010000000 +#endif +#ifndef FIOCLEX +#define FIOCLEX 0x5451 +#endif +#ifndef FIONCLEX +#define FIONCLEX 0x5450 +#endif + +int passes = 0, fails = 0; + +/* Run the FIOCLEX -> FIONCLEX toggle round-trip on an already-open fd. */ +static void check_cloexec_toggle(int fd, const char *what) +{ + char label[64]; + + snprintf(label, sizeof(label), "%s: starts without cloexec", what); + TEST(label); + EXPECT_EQ(fcntl(fd, F_GETFD) & FD_CLOEXEC, 0, "expected cloexec clear"); + + snprintf(label, sizeof(label), "%s: ioctl(FIOCLEX) sets cloexec", what); + TEST(label); + EXPECT_TRUE(ioctl(fd, FIOCLEX) == 0 && (fcntl(fd, F_GETFD) & FD_CLOEXEC), + "FIOCLEX did not set cloexec"); + + snprintf(label, sizeof(label), "%s: ioctl(FIONCLEX) clears cloexec", what); + TEST(label); + EXPECT_TRUE( + ioctl(fd, FIONCLEX) == 0 && (fcntl(fd, F_GETFD) & FD_CLOEXEC) == 0, + "FIONCLEX did not clear cloexec"); +} + +int main(void) +{ + printf("test-ioctl-cloexec: FIOCLEX/FIONCLEX close-on-exec\n"); + + /* A plain readable fd: the ordinary path. */ + int rfd = open("/", O_RDONLY | O_DIRECTORY); + TEST("open(/) O_RDONLY"); + EXPECT_TRUE(rfd >= 0, "open failed"); + if (rfd >= 0) { + check_cloexec_toggle(rfd, "regular fd"); + close(rfd); + } + + /* An O_PATH fd: must be accepted, not rejected with EBADF. */ + int pfd = open("/", O_PATH | O_DIRECTORY); + TEST("open(/) O_PATH"); + EXPECT_TRUE(pfd >= 0, "open O_PATH failed"); + if (pfd >= 0) { + check_cloexec_toggle(pfd, "O_PATH fd"); + close(pfd); + } + + SUMMARY("test-ioctl-cloexec"); + return fails > 0 ? 1 : 0; +}