Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion src/syscall/abi.h
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,9 @@ typedef struct {
#define LINUX_TIOCSCTTY 0x540E /* -> macOS TIOCSCTTY (same semantics) */
#define LINUX_TIOCGWINSZ 0x5413 /* -> macOS TIOCGWINSZ (same struct) */
#define LINUX_FIONREAD 0x541B /* -> macOS FIONREAD (same semantics) */
#define LINUX_FIONBIO 0x5421 /* set/clear O_NONBLOCK (arg: int *) */
#define LINUX_FIONCLEX 0x5450 /* clear close-on-exec on fd */
#define LINUX_FIOCLEX 0x5451 /* set close-on-exec on fd */
#define LINUX_TIOCNOTTY 0x5422 /* -> macOS TIOCNOTTY (same semantics) */
#define LINUX_TIOCGSID 0x5429 /* -> macOS TIOCGSID (same semantics) */
/* termios2 variant (adds c_ispeed/c_ospeed) */
Expand Down Expand Up @@ -705,7 +708,10 @@ typedef struct {
typedef struct {
int type; /* FD_CLOSED, FD_STDIO, FD_REGULAR, FD_DIR */
int host_fd; /* Underlying macOS file descriptor */
uint64_t generation; /* Bumped each time this guest fd slot is reused. */
uint64_t generation; /* Bumped each time this guest fd slot is reused. Lets
* long-lived references (e.g. epoll registrations)
* detect a close+reopen ABA where the slot now holds a
* different open file. */
int linux_flags; /* Linux open flags (for CLOEXEC tracking) */
void *dir; /* DIR* for FD_DIR entries (NULL otherwise) */
char proc_path[FD_VIRTUAL_PATH_MAX]; /* Virtual /proc dir root for *at */
Expand Down
50 changes: 50 additions & 0 deletions src/syscall/io.c
Original file line number Diff line number Diff line change
Expand Up @@ -1476,6 +1476,33 @@ int64_t sys_pwritev2(guest_t *g,

int64_t sys_ioctl(guest_t *g, int fd, uint64_t request, uint64_t arg)
{
/* FIOCLEX/FIONCLEX are the ioctl form of fcntl(F_SETFD): they set/clear the
* guest close-on-exec flag, which lives in fd_table linux_flags (not the
* host fd's FD_CLOEXEC, which is per-descriptor and would be lost on the
* dup that host_fd_ref hands multi-threaded callers, so mirror the F_SETFD
* path in sys_fcntl). They need no host fd, so dispatch them before
* host_fd_ref_open_regular_io(): that helper rejects O_PATH (FD_PATH) fds
* with EBADF, but Linux allows these ioctls -- like fcntl(F_SETFD) -- on
* O_PATH descriptors. Validate the slot and mutate the flag in a single
* fd_lock section so there is no validate-then-mutate window in which a
* concurrent close/reuse could flip CLOEXEC on a different file that took
* the slot. The arg is ignored. */
if (request == LINUX_FIOCLEX || request == LINUX_FIONCLEX) {
if (!RANGE_CHECK(fd, 0, FD_TABLE_SIZE))
return -LINUX_EBADF;
pthread_mutex_lock(&fd_lock);
if (fd_table[fd].type == FD_CLOSED) {
pthread_mutex_unlock(&fd_lock);
return -LINUX_EBADF;
}
if (request == LINUX_FIOCLEX)
fd_table[fd].linux_flags |= LINUX_O_CLOEXEC;
else
fd_table[fd].linux_flags &= ~LINUX_O_CLOEXEC;
pthread_mutex_unlock(&fd_lock);
return 0;
}

host_fd_ref_t host_ref;
int64_t err = host_fd_ref_open_regular_io(fd, &host_ref);
if (err < 0)
Expand Down Expand Up @@ -1688,6 +1715,29 @@ int64_t sys_ioctl(guest_t *g, int fd, uint64_t request, uint64_t arg)
return 0;
}

case LINUX_FIONBIO: {
/* Set/clear O_NONBLOCK on the fd. Linux FIONBIO takes an int* arg:
* nonzero enables non-blocking, zero disables it. libuv's
* uv__nonblock_ioctl() (its default on Linux) issues this on pipe and
* socket fds at setup; without it the guest's uv_pipe_open() fails with
* ENOTTY and Node's stdio stream construction throws.
*/
int32_t on = 0;
if (guest_read_small(g, arg, &on, sizeof(on)) < 0) {
host_fd_ref_close(&host_ref);
return -LINUX_EFAULT;
}
int flags = fcntl(host_fd, F_GETFL);
if (flags < 0) {
host_fd_ref_close(&host_ref);
return linux_errno();
}
flags = on ? (flags | O_NONBLOCK) : (flags & ~O_NONBLOCK);
int r = fcntl(host_fd, F_SETFL, flags);
host_fd_ref_close(&host_ref);
return r < 0 ? linux_errno() : 0;
}

default:
host_fd_ref_close(&host_ref);
return -LINUX_ENOTTY;
Expand Down
75 changes: 51 additions & 24 deletions src/syscall/poll.c
Original file line number Diff line number Diff line change
Expand Up @@ -692,14 +692,19 @@ typedef struct {

/* Per-fd registration entry within an epoll instance. */
typedef struct {
uint32_t events; /* Registered EPOLL* events mask */
uint64_t data; /* User data to return in epoll_wait */
bool active; /* Registered in this instance */
bool oneshot_armed; /* EPOLLONESHOT and event already fired,
* waiting for EPOLL_CTL_MOD re-arm.
* kqueue removed the event, so poll emulation prevents
* reporting but allow MOD.
*/
uint32_t events; /* Registered EPOLL* events mask */
uint64_t data; /* User data to return in epoll_wait */
uint64_t generation; /* fd_entry_t.generation captured at ADD/MOD. Detects a
* close+reopen ABA: if the guest fd's current
* generation no longer matches, the registered open
* file is gone and this stale entry must not drive
* kevent against the reused host fd. */
bool active; /* Registered in this instance */
bool oneshot_armed; /* EPOLLONESHOT and event already fired,
* waiting for EPOLL_CTL_MOD re-arm.
* kqueue removed the event, so poll emulation prevents
* reporting but allow MOD.
*/
} epoll_reg_t;

/* Per-epoll-instance data, stored in fd_table[epfd].dir. Each instance
Expand Down Expand Up @@ -781,18 +786,43 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
return -LINUX_EINVAL;
}

host_fd_ref_t target_ref;
if (host_fd_ref_open(fd, &target_ref) < 0) {
/* Validate the target fd and read its persistent host fd in a single
* fd_lock snapshot, so the kqueue knote ident is taken from the same entry
* that was validated. A kqueue knote is keyed by the fd number and the
* kernel drops it the moment that fd is closed, so the ident must be the
* persistent host fd from the fd table -- not the dup that
* host_fd_ref_open() hands multi-threaded callers, which
* host_fd_ref_close() closes when the syscall returns (silently tearing the
* registration down). Snapshotting (rather than host_fd_ref_open() + a
* separate fd_to_host()) keeps the validate and the ident read atomic under
* one fd_lock. The snapshot's generation then guards the cross-call ABA
* below. Result mapping uses udata (the guest fd), so the ident only needs
* to stay open and refer to the same open file description. */
fd_entry_t target_snap;
if (!fd_snapshot(fd, &target_snap)) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After fd_snapshot() releases fd_lock, target_snap.host_fd is no longer guaranteed to match fd_table[fd] by the time kevent() runs. A sibling can close(fd) and the host can reuse that fd number before EV_ADD posts -- the kqueue then holds a knote keyed on a different open file than the guest fd believed it registered. A later EPOLL_CTL_DEL after a close+reopen ABA would EV_DELETE the wrong knote.

fd_entry_t already carries a generation counter for this class of race (abi.h:708; used in net.c:311,323). Consider stamping the generation into epoll_reg_t at ADD and rejecting DEL/MOD when fd_table[fd].generation no longer matches.

Adjacent (pre-existing, but more observable after this change): sys_close does not clear inst->regs[fd].active for epoll instances holding the fd, so a closed-then-reopened guest fd still looks active to epoll. Worth filing separately.

host_fd_ref_close(&epoll_ref);
return -LINUX_EBADF;
}
int target_host_fd = target_snap.host_fd;

epoll_reg_t *reg = &inst->regs[fd];

/* Cross-call ABA guard. If the guest closed this fd and reopened it (or the
* slot was reused) since the registration was stamped, the kernel already
* dropped the original knote when the old host fd closed, yet the guest fd
* number -- and thus reg->active -- still looks live. Acting on it would
* EV_DELETE/EV_MOD the wrong knote on the reused host fd. A mismatched
* generation means the registration is gone: drop it so DEL/MOD report
* ENOENT (matching Linux's auto-removal on close) and ADD starts fresh. */
if ((reg->active || reg->oneshot_armed) &&
reg->generation != target_snap.generation) {
reg->active = false;
reg->oneshot_armed = false;
}

if (op == LINUX_EPOLL_CTL_DEL) {
/* Linux returns ENOENT when removing an unregistered fd */
if (!reg->active) {
host_fd_ref_close(&target_ref);
host_fd_ref_close(&epoll_ref);
return -LINUX_ENOENT;
}
Expand All @@ -804,12 +834,12 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
int nchanges = 0;
{
if (reg->events & (LINUX_EPOLLIN | LINUX_EPOLLRDHUP)) {
EV_SET(&changes[nchanges], target_ref.fd, EVFILT_READ,
EV_SET(&changes[nchanges], target_host_fd, EVFILT_READ,
EV_DELETE, 0, 0, NULL);
nchanges++;
}
if (reg->events & LINUX_EPOLLOUT) {
EV_SET(&changes[nchanges], target_ref.fd, EVFILT_WRITE,
EV_SET(&changes[nchanges], target_host_fd, EVFILT_WRITE,
EV_DELETE, 0, 0, NULL);
nchanges++;
}
Expand All @@ -819,7 +849,6 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
/* Clear stale state for potential re-add */
reg->oneshot_armed = false;
}
host_fd_ref_close(&target_ref);
host_fd_ref_close(&epoll_ref);
return 0;
}
Expand All @@ -829,20 +858,17 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
* (EPOLLONESHOT fired, waiting for re-arm) are still valid for MOD.
*/
if (op == LINUX_EPOLL_CTL_ADD && reg->active) {
host_fd_ref_close(&target_ref);
host_fd_ref_close(&epoll_ref);
return -LINUX_EEXIST;
}
if (op == LINUX_EPOLL_CTL_MOD && !reg->active && !reg->oneshot_armed) {
host_fd_ref_close(&target_ref);
host_fd_ref_close(&epoll_ref);
return -LINUX_ENOENT;
}

/* ADD or MOD: read the epoll_event from guest */
linux_epoll_event_t ev;
if (guest_read_small(g, event_gva, &ev, sizeof(ev)) < 0) {
host_fd_ref_close(&target_ref);
host_fd_ref_close(&epoll_ref);
return -LINUX_EFAULT;
}
Expand All @@ -860,11 +886,11 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
if (op == LINUX_EPOLL_CTL_MOD && reg->active) {
struct kevent del;
if (reg->events & (LINUX_EPOLLIN | LINUX_EPOLLRDHUP)) {
EV_SET(&del, target_ref.fd, EVFILT_READ, EV_DELETE, 0, 0, NULL);
EV_SET(&del, target_host_fd, EVFILT_READ, EV_DELETE, 0, 0, NULL);
kevent(epoll_ref.fd, &del, 1, NULL, 0, NULL);
}
if (reg->events & LINUX_EPOLLOUT) {
EV_SET(&del, target_ref.fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL);
EV_SET(&del, target_host_fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL);
kevent(epoll_ref.fd, &del, 1, NULL, 0, NULL);
}
}
Expand Down Expand Up @@ -894,33 +920,34 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
void *udata = (void *) (uintptr_t) fd;

if (ev.events & (LINUX_EPOLLIN | LINUX_EPOLLRDHUP)) {
EV_SET(&changes[nchanges], target_ref.fd, EVFILT_READ, kflags, 0, 0,
EV_SET(&changes[nchanges], target_host_fd, EVFILT_READ, kflags, 0, 0,
udata);
nchanges++;
}
if (ev.events & LINUX_EPOLLOUT) {
EV_SET(&changes[nchanges], target_ref.fd, EVFILT_WRITE, kflags, 0, 0,
EV_SET(&changes[nchanges], target_host_fd, EVFILT_WRITE, kflags, 0, 0,
udata);
nchanges++;
}

if (nchanges > 0) {
if (kevent(epoll_ref.fd, changes, nchanges, NULL, 0, NULL) < 0) {
host_fd_ref_close(&target_ref);
host_fd_ref_close(&epoll_ref);
return linux_errno();
}
}

/* Store registration data in per-instance table.
* Clear oneshot_armed when MOD successfully re-arms.
* Clear oneshot_armed when MOD successfully re-arms. Stamp the snapshot's
* generation so a later close+reopen of this guest fd is detected as a
* stale registration by the ABA guard above.
*/
reg->events = ev.events;
reg->data = ev.data;
reg->generation = target_snap.generation;
reg->active = true;
reg->oneshot_armed = false;

host_fd_ref_close(&target_ref);
host_fd_ref_close(&epoll_ref);
return 0;
}
Expand Down
3 changes: 3 additions & 0 deletions tests/manifest.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,11 @@ test-signalfd
test-signalfd-hardening
test-epoll
test-epoll-edge
test-epoll-mt
test-epoll-aba
test-timerfd
test-large-io-boundary
test-ioctl-cloexec

[section] /proc and /dev emulation tests
test-proc
Expand Down
Loading
Loading