From ace1dd640d367c93a7eb599baded4fc48671d2cf Mon Sep 17 00:00:00 2001 From: Max042004 Date: Sat, 6 Jun 2026 20:36:42 +0800 Subject: [PATCH] Back read-only MAP_SHARED file mappings with MAP_PRIVATE A MAP_SHARED, PROT_READ mapping of a file opened O_RDONLY could never be installed. hvf_apply_file_overlay_quiesced() always mmap'd the host page PROT_READ|PROT_WRITE and mapped the HVF segment RWX. On a read-only fd the host mmap fails with EACCES (writable mapping of an O_RDONLY fd); forcing PROT_READ then trips hv_vm_map(), because a MAP_SHARED mapping of an O_RDONLY fd has macOS max_protection=READ and HVF cannot grant stage-2 rights (RWX) beyond the host region's max_protection (HV_ERROR). This blocked every workload that maps a read-only file MAP_SHARED -- most visibly the JVM, which maps its ~135 MiB lib/modules image exactly this way and crashed on startup. Choose the host backing from what the fd and the guest actually need: - guest wants PROT_WRITE: MAP_SHARED PROT_READ|PROT_WRITE (writes reach the file; an O_RDONLY fd still yields EACCES, matching Linux). - guest read-only on a writable fd: MAP_SHARED PROT_READ (max_protection is RWX, so the segment maps and cross-mapping coherence is preserved). - guest read-only on an O_RDONLY fd: MAP_PRIVATE PROT_READ. Its max_protection is RWX so the segment maps; the pages still show file content, and the guest's stage-1 tables keep the region read-only so the private copy is never dirtied -- no observable MAP_SHARED divergence for a read-only mapping. The guest-requested prot is threaded through hvf_apply_file_overlay(), hvf_apply_file_overlay_quiesced(), and restore_file_overlay_range() so every overlay install/restore site picks the correct backing. Add test-mmap-shared-ro covering the O_RDONLY read path, a second concurrent read-only mapping, EACCES on a writable request, and the read-only-mapping-on-O_RDWR-fd branch. (cherry picked from commit 337d39a4313109884112a86a0c4147bddfe18fa1) --- src/syscall/mem.c | 81 ++++++++++---- tests/manifest.txt | 3 + tests/test-mmap-shared-ro.c | 212 ++++++++++++++++++++++++++++++++++++ 3 files changed, 273 insertions(+), 23 deletions(-) create mode 100644 tests/test-mmap-shared-ro.c diff --git a/src/syscall/mem.c b/src/syscall/mem.c index bcdd48d..0c7bead 100644 --- a/src/syscall/mem.c +++ b/src/syscall/mem.c @@ -968,12 +968,14 @@ static int hvf_apply_file_overlay(guest_t *g, uint64_t ipa, uint64_t len, int fd, - off_t file_off); + off_t file_off, + int prot); static int hvf_apply_file_overlay_quiesced(guest_t *g, uint64_t ipa, uint64_t len, int fd, - off_t file_off); + off_t file_off, + int prot); static int hvf_remove_file_overlay(guest_t *g, uint64_t ipa, uint64_t len); static int read_file_range_to_guest(guest_t *g, @@ -1008,10 +1010,12 @@ static int restore_file_overlay_range(guest_t *g, uint64_t overlay_start, uint64_t overlay_end, int fd, - uint64_t file_off) + uint64_t file_off, + int prot) { int err = hvf_apply_file_overlay( - g, overlay_start, overlay_end - overlay_start, fd, (off_t) file_off); + g, overlay_start, overlay_end - overlay_start, fd, (off_t) file_off, + prot); if (err < 0) return err; mark_overlay_metadata_range(g, start, end, overlay_start, overlay_end); @@ -1133,7 +1137,7 @@ static int restore_snapshot_overlays_in_place(guest_t *g, if (first) { int err = restore_file_overlay_range( g, snap->start, snap->end, snap->overlay_start, - snap->overlay_end, snap->backing_fd, snap_file_off); + snap->overlay_end, snap->backing_fd, snap_file_off, snap->prot); if (err < 0) return err; continue; @@ -1240,7 +1244,7 @@ static int restore_region_snapshots(guest_t *g, region_snapshot_t *snaps, int n) return -LINUX_EFAULT; int err = restore_file_overlay_range( g, snap->start, snap->end, snap->overlay_start, - snap->overlay_end, r->backing_fd, snap_file_off); + snap->overlay_end, r->backing_fd, snap_file_off, r->prot); if (err < 0) return err; continue; @@ -1429,18 +1433,19 @@ static int hvf_segment_split(guest_t *g, return 0; } -/* Apply a real MAP_SHARED file overlay at [ipa, ipa+len) backed by [fd, - * file_off). The IPA range may be sub-2 MiB; the containing 2 MiB - * segment is split out first if it is not already isolated. Caller - * holds mmap_lock and has already quiesced sibling vCPUs (or has none). - * The fork pre-snapshot path quiesces siblings before calling this so - * the overlay install does not trigger a nested quiesce. +/* Apply a real file overlay at [ipa, ipa+len) backed by [fd, file_off). The + * IPA range may be sub-2 MiB; the containing 2 MiB segment is split out first + * if it is not already isolated. Caller holds mmap_lock and has already + * quiesced sibling vCPUs (or has none). The fork pre-snapshot path quiesces + * siblings before calling this so the overlay install does not trigger a + * nested quiesce. */ static int hvf_apply_file_overlay_quiesced(guest_t *g, uint64_t ipa, uint64_t len, int fd, - off_t file_off) + off_t file_off, + int prot) { uint64_t aligned_start = ALIGN_2MIB_DOWN(ipa); uint64_t aligned_end = ALIGN_2MIB_UP(ipa + len); @@ -1459,8 +1464,32 @@ static int hvf_apply_file_overlay_quiesced(guest_t *g, return -LINUX_EIO; void *target = (uint8_t *) g->host_base + ipa; - void *p = mmap(target, len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, - fd, file_off); + /* hv_vm_map can only grant stage-2 rights (RWX) that the host VA backing's + * max_protection permits. A MAP_SHARED mapping of an O_RDONLY fd has + * max_protection READ only, so the following hv_vm_map of the segment + * fails with HV_ERROR -- this is why the JVM's read-only mmap of + * lib/modules (an O_RDONLY fd) could never be installed. + * + * Pick the host backing from what the fd and the guest actually need: + * - guest wants PROT_WRITE: MAP_SHARED with PROT_READ|PROT_WRITE so the + * guest's writes reach the file. On an O_RDONLY fd the host mmap + * returns EACCES, matching Linux. + * - guest read-only on a writable fd: MAP_SHARED PROT_READ keeps + * cross-mapping coherence; max_protection is RWX because the fd is + * writable, so the segment maps. + * - guest read-only on an O_RDONLY fd: MAP_PRIVATE PROT_READ. Its + * max_protection is RWX so the segment maps; the pages still show file + * content, and the guest's stage-1 tables keep the region read-only so + * the private copy is never dirtied (no observable MAP_SHARED + * divergence for a read-only mapping). + */ + bool want_write = (prot & LINUX_PROT_WRITE) != 0; + int acc = fcntl(fd, F_GETFL); + bool fd_writable = acc >= 0 && ((acc & O_ACCMODE) == O_RDWR || + (acc & O_ACCMODE) == O_WRONLY); + int host_prot = want_write ? (PROT_READ | PROT_WRITE) : PROT_READ; + int share = (want_write || fd_writable) ? MAP_SHARED : MAP_PRIVATE; + void *p = mmap(target, len, host_prot, share | MAP_FIXED, fd, file_off); if (p == MAP_FAILED) { int saved = linux_errno(); /* The overlay failed; restore the segment to slab backing so the @@ -1501,10 +1530,11 @@ static int hvf_apply_file_overlay(guest_t *g, uint64_t ipa, uint64_t len, int fd, - off_t file_off) + off_t file_off, + int prot) { thread_quiesce_siblings(); - int err = hvf_apply_file_overlay_quiesced(g, ipa, len, fd, file_off); + int err = hvf_apply_file_overlay_quiesced(g, ipa, len, fd, file_off, prot); thread_resume_siblings(); return err; } @@ -1976,7 +2006,7 @@ int64_t sys_mmap(guest_t *g, ALIGN_UP(length, host_page_size_cached()); int oerr = hvf_apply_file_overlay(g, result_off, fixed_overlay_len, - host_backing_fd, (off_t) offset); + host_backing_fd, (off_t) offset, prot); if (oerr < 0) { int restore_err = restore_region_snapshots( g, replaced_snaps, replaced_nsnaps); @@ -2285,7 +2315,8 @@ int64_t sys_mmap(guest_t *g, if (overlay_aligned) { uint64_t nf_overlay_len = ALIGN_UP(length, hps); int oerr = hvf_apply_file_overlay(g, result_off, nf_overlay_len, - host_backing_fd, (off_t) offset); + host_backing_fd, (off_t) offset, + prot); if (oerr < 0) { int rollback_err = rollback_fresh_mmap_allocation( g, result_off, length, false, 0, 0, saved_mmap_next, @@ -2871,7 +2902,7 @@ int64_t sys_mremap(guest_t *g, int restore_err = restore_file_overlay_range( g, old_off, old_off + old_size, source_overlay_start, source_overlay_end, track_backing_fd, - source_overlay_file_off); + source_overlay_file_off, prot); if (restore_err < 0) { if (track_backing_fd >= 0) close(track_backing_fd); @@ -2904,7 +2935,7 @@ int64_t sys_mremap(guest_t *g, (void) restore_file_overlay_range( g, old_off, old_off + old_size, source_overlay_start, source_overlay_end, track_backing_fd, - source_overlay_file_off); + source_overlay_file_off, prot); guest_invalidate_ptes(g, new_off, new_off + new_size); if (track_backing_fd >= 0) close(track_backing_fd); @@ -3761,7 +3792,11 @@ int mmap_fork_prepare_anon_shared(guest_t *g, return nsnaps; } - int err = hvf_apply_file_overlay_quiesced(g, start, aligned_len, fd, 0); + /* Anonymous-shared fork backing: the temp file is writable and the + * region is read-write, so request RW (preserves prior behavior). */ + int err = hvf_apply_file_overlay_quiesced(g, start, aligned_len, fd, 0, + LINUX_PROT_READ | + LINUX_PROT_WRITE); if (err < 0) { log_warn("fork-prep: overlay install [0x%llx, 0x%llx) failed: %d", (unsigned long long) start, @@ -3947,7 +3982,7 @@ int mmap_fork_restore_overlays(guest_t *g, } int err = hvf_apply_file_overlay(g, ovl_s, ovl_e - ovl_s, r->backing_fd, - (off_t) file_off); + (off_t) file_off, r->prot); if (err < 0) { log_warn( "fork-child: overlay re-install [0x%llx, 0x%llx) failed: %d", diff --git a/tests/manifest.txt b/tests/manifest.txt index 7273505..fff2151 100644 --- a/tests/manifest.txt +++ b/tests/manifest.txt @@ -119,6 +119,9 @@ test-shim-cred-race [section] msync MAP_SHARED tests test-msync +[section] Read-only MAP_SHARED file overlay tests +test-mmap-shared-ro + [section] Cross-fork MAP_SHARED coherence tests test-cross-fork-mapshared # diff=skip diff --git a/tests/test-mmap-shared-ro.c b/tests/test-mmap-shared-ro.c new file mode 100644 index 0000000..ffd1184 --- /dev/null +++ b/tests/test-mmap-shared-ro.c @@ -0,0 +1,212 @@ +/* Read-only MAP_SHARED file overlay tests + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Regression lock-in for the file-overlay path in src/syscall/mem.c. + * + * A MAP_SHARED, PROT_READ mapping of a file opened O_RDONLY is extremely + * common -- the JVM maps its ~135 MiB lib/modules image this way, and so do + * loaders that map read-only data segments. The original overlay code always + * mmap'd the host page PROT_READ|PROT_WRITE and mapped the HVF segment RWX, + * which fails twice for a read-only fd: the host mmap returns EACCES (writable + * mapping of an O_RDONLY fd) and, even forced to PROT_READ, hv_vm_map then + * fails because a MAP_SHARED-of-O_RDONLY region has macOS max_protection=READ. + * + * Syscalls exercised: openat, ftruncate/pwrite, mmap, munmap, pread64 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "test-harness.h" + +int passes = 0, fails = 0; + +/* Several guest pages so the overlay spans more than one host page and the + * containing 2 MiB segment is split and remapped over a realistic range. */ +#define NPAGES 64 +#define PGSZ ((size_t) 4096) +#define FILE_LEN (NPAGES * PGSZ) + +/* Distinct byte per 4 KiB page so a partial or misaligned overlay is caught. */ +static unsigned char page_marker(int page) +{ + return (unsigned char) (0x40 + (page % 64)); +} + +/* Create a file seeded with a per-page marker pattern, then close it. Returns + * the path in `out` (caller-sized buffer). Returns 0 on success, -1 on error. */ +static int make_seed_file(char *out, size_t out_sz) +{ + snprintf(out, out_sz, "/tmp/elfuse-mmap-ro-%ld", (long) getpid()); + int fd = open(out, O_CREAT | O_TRUNC | O_RDWR, 0600); + if (fd < 0) + return -1; + for (int p = 0; p < NPAGES; p++) { + unsigned char buf[PGSZ]; + memset(buf, page_marker(p), sizeof(buf)); + off_t foff = (off_t) p * (off_t) PGSZ; + if (pwrite(fd, buf, sizeof(buf), foff) != (ssize_t) sizeof(buf)) { + close(fd); + unlink(out); + return -1; + } + } + close(fd); + return 0; +} + +/* The headline case: O_RDONLY fd + MAP_SHARED + PROT_READ must map and expose + * the full file contents. This is exactly the JVM lib/modules pattern. */ +static void test_rdonly_shared_read(const char *path) +{ + TEST("MAP_SHARED PROT_READ on O_RDONLY fd maps"); + + int fd = open(path, O_RDONLY); + if (fd < 0) { + FAIL("open O_RDONLY failed"); + return; + } + + unsigned char *p = mmap(NULL, FILE_LEN, PROT_READ, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) { + FAIL("mmap MAP_SHARED PROT_READ failed"); + close(fd); + return; + } + + bool ok = true; + for (int pg = 0; pg < NPAGES && ok; pg++) { + unsigned char want = page_marker(pg); + for (int off = 0; off < PGSZ; off += 512) { + if (p[pg * PGSZ + off] != want) { + ok = false; + break; + } + } + } + if (ok) + PASS(); + else + FAIL("mapped contents did not match file across pages"); + + munmap(p, FILE_LEN); + close(fd); +} + +/* The same content must be readable back-to-back through a fresh mapping, and + * a second concurrent read-only mapping of the same fd must also work. */ +static void test_rdonly_shared_second_mapping(const char *path) +{ + TEST("second MAP_SHARED PROT_READ mapping maps"); + + int fd = open(path, O_RDONLY); + if (fd < 0) { + FAIL("open O_RDONLY failed"); + return; + } + + unsigned char *a = mmap(NULL, FILE_LEN, PROT_READ, MAP_SHARED, fd, 0); + unsigned char *b = mmap(NULL, FILE_LEN, PROT_READ, MAP_SHARED, fd, 0); + if (a == MAP_FAILED || b == MAP_FAILED) { + FAIL("one of two MAP_SHARED PROT_READ mappings failed"); + if (a != MAP_FAILED) + munmap(a, FILE_LEN); + if (b != MAP_FAILED) + munmap(b, FILE_LEN); + close(fd); + return; + } + + if (a[0] == page_marker(0) && b[0] == page_marker(0) && + a[(NPAGES - 1) * PGSZ] == page_marker(NPAGES - 1) && + b[(NPAGES - 1) * PGSZ] == page_marker(NPAGES - 1)) + PASS(); + else + FAIL("two concurrent read-only mappings disagree with file"); + + munmap(a, FILE_LEN); + munmap(b, FILE_LEN); + close(fd); +} + +/* A read-only mapping must stay read-only: requesting PROT_WRITE | MAP_SHARED on + * an O_RDONLY fd is EACCES on Linux, and elfuse must surface the same errno + * rather than silently succeeding (which the MAP_PRIVATE backing must not do). */ +static void test_rdonly_shared_write_rejected(const char *path) +{ + TEST("MAP_SHARED PROT_WRITE on O_RDONLY fd is EACCES"); + + int fd = open(path, O_RDONLY); + if (fd < 0) { + FAIL("open O_RDONLY failed"); + return; + } + + void *p = mmap(NULL, FILE_LEN, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (p == MAP_FAILED && errno == EACCES) { + PASS(); + } else { + FAIL("writable shared mapping of O_RDONLY fd was not rejected"); + if (p != MAP_FAILED) + munmap(p, FILE_LEN); + } + close(fd); +} + +/* A read-only mapping taken from a writable (O_RDWR) fd must also work; this + * exercises the MAP_SHARED-PROT_READ-on-writable-fd branch (max_protection RWX + * so the segment maps without dropping to MAP_PRIVATE). */ +static void test_rdwr_fd_readonly_mapping(const char *path) +{ + TEST("MAP_SHARED PROT_READ on O_RDWR fd maps"); + + int fd = open(path, O_RDWR); + if (fd < 0) { + FAIL("open O_RDWR failed"); + return; + } + + unsigned char *p = mmap(NULL, FILE_LEN, PROT_READ, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) { + FAIL("mmap MAP_SHARED PROT_READ on O_RDWR fd failed"); + close(fd); + return; + } + + if (p[0] == page_marker(0) && p[(NPAGES - 1) * PGSZ] == page_marker(NPAGES - 1)) + PASS(); + else + FAIL("read-only mapping of O_RDWR fd did not match file"); + + munmap(p, FILE_LEN); + close(fd); +} + +int main(void) +{ + printf("test-mmap-shared-ro: read-only MAP_SHARED file overlay tests\n\n"); + + char path[64]; + if (make_seed_file(path, sizeof(path)) != 0) { + printf(" %-30s FAIL: could not create seed file (errno=%d)\n", + "setup", errno); + return 1; + } + + test_rdonly_shared_read(path); + test_rdonly_shared_second_mapping(path); + test_rdonly_shared_write_rejected(path); + test_rdwr_fd_readonly_mapping(path); + + unlink(path); + + SUMMARY("test-mmap-shared-ro"); + return fails ? 1 : 0; +}