diff --git a/mk/tests.mk b/mk/tests.mk index fd412ff..4f4dff2 100644 --- a/mk/tests.mk +++ b/mk/tests.mk @@ -6,7 +6,7 @@ test-glibc-coreutils test-perf \ test-rosetta-cli test-rosetta-statics test-rosetta-failure-modes \ test-rosetta-alpine test-rosetta-audit test-rosetta-jit \ - test-rosetta-glibc test-rosetta-all bench-rosetta \ + test-rosetta-glibc test-rosetta-madvise test-rosetta-all bench-rosetta \ test-matrix test-matrix-elfuse-aarch64 test-matrix-qemu-aarch64 \ test-full test-multi-vcpu test-rwx test-sysroot-rename \ test-case-collision test-case-collision-fallback test-sysroot-create-paths \ @@ -187,10 +187,14 @@ test-rosetta-jit: $(ELFUSE_BIN) test-rosetta-glibc: $(ELFUSE_BIN) $(call RUN_OPTIONAL_SKIP77,bash tests/test-rosetta-glibc.sh $(ELFUSE_BIN),test-rosetta-glibc) +test-rosetta-madvise: $(ELFUSE_BIN) + $(call RUN_OPTIONAL_SKIP77,bash tests/test-rosetta-madvise.sh $(ELFUSE_BIN),test-rosetta-madvise) + ## Run every Rosetta-specific test target in sequence. test-rosetta-all: test-rosetta-cli test-rosetta-failure-modes \ test-rosetta-statics test-rosetta-alpine \ - test-rosetta-audit test-rosetta-jit test-rosetta-glibc + test-rosetta-audit test-rosetta-jit test-rosetta-glibc \ + test-rosetta-madvise ## Wall-clock bench harness for x86_64-via-Rosetta workloads. Prints ## best-of-N samples plus the aarch64 reference where available. Set diff --git a/src/syscall/mem.c b/src/syscall/mem.c index bcdd48d..41615b5 100644 --- a/src/syscall/mem.c +++ b/src/syscall/mem.c @@ -2997,7 +2997,13 @@ int64_t sys_madvise(guest_t *g, uint64_t addr, uint64_t length, int advice) * is updated. */ uint64_t off = addr - g->ipa_base; - if (off > g->guest_size || length > g->guest_size - off) + /* Accept ranges in the primary IPA window, and also high-VA mmap regions + * (gpa_base != start) that the tracker records as mapped. Rosetta's own + * slab/JIT and guest JITs (e.g. V8) decommit pages in the high-VA window + * via mprotect(PROT_NONE)+madvise(MADV_DONTNEED); rejecting those with + * ENOMEM trips the guest's CHECK_EQ(0, ret) on the madvise return. */ + bool in_primary = (off <= g->guest_size && length <= g->guest_size - off); + if (!in_primary && !madvise_range_mapped(g, off, length)) return -LINUX_ENOMEM; /* Defensive guard against destructive advice on infrastructure @@ -3050,11 +3056,18 @@ int64_t sys_madvise(guest_t *g, uint64_t addr, uint64_t length, int advice) uint64_t zstart = (r->start > off) ? r->start : off; uint64_t zend = (r->end < end) ? r->end : end; - memset((uint8_t *) g->host_base + zstart, 0, zend - zstart); - if (!(r->flags & LINUX_MAP_ANONYMOUS)) { + /* High-VA regions back their pages at gpa_base, not at the VA; + * resolve the host pointer through the GPA so the reset hits the + * real backing (host_ptr_for_gpa also follows live overlays). For + * identity regions gpa_base == start, so this is unchanged. */ + uint64_t rgpa = r->gpa_base + (zstart - r->start); + memset(host_ptr_for_gpa(g, rgpa), 0, zend - zstart); + if (!(r->flags & LINUX_MAP_ANONYMOUS) && r->gpa_base == r->start) { /* EOF leaves the tail zero per mmap rules; the helper * already returns 0 in that case after stopping the - * read loop. + * read loop. File-backed restore via host_base+off is only + * correct for identity regions; high-VA file mappings keep + * the zero-fill above (not exercised by current JIT guests). */ int err = read_file_range_to_guest( g, zstart, r->backing_fd, r->offset + (zstart - r->start), diff --git a/tests/fixtures/rosetta/README.md b/tests/fixtures/rosetta/README.md index 8a73e1a..8aef31f 100644 --- a/tests/fixtures/rosetta/README.md +++ b/tests/fixtures/rosetta/README.md @@ -4,6 +4,9 @@ Rosetta x86_64 test fixtures vendored for self-contained matrix coverage. - static x86_64 Linux ELF built from `tests/x86_64-rosetta-audit.c` - `x86_64-rosetta-tls0` - static x86_64 Linux ELF built from `tests/x86_64-rosetta-tls0.c` +- `x86_64-rosetta-madvise` + - static x86_64 Linux ELF built from `tests/x86_64-rosetta-madvise.c` + - used by `tests/test-rosetta-madvise.sh` - `x86_64-glibc-rootfs.tar.gz` - minimal x86_64 glibc rootfs used by `tests/test-rosetta-glibc.sh` - contains `hello-dynamic`, `dlopen-probe`, `tls-probe`, @@ -34,8 +37,9 @@ gcc -O2 -o tls-probe tests/x86_64-glibc-tls.c gcc -O2 -fPIC -shared -o libgdtls.so tests/x86_64-glibc-gdtls-lib.c gcc -O2 -ldl -o gdtls-probe tests/x86_64-glibc-gdtls.c gcc -O2 -pthread -o pthread-tls-probe tests/x86_64-glibc-pthread-tls.c -gcc -O2 -static -o x86_64-rosetta-audit tests/x86_64-rosetta-audit.c -gcc -O2 -static -o x86_64-rosetta-tls0 tests/x86_64-rosetta-tls0.c +gcc -O2 -static -o x86_64-rosetta-audit tests/x86_64-rosetta-audit.c +gcc -O2 -static -o x86_64-rosetta-tls0 tests/x86_64-rosetta-tls0.c +gcc -O2 -static -o x86_64-rosetta-madvise tests/x86_64-rosetta-madvise.c # Stage the matching ld.so / libc.so.6 / libm.so.6 from the same host # into a rootfs/ tree alongside libgdtls.so under lib/x86_64-linux-gnu/, # then tar -czf x86_64-glibc-rootfs.tar.gz rootfs/. diff --git a/tests/fixtures/rosetta/x86_64-rosetta-madvise b/tests/fixtures/rosetta/x86_64-rosetta-madvise new file mode 100755 index 0000000..5dbb61c Binary files /dev/null and b/tests/fixtures/rosetta/x86_64-rosetta-madvise differ diff --git a/tests/test-matrix.sh b/tests/test-matrix.sh index 9b0d8b6..291b18b 100755 --- a/tests/test-matrix.sh +++ b/tests/test-matrix.sh @@ -612,10 +612,14 @@ run_rosetta_x86_64_suites() printf "\nRosetta glibc dynamic\n" run_summary_suite "rosetta-glibc" \ bash "${REPO_ROOT}/tests/test-rosetta-glibc.sh" "$ELFUSE" || rc=1 + + printf "\nRosetta high-VA madvise\n" + run_summary_suite "rosetta-madvise" \ + bash "${REPO_ROOT}/tests/test-rosetta-madvise.sh" "$ELFUSE" || rc=1 else local suite for suite in rosetta-statics rosetta-alpine rosetta-audit rosetta-jit \ - rosetta-glibc; do + rosetta-glibc rosetta-madvise; do skip_suite "$suite" "Rosetta translator not installed" done fi diff --git a/tests/test-rosetta-madvise.sh b/tests/test-rosetta-madvise.sh new file mode 100644 index 0000000..281a6f1 --- /dev/null +++ b/tests/test-rosetta-madvise.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +# test-rosetta-madvise.sh - madvise(MADV_DONTNEED) on high-VA regions via Rosetta +# +# Copyright 2026 elfuse contributors +# SPDX-License-Identifier: Apache-2.0 +# +# Regression for elfuse sys_madvise rejecting high-VA mmap regions with ENOMEM. +# Under Rosetta, anonymous mmap(NULL) lands in the high-VA window where +# sys_madvise was primary-window-only and returned ENOMEM for every +# MADV_DONTNEED. V8's page allocator decommits guard/code pages with +# mprotect(PROT_NONE)+madvise(MADV_DONTNEED) and CHECK_EQ(0, ret)s the result, +# so the spurious ENOMEM aborted x86_64 Node.js the moment its JIT initialized. +# +# Fixture: tests/fixtures/rosetta/x86_64-rosetta-madvise (vendored x86_64 ELF). +# +# Usage: tests/test-rosetta-madvise.sh [path/to/elfuse] + +set -euo pipefail + +ELFUSE_INPUT="${1:-build/elfuse}" +case "$ELFUSE_INPUT" in + /*) ELFUSE="$ELFUSE_INPUT" ;; + *) ELFUSE="$(pwd)/$ELFUSE_INPUT" ;; +esac + +ROSETTA_PATH="${MATRIX_ROSETTA_TRANSLATOR:-/Library/Apple/usr/libexec/oah/RosettaLinux/rosetta}" +MADV_BIN="$(pwd)/tests/fixtures/rosetta/x86_64-rosetta-madvise" + +# shellcheck source=tests/lib/rosetta-test.sh +. "$(dirname "$0")/lib/rosetta-test.sh" + +pass=0 +fail=0 +skip=0 +total=0 + +if [ ! -x "$ROSETTA_PATH" ]; then + printf 'rosetta translator not found at %s\n' "$ROSETTA_PATH" >&2 + exit 77 +fi +if [ ! -x "$ELFUSE" ]; then + printf 'elfuse binary not found: %s\n' "$ELFUSE" >&2 + exit 1 +fi + +require_timeout + +if [ ! -x "$MADV_BIN" ]; then + printf 'vendored Rosetta madvise fixture missing under tests/fixtures/rosetta/\n' >&2 + exit 77 +fi + +total=$((total + 1)) +set +e +madv_out="$("$TIMEOUT" 30 "$ELFUSE" "$MADV_BIN" 2>&1)" +madv_rc=$? +set -e +if [ "$madv_rc" -eq 0 ] && + printf '%s\n' "$madv_out" | grep -q 'madvise high-VA: all subtests passed'; then + report_pass "madvise-high-va-dontneed" +else + report_fail "madvise-high-va-dontneed: rc=$madv_rc" + printf '%s\n' "$madv_out" >&2 +fi + +report_summary "$total" diff --git a/tests/x86_64-rosetta-madvise.c b/tests/x86_64-rosetta-madvise.c new file mode 100644 index 0000000..f9b98df --- /dev/null +++ b/tests/x86_64-rosetta-madvise.c @@ -0,0 +1,185 @@ +/* x86_64-rosetta-madvise.c - madvise(MADV_DONTNEED) on high-VA regions + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Regression for elfuse sys_madvise rejecting high-VA mmap regions with + * ENOMEM. Under Rosetta, anonymous mmap(NULL) lands in the high-VA window + * (the region's gpa_base diverges from its VA start), where sys_madvise was + * primary-window-only: it computed off = addr - ipa_base and rejected any + * range past guest_size with ENOMEM, even though sys_mprotect already handles + * the same high-VA range. V8's page allocator decommits guard/code pages with + * mprotect(PROT_NONE)+madvise(MADV_DONTNEED) and CHECK_EQ(0, ret)s the madvise + * return, so the spurious ENOMEM aborted x86_64 Node.js the moment its JIT + * initialized. + * + * Each subtest prints "PASS " / "FAIL "; main() exits non-zero on + * any failure so the shell harness can gate on the exit code. + * + * This is an x86_64 Linux static ELF, run through elfuse + Rosetta. It is not + * built in-tree (the Makefile builds aarch64 hosts); rebuild out of tree and + * re-vendor per tests/fixtures/rosetta/README.md. + */ + +#include +#include +#include +#include +#include +#include + +#ifndef MADV_DONTNEED +#define MADV_DONTNEED 4 +#endif + +#define PAGE ((size_t) 4096) + +static int fails; + +/* The primary IPA window is a handful of GiB; Rosetta places guest mappings at + * their native x86_64 VAs far above it. Anything past 4 GiB is the high-VA + * window that exercises the regression. */ +static int is_high_va(const void *p) +{ + return (uint64_t) (uintptr_t) p > 0x100000000ULL; +} + +/* MADV_DONTNEED on a writable high-VA page returns 0 and zero-fills. */ +static void test_dontneed_rw(void) +{ + void *p = mmap(NULL, PAGE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + printf("FAIL dontneed-rw: mmap errno=%d\n", errno); + fails++; + return; + } + if (!is_high_va(p)) { + printf("FAIL dontneed-rw: mapping not in high-VA window (%p)\n", p); + fails++; + munmap(p, PAGE); + return; + } + memset(p, 0xAA, PAGE); + errno = 0; + if (madvise(p, PAGE, MADV_DONTNEED) != 0) { + printf("FAIL dontneed-rw: madvise rc=-1 errno=%d\n", errno); + fails++; + munmap(p, PAGE); + return; + } + for (unsigned i = 0; i < PAGE; i++) { + if (((unsigned char *) p)[i] != 0) { + printf("FAIL dontneed-rw: byte %u not zeroed\n", i); + fails++; + munmap(p, PAGE); + return; + } + } + printf("PASS dontneed-rw\n"); + munmap(p, PAGE); +} + +/* The exact V8 decommit pattern: a guard page is set PROT_NONE and then + * MADV_DONTNEED'd. Linux returns 0 for a mapped-but-PROT_NONE page; after + * re-granting RW the page reads back as zero. */ +static void test_dontneed_protnone(void) +{ + size_t sz = 2u * PAGE; + void *p = mmap(NULL, sz, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + printf("FAIL dontneed-protnone: mmap errno=%d\n", errno); + fails++; + return; + } + if (!is_high_va(p)) { + printf("FAIL dontneed-protnone: mapping not in high-VA window (%p)\n", + p); + fails++; + munmap(p, sz); + return; + } + void *guard = (char *) p + PAGE; + memset(p, 0xBB, sz); + if (mprotect(guard, PAGE, PROT_NONE) != 0) { + printf("FAIL dontneed-protnone: mprotect PROT_NONE errno=%d\n", errno); + fails++; + munmap(p, sz); + return; + } + errno = 0; + if (madvise(guard, PAGE, MADV_DONTNEED) != 0) { + printf("FAIL dontneed-protnone: madvise rc=-1 errno=%d\n", errno); + fails++; + munmap(p, sz); + return; + } + if (mprotect(guard, PAGE, PROT_READ | PROT_WRITE) != 0) { + printf("FAIL dontneed-protnone: re-grant RW errno=%d\n", errno); + fails++; + munmap(p, sz); + return; + } + for (unsigned i = 0; i < PAGE; i++) { + if (((unsigned char *) guard)[i] != 0) { + printf("FAIL dontneed-protnone: guard byte %u not zeroed\n", i); + fails++; + munmap(p, sz); + return; + } + } + printf("PASS dontneed-protnone\n"); + munmap(p, sz); +} + +/* Multi-page MADV_DONTNEED across a high-VA span returns 0 and zero-fills. */ +static void test_dontneed_multi(void) +{ + size_t sz = 16u * PAGE; + void *p = mmap(NULL, sz, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + printf("FAIL dontneed-multi: mmap errno=%d\n", errno); + fails++; + return; + } + if (!is_high_va(p)) { + printf("FAIL dontneed-multi: mapping not in high-VA window (%p)\n", p); + fails++; + munmap(p, sz); + return; + } + memset(p, 0xCC, sz); + errno = 0; + if (madvise(p, sz, MADV_DONTNEED) != 0) { + printf("FAIL dontneed-multi: madvise rc=-1 errno=%d\n", errno); + fails++; + munmap(p, sz); + return; + } + for (size_t i = 0; i < sz; i++) { + if (((unsigned char *) p)[i] != 0) { + printf("FAIL dontneed-multi: byte %zu not zeroed\n", i); + fails++; + munmap(p, sz); + return; + } + } + printf("PASS dontneed-multi\n"); + munmap(p, sz); +} + +int main(void) +{ + test_dontneed_rw(); + test_dontneed_protnone(); + test_dontneed_multi(); + + if (fails) { + printf("madvise high-VA: %d subtest(s) failed\n", fails); + return 1; + } + printf("madvise high-VA: all subtests passed\n"); + return 0; +}