-
Notifications
You must be signed in to change notification settings - Fork 9
Fix madvise(MADV_DONTNEED) on high-VA mmap regions #83
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2997,7 +2997,13 @@ int64_t sys_madvise(guest_t *g, uint64_t addr, uint64_t length, int advice) | |
| * is updated. | ||
| */ | ||
| uint64_t off = addr - g->ipa_base; | ||
| if (off > g->guest_size || length > g->guest_size - off) | ||
| /* Accept ranges in the primary IPA window, and also high-VA mmap regions | ||
| * (gpa_base != start) that the tracker records as mapped. Rosetta's own | ||
| * slab/JIT and guest JITs (e.g. V8) decommit pages in the high-VA window | ||
| * via mprotect(PROT_NONE)+madvise(MADV_DONTNEED); rejecting those with | ||
| * ENOMEM trips the guest's CHECK_EQ(0, ret) on the madvise return. */ | ||
| bool in_primary = (off <= g->guest_size && length <= g->guest_size - off); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. P2: The high-VA admission check is too loose: it can approve ranges based only on region coverage even though the rest of Prompt for AI agents |
||
| if (!in_primary && !madvise_range_mapped(g, off, length)) | ||
| return -LINUX_ENOMEM; | ||
|
|
||
| /* Defensive guard against destructive advice on infrastructure | ||
|
|
@@ -3050,11 +3056,18 @@ int64_t sys_madvise(guest_t *g, uint64_t addr, uint64_t length, int advice) | |
|
|
||
| uint64_t zstart = (r->start > off) ? r->start : off; | ||
| uint64_t zend = (r->end < end) ? r->end : end; | ||
| memset((uint8_t *) g->host_base + zstart, 0, zend - zstart); | ||
| if (!(r->flags & LINUX_MAP_ANONYMOUS)) { | ||
| /* High-VA regions back their pages at gpa_base, not at the VA; | ||
| * resolve the host pointer through the GPA so the reset hits the | ||
| * real backing (host_ptr_for_gpa also follows live overlays). For | ||
| * identity regions gpa_base == start, so this is unchanged. */ | ||
| uint64_t rgpa = r->gpa_base + (zstart - r->start); | ||
| memset(host_ptr_for_gpa(g, rgpa), 0, zend - zstart); | ||
| if (!(r->flags & LINUX_MAP_ANONYMOUS) && r->gpa_base == r->start) { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. P2: Prompt for AI agents |
||
| /* EOF leaves the tail zero per mmap rules; the helper | ||
| * already returns 0 in that case after stopping the | ||
| * read loop. | ||
| * read loop. File-backed restore via host_base+off is only | ||
| * correct for identity regions; high-VA file mappings keep | ||
| * the zero-fill above (not exercised by current JIT guests). | ||
| */ | ||
| int err = read_file_range_to_guest( | ||
| g, zstart, r->backing_fd, r->offset + (zstart - r->start), | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,66 @@ | ||
| #!/usr/bin/env bash | ||
| # test-rosetta-madvise.sh - madvise(MADV_DONTNEED) on high-VA regions via Rosetta | ||
| # | ||
| # Copyright 2026 elfuse contributors | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # | ||
| # Regression for elfuse sys_madvise rejecting high-VA mmap regions with ENOMEM. | ||
| # Under Rosetta, anonymous mmap(NULL) lands in the high-VA window where | ||
| # sys_madvise was primary-window-only and returned ENOMEM for every | ||
| # MADV_DONTNEED. V8's page allocator decommits guard/code pages with | ||
| # mprotect(PROT_NONE)+madvise(MADV_DONTNEED) and CHECK_EQ(0, ret)s the result, | ||
| # so the spurious ENOMEM aborted x86_64 Node.js the moment its JIT initialized. | ||
| # | ||
| # Fixture: tests/fixtures/rosetta/x86_64-rosetta-madvise (vendored x86_64 ELF). | ||
| # | ||
| # Usage: tests/test-rosetta-madvise.sh [path/to/elfuse] | ||
|
|
||
| set -euo pipefail | ||
|
|
||
| ELFUSE_INPUT="${1:-build/elfuse}" | ||
| case "$ELFUSE_INPUT" in | ||
| /*) ELFUSE="$ELFUSE_INPUT" ;; | ||
| *) ELFUSE="$(pwd)/$ELFUSE_INPUT" ;; | ||
| esac | ||
|
|
||
| ROSETTA_PATH="${MATRIX_ROSETTA_TRANSLATOR:-/Library/Apple/usr/libexec/oah/RosettaLinux/rosetta}" | ||
| MADV_BIN="$(pwd)/tests/fixtures/rosetta/x86_64-rosetta-madvise" | ||
|
|
||
| # shellcheck source=tests/lib/rosetta-test.sh | ||
| . "$(dirname "$0")/lib/rosetta-test.sh" | ||
|
|
||
| pass=0 | ||
| fail=0 | ||
| skip=0 | ||
| total=0 | ||
|
|
||
| if [ ! -x "$ROSETTA_PATH" ]; then | ||
| printf 'rosetta translator not found at %s\n' "$ROSETTA_PATH" >&2 | ||
| exit 77 | ||
| fi | ||
| if [ ! -x "$ELFUSE" ]; then | ||
| printf 'elfuse binary not found: %s\n' "$ELFUSE" >&2 | ||
| exit 1 | ||
| fi | ||
|
|
||
| require_timeout | ||
|
|
||
| if [ ! -x "$MADV_BIN" ]; then | ||
| printf 'vendored Rosetta madvise fixture missing under tests/fixtures/rosetta/\n' >&2 | ||
| exit 77 | ||
| fi | ||
|
|
||
| total=$((total + 1)) | ||
| set +e | ||
| madv_out="$("$TIMEOUT" 30 "$ELFUSE" "$MADV_BIN" 2>&1)" | ||
| madv_rc=$? | ||
| set -e | ||
| if [ "$madv_rc" -eq 0 ] && | ||
| printf '%s\n' "$madv_out" | grep -q 'madvise high-VA: all subtests passed'; then | ||
| report_pass "madvise-high-va-dontneed" | ||
| else | ||
| report_fail "madvise-high-va-dontneed: rc=$madv_rc" | ||
| printf '%s\n' "$madv_out" >&2 | ||
| fi | ||
|
|
||
| report_summary "$total" | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. P2: Script exits 0 even when tests fail — missing Prompt for AI agents |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,185 @@ | ||
| /* x86_64-rosetta-madvise.c - madvise(MADV_DONTNEED) on high-VA regions | ||
| * | ||
| * Copyright 2026 elfuse contributors | ||
| * SPDX-License-Identifier: Apache-2.0 | ||
| * | ||
| * Regression for elfuse sys_madvise rejecting high-VA mmap regions with | ||
| * ENOMEM. Under Rosetta, anonymous mmap(NULL) lands in the high-VA window | ||
| * (the region's gpa_base diverges from its VA start), where sys_madvise was | ||
| * primary-window-only: it computed off = addr - ipa_base and rejected any | ||
| * range past guest_size with ENOMEM, even though sys_mprotect already handles | ||
| * the same high-VA range. V8's page allocator decommits guard/code pages with | ||
| * mprotect(PROT_NONE)+madvise(MADV_DONTNEED) and CHECK_EQ(0, ret)s the madvise | ||
| * return, so the spurious ENOMEM aborted x86_64 Node.js the moment its JIT | ||
| * initialized. | ||
| * | ||
| * Each subtest prints "PASS <name>" / "FAIL <name>"; main() exits non-zero on | ||
| * any failure so the shell harness can gate on the exit code. | ||
| * | ||
| * This is an x86_64 Linux static ELF, run through elfuse + Rosetta. It is not | ||
| * built in-tree (the Makefile builds aarch64 hosts); rebuild out of tree and | ||
| * re-vendor per tests/fixtures/rosetta/README.md. | ||
| */ | ||
|
|
||
| #include <errno.h> | ||
| #include <stdint.h> | ||
| #include <stdio.h> | ||
| #include <string.h> | ||
| #include <sys/mman.h> | ||
| #include <unistd.h> | ||
|
|
||
| #ifndef MADV_DONTNEED | ||
| #define MADV_DONTNEED 4 | ||
| #endif | ||
|
|
||
| #define PAGE ((size_t) 4096) | ||
|
|
||
| static int fails; | ||
|
|
||
| /* The primary IPA window is a handful of GiB; Rosetta places guest mappings at | ||
| * their native x86_64 VAs far above it. Anything past 4 GiB is the high-VA | ||
| * window that exercises the regression. */ | ||
| static int is_high_va(const void *p) | ||
| { | ||
| return (uint64_t) (uintptr_t) p > 0x100000000ULL; | ||
| } | ||
|
|
||
| /* MADV_DONTNEED on a writable high-VA page returns 0 and zero-fills. */ | ||
| static void test_dontneed_rw(void) | ||
| { | ||
| void *p = mmap(NULL, PAGE, PROT_READ | PROT_WRITE, | ||
| MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); | ||
| if (p == MAP_FAILED) { | ||
| printf("FAIL dontneed-rw: mmap errno=%d\n", errno); | ||
| fails++; | ||
| return; | ||
| } | ||
| if (!is_high_va(p)) { | ||
| printf("FAIL dontneed-rw: mapping not in high-VA window (%p)\n", p); | ||
| fails++; | ||
| munmap(p, PAGE); | ||
| return; | ||
| } | ||
| memset(p, 0xAA, PAGE); | ||
| errno = 0; | ||
| if (madvise(p, PAGE, MADV_DONTNEED) != 0) { | ||
| printf("FAIL dontneed-rw: madvise rc=-1 errno=%d\n", errno); | ||
| fails++; | ||
| munmap(p, PAGE); | ||
| return; | ||
| } | ||
| for (unsigned i = 0; i < PAGE; i++) { | ||
| if (((unsigned char *) p)[i] != 0) { | ||
| printf("FAIL dontneed-rw: byte %u not zeroed\n", i); | ||
| fails++; | ||
| munmap(p, PAGE); | ||
| return; | ||
| } | ||
| } | ||
| printf("PASS dontneed-rw\n"); | ||
| munmap(p, PAGE); | ||
| } | ||
|
|
||
| /* The exact V8 decommit pattern: a guard page is set PROT_NONE and then | ||
| * MADV_DONTNEED'd. Linux returns 0 for a mapped-but-PROT_NONE page; after | ||
| * re-granting RW the page reads back as zero. */ | ||
| static void test_dontneed_protnone(void) | ||
| { | ||
| size_t sz = 2u * PAGE; | ||
| void *p = mmap(NULL, sz, PROT_READ | PROT_WRITE, | ||
| MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); | ||
| if (p == MAP_FAILED) { | ||
| printf("FAIL dontneed-protnone: mmap errno=%d\n", errno); | ||
| fails++; | ||
| return; | ||
| } | ||
| if (!is_high_va(p)) { | ||
| printf("FAIL dontneed-protnone: mapping not in high-VA window (%p)\n", | ||
| p); | ||
| fails++; | ||
| munmap(p, sz); | ||
| return; | ||
| } | ||
| void *guard = (char *) p + PAGE; | ||
| memset(p, 0xBB, sz); | ||
| if (mprotect(guard, PAGE, PROT_NONE) != 0) { | ||
| printf("FAIL dontneed-protnone: mprotect PROT_NONE errno=%d\n", errno); | ||
| fails++; | ||
| munmap(p, sz); | ||
| return; | ||
| } | ||
| errno = 0; | ||
| if (madvise(guard, PAGE, MADV_DONTNEED) != 0) { | ||
| printf("FAIL dontneed-protnone: madvise rc=-1 errno=%d\n", errno); | ||
| fails++; | ||
| munmap(p, sz); | ||
| return; | ||
| } | ||
| if (mprotect(guard, PAGE, PROT_READ | PROT_WRITE) != 0) { | ||
| printf("FAIL dontneed-protnone: re-grant RW errno=%d\n", errno); | ||
| fails++; | ||
| munmap(p, sz); | ||
| return; | ||
| } | ||
| for (unsigned i = 0; i < PAGE; i++) { | ||
| if (((unsigned char *) guard)[i] != 0) { | ||
| printf("FAIL dontneed-protnone: guard byte %u not zeroed\n", i); | ||
| fails++; | ||
| munmap(p, sz); | ||
| return; | ||
| } | ||
| } | ||
| printf("PASS dontneed-protnone\n"); | ||
| munmap(p, sz); | ||
| } | ||
|
|
||
| /* Multi-page MADV_DONTNEED across a high-VA span returns 0 and zero-fills. */ | ||
| static void test_dontneed_multi(void) | ||
| { | ||
| size_t sz = 16u * PAGE; | ||
| void *p = mmap(NULL, sz, PROT_READ | PROT_WRITE, | ||
| MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); | ||
| if (p == MAP_FAILED) { | ||
| printf("FAIL dontneed-multi: mmap errno=%d\n", errno); | ||
| fails++; | ||
| return; | ||
| } | ||
| if (!is_high_va(p)) { | ||
| printf("FAIL dontneed-multi: mapping not in high-VA window (%p)\n", p); | ||
| fails++; | ||
| munmap(p, sz); | ||
| return; | ||
| } | ||
| memset(p, 0xCC, sz); | ||
| errno = 0; | ||
| if (madvise(p, sz, MADV_DONTNEED) != 0) { | ||
| printf("FAIL dontneed-multi: madvise rc=-1 errno=%d\n", errno); | ||
| fails++; | ||
| munmap(p, sz); | ||
| return; | ||
| } | ||
| for (size_t i = 0; i < sz; i++) { | ||
| if (((unsigned char *) p)[i] != 0) { | ||
| printf("FAIL dontneed-multi: byte %zu not zeroed\n", i); | ||
| fails++; | ||
| munmap(p, sz); | ||
| return; | ||
| } | ||
| } | ||
| printf("PASS dontneed-multi\n"); | ||
| munmap(p, sz); | ||
| } | ||
|
|
||
| int main(void) | ||
| { | ||
| test_dontneed_rw(); | ||
| test_dontneed_protnone(); | ||
| test_dontneed_multi(); | ||
|
|
||
| if (fails) { | ||
| printf("madvise high-VA: %d subtest(s) failed\n", fails); | ||
| return 1; | ||
| } | ||
| printf("madvise high-VA: all subtests passed\n"); | ||
| return 0; | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
P1: Missing
off + lengthoverflow guard can make invalid high-VA madvise ranges pass mapping checks and return success.Prompt for AI agents