Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 120 additions & 16 deletions mypyc/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,22 +566,114 @@ def construct_groups(
return groups


def get_header_deps(cfiles: list[tuple[str, str]]) -> list[str]:
"""Find all the headers used by a group of cfiles.
# Single regex that captures both `#include "foo"` and `#include <foo>`. The
# alternation lets us tell the two forms apart: the quoted-form match populates
# group 1 and the angle-form match populates group 2. The C preprocessor
# applies different search rules to each kind (see `_extract_includes`), so we
# carry the kind through resolution rather than collapsing them up front.
_INCLUDE_RE = re.compile(r'#\s*include\s+(?:"([^"]+)"|<([^>]+)>)')


def _extract_includes(contents: str) -> list[tuple[bool, str]]:
"""Return each `#include` directive's (is_angled, name) from `contents`.

is_angled=False for `#include "foo"`, True for `#include <foo>`.
"""
out: list[tuple[bool, str]] = []
for quoted, angled in _INCLUDE_RE.findall(contents):
if quoted:
out.append((False, quoted))
else:
out.append((True, angled))
return out


def get_header_deps(cfiles: list[tuple[str, str]]) -> list[tuple[bool, str]]:
"""Find all the headers directly included by a group of cfiles.

Returns a sorted, deduplicated list of `(is_angled, header_name)` pairs.
Callers that only need the names can ignore the bool, but it is needed by
`resolve_cfile_deps` to apply the correct preprocessor search order.

We do this by just regexping the source, which is a bit simpler than
properly plumbing the data through.
properly plumbing the data through. Transitive header-to-header includes
are picked up by `resolve_cfile_deps` in `mypyc_build`, which can read
the on-disk headers after every group has written its files.

Arguments:
cfiles: A list of (file name, file contents) pairs.
"""
headers: set[str] = set()
headers: set[tuple[bool, str]] = set()
for _, contents in cfiles:
headers.update(re.findall(r'#include [<"]([^>"]+)[>"]', contents))
headers.update(_extract_includes(contents))

return sorted(headers)


def resolve_cfile_deps(
cfile_dir: str, direct_includes: list[tuple[bool, str]], target_dir: str
) -> set[str]:
"""Resolve a .c file's `#include` directives to on-disk paths, walking
transitively through resolved headers.

The C preprocessor resolves `#include "foo"` against the includer's
directory first, then via -I, while `#include <foo>` only uses -I. We
mirror that exactly: quoted includes are searched in (includer_dir,
target_dir) order, and angled includes are searched in target_dir only.
`target_dir` is the only -I path that holds files we generate; anything
we cannot resolve under it (or, for quoted form, the includer's dir) is
dropped — lib-rt headers like `<Python.h>` and `<CPy.h>` live elsewhere
and do not change between builds, so they are not real deps for
incremental purposes.

The walk is transitive: each resolved header is opened and scanned for
its own `#include` directives. Without this, cross-group export-table
headers reached via `__native_internal_<mod>.h` (which includes
`<other_group/__native_other.h>`) would be missed, and edits that shift
struct offsets in `other_group` would not trigger a recompile of the
consumer's .o file. Its baked-in offsets would then resolve to whatever
class/function now occupies that slot — silent runtime corruption.

Returns a set of resolved paths suitable for use as an Extension.depends
list.
"""
resolved: set[str] = set()
# Worklist of (search_dir, is_angled, header_name). search_dir is the
# includer's directory — for the initial cfile it is the cfile's dir, for
# a transitively-included header it is that header's dir. It is only
# consulted for quoted-form includes.
worklist: list[tuple[str, bool, str]] = [
(cfile_dir, is_angled, dep) for is_angled, dep in direct_includes
]
while worklist:
search_dir, is_angled, dep = worklist.pop()
# Quoted form: includer's dir first, then -I (target_dir).
# Angled form: -I only (skips the includer's dir).
search_bases = (target_dir,) if is_angled else (search_dir, target_dir)
for base in search_bases:
candidate = os.path.normpath(os.path.join(base, dep))
if not os.path.exists(candidate):
continue
if candidate in resolved:
break
resolved.add(candidate)
# Recurse only into headers. Some lib-rt sources are pulled in
# as `#include "init.c"` etc.; those do not resolve under
# target_dir so they get filtered out before we would try to scan
# them, but the .h guard is a cheap belt-and-braces.
if candidate.endswith(".h"):
try:
with open(candidate, encoding="utf-8", errors="replace") as f:
header_contents = f.read()
except OSError:
header_contents = ""
sub_dir = os.path.dirname(candidate)
for sub_angled, sub in _extract_includes(header_contents):
worklist.append((sub_dir, sub_angled, sub))
break
return resolved


def mypyc_build(
paths: list[str],
compiler_options: CompilerOptions,
Expand Down Expand Up @@ -630,11 +722,17 @@ def mypyc_build(
for (path, dirs, internal) in skip_cgen_input[1]
]

# Write out the generated C and collect the files for each group
# Write out the generated C and collect the files for each group.
# Should this be here??
group_cfilenames: list[tuple[list[str], list[str]]] = []
# Per-group list of (full_cfile_path, raw_include_targets). Resolution is
# deferred until every group has written its files: a header in one group
# may include a header generated by another group, so resolving immediately
# would miss cross-group deps for groups processed first.
pending: list[list[tuple[str, list[tuple[bool, str]]]]] = []
for cfiles in group_cfiles:
cfilenames = []
per_cfile_deps: list[tuple[str, list[tuple[bool, str]]]] = []
for cfile, ctext in cfiles:
cfile = os.path.join(compiler_options.target_dir, cfile)
# Empty contents marks a file the previous run already wrote
Expand All @@ -643,16 +741,22 @@ def mypyc_build(
write_file(cfile, ctext)
if os.path.splitext(cfile)[1] == ".c":
cfilenames.append(cfile)

# The header regex matches both quote styles, so the result can
# include system headers like `<Python.h>` that don't live under
# target_dir. Joining those produces non-existent paths which
# would force a full rebuild on every run via Extension.depends.
candidate_deps = (
os.path.join(compiler_options.target_dir, dep) for dep in get_header_deps(cfiles)
)
deps = [d for d in candidate_deps if os.path.exists(d)]
group_cfilenames.append((cfilenames, deps))
per_cfile_deps.append((cfile, get_header_deps([(cfile, ctext)])))
pending.append(per_cfile_deps)
group_cfilenames.append((cfilenames, []))

# Second pass: resolve each .c file's deps transitively now that every
# group's headers are on disk. See resolve_cfile_deps for the rules.
for i, per_cfile in enumerate(pending):
deps_set: set[str] = set()
for cfile_full, dep_names in per_cfile:
deps_set.update(
resolve_cfile_deps(
os.path.dirname(cfile_full), dep_names, compiler_options.target_dir
)
)
cfilenames, _ = group_cfilenames[i]
group_cfilenames[i] = (cfilenames, sorted(deps_set))

return groups, group_cfilenames, source_deps

Expand Down
151 changes: 151 additions & 0 deletions mypyc/test/test_misc.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from __future__ import annotations

import os
import tempfile
import unittest

from mypyc.build import get_header_deps, resolve_cfile_deps
from mypyc.ir.ops import BasicBlock
from mypyc.ir.pprint import format_blocks, generate_names_for_ir
from mypyc.irbuild.ll_builder import LowLevelIRBuilder
Expand All @@ -20,3 +23,151 @@ def test_debug_op(self) -> None:
names = generate_names_for_ir([], [block])
code = format_blocks([block], names, {})
assert code[:-1] == ["L0:", " r0 = 'foo'", " CPyDebug_PrintObject(r0)"]


class TestHeaderDeps(unittest.TestCase):
"""Tests for the header-dependency tracking used to build
`Extension.depends`, which drives setuptools' `newer_group` decision
about whether to recompile a .o file on incremental builds.

The critical case is cross-group export-table headers: each module's
`__native_internal_<mod>.h` does `#include <other_group/__native_other.h>`,
and the consumer's compiled .o file bakes in byte offsets into that
header's `export_table_<group>` struct. If we miss this header in the
deps list, struct-layout changes in `other_group` won't trigger a
rebuild of the consumer, and its baked-in offsets will silently resolve
to whatever now occupies those slots.
"""

def test_get_header_deps_quoted_includes(self) -> None:
# Quoted includes — the historical form. Used by the .c file to
# reach its own __native_<mod>.h / __native_internal_<mod>.h. The
# `False` in each tuple marks the include as non-angled, which
# `resolve_cfile_deps` uses to search the includer's directory.
cfile = '#include "__native_caller.h"\n#include "__native_internal_caller.h"\n'
assert get_header_deps([("caller.c", cfile)]) == [
(False, "__native_caller.h"),
(False, "__native_internal_caller.h"),
]

def test_get_header_deps_angle_bracket_includes(self) -> None:
# Angle-bracket includes are also matched, and reported with
# is_angled=True so that the resolver skips the includer's dir
# for them (matching the C preprocessor). The cross-group export
# header is reached via `#include <other_group/__native_other.h>`
# in __native_internal_<mod>.h. Before this was matched the dep
# was missed entirely and the consumer's .o was never invalidated
# when the other group's struct layout shifted.
cfile = "#include <Python.h>\n#include <lib/__native_functions.h>\n"
assert get_header_deps([("caller.c", cfile)]) == [
(True, "Python.h"),
(True, "lib/__native_functions.h"),
]

def test_get_header_deps_mixed_and_whitespace(self) -> None:
# The preprocessor tolerates whitespace and the leading-hash form.
# `get_header_deps` returns sorted tuples — non-angled (False) sorts
# before angled (True), then alphabetical within each kind.
cfile = '# include "a.h"\n# include <b.h>\n#include\t"c.h"\n'
assert get_header_deps([("x.c", cfile)]) == [(False, "a.h"), (False, "c.h"), (True, "b.h")]

def test_resolve_walks_transitively_through_headers(self) -> None:
# Reproduces the bug scenario: caller's .c only directly includes
# caller's own headers, but caller's __native_internal_caller.h
# includes the cross-group export header. The resolver must follow
# that chain so setuptools sees the cross-group header as a dep.
with tempfile.TemporaryDirectory() as tmp:
build_dir = tmp
os.makedirs(os.path.join(build_dir, "lib"))
os.makedirs(os.path.join(build_dir, "other_group"))

# caller.c's directly-included headers — both live alongside
# caller.c under build/ (resolved via target_dir).
internal_h = os.path.join(build_dir, "__native_internal_caller.h")
caller_h = os.path.join(build_dir, "__native_caller.h")
cross_group_h = os.path.join(build_dir, "lib", "__native_functions.h")
unrelated_h = os.path.join(build_dir, "other_group", "__native_other.h")

with open(caller_h, "w") as f:
# lib-rt headers don't exist on disk under build/, so they
# get dropped during resolution and aren't recursed into.
f.write("#include <Python.h>\n#include <CPy.h>\n")
with open(internal_h, "w") as f:
# The smoking gun: this header includes a header in another
# group via angle brackets. Pre-fix, this dep was invisible
# to setuptools.
f.write(
"#include <Python.h>\n"
'#include "__native_caller.h"\n'
"#include <lib/__native_functions.h>\n"
)
with open(cross_group_h, "w") as f:
f.write("struct export_table_lib___functions { int x; };\n")
with open(unrelated_h, "w") as f:
# Sibling group not reached from caller's chain — must
# NOT appear in the resolved set.
f.write("struct unrelated { int x; };\n")

# caller.c is in build_dir, so its includer-dir is build_dir.
# Both directly-included headers are quoted (`False`); the
# cross-group header that __native_internal_caller.h reaches
# via `<lib/__native_functions.h>` is found by the recursive
# walk re-reading the on-disk header.
deps = resolve_cfile_deps(
cfile_dir=build_dir,
direct_includes=[
(False, "__native_caller.h"),
(False, "__native_internal_caller.h"),
],
target_dir=build_dir,
)

assert deps == {caller_h, internal_h, cross_group_h}, (
f"expected the cross-group header to be reached transitively; "
f"got {sorted(deps)!r}"
)

def test_resolve_drops_unresolvable_includes(self) -> None:
# `<Python.h>`, `<CPy.h>`, etc. don't live under target_dir, so
# they're dropped from depends. They never change between builds,
# so this is the right behavior — and crucially it stops
# setuptools' `missing="newer"` from treating them as always-newer
# and force-rebuilding every translation unit.
with tempfile.TemporaryDirectory() as tmp:
cfile_dir = tmp
deps = resolve_cfile_deps(
cfile_dir=cfile_dir,
direct_includes=[(True, "Python.h"), (True, "CPy.h"), (False, "init.c")],
target_dir=cfile_dir,
)
assert deps == set()

def test_resolve_search_order_matches_preprocessor(self) -> None:
# When the same header name exists both next to the includer and
# under target_dir, the C preprocessor picks the includer-dir copy
# for `#include "shared.h"` and the target_dir copy for
# `#include <shared.h>`. The resolver must record the same path
# the compiler will actually consume, otherwise mtimes of the
# wrong file drive incremental rebuild decisions.
with tempfile.TemporaryDirectory() as tmp:
includer = os.path.join(tmp, "groupA")
target = os.path.join(tmp, "build")
os.makedirs(includer)
os.makedirs(target)

local_h = os.path.join(includer, "shared.h")
global_h = os.path.join(target, "shared.h")
with open(local_h, "w") as f:
f.write("/* local */\n")
with open(global_h, "w") as f:
f.write("/* global */\n")

# Quoted form: resolves to the includer-dir copy.
assert resolve_cfile_deps(
cfile_dir=includer, direct_includes=[(False, "shared.h")], target_dir=target
) == {local_h}

# Angled form: skips the includer-dir copy, resolves under -I.
assert resolve_cfile_deps(
cfile_dir=includer, direct_includes=[(True, "shared.h")], target_dir=target
) == {global_h}
Loading