From b4fa5f83a5e291ddab19808cbc73943544ffe0aa Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Wed, 29 Apr 2026 17:13:08 +0300 Subject: [PATCH 1/3] Add chunk-isolation regression test for the exec LOAD's last 2MB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hugifyr transformation aims to make the kernel grant code huge pages on the binary's executable LOAD. For that to work, every 2MB chunk that LOAD RE touches must be exclusively RE — if a non-exec LOAD's vaddr range overlaps any of those chunks, mmap-order overlay mixes protections and the kernel can't issue a code huge page on it. Add a parser for readelf -lW LOAD entries plus check_re_chunk_isolation which asserts no other LOAD's vaddr range intersects an RE 2MB chunk. Wire it into test_basic. The check fires loudly if a future change to the layout pass picks a vaddr_delta that's just large enough to land .text on a 2MB boundary but not large enough to push subsequent LOADs out of RE's last chunk — i.e. start-aligning instead of end-aligning the executable segment. Also add test_load_layouts that builds test1.c with default ld and with -Wl,-z,noseparate-code (Oracle JDK-style combined R+E first segment) and verifies hugifyr produces a runnable binary for each. The lld-style layout (rodata in seg0, used by Chromium-based apps and Sublime Text) isn't covered here because hugifyr's main path doesn't currently handle it: shifting .text without also shifting seg0's rodata breaks RIP-relative LEAs from code into rodata. Fixing that while keeping end-alignment is separate work. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test.py | 109 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 108 insertions(+), 1 deletion(-) diff --git a/tests/test.py b/tests/test.py index f093b7f..b73ffb9 100644 --- a/tests/test.py +++ b/tests/test.py @@ -11,6 +11,66 @@ def run_command(cmd): raise RuntimeError(f"Command {cmd} failed: {result.stderr}") return result.stdout +def parse_load_segments(filename): + """Parse LOAD program-header entries from readelf -lW output. + Returns a list of dicts with offset/vaddr/filesz/memsz/flags.""" + output = run_command(['readelf', '-lW', filename]) + loads = [] + for line in output.splitlines(): + s = line.strip() + if not s.startswith('LOAD'): + continue + parts = s.split() + # Format with R+E (flags split): LOAD off vaddr paddr filesz memsz R E align + # Format with R/RW/RWE: LOAD off vaddr paddr filesz memsz Flg align + if len(parts) == 9: + flags = parts[6] + parts[7] + else: + flags = parts[6] + loads.append({ + 'offset': int(parts[1], 16), + 'vaddr': int(parts[2], 16), + 'filesz': int(parts[4], 16), + 'memsz': int(parts[5], 16), + 'flags': flags, + }) + return loads + + +def check_re_chunk_isolation(filename, huge=0x200000): + """Verify that no 2MB chunk touched by LOAD RE is also touched by any + other LOAD's vaddr range. This is the hugifyr project's central goal: + every 2MB chunk that contains code must be exclusively code so the + kernel can grant code huge pages. Any other LOAD overlapping (by + mmap-order overlay or by sitting past RE's end inside the same chunk) + blocks the chunk from being huge-page eligible.""" + loads = parse_load_segments(filename) + exec_load = next((l for l in loads if 'E' in l['flags']), None) + if not exec_load: + raise RuntimeError(f"{filename}: no executable LOAD") + + re_lo = exec_load['vaddr'] + re_hi = exec_load['vaddr'] + exec_load['memsz'] + re_first_chunk = re_lo & ~(huge - 1) + re_last_chunk_top = ((re_hi - 1) & ~(huge - 1)) + huge + + for l in loads: + if l is exec_load: + continue + l_lo = l['vaddr'] + l_hi = l['vaddr'] + l['memsz'] + # Does this LOAD's vaddr range intersect any 2MB chunk that RE touches? + if l_lo < re_last_chunk_top and l_hi > re_first_chunk: + raise RuntimeError( + f"{filename}: LOAD vaddr 0x{l_lo:x}-0x{l_hi:x} (flags '{l['flags']}') " + f"intersects an RE 2MB chunk in [0x{re_first_chunk:x}, 0x{re_last_chunk_top:x}). " + f"RE end is 0x{re_hi:x}; mmap-order will mix protections in shared chunks " + f"and the kernel can't grant code huge pages.") + print(f"RE chunk isolation OK in {filename} " + f"(RE [0x{re_lo:x}, 0x{re_hi:x}), no other LOAD intersects " + f"[0x{re_first_chunk:x}, 0x{re_last_chunk_top:x}))") + + def check_segment_alignment(filename): """Check if executable segment is properly 2MB aligned""" ALIGN_SIZE = 0x200000 # 2MB @@ -104,6 +164,12 @@ def test_basic(): check_segment_alignment('test1_huge.exe') check_segment_alignment('libtest1_huge.so') + # Regression: the 2MB chunk where RE ends must not be intersected by + # any subsequent LOAD's vaddr — that's what makes the last code huge + # page eligible. This is what end-aligning vaddr_delta gives us; a + # start-aligning delta puts subsequent LOADs back into RE's last chunk. + check_re_chunk_isolation('test1_huge.exe') + # Check debug info print("Checking debug info...") gdb_output = run_command(['gdb', '-batch', '-ex', 'file test1_huge.exe', @@ -197,10 +263,51 @@ def test_tls_relocations(): print("TLS relocation tests passed!") +def test_load_layouts(): + """Regression: hugifyr must work across the LOAD-segment layouts the + main path supports today. + + 1. default (modern ld -z separate-code): 4-LOAD with metadata-only first + R segment. .rodata sits in a separate (third) R LOAD. + 2. -z noseparate-code: 2-LOAD with code+rodata combined + in the first R+E segment (Oracle JDK / libjvm.so shape). + + The lld-style 4-LOAD layout (.rodata in the first R segment, see Sublime + Text / Discord / Slack / msedge) is NOT covered here because hugifyr's + main path doesn't handle it: shifting .text without also shifting the + rodata in seg0 breaks RIP-relative LEAs from code into rodata. Adding + that fix is a separate piece of work that has to preserve end-aligning + of the exec segment (see check_re_chunk_isolation).""" + print("\n=== Testing across LOAD-segment layouts ===") + + variants = [ + ('default', ['gcc', '-pie', '-O2']), + ('combined', ['gcc', '-pie', '-O2', '-Wl,-z,noseparate-code']), + ] + + for name, cmd in variants: + src_exe = f'test1_{name}.exe' + huge_exe = f'test1_{name}_huge.exe' + run_command(cmd + ['test1.c', '-o', src_exe]) + + original = run_command([f'./{src_exe}']).strip() + run_command(['../bin/hugifyr', src_exe, huge_exe]) + os.chmod(huge_exe, 0o755) + modified = run_command([f'./{huge_exe}']).strip() + + if original != modified: + raise RuntimeError(f"{name}: output mismatch (orig={original!r}, new={modified!r})") + print(f" {name}: {original}") + + print("Layout tests passed!") + def main(): # Run basic tests test_basic() - + + # Layout-coverage regression + test_load_layouts() + # Run TLS tests test_tls() From ceec465e9e2884191df5ac8362d11e72bd63520a Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Wed, 29 Apr 2026 18:04:47 +0300 Subject: [PATCH 2/3] Padding-only path for lld-style PIE; align p_offset%2MB to p_vaddr%2MB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The main shifting path crashes lld-style PIE binaries (Sublime Text, Discord, Slack, MS Edge, Chrome, MongoDB) because their first read-only LOAD ("seg0") carries .rodata / .eh_frame_hdr / .eh_frame / .gcc_except_table — sections that .text RIP-references via direct LEA displacements with no relocation entries. Shifting .text without shifting those sections invalidates every cross-segment LEA and the binary segfaults during dl_main / unwinder init. This commit doesn't fix that fully — moving seg0's rodata into a shifted segment with end-alignment preserved is structurally bigger work. It establishes the necessary precondition: a safe transformation that runs on lld-style binaries, leaves them runnable, and ensures the exec LOAD's p_offset and p_vaddr have the same residue modulo 2MB. Detection: seg0_has_movable_sections() walks sections at vaddrs below the first PT_X LOAD's p_vaddr. Anything SHF_ALLOC, not SHT_NOBITS, and not in the existing relocatable_section_types whitelist (which already covers .dynsym, .gnu.hash, .rela.*, .dynamic, .interp, .note.*) is considered RIP-referenced from code => the binary is lld-style. The whitelist is conservative; unknown section types route to the safe padding-only path rather than to the shifting path. Padding-only path: pad_offset_to_match_vaddr() computes delta = (p_vaddr_RE - p_offset_RE) mod 2MB, bumps p_offset of every phdr at-or-after the original exec offset by delta, bumps every section's sh_offset similarly, bumps e_shoff, and stamps the first LOAD's p_align to 2MB. It does NOT touch any p_vaddr / sh_addr / relocations / symbols / DWARF / build-id. The output is byte-identical to the input except for the inserted file padding and the updated offset fields. The transformed binary runs identically to the original. Tests: - check_offset_vaddr_mod_2mb_match: asserts p_offset%2MB == p_vaddr%2MB on the exec LOAD. Wired into test_basic and every test_load_layouts variant. - test_load_layouts gets the lld variant back (built with -fuse-ld=lld); it now exercises the new padding-only path. Verified on real-world closed-source PIE binaries we already had downloaded: - Sublime Text 4180: --version → "Sublime Text Build 4180" matches - Discord 0.0.135: matches - Slack 4.42.117: matches - MEGAsync (modern): main path, matches - Cisco Webex CEF: main path, matches - cloudflared/terraform: ET_EXEC fallback, unchanged Co-Authored-By: Claude Opus 4.7 (1M context) --- src/hugifyr.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++++- tests/test.py | 60 +++++++++++++++++---- 2 files changed, 191 insertions(+), 12 deletions(-) diff --git a/src/hugifyr.c b/src/hugifyr.c index 799ea74..8c10645 100644 --- a/src/hugifyr.c +++ b/src/hugifyr.c @@ -2928,13 +2928,137 @@ static int handle_fallback_hugepage_alignment(ElfInfo *info) { return ERR_SUCCESS; } +// Returns 1 if any section in the first read-only LOAD ("seg0") is +// SHF_ALLOC, not SHT_NOBITS, sits at vaddr below the first PT_X LOAD's +// p_vaddr, AND is NOT in the relocatable_section_types whitelist. The +// whitelist catches metadata that's only consumed by ld.so (.dynsym, +// .gnu.hash, .rela.*, .dynamic, .interp, .note.*, ...). Anything else +// in seg0 — typically .rodata, .eh_frame, .eh_frame_hdr, .gcc_except_table +// — is RIP-referenced from .text, making the binary "lld-style" and +// unsafe for the main shifting path. +static int seg0_has_movable_sections(ElfInfo *info) { + GElf_Addr exec_vaddr = 0; + bool found = false; + for (size_t i = 0; i < info->phnum; i++) { + if (info->phdrs[i].p_type == PT_LOAD && + (info->phdrs[i].p_flags & PF_X)) { + exec_vaddr = info->phdrs[i].p_vaddr; + found = true; + break; + } + } + if (!found) { + return 0; + } + + for (size_t i = 0; i < info->shnum; i++) { + struct elf_section *sec = &info->sections[i]; + if (sec->shdr.sh_addr == 0 || sec->shdr.sh_addr >= exec_vaddr) { + continue; + } + if ((sec->shdr.sh_flags & SHF_ALLOC) == 0) continue; + if (sec->shdr.sh_type == SHT_NOBITS) continue; + + bool whitelisted = false; + for (size_t j = 0; j < array_size(relocatable_section_types); j++) { + if (sec->shdr.sh_type == relocatable_section_types[j].type && + (relocatable_section_types[j].name == NULL || + strcmp(sec->name, relocatable_section_types[j].name) == 0)) { + whitelisted = true; + break; + } + } + if (!whitelisted) { + pr_debug("seg0 contains non-whitelisted section '%s' (type=%u, addr=0x%lx) — lld-style\n", + sec->name, sec->shdr.sh_type, sec->shdr.sh_addr); + return 1; + } + } + return 0; +} + +// Padding-only transformation for lld-style PIE binaries. The main path +// would shift .text without shifting the .rodata that lives in seg0, +// breaking RIP-relative LEAs from code. As a first safe step, this +// transform only fixes the file-side mod-2MB alignment of LOAD RE: it +// inserts file padding before the exec segment so that +// (p_offset_RE + delta) % 2MB == p_vaddr_RE % 2MB. It does NOT change any +// virtual address, does NOT move any section, does NOT extend memsz, and +// does NOT touch relocations / symbols / DWARF / build-id. The output +// runs identically to the input. Establishing this property is the +// precondition for a future stage that will move the seg0 RIP-referenced +// sections and align RE to a 2MB boundary. +static int pad_offset_to_match_vaddr(ElfInfo *info) { + ssize_t first_load_idx = -1; + ssize_t exec_idx = -1; + for (size_t i = 0; i < info->phnum; i++) { + if (info->phdrs[i].p_type != PT_LOAD) continue; + if (first_load_idx < 0) first_load_idx = i; + if ((info->phdrs[i].p_flags & PF_X) && exec_idx < 0) exec_idx = i; + } + if (first_load_idx < 0 || exec_idx < 0) { + pr_error("padding-only path: missing first or executable LOAD\n"); + return ERR_FATAL; + } + + GElf_Phdr *first_load = &info->phdrs[first_load_idx]; + GElf_Phdr *exec_phdr = &info->phdrs[exec_idx]; + GElf_Off old_exec_offset = exec_phdr->p_offset; + GElf_Addr exec_vaddr = exec_phdr->p_vaddr; + + uint64_t v_mod = exec_vaddr % HUGE_PAGE_SIZE; + uint64_t o_mod = old_exec_offset % HUGE_PAGE_SIZE; + uint64_t delta = (v_mod + HUGE_PAGE_SIZE - o_mod) % HUGE_PAGE_SIZE; + + pr_info("Padding-only path (lld-style): exec_vaddr=0x%lx exec_offset=0x%lx, delta=0x%lx\n", + exec_vaddr, old_exec_offset, delta); + + // Always set p_align on the first LOAD so the kernel picks a 2MB- + // aligned load base. Harmless for the runtime; necessary precondition + // for future huge-page enablement. + first_load->p_align = HUGE_PAGE_SIZE; + + if (delta == 0) { + pr_info("offset already aligns with vaddr modulo 2MB; only setting p_align\n"); + return ERR_SUCCESS; + } + + // Bump p_offset for every phdr at or after the original exec offset. + // PT_LOAD, PT_DYNAMIC, PT_GNU_RELRO, PT_GNU_EH_FRAME, PT_TLS, PT_NOTE + // — anything whose file location sits in or after the executable's + // file region must shift in lockstep. + for (size_t i = 0; i < info->phnum; i++) { + GElf_Phdr *p = &info->phdrs[i]; + if (p->p_type != PT_NULL && p->p_offset >= old_exec_offset) { + p->p_offset += delta; + } + } + + // Bump sh_offset for every section at or after the original exec + // offset and mark dirty so libelf rewrites the data at the new offset. + for (size_t i = 0; i < info->shnum; i++) { + struct elf_section *sec = &info->sections[i]; + if (sec->shdr.sh_offset >= old_exec_offset && sec->shdr.sh_type != SHT_NULL) { + sec->shdr.sh_offset += delta; + mark_section_dirty(sec); + } + } + + // Bump section header table offset if it lives past the exec offset. + if (info->ehdr.e_shoff >= old_exec_offset) { + info->ehdr.e_shoff += delta; + } + + return ERR_SUCCESS; +} + static int process_elf(const char *filename, uint32_t flags) { __attribute__((cleanup(cleanup_free_elf_info))) ElfInfo *elf_info = init_elf(filename); if (elf_info == NULL) { return ERR_FATAL; } - + if (read_elf(elf_info) != 0) { pr_error("Failed to read ELF file\n"); return ERR_FATAL; @@ -2946,6 +3070,23 @@ static int process_elf(const char *filename, uint32_t flags) { init_sections_ordered_by_offset(elf_info); + // lld-style PIE detection: seg0 carries .rodata / .eh_frame* / + // .gcc_except_table that are RIP-referenced from code. The main + // shifting path would crash these binaries at runtime. Route them + // through a safer padding-only transform that only fixes file-side + // mod-2MB alignment without disturbing any vaddr. Huge-page + // enablement is left to a future stage. + if (elf_info->ehdr.e_type == ET_DYN && seg0_has_movable_sections(elf_info)) { + if (pad_offset_to_match_vaddr(elf_info) != 0) { + return ERR_FATAL; + } + if (write_elf(elf_info)) { + pr_error("Failed to write ELF file\n"); + return ERR_FATAL; + } + return ERR_SUCCESS; + } + if (elf_info->ehdr.e_type != ET_DYN) { pr_error("Input file is not position-independent code, trying fallback path\n"); if (handle_fallback_hugepage_alignment(elf_info) != 0) { diff --git a/tests/test.py b/tests/test.py index b73ffb9..31e86f7 100644 --- a/tests/test.py +++ b/tests/test.py @@ -37,6 +37,30 @@ def parse_load_segments(filename): return loads +def check_offset_vaddr_mod_2mb_match(filename, huge=0x200000): + """Verify that for the executable LOAD, p_offset and p_vaddr have the + same residue modulo 2MB. Without this property the kernel cannot place + the segment on a 2MB-aligned VMA backed by the file (THP / file-backed + huge pages need both vaddr and file offset 2MB-aligned in lockstep). + Linkers normally produce binaries where this isn't true (e.g. lld + leaves a 0x1000 vaddr/offset gap on the executable LOAD), so hugifyr's + transformation must establish it.""" + loads = parse_load_segments(filename) + exec_load = next((l for l in loads if 'E' in l['flags']), None) + if not exec_load: + raise RuntimeError(f"{filename}: no executable LOAD") + o_mod = exec_load['offset'] % huge + v_mod = exec_load['vaddr'] % huge + if o_mod != v_mod: + raise RuntimeError( + f"{filename}: exec LOAD p_offset%2MB=0x{o_mod:x} != p_vaddr%2MB=0x{v_mod:x} " + f"(p_offset=0x{exec_load['offset']:x}, p_vaddr=0x{exec_load['vaddr']:x}). " + f"Kernel can't place on a 2MB-aligned file-backed VMA.") + print(f"offset%2MB == vaddr%2MB OK in {filename} " + f"(both 0x{v_mod:x}; p_offset=0x{exec_load['offset']:x}, " + f"p_vaddr=0x{exec_load['vaddr']:x})") + + def check_re_chunk_isolation(filename, huge=0x200000): """Verify that no 2MB chunk touched by LOAD RE is also touched by any other LOAD's vaddr range. This is the hugifyr project's central goal: @@ -170,6 +194,11 @@ def test_basic(): # start-aligning delta puts subsequent LOADs back into RE's last chunk. check_re_chunk_isolation('test1_huge.exe') + # Regression: p_offset and p_vaddr of the exec LOAD must agree + # modulo 2MB. The transformation should establish this property + # whether the input had it or not. + check_offset_vaddr_mod_2mb_match('test1_huge.exe') + # Check debug info print("Checking debug info...") gdb_output = run_command(['gdb', '-batch', '-ex', 'file test1_huge.exe', @@ -264,26 +293,30 @@ def test_tls_relocations(): print("TLS relocation tests passed!") def test_load_layouts(): - """Regression: hugifyr must work across the LOAD-segment layouts the - main path supports today. + """Regression: hugifyr must work across the LOAD-segment layouts. 1. default (modern ld -z separate-code): 4-LOAD with metadata-only first - R segment. .rodata sits in a separate (third) R LOAD. + R segment. .rodata sits in a separate (third) R LOAD. Goes through + the main shifting path (huge-page enablement). 2. -z noseparate-code: 2-LOAD with code+rodata combined - in the first R+E segment (Oracle JDK / libjvm.so shape). - - The lld-style 4-LOAD layout (.rodata in the first R segment, see Sublime - Text / Discord / Slack / msedge) is NOT covered here because hugifyr's - main path doesn't handle it: shifting .text without also shifting the - rodata in seg0 breaks RIP-relative LEAs from code into rodata. Adding - that fix is a separate piece of work that has to preserve end-aligning - of the exec segment (see check_re_chunk_isolation).""" + in the first R+E segment (Oracle JDK / libjvm.so shape). Main path. + 3. lld: 4-LOAD with .rodata + .eh_frame* + in the first R segment. .text RIP-references seg0 content, so the + main shifting path would crash this binary. Routes through the new + padding-only path that establishes + p_offset%2MB == p_vaddr%2MB for the exec LOAD without changing any + vaddrs. Output runs identically to the original.""" print("\n=== Testing across LOAD-segment layouts ===") variants = [ ('default', ['gcc', '-pie', '-O2']), ('combined', ['gcc', '-pie', '-O2', '-Wl,-z,noseparate-code']), ] + if subprocess.run(['gcc', '-fuse-ld=lld', '-x', 'c', '-', '-o', '/dev/null'], + input='int main(){}', capture_output=True, text=True).returncode == 0: + variants.append(('lld', ['gcc', '-pie', '-O2', '-fuse-ld=lld'])) + else: + print(" (lld not available — skipping rodata-in-seg0 variant)") for name, cmd in variants: src_exe = f'test1_{name}.exe' @@ -297,6 +330,11 @@ def test_load_layouts(): if original != modified: raise RuntimeError(f"{name}: output mismatch (orig={original!r}, new={modified!r})") + + # Every variant must establish p_offset%2MB == p_vaddr%2MB after + # the transform. The main path achieves this by shifting; the + # padding-only path achieves it by inserting file padding. + check_offset_vaddr_mod_2mb_match(huge_exe) print(f" {name}: {original}") print("Layout tests passed!") From 943d2800d6ac09aeac844716b36dcc3a11bc1d9d Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Wed, 29 Apr 2026 20:46:01 +0300 Subject: [PATCH 3/3] lld-style PIE: end-aligned section-aware shift (replaces padding-only path) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The padding-only path added in ceec465 only fixed the file-side mod-2MB alignment of LOAD RE without changing any vaddr — so lld-style binaries became correct but never huge-page-eligible. This commit replaces it with a transformation that does enable code huge pages on lld-style PIE. What's new: - AdjInfo carries a list of "movable seg0" vaddr ranges: sections in seg0 that are SHF_ALLOC, non-NOBITS, and NOT in relocatable_section_types (.rodata, .eh_frame, .eh_frame_hdr, .gcc_except_table). calc_adjusted_addr remaps addresses inside those ranges by the same vaddr_delta as everything at-or-after old_exec_vaddr, so RIP-relative LEAs from .text into .rodata stay valid after the shift. Empty for non-lld binaries (the existing behavior). - adjust_program_headers extends seg0 LOAD R's filesz/memsz to cover the shifted seg0 contents, clamps LOAD RE's p_vaddr to max(round_down(p_vaddr,2MB), seg0_end_after_shift) so seg0 LOAD R and LOAD RE never overlap in vaddr space, and shifts PT_GNU_EH_FRAME (which targets a movable .eh_frame_hdr). - adjust_section_headers shifts sh_offset for movable seg0 sections; seg0 has p_vaddr == p_offset == 0, so the file delta equals vaddr_delta. - segment_offset_delta for lld-style is exec_p_vaddr_clamped - old_p_offset (LOAD RE's file region starts where extended seg0 ends); section_offset_delta accounts for the clamp so every section in LOAD RE has sh_offset_new - sh_addr_new == p_offset_new - p_vaddr_clamped (kernel constraint for a single LOAD's file mapping). - pad_segment_start now fills the gap between the last non-exec section and the first executable section in LOAD RE — never below p_vaddr or over metadata. This avoids clobbering ELF header / PHDR / .interp / .note in the 2-LOAD R+E first ("combined" -z noseparate-code) layout. - pad_offset_to_match_vaddr removed. For modern PIE (4-LOAD with metadata-only seg0) and 2-LOAD R+E first ("combined") the new code is a no-op via the seg0_end_after_shift = 0 clamp degeneration. Tests: - check_segment_alignment unchanged for the modern path. - New check_exec_load_end_aligned: every variant must have LOAD RE's end on a 2MB boundary. - check_re_chunk_isolation relaxed to require only that fully-covered 2MB chunks be exclusive code (partial chunks at the start/end of LOAD RE can legitimately share their range with adjacent LOAD R / LOAD RW). - All three checks (offset/vaddr-mod, end-aligned, chunk-isolation) wired into every test_load_layouts variant including lld. Verification: - test_basic + test_load_layouts (default, combined, lld) + TLS + TLS-relocs all pass. - Real-world smoke test on lld-style PIE: sublime_text (Build 4180), Discord (0.0.135), slack (4.42.117) all run identically; LOAD RE ends at 2MB, full chunks isolated. - libjvm.so (Oracle JDK 21.0.11, 2-LOAD R+E first / 20MB code) runs the full Java workload (JIT, GC, Streams, ConcurrentHashMap, Executors, recursion) bit-identical to the original. - Booted under /boot/vmlinuz-6.14.11rothp (READ_ONLY_THP_FOR_FS=y) with the hugified libjvm.so: THPeligible=1 (was 0 on host), khugepaged collapsed 16384 kB into 8 file-PMD-mapped 2MB pages on the libjvm.so r-xp mapping after running the workload. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/hugifyr.c | 350 ++++++++++++++++++++++++++++++++------------------ tests/test.py | 84 ++++++++---- 2 files changed, 280 insertions(+), 154 deletions(-) diff --git a/src/hugifyr.c b/src/hugifyr.c index 8c10645..1088ccb 100644 --- a/src/hugifyr.c +++ b/src/hugifyr.c @@ -129,6 +129,11 @@ typedef struct { offset_entry *loclist_offsets; } ElfInfo; +typedef struct { + GElf_Addr old_lo; // inclusive + GElf_Addr old_hi; // exclusive +} AddrRange; + typedef struct { GElf_Addr old_exec_vaddr; GElf_Addr vaddr_delta; @@ -138,6 +143,18 @@ typedef struct { bool adjust_offsets; bool sections_adjusted; bool adjust_debug; + + // lld-style support: vaddr ranges in seg0 that aren't in the + // relocatable_section_types whitelist. They sit below old_exec_vaddr but + // are RIP-referenced from .text (.rodata, .eh_frame*, .gcc_except_table), + // so they must shift by vaddr_delta along with the rest of the binary. + // Empty (n_movable_seg0 == 0) for non-lld-style binaries; in that case + // calc_adjusted_addr falls through to the legacy "shift if addr >= + // old_exec_vaddr" logic. + AddrRange *movable_seg0_ranges; + size_t n_movable_seg0; + GElf_Addr seg0_end_after_shift; // post-shift highest end of any seg0 section + GElf_Addr exec_p_vaddr_clamped; // LOAD RE's final p_vaddr (max(round_down, seg0_end)) } AdjInfo; typedef struct { @@ -208,6 +225,10 @@ static void cleanup_free_uchar(unsigned char **ptr) { cleanup_free((void **)ptr); } +static void cleanup_free_addrrange(AddrRange **ptr) { + cleanup_free((void **)ptr); +} + static void cleanup_close(int *fd) { if (*fd != -1) { close(*fd); @@ -408,11 +429,27 @@ static void write_address(void *p, uint64_t value, uint8_t address_size) { *(uint32_t *)p = value; } +static bool addr_in_movable_seg0(AdjInfo *adj_info, GElf_Addr addr) { + for (size_t i = 0; i < adj_info->n_movable_seg0; i++) { + if (addr >= adj_info->movable_seg0_ranges[i].old_lo && + addr < adj_info->movable_seg0_ranges[i].old_hi) { + return true; + } + } + return false; +} + static GElf_Addr calc_adjusted_addr(AdjInfo *adj_info, GElf_Addr addr) { if (addr >= adj_info->old_exec_vaddr) { return addr + adj_info->vaddr_delta; } + // lld-style: addresses in movable seg0 sections shift by the same delta + // (relocation addends, symbol values, dynamic-section pointers, DWARF + // references that target .rodata / .eh_frame* etc.). + if (addr_in_movable_seg0(adj_info, addr)) { + return addr + adj_info->vaddr_delta; + } return addr; } @@ -1382,7 +1419,23 @@ static void adjust_program_headers(ElfInfo *info, AdjInfo *adj_info) { if ( i == (size_t)info->first_load_index) { phdr->p_align = HUGE_PAGE_SIZE; } - + + // lld-style: seg0 LOAD covers movable sections that shift forward. + // Extend its filesz/memsz to cover the post-shift end. Don't change + // its p_vaddr/p_offset (still 0). + if (i == (size_t)info->first_load_index && + info->first_load_index != info->exec_index && + adj_info->n_movable_seg0 > 0) { + GElf_Addr new_end = adj_info->seg0_end_after_shift; + if (new_end > phdr->p_vaddr + phdr->p_memsz) { + GElf_Xword new_size = new_end - phdr->p_vaddr; + phdr->p_memsz = new_size; + if (adj_info->adjust_offsets) { + phdr->p_filesz = new_size; + } + } + } + if (phdr->p_vaddr >= (size_t)adj_info->old_exec_vaddr) { uint64_t old_end_addr = phdr->p_vaddr + phdr->p_memsz; @@ -1393,18 +1446,29 @@ static void adjust_program_headers(ElfInfo *info, AdjInfo *adj_info) { if (i == (size_t)info->exec_index) { uint64_t old_end_offset_in_page = old_end_addr % PAGE_SIZE; uint64_t offset_in_page = phdr->p_vaddr % PAGE_SIZE; - uint64_t memsz = round_up(phdr->p_memsz, HUGE_PAGE_SIZE) - - offset_in_page - + uint64_t memsz = round_up(phdr->p_memsz, HUGE_PAGE_SIZE) - + offset_in_page - (PAGE_SIZE - old_end_offset_in_page); - // calculate the + // calculate the phdr->p_memsz = memsz; if (adj_info->adjust_offsets) { phdr->p_filesz = phdr->p_memsz; phdr->p_align = HUGE_PAGE_SIZE; } - } + } + } else if (addr_in_movable_seg0(adj_info, phdr->p_vaddr)) { + // lld-style: phdr (e.g. PT_GNU_EH_FRAME) targets a section + // that's been remapped into the new vaddr range. Shift in + // lockstep. seg0 has p_vaddr == p_offset, so the file delta + // matches vaddr_delta. + phdr->p_vaddr += adj_info->vaddr_delta; + phdr->p_paddr += adj_info->vaddr_delta; + if (adj_info->adjust_offsets) { + phdr->p_offset += adj_info->vaddr_delta; + } + continue; } if (adj_info->adjust_offsets) { @@ -1413,8 +1477,21 @@ static void adjust_program_headers(ElfInfo *info, AdjInfo *adj_info) { // that is then added at the beginning of the executable segment. if (phdr->p_offset == adj_info->old_exec_offset) { phdr->p_offset += adj_info->segment_offset_delta; - phdr->p_vaddr = round_down(phdr->p_vaddr, HUGE_PAGE_SIZE); - phdr->p_paddr = round_down(phdr->p_paddr, HUGE_PAGE_SIZE); + // adj_info->exec_p_vaddr_clamped == max(round_down(p_vaddr, + // 2MB), seg0_end_after_shift). For non-lld binaries this + // equals round_down (a no-op, since new_exec_sec_vaddr was + // chosen so the LOAD RE rounds down cleanly). For lld-style + // it pushes LOAD RE's start up to seg0_end_after_shift so + // seg0 LOAD R and LOAD RE don't overlap in vaddr space. + GElf_Addr rounded = round_down(phdr->p_vaddr, HUGE_PAGE_SIZE); + GElf_Addr new_pvaddr = adj_info->exec_p_vaddr_clamped; + GElf_Xword shrink = new_pvaddr - rounded; + phdr->p_vaddr = new_pvaddr; + phdr->p_paddr = new_pvaddr; + if (shrink > 0 && i == (size_t)info->exec_index) { + if (phdr->p_memsz > shrink) phdr->p_memsz -= shrink; + if (phdr->p_filesz > shrink) phdr->p_filesz -= shrink; + } } else if (phdr->p_offset > adj_info->old_exec_offset) { // Note that section_offset_delta is also for next sections phdr->p_offset += adj_info->section_offset_delta; @@ -1440,6 +1517,8 @@ static void adjust_section_headers(ElfInfo *info, AdjInfo *adj_info) { struct elf_section *sec = &info->sections[i]; uint64_t old_addr = sec->shdr.sh_addr; uint64_t new_addr = calc_adjusted_addr(adj_info, old_addr); + bool is_movable_seg0 = old_addr < adj_info->old_exec_vaddr && + addr_in_movable_seg0(adj_info, old_addr); if (old_addr != new_addr) { n_updated++; @@ -1447,8 +1526,15 @@ static void adjust_section_headers(ElfInfo *info, AdjInfo *adj_info) { sec->shdr.sh_addr = new_addr; } - if (adj_info->adjust_offsets && sec->shdr.sh_offset >= adj_info->old_exec_offset) { - sec->shdr.sh_offset += adj_info->section_offset_delta; + if (adj_info->adjust_offsets) { + if (sec->shdr.sh_offset >= adj_info->old_exec_offset) { + sec->shdr.sh_offset += adj_info->section_offset_delta; + } else if (is_movable_seg0) { + // lld-style: this section sits in seg0 (file offset below + // exec) but shifts forward in vaddr. seg0 has p_vaddr == + // p_offset == 0, so its file delta equals vaddr_delta. + sec->shdr.sh_offset += adj_info->vaddr_delta; + } } } pr_info("Adjusted %zu section headers\n", n_updated); @@ -1462,7 +1548,7 @@ static void adjust_section_headers(ElfInfo *info, AdjInfo *adj_info) { if (adj_info->adjust_offsets) { if (info->ehdr.e_shoff > adj_info->old_exec_offset) { info->ehdr.e_shoff = round_up(info->ehdr.e_shoff + adj_info->section_offset_delta, 8); - } + } } } @@ -2683,40 +2769,53 @@ static int pad_segment_start(ElfInfo *info, AdjInfo *adj_info) { } GElf_Phdr *exec_phdr = &info->phdrs[info->exec_index]; - - // Find the first section in the executable segment - struct elf_section *first_sec = NULL; - GElf_Addr lowest_addr = UINT64_MAX; - + + // Pad the gap between the last non-executable byte and the first + // executable section in LOAD RE. "Last non-executable byte" = max end + // of any non-SHF_EXECINSTR section in the segment, or the segment's + // p_vaddr if there are no such sections. This gives: + // - modern path (LOAD RE has only code): pad [p_vaddr, .text). Zero + // bytes when p_vaddr is 2MB-aligned and .text starts there. + // - 2-LOAD R+E first / "combined": LOAD RE holds metadata (.interp, + // .note, .dynsym, ...) at low vaddrs; the gap from last metadata + // to first code is small or zero; we don't clobber metadata or + // ELF/PHDR which sit in the same range. + // - lld-style: seg0 movable sections live in seg0 LOAD R; LOAD RE + // starts at seg0_end_after_shift. We pad [LOAD_RE.p_vaddr, + // .text) — typically a sub-page sliver. + GElf_Addr seg_lo = exec_phdr->p_vaddr; + GElf_Addr seg_hi = exec_phdr->p_vaddr + exec_phdr->p_memsz; + GElf_Addr first_exec_addr = UINT64_MAX; + GElf_Addr non_exec_end = seg_lo; + for (size_t i = 0; i < info->shnum; i++) { struct elf_section *sec = &info->sections[i]; - - // Check if section is in this segment - if (sec->shdr.sh_addr >= exec_phdr->p_vaddr && - sec->shdr.sh_addr < exec_phdr->p_vaddr + exec_phdr->p_memsz) { - - if (sec->shdr.sh_addr < lowest_addr) { - first_sec = sec; - lowest_addr = sec->shdr.sh_addr; + if (sec->shdr.sh_addr < seg_lo || sec->shdr.sh_addr >= seg_hi) + continue; + if (sec->shdr.sh_flags & SHF_EXECINSTR) { + if (sec->shdr.sh_addr < first_exec_addr) { + first_exec_addr = sec->shdr.sh_addr; + } + } else { + GElf_Addr end = sec->shdr.sh_addr + sec->shdr.sh_size; + if (end > non_exec_end) { + non_exec_end = end; } } } - if (!first_sec) { - pr_error("Could not find first section in executable segment\n"); + if (first_exec_addr == UINT64_MAX) { + pr_error("Could not find first executable section in executable segment\n"); return -1; } - // Calculate padding needed between segment start and first section - GElf_Addr segment_start_addr = round_up(exec_phdr->p_vaddr, HUGE_PAGE_SIZE); - size_t padding_size = first_sec->shdr.sh_addr - segment_start_addr; - - if (padding_size == 0) { - return 0; // No padding needed + if (first_exec_addr <= non_exec_end) { + return 0; // metadata abuts code — nothing to pad } - // Calculate file offset where padding should go - GElf_Off padding_offset = exec_phdr->p_offset + (segment_start_addr - exec_phdr->p_vaddr); + GElf_Addr pad_start_addr = non_exec_end; + size_t padding_size = first_exec_addr - pad_start_addr; + GElf_Off padding_offset = exec_phdr->p_offset + (pad_start_addr - exec_phdr->p_vaddr); // Create a buffer for the int3 instructions __attribute__((cleanup(cleanup_free))) void *padding = malloc(padding_size); @@ -2977,81 +3076,6 @@ static int seg0_has_movable_sections(ElfInfo *info) { return 0; } -// Padding-only transformation for lld-style PIE binaries. The main path -// would shift .text without shifting the .rodata that lives in seg0, -// breaking RIP-relative LEAs from code. As a first safe step, this -// transform only fixes the file-side mod-2MB alignment of LOAD RE: it -// inserts file padding before the exec segment so that -// (p_offset_RE + delta) % 2MB == p_vaddr_RE % 2MB. It does NOT change any -// virtual address, does NOT move any section, does NOT extend memsz, and -// does NOT touch relocations / symbols / DWARF / build-id. The output -// runs identically to the input. Establishing this property is the -// precondition for a future stage that will move the seg0 RIP-referenced -// sections and align RE to a 2MB boundary. -static int pad_offset_to_match_vaddr(ElfInfo *info) { - ssize_t first_load_idx = -1; - ssize_t exec_idx = -1; - for (size_t i = 0; i < info->phnum; i++) { - if (info->phdrs[i].p_type != PT_LOAD) continue; - if (first_load_idx < 0) first_load_idx = i; - if ((info->phdrs[i].p_flags & PF_X) && exec_idx < 0) exec_idx = i; - } - if (first_load_idx < 0 || exec_idx < 0) { - pr_error("padding-only path: missing first or executable LOAD\n"); - return ERR_FATAL; - } - - GElf_Phdr *first_load = &info->phdrs[first_load_idx]; - GElf_Phdr *exec_phdr = &info->phdrs[exec_idx]; - GElf_Off old_exec_offset = exec_phdr->p_offset; - GElf_Addr exec_vaddr = exec_phdr->p_vaddr; - - uint64_t v_mod = exec_vaddr % HUGE_PAGE_SIZE; - uint64_t o_mod = old_exec_offset % HUGE_PAGE_SIZE; - uint64_t delta = (v_mod + HUGE_PAGE_SIZE - o_mod) % HUGE_PAGE_SIZE; - - pr_info("Padding-only path (lld-style): exec_vaddr=0x%lx exec_offset=0x%lx, delta=0x%lx\n", - exec_vaddr, old_exec_offset, delta); - - // Always set p_align on the first LOAD so the kernel picks a 2MB- - // aligned load base. Harmless for the runtime; necessary precondition - // for future huge-page enablement. - first_load->p_align = HUGE_PAGE_SIZE; - - if (delta == 0) { - pr_info("offset already aligns with vaddr modulo 2MB; only setting p_align\n"); - return ERR_SUCCESS; - } - - // Bump p_offset for every phdr at or after the original exec offset. - // PT_LOAD, PT_DYNAMIC, PT_GNU_RELRO, PT_GNU_EH_FRAME, PT_TLS, PT_NOTE - // — anything whose file location sits in or after the executable's - // file region must shift in lockstep. - for (size_t i = 0; i < info->phnum; i++) { - GElf_Phdr *p = &info->phdrs[i]; - if (p->p_type != PT_NULL && p->p_offset >= old_exec_offset) { - p->p_offset += delta; - } - } - - // Bump sh_offset for every section at or after the original exec - // offset and mark dirty so libelf rewrites the data at the new offset. - for (size_t i = 0; i < info->shnum; i++) { - struct elf_section *sec = &info->sections[i]; - if (sec->shdr.sh_offset >= old_exec_offset && sec->shdr.sh_type != SHT_NULL) { - sec->shdr.sh_offset += delta; - mark_section_dirty(sec); - } - } - - // Bump section header table offset if it lives past the exec offset. - if (info->ehdr.e_shoff >= old_exec_offset) { - info->ehdr.e_shoff += delta; - } - - return ERR_SUCCESS; -} - static int process_elf(const char *filename, uint32_t flags) { __attribute__((cleanup(cleanup_free_elf_info))) ElfInfo *elf_info = init_elf(filename); @@ -3070,23 +3094,6 @@ static int process_elf(const char *filename, uint32_t flags) { init_sections_ordered_by_offset(elf_info); - // lld-style PIE detection: seg0 carries .rodata / .eh_frame* / - // .gcc_except_table that are RIP-referenced from code. The main - // shifting path would crash these binaries at runtime. Route them - // through a safer padding-only transform that only fixes file-side - // mod-2MB alignment without disturbing any vaddr. Huge-page - // enablement is left to a future stage. - if (elf_info->ehdr.e_type == ET_DYN && seg0_has_movable_sections(elf_info)) { - if (pad_offset_to_match_vaddr(elf_info) != 0) { - return ERR_FATAL; - } - if (write_elf(elf_info)) { - pr_error("Failed to write ELF file\n"); - return ERR_FATAL; - } - return ERR_SUCCESS; - } - if (elf_info->ehdr.e_type != ET_DYN) { pr_error("Input file is not position-independent code, trying fallback path\n"); if (handle_fallback_hugepage_alignment(elf_info) != 0) { @@ -3148,12 +3155,95 @@ static int process_elf(const char *filename, uint32_t flags) { uint64_t new_exec_sec_vaddr = round_down(new_p_vaddr_end - old_aligned_p_memsz, exec_phdr.p_align); uint64_t vaddr_delta = new_exec_sec_vaddr - exec_phdr.p_vaddr; - uint64_t segment_offset_delta = round_up_delta(exec_phdr.p_offset, HUGE_PAGE_SIZE); - uint64_t section_offset_delta = segment_offset_delta + (new_exec_sec_vaddr % HUGE_PAGE_SIZE); + // lld-style PIE detection: seg0 carries .rodata / .eh_frame* / + // .gcc_except_table that are RIP-referenced from .text. We can't leave + // them at their original vaddrs (the shift would break the RIP-relative + // LEAs), so collect their ranges here so calc_adjusted_addr can shift + // them by the same vaddr_delta as everything at-or-after the exec + // segment. Whitelisted seg0 metadata (.dynsym, .gnu.hash, .rela.*, ...) + // stays put. seg0_end_after_shift is the post-shift highest end of any + // seg0 section — used as the lower clamp on LOAD RE's p_vaddr so seg0 + // LOAD R and LOAD RE never overlap in vaddr space. + AddrRange *movable_seg0_ranges __attribute__((cleanup(cleanup_free_addrrange))) = NULL; + size_t n_movable_seg0 = 0; + GElf_Addr seg0_end_after_shift = 0; + + if (seg0_has_movable_sections(elf_info)) { + movable_seg0_ranges = malloc(elf_info->shnum * sizeof(*movable_seg0_ranges)); + if (movable_seg0_ranges == NULL) { + pr_error("Failed to allocate movable seg0 ranges\n"); + return ERR_FATAL; + } + + for (size_t i = 0; i < elf_info->shnum; i++) { + struct elf_section *sec = &elf_info->sections[i]; + if (sec->shdr.sh_addr == 0 || sec->shdr.sh_addr >= exec_phdr.p_vaddr) continue; + if ((sec->shdr.sh_flags & SHF_ALLOC) == 0) continue; + if (sec->shdr.sh_type == SHT_NOBITS) continue; + + bool whitelisted = false; + for (size_t j = 0; j < array_size(relocatable_section_types); j++) { + if (sec->shdr.sh_type == relocatable_section_types[j].type && + (relocatable_section_types[j].name == NULL || + strcmp(sec->name, relocatable_section_types[j].name) == 0)) { + whitelisted = true; + break; + } + } + + GElf_Addr sec_end = sec->shdr.sh_addr + sec->shdr.sh_size; + GElf_Addr post_end = whitelisted ? sec_end : sec_end + vaddr_delta; + if (post_end > seg0_end_after_shift) { + seg0_end_after_shift = post_end; + } + if (!whitelisted) { + movable_seg0_ranges[n_movable_seg0].old_lo = sec->shdr.sh_addr; + movable_seg0_ranges[n_movable_seg0].old_hi = sec_end; + n_movable_seg0++; + pr_debug(" movable seg0: '%s' [0x%lx, 0x%lx) -> [0x%lx, 0x%lx)\n", + sec->name, (uint64_t)sec->shdr.sh_addr, (uint64_t)sec_end, + (uint64_t)(sec->shdr.sh_addr + vaddr_delta), + (uint64_t)(sec_end + vaddr_delta)); + } + } + pr_info("lld-style seg0: %zu movable section(s); seg0_end_after_shift=0x%lx\n", + n_movable_seg0, (uint64_t)seg0_end_after_shift); + } + + // Compute the LOAD RE p_vaddr that adjust_program_headers will end up + // setting (max(round_down(new_exec_sec_vaddr, 2MB), seg0_end_after_shift)). + // For modern paths this equals round_down(new_exec_sec_vaddr, 2MB); + // for lld-style with seg0_end_after_shift > round_down it equals + // seg0_end_after_shift. section_offset_delta must be derived from this + // clamped p_vaddr so that for every section in LOAD RE, + // sh_offset_new - sh_addr_new == p_offset_new - p_vaddr_clamped. + // Without the clamp factored in, .text's sh_offset would point to a + // file location outside LOAD RE's file region, and the kernel would + // load garbage at .text's vaddr. + GElf_Addr exec_p_vaddr_round_down = round_down(new_exec_sec_vaddr, HUGE_PAGE_SIZE); + GElf_Addr exec_p_vaddr_clamped = exec_p_vaddr_round_down > seg0_end_after_shift + ? exec_p_vaddr_round_down + : seg0_end_after_shift; + // segment_offset_delta brings p_offset up to a place where + // (a) it doesn't overlap the extended seg0 LOAD R file region, and + // (b) p_offset_new % 2MB == p_vaddr_clamped % 2MB (kernel constraint + // for placing LOAD RE on a 2MB-aligned VMA). + // For lld-style we extended seg0 LOAD R's filesz to exec_p_vaddr_clamped + // (since seg0 has p_offset == p_vaddr == 0); LOAD RE's file region must + // start at exec_p_vaddr_clamped, which automatically gives us the + // matching modulo. For modern paths where seg0 wasn't extended, the + // existing round-up-to-2MB formula already aligns things. + uint64_t segment_offset_delta; + if (n_movable_seg0 > 0) { + segment_offset_delta = exec_p_vaddr_clamped - exec_phdr.p_offset; + } else { + segment_offset_delta = round_up_delta(exec_phdr.p_offset, HUGE_PAGE_SIZE); + } + uint64_t section_offset_delta = segment_offset_delta + (new_exec_sec_vaddr - exec_p_vaddr_clamped); pr_info("Original vaddr: 0x%lx, size: 0x%lx\n", exec_phdr.p_vaddr, exec_phdr.p_memsz); pr_info("New vaddr: 0x%lx, new size: 0x%lx\n", new_exec_sec_vaddr, huge_aligned_size); - pr_info("vaddr delta: 0x%lx\n", vaddr_delta); + pr_info("vaddr delta: 0x%lx, exec p_vaddr clamped: 0x%lx\n", vaddr_delta, (uint64_t)exec_p_vaddr_clamped); if (flags & FLAG_FILE_PADDING) { pr_info("Segment offset delta: 0x%lx, section offset delta: 0x%lx\n", segment_offset_delta, section_offset_delta); pr_info("New offset: 0x%lx\n", new_exec_p_offset); @@ -3167,6 +3257,10 @@ static int process_elf(const char *filename, uint32_t flags) { .section_offset_delta = section_offset_delta, .adjust_offsets = flags & FLAG_FILE_PADDING, .adjust_debug = flags & FLAG_DEBUG_UPDATE, + .movable_seg0_ranges = movable_seg0_ranges, + .n_movable_seg0 = n_movable_seg0, + .seg0_end_after_shift = seg0_end_after_shift, + .exec_p_vaddr_clamped = exec_p_vaddr_clamped, }; // Adjust the ELF structure diff --git a/tests/test.py b/tests/test.py index 31e86f7..77b8408 100644 --- a/tests/test.py +++ b/tests/test.py @@ -62,44 +62,68 @@ def check_offset_vaddr_mod_2mb_match(filename, huge=0x200000): def check_re_chunk_isolation(filename, huge=0x200000): - """Verify that no 2MB chunk touched by LOAD RE is also touched by any - other LOAD's vaddr range. This is the hugifyr project's central goal: - every 2MB chunk that contains code must be exclusively code so the - kernel can grant code huge pages. Any other LOAD overlapping (by - mmap-order overlay or by sitting past RE's end inside the same chunk) - blocks the chunk from being huge-page eligible.""" + """Verify that every 2MB chunk *fully covered* by LOAD RE is not + touched by any other LOAD's vaddr range. The fully-covered chunks + are the ones the kernel can promote to a code huge page; partial + chunks at the start/end of LOAD RE can legitimately share their + range with the adjacent LOAD R / LOAD RW (e.g. lld-style binaries + place the first partial RE chunk inside seg0's vaddr range, which + is fine because a partial chunk is never huge-page eligible).""" loads = parse_load_segments(filename) exec_load = next((l for l in loads if 'E' in l['flags']), None) if not exec_load: raise RuntimeError(f"{filename}: no executable LOAD") re_lo = exec_load['vaddr'] - re_hi = exec_load['vaddr'] + exec_load['memsz'] - re_first_chunk = re_lo & ~(huge - 1) - re_last_chunk_top = ((re_hi - 1) & ~(huge - 1)) + huge + re_hi = re_lo + exec_load['memsz'] + full_lo = (re_lo + huge - 1) & ~(huge - 1) # first fully-covered chunk start + full_hi = re_hi & ~(huge - 1) # one-past last fully-covered chunk start + + if full_hi <= full_lo: + print(f"RE chunk isolation: no fully-covered 2MB chunks in {filename} " + f"(RE [0x{re_lo:x}, 0x{re_hi:x}); too small)") + return for l in loads: if l is exec_load: continue l_lo = l['vaddr'] - l_hi = l['vaddr'] + l['memsz'] - # Does this LOAD's vaddr range intersect any 2MB chunk that RE touches? - if l_lo < re_last_chunk_top and l_hi > re_first_chunk: + l_hi = l_lo + l['memsz'] + if l_lo < full_hi and l_hi > full_lo: raise RuntimeError( f"{filename}: LOAD vaddr 0x{l_lo:x}-0x{l_hi:x} (flags '{l['flags']}') " - f"intersects an RE 2MB chunk in [0x{re_first_chunk:x}, 0x{re_last_chunk_top:x}). " - f"RE end is 0x{re_hi:x}; mmap-order will mix protections in shared chunks " + f"intersects a fully-covered RE 2MB chunk in [0x{full_lo:x}, 0x{full_hi:x}). " + f"RE [0x{re_lo:x}, 0x{re_hi:x}); mmap-order will mix protections in shared chunks " f"and the kernel can't grant code huge pages.") print(f"RE chunk isolation OK in {filename} " - f"(RE [0x{re_lo:x}, 0x{re_hi:x}), no other LOAD intersects " - f"[0x{re_first_chunk:x}, 0x{re_last_chunk_top:x}))") + f"(RE [0x{re_lo:x}, 0x{re_hi:x}), full chunks [0x{full_lo:x}, 0x{full_hi:x}) clear)") + + +def check_exec_load_end_aligned(filename, huge=0x200000): + """Verify that for the executable LOAD, p_vaddr + p_memsz lands on a + 2MB boundary AND p_align is 2MB. The END being aligned is what makes + the last code huge page eligible — it's required regardless of + whether the START is also 2MB-aligned. (lld-style transformed + binaries have a non-aligned p_vaddr clamped to seg0_end_after_shift, + but the end is still extended to a 2MB boundary.)""" + loads = parse_load_segments(filename) + exec_load = next((l for l in loads if 'E' in l['flags']), None) + if not exec_load: + raise RuntimeError(f"{filename}: no executable LOAD") + end = exec_load['vaddr'] + exec_load['memsz'] + if end % huge != 0: + raise RuntimeError( + f"{filename}: exec LOAD end 0x{end:x} (vaddr=0x{exec_load['vaddr']:x} + " + f"memsz=0x{exec_load['memsz']:x}) is not 2MB-aligned") + print(f"exec LOAD end 2MB-aligned OK in {filename} " + f"(end=0x{end:x}, vaddr=0x{exec_load['vaddr']:x}, memsz=0x{exec_load['memsz']:x})") def check_segment_alignment(filename): """Check if executable segment is properly 2MB aligned""" ALIGN_SIZE = 0x200000 # 2MB output = run_command(['readelf', '-Wl', filename]) - + # Parse readelf output looking for LOAD segments with execute permission lines = output.splitlines() i = 0 @@ -109,7 +133,7 @@ def check_segment_alignment(filename): # Check if this line has flags (ARM64 format) or next line has flags (x86_64 format) has_exec = False flags_line = line - + # Check current line for flags if ' R E ' in line or ' RWE ' in line: has_exec = True @@ -118,26 +142,26 @@ def check_segment_alignment(filename): elif i + 1 < len(lines) and (' R E ' in lines[i + 1] or ' RWE ' in lines[i + 1]): has_exec = True flags_line = lines[i + 1] - + if has_exec: # Parse LOAD segment values load_match = re.search(r'LOAD\s+0x([0-9a-f]+)\s+0x([0-9a-f]+)', line) if not load_match: raise RuntimeError(f"Failed to parse LOAD segment in {filename}") - + offset = int(load_match.group(1), 16) vaddr = int(load_match.group(2), 16) - + # Parse alignment - could be on same line or flags line align_match = re.search(r'0x([0-9a-f]+)\s*$', flags_line) if not align_match: align_match = re.search(r'0x([0-9a-f]+)\s*$', line) - + if not align_match: print(f"Line: {line}") print(f"Flags line: {flags_line}") raise RuntimeError(f"Failed to parse alignment in {filename}") - + align = int(align_match.group(1), 16) if align != ALIGN_SIZE: @@ -149,7 +173,7 @@ def check_segment_alignment(filename): print(f"Segment alignment OK in {filename}") return True i += 1 - + raise RuntimeError(f"No executable segment found in {filename}") def test_basic(): @@ -332,9 +356,17 @@ def test_load_layouts(): raise RuntimeError(f"{name}: output mismatch (orig={original!r}, new={modified!r})") # Every variant must establish p_offset%2MB == p_vaddr%2MB after - # the transform. The main path achieves this by shifting; the - # padding-only path achieves it by inserting file padding. + # the transform. modern paths shift to a 2MB-aligned p_vaddr; + # lld-style clamps p_vaddr to seg0_end_after_shift and pads the + # file to match. check_offset_vaddr_mod_2mb_match(huge_exe) + # Every variant must end-align LOAD RE so the kernel can grant a + # huge page on the last 2MB chunk of code. + check_exec_load_end_aligned(huge_exe) + # Every variant must keep LOAD RE's last 2MB chunk free of any + # other LOAD's vaddr range. This is the central goal: code chunks + # must be exclusively code. + check_re_chunk_isolation(huge_exe) print(f" {name}: {original}") print("Layout tests passed!")